diff options
author | Jonathan Herman <hermanjl@cs.unc.edu> | 2012-09-29 13:04:40 -0400 |
---|---|---|
committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2012-09-29 13:04:40 -0400 |
commit | daf1e620bff2cb6d830ef66725369bba9c858f62 (patch) | |
tree | 1aed8f7cb55371c70d2139b6754d90ea89a26147 /kernel/sched.c | |
parent | 451ed3b075c2a8e322e5a44f177e2470426a821d (diff) | |
parent | 1cb90226816c7af7808be4c0de866c54da17ecc9 (diff) |
Merge branch 'wip-color' into wip-mc
Conflicts:
include/litmus/budget.h
include/litmus/litmus.h
include/litmus/rt_param.h
include/litmus/sched_trace.h
include/litmus/trace.h
include/trace/events/litmus.h
litmus/Makefile
litmus/budget.c
litmus/ftdev.c
litmus/jobs.c
litmus/litmus.c
litmus/locking.c
litmus/preempt.c
litmus/rt_domain.c
litmus/sched_gsn_edf.c
litmus/trace.c
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 3440 |
1 files changed, 1804 insertions, 1636 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 1db6b746845c..d9d591e70b03 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -32,7 +32,6 @@ | |||
32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
33 | #include <linux/uaccess.h> | 33 | #include <linux/uaccess.h> |
34 | #include <linux/highmem.h> | 34 | #include <linux/highmem.h> |
35 | #include <linux/smp_lock.h> | ||
36 | #include <asm/mmu_context.h> | 35 | #include <asm/mmu_context.h> |
37 | #include <linux/interrupt.h> | 36 | #include <linux/interrupt.h> |
38 | #include <linux/capability.h> | 37 | #include <linux/capability.h> |
@@ -75,9 +74,14 @@ | |||
75 | 74 | ||
76 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
77 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
77 | #include <asm/mutex.h> | ||
78 | 78 | ||
79 | #include "sched_cpupri.h" | 79 | #include "sched_cpupri.h" |
80 | #include "workqueue_sched.h" | 80 | #include "workqueue_sched.h" |
81 | #include "sched_autogroup.h" | ||
82 | |||
83 | #define CREATE_TRACE_POINTS | ||
84 | #include <trace/events/sched.h> | ||
81 | 85 | ||
82 | #define CREATE_TRACE_POINTS | 86 | #define CREATE_TRACE_POINTS |
83 | #include <trace/events/sched.h> | 87 | #include <trace/events/sched.h> |
@@ -235,7 +239,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
235 | #endif | 239 | #endif |
236 | 240 | ||
237 | /* | 241 | /* |
238 | * sched_domains_mutex serializes calls to arch_init_sched_domains, | 242 | * sched_domains_mutex serializes calls to init_sched_domains, |
239 | * detach_destroy_domains and partition_sched_domains. | 243 | * detach_destroy_domains and partition_sched_domains. |
240 | */ | 244 | */ |
241 | static DEFINE_MUTEX(sched_domains_mutex); | 245 | static DEFINE_MUTEX(sched_domains_mutex); |
@@ -258,6 +262,8 @@ struct task_group { | |||
258 | /* runqueue "owned" by this group on each cpu */ | 262 | /* runqueue "owned" by this group on each cpu */ |
259 | struct cfs_rq **cfs_rq; | 263 | struct cfs_rq **cfs_rq; |
260 | unsigned long shares; | 264 | unsigned long shares; |
265 | |||
266 | atomic_t load_weight; | ||
261 | #endif | 267 | #endif |
262 | 268 | ||
263 | #ifdef CONFIG_RT_GROUP_SCHED | 269 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -273,25 +279,18 @@ struct task_group { | |||
273 | struct task_group *parent; | 279 | struct task_group *parent; |
274 | struct list_head siblings; | 280 | struct list_head siblings; |
275 | struct list_head children; | 281 | struct list_head children; |
276 | }; | ||
277 | 282 | ||
278 | #define root_task_group init_task_group | 283 | #ifdef CONFIG_SCHED_AUTOGROUP |
284 | struct autogroup *autogroup; | ||
285 | #endif | ||
286 | }; | ||
279 | 287 | ||
280 | /* task_group_lock serializes add/remove of task groups and also changes to | 288 | /* task_group_lock serializes the addition/removal of task groups */ |
281 | * a task group's cpu shares. | ||
282 | */ | ||
283 | static DEFINE_SPINLOCK(task_group_lock); | 289 | static DEFINE_SPINLOCK(task_group_lock); |
284 | 290 | ||
285 | #ifdef CONFIG_FAIR_GROUP_SCHED | 291 | #ifdef CONFIG_FAIR_GROUP_SCHED |
286 | 292 | ||
287 | #ifdef CONFIG_SMP | 293 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD |
288 | static int root_task_group_empty(void) | ||
289 | { | ||
290 | return list_empty(&root_task_group.children); | ||
291 | } | ||
292 | #endif | ||
293 | |||
294 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | ||
295 | 294 | ||
296 | /* | 295 | /* |
297 | * A weight of 0 or 1 can cause arithmetics problems. | 296 | * A weight of 0 or 1 can cause arithmetics problems. |
@@ -301,16 +300,16 @@ static int root_task_group_empty(void) | |||
301 | * (The default weight is 1024 - so there's no practical | 300 | * (The default weight is 1024 - so there's no practical |
302 | * limitation from this.) | 301 | * limitation from this.) |
303 | */ | 302 | */ |
304 | #define MIN_SHARES 2 | 303 | #define MIN_SHARES (1UL << 1) |
305 | #define MAX_SHARES (1UL << 18) | 304 | #define MAX_SHARES (1UL << 18) |
306 | 305 | ||
307 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 306 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; |
308 | #endif | 307 | #endif |
309 | 308 | ||
310 | /* Default task group. | 309 | /* Default task group. |
311 | * Every task in system belong to this group at bootup. | 310 | * Every task in system belong to this group at bootup. |
312 | */ | 311 | */ |
313 | struct task_group init_task_group; | 312 | struct task_group root_task_group; |
314 | 313 | ||
315 | #endif /* CONFIG_CGROUP_SCHED */ | 314 | #endif /* CONFIG_CGROUP_SCHED */ |
316 | 315 | ||
@@ -321,6 +320,9 @@ struct cfs_rq { | |||
321 | 320 | ||
322 | u64 exec_clock; | 321 | u64 exec_clock; |
323 | u64 min_vruntime; | 322 | u64 min_vruntime; |
323 | #ifndef CONFIG_64BIT | ||
324 | u64 min_vruntime_copy; | ||
325 | #endif | ||
324 | 326 | ||
325 | struct rb_root tasks_timeline; | 327 | struct rb_root tasks_timeline; |
326 | struct rb_node *rb_leftmost; | 328 | struct rb_node *rb_leftmost; |
@@ -332,9 +334,11 @@ struct cfs_rq { | |||
332 | * 'curr' points to currently running entity on this cfs_rq. | 334 | * 'curr' points to currently running entity on this cfs_rq. |
333 | * It is set to NULL otherwise (i.e when none are currently running). | 335 | * It is set to NULL otherwise (i.e when none are currently running). |
334 | */ | 336 | */ |
335 | struct sched_entity *curr, *next, *last; | 337 | struct sched_entity *curr, *next, *last, *skip; |
336 | 338 | ||
339 | #ifdef CONFIG_SCHED_DEBUG | ||
337 | unsigned int nr_spread_over; | 340 | unsigned int nr_spread_over; |
341 | #endif | ||
338 | 342 | ||
339 | #ifdef CONFIG_FAIR_GROUP_SCHED | 343 | #ifdef CONFIG_FAIR_GROUP_SCHED |
340 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 344 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
@@ -347,6 +351,7 @@ struct cfs_rq { | |||
347 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 351 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
348 | * list is used during load balance. | 352 | * list is used during load balance. |
349 | */ | 353 | */ |
354 | int on_list; | ||
350 | struct list_head leaf_cfs_rq_list; | 355 | struct list_head leaf_cfs_rq_list; |
351 | struct task_group *tg; /* group that "owns" this runqueue */ | 356 | struct task_group *tg; /* group that "owns" this runqueue */ |
352 | 357 | ||
@@ -365,14 +370,17 @@ struct cfs_rq { | |||
365 | unsigned long h_load; | 370 | unsigned long h_load; |
366 | 371 | ||
367 | /* | 372 | /* |
368 | * this cpu's part of tg->shares | 373 | * Maintaining per-cpu shares distribution for group scheduling |
374 | * | ||
375 | * load_stamp is the last time we updated the load average | ||
376 | * load_last is the last time we updated the load average and saw load | ||
377 | * load_unacc_exec_time is currently unaccounted execution time | ||
369 | */ | 378 | */ |
370 | unsigned long shares; | 379 | u64 load_avg; |
380 | u64 load_period; | ||
381 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
371 | 382 | ||
372 | /* | 383 | unsigned long load_contribution; |
373 | * load.weight at the time we set shares | ||
374 | */ | ||
375 | unsigned long rq_weight; | ||
376 | #endif | 384 | #endif |
377 | #endif | 385 | #endif |
378 | }; | 386 | }; |
@@ -428,6 +436,7 @@ struct litmus_rq { | |||
428 | */ | 436 | */ |
429 | struct root_domain { | 437 | struct root_domain { |
430 | atomic_t refcount; | 438 | atomic_t refcount; |
439 | struct rcu_head rcu; | ||
431 | cpumask_var_t span; | 440 | cpumask_var_t span; |
432 | cpumask_var_t online; | 441 | cpumask_var_t online; |
433 | 442 | ||
@@ -437,9 +446,7 @@ struct root_domain { | |||
437 | */ | 446 | */ |
438 | cpumask_var_t rto_mask; | 447 | cpumask_var_t rto_mask; |
439 | atomic_t rto_count; | 448 | atomic_t rto_count; |
440 | #ifdef CONFIG_SMP | ||
441 | struct cpupri cpupri; | 449 | struct cpupri cpupri; |
442 | #endif | ||
443 | }; | 450 | }; |
444 | 451 | ||
445 | /* | 452 | /* |
@@ -448,7 +455,7 @@ struct root_domain { | |||
448 | */ | 455 | */ |
449 | static struct root_domain def_root_domain; | 456 | static struct root_domain def_root_domain; |
450 | 457 | ||
451 | #endif | 458 | #endif /* CONFIG_SMP */ |
452 | 459 | ||
453 | /* | 460 | /* |
454 | * This is the main, per-CPU runqueue data structure. | 461 | * This is the main, per-CPU runqueue data structure. |
@@ -473,7 +480,7 @@ struct rq { | |||
473 | u64 nohz_stamp; | 480 | u64 nohz_stamp; |
474 | unsigned char nohz_balance_kick; | 481 | unsigned char nohz_balance_kick; |
475 | #endif | 482 | #endif |
476 | unsigned int skip_clock_update; | 483 | int skip_clock_update; |
477 | 484 | ||
478 | /* capture load from *all* tasks on this cpu: */ | 485 | /* capture load from *all* tasks on this cpu: */ |
479 | struct load_weight load; | 486 | struct load_weight load; |
@@ -500,11 +507,12 @@ struct rq { | |||
500 | */ | 507 | */ |
501 | unsigned long nr_uninterruptible; | 508 | unsigned long nr_uninterruptible; |
502 | 509 | ||
503 | struct task_struct *curr, *idle; | 510 | struct task_struct *curr, *idle, *stop; |
504 | unsigned long next_balance; | 511 | unsigned long next_balance; |
505 | struct mm_struct *prev_mm; | 512 | struct mm_struct *prev_mm; |
506 | 513 | ||
507 | u64 clock; | 514 | u64 clock; |
515 | u64 clock_task; | ||
508 | 516 | ||
509 | atomic_t nr_iowait; | 517 | atomic_t nr_iowait; |
510 | 518 | ||
@@ -532,6 +540,10 @@ struct rq { | |||
532 | u64 avg_idle; | 540 | u64 avg_idle; |
533 | #endif | 541 | #endif |
534 | 542 | ||
543 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
544 | u64 prev_irq_time; | ||
545 | #endif | ||
546 | |||
535 | /* calc_load related fields */ | 547 | /* calc_load related fields */ |
536 | unsigned long calc_load_update; | 548 | unsigned long calc_load_update; |
537 | long calc_load_active; | 549 | long calc_load_active; |
@@ -561,32 +573,17 @@ struct rq { | |||
561 | /* try_to_wake_up() stats */ | 573 | /* try_to_wake_up() stats */ |
562 | unsigned int ttwu_count; | 574 | unsigned int ttwu_count; |
563 | unsigned int ttwu_local; | 575 | unsigned int ttwu_local; |
576 | #endif | ||
564 | 577 | ||
565 | /* BKL stats */ | 578 | #ifdef CONFIG_SMP |
566 | unsigned int bkl_count; | 579 | struct task_struct *wake_list; |
567 | #endif | 580 | #endif |
568 | }; | 581 | }; |
569 | 582 | ||
570 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 583 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
571 | 584 | ||
572 | static inline | ||
573 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
574 | { | ||
575 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
576 | 585 | ||
577 | /* | 586 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); |
578 | * A queue event has occurred, and we're going to schedule. In | ||
579 | * this case, we can save a useless back to back clock update. | ||
580 | */ | ||
581 | /* LITMUS^RT: turning off the clock update is buggy in Linux 2.6.36; | ||
582 | * the scheduler can "forget" to renable the runqueue clock in some | ||
583 | * cases. LITMUS^RT amplifies the effects of this problem. Hence, we | ||
584 | * turn it off to avoid stalling clocks. */ | ||
585 | /* | ||
586 | if (test_tsk_need_resched(p)) | ||
587 | rq->skip_clock_update = 1; | ||
588 | */ | ||
589 | } | ||
590 | 587 | ||
591 | static inline int cpu_of(struct rq *rq) | 588 | static inline int cpu_of(struct rq *rq) |
592 | { | 589 | { |
@@ -599,7 +596,7 @@ static inline int cpu_of(struct rq *rq) | |||
599 | 596 | ||
600 | #define rcu_dereference_check_sched_domain(p) \ | 597 | #define rcu_dereference_check_sched_domain(p) \ |
601 | rcu_dereference_check((p), \ | 598 | rcu_dereference_check((p), \ |
602 | rcu_read_lock_sched_held() || \ | 599 | rcu_read_lock_held() || \ |
603 | lockdep_is_held(&sched_domains_mutex)) | 600 | lockdep_is_held(&sched_domains_mutex)) |
604 | 601 | ||
605 | /* | 602 | /* |
@@ -623,18 +620,22 @@ static inline int cpu_of(struct rq *rq) | |||
623 | /* | 620 | /* |
624 | * Return the group to which this tasks belongs. | 621 | * Return the group to which this tasks belongs. |
625 | * | 622 | * |
626 | * We use task_subsys_state_check() and extend the RCU verification | 623 | * We use task_subsys_state_check() and extend the RCU verification with |
627 | * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() | 624 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each |
628 | * holds that lock for each task it moves into the cgroup. Therefore | 625 | * task it moves into the cgroup. Therefore by holding either of those locks, |
629 | * by holding that lock, we pin the task to the current cgroup. | 626 | * we pin the task to the current cgroup. |
630 | */ | 627 | */ |
631 | static inline struct task_group *task_group(struct task_struct *p) | 628 | static inline struct task_group *task_group(struct task_struct *p) |
632 | { | 629 | { |
630 | struct task_group *tg; | ||
633 | struct cgroup_subsys_state *css; | 631 | struct cgroup_subsys_state *css; |
634 | 632 | ||
635 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 633 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
634 | lockdep_is_held(&p->pi_lock) || | ||
636 | lockdep_is_held(&task_rq(p)->lock)); | 635 | lockdep_is_held(&task_rq(p)->lock)); |
637 | return container_of(css, struct task_group, css); | 636 | tg = container_of(css, struct task_group, css); |
637 | |||
638 | return autogroup_task_group(p, tg); | ||
638 | } | 639 | } |
639 | 640 | ||
640 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 641 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
@@ -661,10 +662,18 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
661 | 662 | ||
662 | #endif /* CONFIG_CGROUP_SCHED */ | 663 | #endif /* CONFIG_CGROUP_SCHED */ |
663 | 664 | ||
664 | inline void update_rq_clock(struct rq *rq) | 665 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
666 | |||
667 | static void update_rq_clock(struct rq *rq) | ||
665 | { | 668 | { |
666 | if (!rq->skip_clock_update) | 669 | s64 delta; |
667 | rq->clock = sched_clock_cpu(cpu_of(rq)); | 670 | |
671 | if (rq->skip_clock_update > 0) | ||
672 | return; | ||
673 | |||
674 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | ||
675 | rq->clock += delta; | ||
676 | update_rq_clock_task(rq, delta); | ||
668 | } | 677 | } |
669 | 678 | ||
670 | /* | 679 | /* |
@@ -677,10 +686,9 @@ inline void update_rq_clock(struct rq *rq) | |||
677 | #endif | 686 | #endif |
678 | 687 | ||
679 | /** | 688 | /** |
680 | * runqueue_is_locked | 689 | * runqueue_is_locked - Returns true if the current cpu runqueue is locked |
681 | * @cpu: the processor in question. | 690 | * @cpu: the processor in question. |
682 | * | 691 | * |
683 | * Returns true if the current cpu runqueue is locked. | ||
684 | * This interface allows printk to be called with the runqueue lock | 692 | * This interface allows printk to be called with the runqueue lock |
685 | * held and know whether or not it is OK to wake up the klogd. | 693 | * held and know whether or not it is OK to wake up the klogd. |
686 | */ | 694 | */ |
@@ -741,7 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
741 | size_t cnt, loff_t *ppos) | 749 | size_t cnt, loff_t *ppos) |
742 | { | 750 | { |
743 | char buf[64]; | 751 | char buf[64]; |
744 | char *cmp = buf; | 752 | char *cmp; |
745 | int neg = 0; | 753 | int neg = 0; |
746 | int i; | 754 | int i; |
747 | 755 | ||
@@ -752,16 +760,15 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
752 | return -EFAULT; | 760 | return -EFAULT; |
753 | 761 | ||
754 | buf[cnt] = 0; | 762 | buf[cnt] = 0; |
763 | cmp = strstrip(buf); | ||
755 | 764 | ||
756 | if (strncmp(buf, "NO_", 3) == 0) { | 765 | if (strncmp(cmp, "NO_", 3) == 0) { |
757 | neg = 1; | 766 | neg = 1; |
758 | cmp += 3; | 767 | cmp += 3; |
759 | } | 768 | } |
760 | 769 | ||
761 | for (i = 0; sched_feat_names[i]; i++) { | 770 | for (i = 0; sched_feat_names[i]; i++) { |
762 | int len = strlen(sched_feat_names[i]); | 771 | if (strcmp(cmp, sched_feat_names[i]) == 0) { |
763 | |||
764 | if (strncmp(cmp, sched_feat_names[i], len) == 0) { | ||
765 | if (neg) | 772 | if (neg) |
766 | sysctl_sched_features &= ~(1UL << i); | 773 | sysctl_sched_features &= ~(1UL << i); |
767 | else | 774 | else |
@@ -811,20 +818,6 @@ late_initcall(sched_init_debug); | |||
811 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 818 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
812 | 819 | ||
813 | /* | 820 | /* |
814 | * ratelimit for updating the group shares. | ||
815 | * default: 0.25ms | ||
816 | */ | ||
817 | unsigned int sysctl_sched_shares_ratelimit = 250000; | ||
818 | unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; | ||
819 | |||
820 | /* | ||
821 | * Inject some fuzzyness into changing the per-cpu group shares | ||
822 | * this avoids remote rq-locks at the expense of fairness. | ||
823 | * default: 4 | ||
824 | */ | ||
825 | unsigned int sysctl_sched_shares_thresh = 4; | ||
826 | |||
827 | /* | ||
828 | * period over which we average the RT time consumption, measured | 821 | * period over which we average the RT time consumption, measured |
829 | * in ms. | 822 | * in ms. |
830 | * | 823 | * |
@@ -871,18 +864,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) | |||
871 | return rq->curr == p; | 864 | return rq->curr == p; |
872 | } | 865 | } |
873 | 866 | ||
874 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
875 | static inline int task_running(struct rq *rq, struct task_struct *p) | 867 | static inline int task_running(struct rq *rq, struct task_struct *p) |
876 | { | 868 | { |
869 | #ifdef CONFIG_SMP | ||
870 | return p->on_cpu; | ||
871 | #else | ||
877 | return task_current(rq, p); | 872 | return task_current(rq, p); |
873 | #endif | ||
878 | } | 874 | } |
879 | 875 | ||
876 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
880 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 877 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
881 | { | 878 | { |
879 | #ifdef CONFIG_SMP | ||
880 | /* | ||
881 | * We can optimise this out completely for !SMP, because the | ||
882 | * SMP rebalancing from interrupt is the only thing that cares | ||
883 | * here. | ||
884 | */ | ||
885 | next->on_cpu = 1; | ||
886 | #endif | ||
882 | } | 887 | } |
883 | 888 | ||
884 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 889 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
885 | { | 890 | { |
891 | #ifdef CONFIG_SMP | ||
892 | /* | ||
893 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
894 | * We must ensure this doesn't happen until the switch is completely | ||
895 | * finished. | ||
896 | */ | ||
897 | smp_wmb(); | ||
898 | prev->on_cpu = 0; | ||
899 | #endif | ||
886 | #ifdef CONFIG_DEBUG_SPINLOCK | 900 | #ifdef CONFIG_DEBUG_SPINLOCK |
887 | /* this is a valid case when another task releases the spinlock */ | 901 | /* this is a valid case when another task releases the spinlock */ |
888 | rq->lock.owner = current; | 902 | rq->lock.owner = current; |
@@ -898,15 +912,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
898 | } | 912 | } |
899 | 913 | ||
900 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 914 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
901 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
902 | { | ||
903 | #ifdef CONFIG_SMP | ||
904 | return p->oncpu; | ||
905 | #else | ||
906 | return task_current(rq, p); | ||
907 | #endif | ||
908 | } | ||
909 | |||
910 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 915 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
911 | { | 916 | { |
912 | #ifdef CONFIG_SMP | 917 | #ifdef CONFIG_SMP |
@@ -915,7 +920,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | |||
915 | * SMP rebalancing from interrupt is the only thing that cares | 920 | * SMP rebalancing from interrupt is the only thing that cares |
916 | * here. | 921 | * here. |
917 | */ | 922 | */ |
918 | next->oncpu = 1; | 923 | next->on_cpu = 1; |
919 | #endif | 924 | #endif |
920 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 925 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
921 | raw_spin_unlock_irq(&rq->lock); | 926 | raw_spin_unlock_irq(&rq->lock); |
@@ -928,12 +933,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
928 | { | 933 | { |
929 | #ifdef CONFIG_SMP | 934 | #ifdef CONFIG_SMP |
930 | /* | 935 | /* |
931 | * After ->oncpu is cleared, the task can be moved to a different CPU. | 936 | * After ->on_cpu is cleared, the task can be moved to a different CPU. |
932 | * We must ensure this doesn't happen until the switch is completely | 937 | * We must ensure this doesn't happen until the switch is completely |
933 | * finished. | 938 | * finished. |
934 | */ | 939 | */ |
935 | smp_wmb(); | 940 | smp_wmb(); |
936 | prev->oncpu = 0; | 941 | prev->on_cpu = 0; |
937 | #endif | 942 | #endif |
938 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 943 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
939 | local_irq_enable(); | 944 | local_irq_enable(); |
@@ -942,23 +947,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
942 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 947 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
943 | 948 | ||
944 | /* | 949 | /* |
945 | * Check whether the task is waking, we use this to synchronize ->cpus_allowed | 950 | * __task_rq_lock - lock the rq @p resides on. |
946 | * against ttwu(). | ||
947 | */ | ||
948 | static inline int task_is_waking(struct task_struct *p) | ||
949 | { | ||
950 | return unlikely(p->state == TASK_WAKING); | ||
951 | } | ||
952 | |||
953 | /* | ||
954 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
955 | * Must be called interrupts disabled. | ||
956 | */ | 951 | */ |
957 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 952 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
958 | __acquires(rq->lock) | 953 | __acquires(rq->lock) |
959 | { | 954 | { |
960 | struct rq *rq; | 955 | struct rq *rq; |
961 | 956 | ||
957 | lockdep_assert_held(&p->pi_lock); | ||
958 | |||
962 | for (;;) { | 959 | for (;;) { |
963 | rq = task_rq(p); | 960 | rq = task_rq(p); |
964 | raw_spin_lock(&rq->lock); | 961 | raw_spin_lock(&rq->lock); |
@@ -969,22 +966,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
969 | } | 966 | } |
970 | 967 | ||
971 | /* | 968 | /* |
972 | * task_rq_lock - lock the runqueue a given task resides on and disable | 969 | * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. |
973 | * interrupts. Note the ordering: we can safely lookup the task_rq without | ||
974 | * explicitly disabling preemption. | ||
975 | */ | 970 | */ |
976 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 971 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
972 | __acquires(p->pi_lock) | ||
977 | __acquires(rq->lock) | 973 | __acquires(rq->lock) |
978 | { | 974 | { |
979 | struct rq *rq; | 975 | struct rq *rq; |
980 | 976 | ||
981 | for (;;) { | 977 | for (;;) { |
982 | local_irq_save(*flags); | 978 | raw_spin_lock_irqsave(&p->pi_lock, *flags); |
983 | rq = task_rq(p); | 979 | rq = task_rq(p); |
984 | raw_spin_lock(&rq->lock); | 980 | raw_spin_lock(&rq->lock); |
985 | if (likely(rq == task_rq(p))) | 981 | if (likely(rq == task_rq(p))) |
986 | return rq; | 982 | return rq; |
987 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 983 | raw_spin_unlock(&rq->lock); |
984 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
988 | } | 985 | } |
989 | } | 986 | } |
990 | 987 | ||
@@ -994,10 +991,13 @@ static void __task_rq_unlock(struct rq *rq) | |||
994 | raw_spin_unlock(&rq->lock); | 991 | raw_spin_unlock(&rq->lock); |
995 | } | 992 | } |
996 | 993 | ||
997 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | 994 | static inline void |
995 | task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) | ||
998 | __releases(rq->lock) | 996 | __releases(rq->lock) |
997 | __releases(p->pi_lock) | ||
999 | { | 998 | { |
1000 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 999 | raw_spin_unlock(&rq->lock); |
1000 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
1001 | } | 1001 | } |
1002 | 1002 | ||
1003 | /* | 1003 | /* |
@@ -1227,11 +1227,17 @@ int get_nohz_timer_target(void) | |||
1227 | int i; | 1227 | int i; |
1228 | struct sched_domain *sd; | 1228 | struct sched_domain *sd; |
1229 | 1229 | ||
1230 | rcu_read_lock(); | ||
1230 | for_each_domain(cpu, sd) { | 1231 | for_each_domain(cpu, sd) { |
1231 | for_each_cpu(i, sched_domain_span(sd)) | 1232 | for_each_cpu(i, sched_domain_span(sd)) { |
1232 | if (!idle_cpu(i)) | 1233 | if (!idle_cpu(i)) { |
1233 | return i; | 1234 | cpu = i; |
1235 | goto unlock; | ||
1236 | } | ||
1237 | } | ||
1234 | } | 1238 | } |
1239 | unlock: | ||
1240 | rcu_read_unlock(); | ||
1235 | return cpu; | 1241 | return cpu; |
1236 | } | 1242 | } |
1237 | /* | 1243 | /* |
@@ -1341,15 +1347,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1341 | { | 1347 | { |
1342 | u64 tmp; | 1348 | u64 tmp; |
1343 | 1349 | ||
1350 | /* | ||
1351 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
1352 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
1353 | * 2^SCHED_LOAD_RESOLUTION. | ||
1354 | */ | ||
1355 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
1356 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
1357 | else | ||
1358 | tmp = (u64)delta_exec; | ||
1359 | |||
1344 | if (!lw->inv_weight) { | 1360 | if (!lw->inv_weight) { |
1345 | if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) | 1361 | unsigned long w = scale_load_down(lw->weight); |
1362 | |||
1363 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
1346 | lw->inv_weight = 1; | 1364 | lw->inv_weight = 1; |
1365 | else if (unlikely(!w)) | ||
1366 | lw->inv_weight = WMULT_CONST; | ||
1347 | else | 1367 | else |
1348 | lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) | 1368 | lw->inv_weight = WMULT_CONST / w; |
1349 | / (lw->weight+1); | ||
1350 | } | 1369 | } |
1351 | 1370 | ||
1352 | tmp = (u64)delta_exec * weight; | ||
1353 | /* | 1371 | /* |
1354 | * Check whether we'd overflow the 64-bit multiplication: | 1372 | * Check whether we'd overflow the 64-bit multiplication: |
1355 | */ | 1373 | */ |
@@ -1374,6 +1392,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | |||
1374 | lw->inv_weight = 0; | 1392 | lw->inv_weight = 0; |
1375 | } | 1393 | } |
1376 | 1394 | ||
1395 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
1396 | { | ||
1397 | lw->weight = w; | ||
1398 | lw->inv_weight = 0; | ||
1399 | } | ||
1400 | |||
1377 | /* | 1401 | /* |
1378 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1402 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
1379 | * of tasks with abnormal "nice" values across CPUs the contribution that | 1403 | * of tasks with abnormal "nice" values across CPUs the contribution that |
@@ -1562,101 +1586,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1562 | 1586 | ||
1563 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1587 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1564 | 1588 | ||
1565 | static __read_mostly unsigned long __percpu *update_shares_data; | ||
1566 | |||
1567 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1568 | |||
1569 | /* | ||
1570 | * Calculate and set the cpu's group shares. | ||
1571 | */ | ||
1572 | static void update_group_shares_cpu(struct task_group *tg, int cpu, | ||
1573 | unsigned long sd_shares, | ||
1574 | unsigned long sd_rq_weight, | ||
1575 | unsigned long *usd_rq_weight) | ||
1576 | { | ||
1577 | unsigned long shares, rq_weight; | ||
1578 | int boost = 0; | ||
1579 | |||
1580 | rq_weight = usd_rq_weight[cpu]; | ||
1581 | if (!rq_weight) { | ||
1582 | boost = 1; | ||
1583 | rq_weight = NICE_0_LOAD; | ||
1584 | } | ||
1585 | |||
1586 | /* | ||
1587 | * \Sum_j shares_j * rq_weight_i | ||
1588 | * shares_i = ----------------------------- | ||
1589 | * \Sum_j rq_weight_j | ||
1590 | */ | ||
1591 | shares = (sd_shares * rq_weight) / sd_rq_weight; | ||
1592 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
1593 | |||
1594 | if (abs(shares - tg->se[cpu]->load.weight) > | ||
1595 | sysctl_sched_shares_thresh) { | ||
1596 | struct rq *rq = cpu_rq(cpu); | ||
1597 | unsigned long flags; | ||
1598 | |||
1599 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
1600 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; | ||
1601 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1602 | __set_se_shares(tg->se[cpu], shares); | ||
1603 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
1604 | } | ||
1605 | } | ||
1606 | |||
1607 | /* | ||
1608 | * Re-compute the task group their per cpu shares over the given domain. | ||
1609 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
1610 | * parent group depends on the shares of its child groups. | ||
1611 | */ | ||
1612 | static int tg_shares_up(struct task_group *tg, void *data) | ||
1613 | { | ||
1614 | unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; | ||
1615 | unsigned long *usd_rq_weight; | ||
1616 | struct sched_domain *sd = data; | ||
1617 | unsigned long flags; | ||
1618 | int i; | ||
1619 | |||
1620 | if (!tg->se[0]) | ||
1621 | return 0; | ||
1622 | |||
1623 | local_irq_save(flags); | ||
1624 | usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); | ||
1625 | |||
1626 | for_each_cpu(i, sched_domain_span(sd)) { | ||
1627 | weight = tg->cfs_rq[i]->load.weight; | ||
1628 | usd_rq_weight[i] = weight; | ||
1629 | |||
1630 | rq_weight += weight; | ||
1631 | /* | ||
1632 | * If there are currently no tasks on the cpu pretend there | ||
1633 | * is one of average load so that when a new task gets to | ||
1634 | * run here it will not get delayed by group starvation. | ||
1635 | */ | ||
1636 | if (!weight) | ||
1637 | weight = NICE_0_LOAD; | ||
1638 | |||
1639 | sum_weight += weight; | ||
1640 | shares += tg->cfs_rq[i]->shares; | ||
1641 | } | ||
1642 | |||
1643 | if (!rq_weight) | ||
1644 | rq_weight = sum_weight; | ||
1645 | |||
1646 | if ((!shares && rq_weight) || shares > tg->shares) | ||
1647 | shares = tg->shares; | ||
1648 | |||
1649 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | ||
1650 | shares = tg->shares; | ||
1651 | |||
1652 | for_each_cpu(i, sched_domain_span(sd)) | ||
1653 | update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); | ||
1654 | |||
1655 | local_irq_restore(flags); | ||
1656 | |||
1657 | return 0; | ||
1658 | } | ||
1659 | |||
1660 | /* | 1589 | /* |
1661 | * Compute the cpu's hierarchical load factor for each task group. | 1590 | * Compute the cpu's hierarchical load factor for each task group. |
1662 | * This needs to be done in a top-down fashion because the load of a child | 1591 | * This needs to be done in a top-down fashion because the load of a child |
@@ -1671,7 +1600,7 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1671 | load = cpu_rq(cpu)->load.weight; | 1600 | load = cpu_rq(cpu)->load.weight; |
1672 | } else { | 1601 | } else { |
1673 | load = tg->parent->cfs_rq[cpu]->h_load; | 1602 | load = tg->parent->cfs_rq[cpu]->h_load; |
1674 | load *= tg->cfs_rq[cpu]->shares; | 1603 | load *= tg->se[cpu]->load.weight; |
1675 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | 1604 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; |
1676 | } | 1605 | } |
1677 | 1606 | ||
@@ -1680,34 +1609,11 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1680 | return 0; | 1609 | return 0; |
1681 | } | 1610 | } |
1682 | 1611 | ||
1683 | static void update_shares(struct sched_domain *sd) | ||
1684 | { | ||
1685 | s64 elapsed; | ||
1686 | u64 now; | ||
1687 | |||
1688 | if (root_task_group_empty()) | ||
1689 | return; | ||
1690 | |||
1691 | now = local_clock(); | ||
1692 | elapsed = now - sd->last_update; | ||
1693 | |||
1694 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | ||
1695 | sd->last_update = now; | ||
1696 | walk_tg_tree(tg_nop, tg_shares_up, sd); | ||
1697 | } | ||
1698 | } | ||
1699 | |||
1700 | static void update_h_load(long cpu) | 1612 | static void update_h_load(long cpu) |
1701 | { | 1613 | { |
1702 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1614 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1703 | } | 1615 | } |
1704 | 1616 | ||
1705 | #else | ||
1706 | |||
1707 | static inline void update_shares(struct sched_domain *sd) | ||
1708 | { | ||
1709 | } | ||
1710 | |||
1711 | #endif | 1617 | #endif |
1712 | 1618 | ||
1713 | #ifdef CONFIG_PREEMPT | 1619 | #ifdef CONFIG_PREEMPT |
@@ -1827,15 +1733,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1827 | __release(rq2->lock); | 1733 | __release(rq2->lock); |
1828 | } | 1734 | } |
1829 | 1735 | ||
1830 | #endif | 1736 | #else /* CONFIG_SMP */ |
1831 | 1737 | ||
1832 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1738 | /* |
1833 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | 1739 | * double_rq_lock - safely lock two runqueues |
1740 | * | ||
1741 | * Note this does not disable interrupts like task_rq_lock, | ||
1742 | * you need to do so manually before calling. | ||
1743 | */ | ||
1744 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1745 | __acquires(rq1->lock) | ||
1746 | __acquires(rq2->lock) | ||
1834 | { | 1747 | { |
1835 | #ifdef CONFIG_SMP | 1748 | BUG_ON(!irqs_disabled()); |
1836 | cfs_rq->shares = shares; | 1749 | BUG_ON(rq1 != rq2); |
1837 | #endif | 1750 | raw_spin_lock(&rq1->lock); |
1751 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1752 | } | ||
1753 | |||
1754 | /* | ||
1755 | * double_rq_unlock - safely unlock two runqueues | ||
1756 | * | ||
1757 | * Note this does not restore interrupts like task_rq_unlock, | ||
1758 | * you need to do so manually after calling. | ||
1759 | */ | ||
1760 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1761 | __releases(rq1->lock) | ||
1762 | __releases(rq2->lock) | ||
1763 | { | ||
1764 | BUG_ON(rq1 != rq2); | ||
1765 | raw_spin_unlock(&rq1->lock); | ||
1766 | __release(rq2->lock); | ||
1838 | } | 1767 | } |
1768 | |||
1839 | #endif | 1769 | #endif |
1840 | 1770 | ||
1841 | static void calc_load_account_idle(struct rq *this_rq); | 1771 | static void calc_load_account_idle(struct rq *this_rq); |
@@ -1877,23 +1807,20 @@ static void dec_nr_running(struct rq *rq) | |||
1877 | 1807 | ||
1878 | static void set_load_weight(struct task_struct *p) | 1808 | static void set_load_weight(struct task_struct *p) |
1879 | { | 1809 | { |
1880 | if (task_has_rt_policy(p)) { | 1810 | int prio = p->static_prio - MAX_RT_PRIO; |
1881 | p->se.load.weight = 0; | 1811 | struct load_weight *load = &p->se.load; |
1882 | p->se.load.inv_weight = WMULT_CONST; | ||
1883 | return; | ||
1884 | } | ||
1885 | 1812 | ||
1886 | /* | 1813 | /* |
1887 | * SCHED_IDLE tasks get minimal weight: | 1814 | * SCHED_IDLE tasks get minimal weight: |
1888 | */ | 1815 | */ |
1889 | if (p->policy == SCHED_IDLE) { | 1816 | if (p->policy == SCHED_IDLE) { |
1890 | p->se.load.weight = WEIGHT_IDLEPRIO; | 1817 | load->weight = scale_load(WEIGHT_IDLEPRIO); |
1891 | p->se.load.inv_weight = WMULT_IDLEPRIO; | 1818 | load->inv_weight = WMULT_IDLEPRIO; |
1892 | return; | 1819 | return; |
1893 | } | 1820 | } |
1894 | 1821 | ||
1895 | p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; | 1822 | load->weight = scale_load(prio_to_weight[prio]); |
1896 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 1823 | load->inv_weight = prio_to_wmult[prio]; |
1897 | } | 1824 | } |
1898 | 1825 | ||
1899 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 1826 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -1901,7 +1828,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1901 | update_rq_clock(rq); | 1828 | update_rq_clock(rq); |
1902 | sched_info_queued(p); | 1829 | sched_info_queued(p); |
1903 | p->sched_class->enqueue_task(rq, p, flags); | 1830 | p->sched_class->enqueue_task(rq, p, flags); |
1904 | p->se.on_rq = 1; | ||
1905 | } | 1831 | } |
1906 | 1832 | ||
1907 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 1833 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -1909,7 +1835,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1909 | update_rq_clock(rq); | 1835 | update_rq_clock(rq); |
1910 | sched_info_dequeued(p); | 1836 | sched_info_dequeued(p); |
1911 | p->sched_class->dequeue_task(rq, p, flags); | 1837 | p->sched_class->dequeue_task(rq, p, flags); |
1912 | p->se.on_rq = 0; | ||
1913 | } | 1838 | } |
1914 | 1839 | ||
1915 | /* | 1840 | /* |
@@ -1936,14 +1861,227 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1936 | dec_nr_running(rq); | 1861 | dec_nr_running(rq); |
1937 | } | 1862 | } |
1938 | 1863 | ||
1864 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1865 | |||
1866 | /* | ||
1867 | * There are no locks covering percpu hardirq/softirq time. | ||
1868 | * They are only modified in account_system_vtime, on corresponding CPU | ||
1869 | * with interrupts disabled. So, writes are safe. | ||
1870 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
1871 | * This may result in other CPU reading this CPU's irq time and can | ||
1872 | * race with irq/account_system_vtime on this CPU. We would either get old | ||
1873 | * or new value with a side effect of accounting a slice of irq time to wrong | ||
1874 | * task when irq is in progress while we read rq->clock. That is a worthy | ||
1875 | * compromise in place of having locks on each irq in account_system_time. | ||
1876 | */ | ||
1877 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
1878 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
1879 | |||
1880 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
1881 | static int sched_clock_irqtime; | ||
1882 | |||
1883 | void enable_sched_clock_irqtime(void) | ||
1884 | { | ||
1885 | sched_clock_irqtime = 1; | ||
1886 | } | ||
1887 | |||
1888 | void disable_sched_clock_irqtime(void) | ||
1889 | { | ||
1890 | sched_clock_irqtime = 0; | ||
1891 | } | ||
1892 | |||
1893 | #ifndef CONFIG_64BIT | ||
1894 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
1895 | |||
1896 | static inline void irq_time_write_begin(void) | ||
1897 | { | ||
1898 | __this_cpu_inc(irq_time_seq.sequence); | ||
1899 | smp_wmb(); | ||
1900 | } | ||
1901 | |||
1902 | static inline void irq_time_write_end(void) | ||
1903 | { | ||
1904 | smp_wmb(); | ||
1905 | __this_cpu_inc(irq_time_seq.sequence); | ||
1906 | } | ||
1907 | |||
1908 | static inline u64 irq_time_read(int cpu) | ||
1909 | { | ||
1910 | u64 irq_time; | ||
1911 | unsigned seq; | ||
1912 | |||
1913 | do { | ||
1914 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
1915 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
1916 | per_cpu(cpu_hardirq_time, cpu); | ||
1917 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
1918 | |||
1919 | return irq_time; | ||
1920 | } | ||
1921 | #else /* CONFIG_64BIT */ | ||
1922 | static inline void irq_time_write_begin(void) | ||
1923 | { | ||
1924 | } | ||
1925 | |||
1926 | static inline void irq_time_write_end(void) | ||
1927 | { | ||
1928 | } | ||
1929 | |||
1930 | static inline u64 irq_time_read(int cpu) | ||
1931 | { | ||
1932 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
1933 | } | ||
1934 | #endif /* CONFIG_64BIT */ | ||
1935 | |||
1936 | /* | ||
1937 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
1938 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
1939 | */ | ||
1940 | void account_system_vtime(struct task_struct *curr) | ||
1941 | { | ||
1942 | unsigned long flags; | ||
1943 | s64 delta; | ||
1944 | int cpu; | ||
1945 | |||
1946 | if (!sched_clock_irqtime) | ||
1947 | return; | ||
1948 | |||
1949 | local_irq_save(flags); | ||
1950 | |||
1951 | cpu = smp_processor_id(); | ||
1952 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | ||
1953 | __this_cpu_add(irq_start_time, delta); | ||
1954 | |||
1955 | irq_time_write_begin(); | ||
1956 | /* | ||
1957 | * We do not account for softirq time from ksoftirqd here. | ||
1958 | * We want to continue accounting softirq time to ksoftirqd thread | ||
1959 | * in that case, so as not to confuse scheduler with a special task | ||
1960 | * that do not consume any time, but still wants to run. | ||
1961 | */ | ||
1962 | if (hardirq_count()) | ||
1963 | __this_cpu_add(cpu_hardirq_time, delta); | ||
1964 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | ||
1965 | __this_cpu_add(cpu_softirq_time, delta); | ||
1966 | |||
1967 | irq_time_write_end(); | ||
1968 | local_irq_restore(flags); | ||
1969 | } | ||
1970 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
1971 | |||
1972 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
1973 | { | ||
1974 | s64 irq_delta; | ||
1975 | |||
1976 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | ||
1977 | |||
1978 | /* | ||
1979 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
1980 | * this case when a previous update_rq_clock() happened inside a | ||
1981 | * {soft,}irq region. | ||
1982 | * | ||
1983 | * When this happens, we stop ->clock_task and only update the | ||
1984 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
1985 | * update will consume the rest. This ensures ->clock_task is | ||
1986 | * monotonic. | ||
1987 | * | ||
1988 | * It does however cause some slight miss-attribution of {soft,}irq | ||
1989 | * time, a more accurate solution would be to update the irq_time using | ||
1990 | * the current rq->clock timestamp, except that would require using | ||
1991 | * atomic ops. | ||
1992 | */ | ||
1993 | if (irq_delta > delta) | ||
1994 | irq_delta = delta; | ||
1995 | |||
1996 | rq->prev_irq_time += irq_delta; | ||
1997 | delta -= irq_delta; | ||
1998 | rq->clock_task += delta; | ||
1999 | |||
2000 | if (irq_delta && sched_feat(NONIRQ_POWER)) | ||
2001 | sched_rt_avg_update(rq, irq_delta); | ||
2002 | } | ||
2003 | |||
2004 | static int irqtime_account_hi_update(void) | ||
2005 | { | ||
2006 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
2007 | unsigned long flags; | ||
2008 | u64 latest_ns; | ||
2009 | int ret = 0; | ||
2010 | |||
2011 | local_irq_save(flags); | ||
2012 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
2013 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) | ||
2014 | ret = 1; | ||
2015 | local_irq_restore(flags); | ||
2016 | return ret; | ||
2017 | } | ||
2018 | |||
2019 | static int irqtime_account_si_update(void) | ||
2020 | { | ||
2021 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
2022 | unsigned long flags; | ||
2023 | u64 latest_ns; | ||
2024 | int ret = 0; | ||
2025 | |||
2026 | local_irq_save(flags); | ||
2027 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
2028 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) | ||
2029 | ret = 1; | ||
2030 | local_irq_restore(flags); | ||
2031 | return ret; | ||
2032 | } | ||
2033 | |||
2034 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
2035 | |||
2036 | #define sched_clock_irqtime (0) | ||
2037 | |||
2038 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
2039 | { | ||
2040 | rq->clock_task += delta; | ||
2041 | } | ||
2042 | |||
2043 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
2044 | |||
1939 | #include "sched_idletask.c" | 2045 | #include "sched_idletask.c" |
1940 | #include "sched_fair.c" | 2046 | #include "sched_fair.c" |
1941 | #include "sched_rt.c" | 2047 | #include "sched_rt.c" |
2048 | #include "sched_autogroup.c" | ||
2049 | #include "sched_stoptask.c" | ||
1942 | #include "../litmus/sched_litmus.c" | 2050 | #include "../litmus/sched_litmus.c" |
1943 | #ifdef CONFIG_SCHED_DEBUG | 2051 | #ifdef CONFIG_SCHED_DEBUG |
1944 | # include "sched_debug.c" | 2052 | # include "sched_debug.c" |
1945 | #endif | 2053 | #endif |
1946 | 2054 | ||
2055 | void sched_set_stop_task(int cpu, struct task_struct *stop) | ||
2056 | { | ||
2057 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | ||
2058 | struct task_struct *old_stop = cpu_rq(cpu)->stop; | ||
2059 | |||
2060 | if (stop) { | ||
2061 | /* | ||
2062 | * Make it appear like a SCHED_FIFO task, its something | ||
2063 | * userspace knows about and won't get confused about. | ||
2064 | * | ||
2065 | * Also, it will make PI more or less work without too | ||
2066 | * much confusion -- but then, stop work should not | ||
2067 | * rely on PI working anyway. | ||
2068 | */ | ||
2069 | sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); | ||
2070 | |||
2071 | stop->sched_class = &stop_sched_class; | ||
2072 | } | ||
2073 | |||
2074 | cpu_rq(cpu)->stop = stop; | ||
2075 | |||
2076 | if (old_stop) { | ||
2077 | /* | ||
2078 | * Reset it back to a normal scheduling class so that | ||
2079 | * it can die in pieces. | ||
2080 | */ | ||
2081 | old_stop->sched_class = &rt_sched_class; | ||
2082 | } | ||
2083 | } | ||
2084 | |||
1947 | /* | 2085 | /* |
1948 | * __normal_prio - return the priority that is based on the static prio | 2086 | * __normal_prio - return the priority that is based on the static prio |
1949 | */ | 2087 | */ |
@@ -2001,14 +2139,43 @@ inline int task_curr(const struct task_struct *p) | |||
2001 | 2139 | ||
2002 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 2140 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
2003 | const struct sched_class *prev_class, | 2141 | const struct sched_class *prev_class, |
2004 | int oldprio, int running) | 2142 | int oldprio) |
2005 | { | 2143 | { |
2006 | if (prev_class != p->sched_class) { | 2144 | if (prev_class != p->sched_class) { |
2007 | if (prev_class->switched_from) | 2145 | if (prev_class->switched_from) |
2008 | prev_class->switched_from(rq, p, running); | 2146 | prev_class->switched_from(rq, p); |
2009 | p->sched_class->switched_to(rq, p, running); | 2147 | p->sched_class->switched_to(rq, p); |
2010 | } else | 2148 | } else if (oldprio != p->prio) |
2011 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2149 | p->sched_class->prio_changed(rq, p, oldprio); |
2150 | } | ||
2151 | |||
2152 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
2153 | { | ||
2154 | const struct sched_class *class; | ||
2155 | |||
2156 | if (p->sched_class == rq->curr->sched_class) { | ||
2157 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
2158 | } else { | ||
2159 | for_each_class(class) { | ||
2160 | if (class == rq->curr->sched_class) | ||
2161 | break; | ||
2162 | if (class == p->sched_class) { | ||
2163 | resched_task(rq->curr); | ||
2164 | break; | ||
2165 | } | ||
2166 | } | ||
2167 | } | ||
2168 | |||
2169 | /* | ||
2170 | * A queue event has occurred, and we're going to schedule. In | ||
2171 | * this case, we can save a useless back to back clock update. | ||
2172 | */ | ||
2173 | /* LITMUS^RT: | ||
2174 | * The "disable-clock-update" approach was buggy in Linux 2.6.36. | ||
2175 | * The issue has been solved in 2.6.37. | ||
2176 | */ | ||
2177 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) | ||
2178 | rq->skip_clock_update = 1; | ||
2012 | } | 2179 | } |
2013 | 2180 | ||
2014 | #ifdef CONFIG_SMP | 2181 | #ifdef CONFIG_SMP |
@@ -2023,6 +2190,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
2023 | if (p->sched_class != &fair_sched_class) | 2190 | if (p->sched_class != &fair_sched_class) |
2024 | return 0; | 2191 | return 0; |
2025 | 2192 | ||
2193 | if (unlikely(p->policy == SCHED_IDLE)) | ||
2194 | return 0; | ||
2195 | |||
2026 | /* | 2196 | /* |
2027 | * Buddy candidates are cache hot: | 2197 | * Buddy candidates are cache hot: |
2028 | */ | 2198 | */ |
@@ -2050,6 +2220,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
2050 | */ | 2220 | */ |
2051 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 2221 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
2052 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 2222 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); |
2223 | |||
2224 | #ifdef CONFIG_LOCKDEP | ||
2225 | /* | ||
2226 | * The caller should hold either p->pi_lock or rq->lock, when changing | ||
2227 | * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. | ||
2228 | * | ||
2229 | * sched_move_task() holds both and thus holding either pins the cgroup, | ||
2230 | * see set_task_rq(). | ||
2231 | * | ||
2232 | * Furthermore, all task_rq users should acquire both locks, see | ||
2233 | * task_rq_lock(). | ||
2234 | */ | ||
2235 | WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || | ||
2236 | lockdep_is_held(&task_rq(p)->lock))); | ||
2237 | #endif | ||
2053 | #endif | 2238 | #endif |
2054 | 2239 | ||
2055 | trace_sched_migrate_task(p, new_cpu); | 2240 | trace_sched_migrate_task(p, new_cpu); |
@@ -2070,21 +2255,6 @@ struct migration_arg { | |||
2070 | static int migration_cpu_stop(void *data); | 2255 | static int migration_cpu_stop(void *data); |
2071 | 2256 | ||
2072 | /* | 2257 | /* |
2073 | * The task's runqueue lock must be held. | ||
2074 | * Returns true if you have to wait for migration thread. | ||
2075 | */ | ||
2076 | static bool migrate_task(struct task_struct *p, int dest_cpu) | ||
2077 | { | ||
2078 | struct rq *rq = task_rq(p); | ||
2079 | |||
2080 | /* | ||
2081 | * If the task is not on a runqueue (and not running), then | ||
2082 | * the next wake-up will properly place the task. | ||
2083 | */ | ||
2084 | return p->se.on_rq || task_running(rq, p); | ||
2085 | } | ||
2086 | |||
2087 | /* | ||
2088 | * wait_task_inactive - wait for a thread to unschedule. | 2258 | * wait_task_inactive - wait for a thread to unschedule. |
2089 | * | 2259 | * |
2090 | * If @match_state is nonzero, it's the @p->state value just checked and | 2260 | * If @match_state is nonzero, it's the @p->state value just checked and |
@@ -2141,11 +2311,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2141 | rq = task_rq_lock(p, &flags); | 2311 | rq = task_rq_lock(p, &flags); |
2142 | trace_sched_wait_task(p); | 2312 | trace_sched_wait_task(p); |
2143 | running = task_running(rq, p); | 2313 | running = task_running(rq, p); |
2144 | on_rq = p->se.on_rq; | 2314 | on_rq = p->on_rq; |
2145 | ncsw = 0; | 2315 | ncsw = 0; |
2146 | if (!match_state || p->state == match_state) | 2316 | if (!match_state || p->state == match_state) |
2147 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | 2317 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
2148 | task_rq_unlock(rq, &flags); | 2318 | task_rq_unlock(rq, p, &flags); |
2149 | 2319 | ||
2150 | /* | 2320 | /* |
2151 | * If it changed from the expected state, bail out now. | 2321 | * If it changed from the expected state, bail out now. |
@@ -2174,7 +2344,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2174 | * yield - it could be a while. | 2344 | * yield - it could be a while. |
2175 | */ | 2345 | */ |
2176 | if (unlikely(on_rq)) { | 2346 | if (unlikely(on_rq)) { |
2177 | schedule_timeout_uninterruptible(1); | 2347 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); |
2348 | |||
2349 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
2350 | schedule_hrtimeout(&to, HRTIMER_MODE_REL); | ||
2178 | continue; | 2351 | continue; |
2179 | } | 2352 | } |
2180 | 2353 | ||
@@ -2196,7 +2369,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2196 | * Cause a process which is running on another CPU to enter | 2369 | * Cause a process which is running on another CPU to enter |
2197 | * kernel-mode, without any delay. (to get signals handled.) | 2370 | * kernel-mode, without any delay. (to get signals handled.) |
2198 | * | 2371 | * |
2199 | * NOTE: this function doesnt have to take the runqueue lock, | 2372 | * NOTE: this function doesn't have to take the runqueue lock, |
2200 | * because all it wants to ensure is that the remote task enters | 2373 | * because all it wants to ensure is that the remote task enters |
2201 | * the kernel. If the IPI races and the task has been migrated | 2374 | * the kernel. If the IPI races and the task has been migrated |
2202 | * to another CPU then no harm is done and the purpose has been | 2375 | * to another CPU then no harm is done and the purpose has been |
@@ -2215,30 +2388,9 @@ void kick_process(struct task_struct *p) | |||
2215 | EXPORT_SYMBOL_GPL(kick_process); | 2388 | EXPORT_SYMBOL_GPL(kick_process); |
2216 | #endif /* CONFIG_SMP */ | 2389 | #endif /* CONFIG_SMP */ |
2217 | 2390 | ||
2218 | /** | ||
2219 | * task_oncpu_function_call - call a function on the cpu on which a task runs | ||
2220 | * @p: the task to evaluate | ||
2221 | * @func: the function to be called | ||
2222 | * @info: the function call argument | ||
2223 | * | ||
2224 | * Calls the function @func when the task is currently running. This might | ||
2225 | * be on the current CPU, which just calls the function directly | ||
2226 | */ | ||
2227 | void task_oncpu_function_call(struct task_struct *p, | ||
2228 | void (*func) (void *info), void *info) | ||
2229 | { | ||
2230 | int cpu; | ||
2231 | |||
2232 | preempt_disable(); | ||
2233 | cpu = task_cpu(p); | ||
2234 | if (task_curr(p)) | ||
2235 | smp_call_function_single(cpu, func, info, 1); | ||
2236 | preempt_enable(); | ||
2237 | } | ||
2238 | |||
2239 | #ifdef CONFIG_SMP | 2391 | #ifdef CONFIG_SMP |
2240 | /* | 2392 | /* |
2241 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. | 2393 | * ->cpus_allowed is protected by both rq->lock and p->pi_lock |
2242 | */ | 2394 | */ |
2243 | static int select_fallback_rq(int cpu, struct task_struct *p) | 2395 | static int select_fallback_rq(int cpu, struct task_struct *p) |
2244 | { | 2396 | { |
@@ -2256,30 +2408,27 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2256 | return dest_cpu; | 2408 | return dest_cpu; |
2257 | 2409 | ||
2258 | /* No more Mr. Nice Guy. */ | 2410 | /* No more Mr. Nice Guy. */ |
2259 | if (unlikely(dest_cpu >= nr_cpu_ids)) { | 2411 | dest_cpu = cpuset_cpus_allowed_fallback(p); |
2260 | dest_cpu = cpuset_cpus_allowed_fallback(p); | 2412 | /* |
2261 | /* | 2413 | * Don't tell them about moving exiting tasks or |
2262 | * Don't tell them about moving exiting tasks or | 2414 | * kernel threads (both mm NULL), since they never |
2263 | * kernel threads (both mm NULL), since they never | 2415 | * leave kernel. |
2264 | * leave kernel. | 2416 | */ |
2265 | */ | 2417 | if (p->mm && printk_ratelimit()) { |
2266 | if (p->mm && printk_ratelimit()) { | 2418 | printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", |
2267 | printk(KERN_INFO "process %d (%s) no " | 2419 | task_pid_nr(p), p->comm, cpu); |
2268 | "longer affine to cpu%d\n", | ||
2269 | task_pid_nr(p), p->comm, cpu); | ||
2270 | } | ||
2271 | } | 2420 | } |
2272 | 2421 | ||
2273 | return dest_cpu; | 2422 | return dest_cpu; |
2274 | } | 2423 | } |
2275 | 2424 | ||
2276 | /* | 2425 | /* |
2277 | * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. | 2426 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
2278 | */ | 2427 | */ |
2279 | static inline | 2428 | static inline |
2280 | int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) | 2429 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) |
2281 | { | 2430 | { |
2282 | int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); | 2431 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); |
2283 | 2432 | ||
2284 | /* | 2433 | /* |
2285 | * In order not to call set_task_cpu() on a blocking task we need | 2434 | * In order not to call set_task_cpu() on a blocking task we need |
@@ -2305,27 +2454,63 @@ static void update_avg(u64 *avg, u64 sample) | |||
2305 | } | 2454 | } |
2306 | #endif | 2455 | #endif |
2307 | 2456 | ||
2308 | static inline void ttwu_activate(struct task_struct *p, struct rq *rq, | 2457 | static void |
2309 | bool is_sync, bool is_migrate, bool is_local, | 2458 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) |
2310 | unsigned long en_flags) | ||
2311 | { | 2459 | { |
2312 | schedstat_inc(p, se.statistics.nr_wakeups); | 2460 | #ifdef CONFIG_SCHEDSTATS |
2313 | if (is_sync) | 2461 | struct rq *rq = this_rq(); |
2314 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | 2462 | |
2315 | if (is_migrate) | 2463 | #ifdef CONFIG_SMP |
2316 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | 2464 | int this_cpu = smp_processor_id(); |
2317 | if (is_local) | 2465 | |
2466 | if (cpu == this_cpu) { | ||
2467 | schedstat_inc(rq, ttwu_local); | ||
2318 | schedstat_inc(p, se.statistics.nr_wakeups_local); | 2468 | schedstat_inc(p, se.statistics.nr_wakeups_local); |
2319 | else | 2469 | } else { |
2470 | struct sched_domain *sd; | ||
2471 | |||
2320 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | 2472 | schedstat_inc(p, se.statistics.nr_wakeups_remote); |
2473 | rcu_read_lock(); | ||
2474 | for_each_domain(this_cpu, sd) { | ||
2475 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2476 | schedstat_inc(sd, ttwu_wake_remote); | ||
2477 | break; | ||
2478 | } | ||
2479 | } | ||
2480 | rcu_read_unlock(); | ||
2481 | } | ||
2482 | |||
2483 | if (wake_flags & WF_MIGRATED) | ||
2484 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | ||
2485 | |||
2486 | #endif /* CONFIG_SMP */ | ||
2487 | |||
2488 | schedstat_inc(rq, ttwu_count); | ||
2489 | schedstat_inc(p, se.statistics.nr_wakeups); | ||
2321 | 2490 | ||
2491 | if (wake_flags & WF_SYNC) | ||
2492 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | ||
2493 | |||
2494 | #endif /* CONFIG_SCHEDSTATS */ | ||
2495 | } | ||
2496 | |||
2497 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | ||
2498 | { | ||
2322 | activate_task(rq, p, en_flags); | 2499 | activate_task(rq, p, en_flags); |
2500 | p->on_rq = 1; | ||
2501 | |||
2502 | /* if a worker is waking up, notify workqueue */ | ||
2503 | if (p->flags & PF_WQ_WORKER) | ||
2504 | wq_worker_waking_up(p, cpu_of(rq)); | ||
2323 | } | 2505 | } |
2324 | 2506 | ||
2325 | static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | 2507 | /* |
2326 | int wake_flags, bool success) | 2508 | * Mark the task runnable and perform wakeup-preemption. |
2509 | */ | ||
2510 | static void | ||
2511 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2327 | { | 2512 | { |
2328 | trace_sched_wakeup(p, success); | 2513 | trace_sched_wakeup(p, true); |
2329 | check_preempt_curr(rq, p, wake_flags); | 2514 | check_preempt_curr(rq, p, wake_flags); |
2330 | 2515 | ||
2331 | p->state = TASK_RUNNING; | 2516 | p->state = TASK_RUNNING; |
@@ -2344,9 +2529,156 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2344 | rq->idle_stamp = 0; | 2529 | rq->idle_stamp = 0; |
2345 | } | 2530 | } |
2346 | #endif | 2531 | #endif |
2347 | /* if a worker is waking up, notify workqueue */ | 2532 | } |
2348 | if ((p->flags & PF_WQ_WORKER) && success) | 2533 | |
2349 | wq_worker_waking_up(p, cpu_of(rq)); | 2534 | static void |
2535 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2536 | { | ||
2537 | #ifdef CONFIG_SMP | ||
2538 | if (p->sched_contributes_to_load) | ||
2539 | rq->nr_uninterruptible--; | ||
2540 | #endif | ||
2541 | |||
2542 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); | ||
2543 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2544 | } | ||
2545 | |||
2546 | /* | ||
2547 | * Called in case the task @p isn't fully descheduled from its runqueue, | ||
2548 | * in this case we must do a remote wakeup. Its a 'light' wakeup though, | ||
2549 | * since all we need to do is flip p->state to TASK_RUNNING, since | ||
2550 | * the task is still ->on_rq. | ||
2551 | */ | ||
2552 | static int ttwu_remote(struct task_struct *p, int wake_flags) | ||
2553 | { | ||
2554 | struct rq *rq; | ||
2555 | int ret = 0; | ||
2556 | |||
2557 | rq = __task_rq_lock(p); | ||
2558 | if (p->on_rq) { | ||
2559 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2560 | ret = 1; | ||
2561 | } | ||
2562 | __task_rq_unlock(rq); | ||
2563 | |||
2564 | return ret; | ||
2565 | } | ||
2566 | |||
2567 | #ifdef CONFIG_SMP | ||
2568 | static void sched_ttwu_do_pending(struct task_struct *list) | ||
2569 | { | ||
2570 | struct rq *rq = this_rq(); | ||
2571 | |||
2572 | raw_spin_lock(&rq->lock); | ||
2573 | |||
2574 | while (list) { | ||
2575 | struct task_struct *p = list; | ||
2576 | list = list->wake_entry; | ||
2577 | ttwu_do_activate(rq, p, 0); | ||
2578 | } | ||
2579 | |||
2580 | raw_spin_unlock(&rq->lock); | ||
2581 | } | ||
2582 | |||
2583 | #ifdef CONFIG_HOTPLUG_CPU | ||
2584 | |||
2585 | static void sched_ttwu_pending(void) | ||
2586 | { | ||
2587 | struct rq *rq = this_rq(); | ||
2588 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2589 | |||
2590 | if (!list) | ||
2591 | return; | ||
2592 | |||
2593 | sched_ttwu_do_pending(list); | ||
2594 | } | ||
2595 | |||
2596 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
2597 | |||
2598 | void scheduler_ipi(void) | ||
2599 | { | ||
2600 | struct rq *rq = this_rq(); | ||
2601 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2602 | |||
2603 | if (!list) | ||
2604 | return; | ||
2605 | |||
2606 | /* | ||
2607 | * Not all reschedule IPI handlers call irq_enter/irq_exit, since | ||
2608 | * traditionally all their work was done from the interrupt return | ||
2609 | * path. Now that we actually do some work, we need to make sure | ||
2610 | * we do call them. | ||
2611 | * | ||
2612 | * Some archs already do call them, luckily irq_enter/exit nest | ||
2613 | * properly. | ||
2614 | * | ||
2615 | * Arguably we should visit all archs and update all handlers, | ||
2616 | * however a fair share of IPIs are still resched only so this would | ||
2617 | * somewhat pessimize the simple resched case. | ||
2618 | */ | ||
2619 | irq_enter(); | ||
2620 | sched_ttwu_do_pending(list); | ||
2621 | irq_exit(); | ||
2622 | } | ||
2623 | |||
2624 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | ||
2625 | { | ||
2626 | struct rq *rq = cpu_rq(cpu); | ||
2627 | struct task_struct *next = rq->wake_list; | ||
2628 | |||
2629 | for (;;) { | ||
2630 | struct task_struct *old = next; | ||
2631 | |||
2632 | p->wake_entry = next; | ||
2633 | next = cmpxchg(&rq->wake_list, old, p); | ||
2634 | if (next == old) | ||
2635 | break; | ||
2636 | } | ||
2637 | |||
2638 | if (!next) | ||
2639 | smp_send_reschedule(cpu); | ||
2640 | } | ||
2641 | |||
2642 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
2643 | static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | ||
2644 | { | ||
2645 | struct rq *rq; | ||
2646 | int ret = 0; | ||
2647 | |||
2648 | rq = __task_rq_lock(p); | ||
2649 | if (p->on_cpu) { | ||
2650 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | ||
2651 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2652 | ret = 1; | ||
2653 | } | ||
2654 | __task_rq_unlock(rq); | ||
2655 | |||
2656 | return ret; | ||
2657 | |||
2658 | } | ||
2659 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
2660 | #endif /* CONFIG_SMP */ | ||
2661 | |||
2662 | static void ttwu_queue(struct task_struct *p, int cpu) | ||
2663 | { | ||
2664 | struct rq *rq = cpu_rq(cpu); | ||
2665 | |||
2666 | #if defined(CONFIG_SMP) | ||
2667 | /* | ||
2668 | * LITMUS^RT: whether to send an IPI to the remote CPU | ||
2669 | * is plugin specific. | ||
2670 | */ | ||
2671 | if (!is_realtime(p) && | ||
2672 | sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { | ||
2673 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | ||
2674 | ttwu_queue_remote(p, cpu); | ||
2675 | return; | ||
2676 | } | ||
2677 | #endif | ||
2678 | |||
2679 | raw_spin_lock(&rq->lock); | ||
2680 | ttwu_do_activate(rq, p, 0); | ||
2681 | raw_spin_unlock(&rq->lock); | ||
2350 | } | 2682 | } |
2351 | 2683 | ||
2352 | /** | 2684 | /** |
@@ -2364,97 +2696,79 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2364 | * Returns %true if @p was woken up, %false if it was already running | 2696 | * Returns %true if @p was woken up, %false if it was already running |
2365 | * or @state didn't match @p's state. | 2697 | * or @state didn't match @p's state. |
2366 | */ | 2698 | */ |
2367 | static int try_to_wake_up(struct task_struct *p, unsigned int state, | 2699 | static int |
2368 | int wake_flags) | 2700 | try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) |
2369 | { | 2701 | { |
2370 | int cpu, orig_cpu, this_cpu, success = 0; | ||
2371 | unsigned long flags; | 2702 | unsigned long flags; |
2372 | unsigned long en_flags = ENQUEUE_WAKEUP; | 2703 | int cpu, success = 0; |
2373 | struct rq *rq; | ||
2374 | 2704 | ||
2375 | if (is_realtime(p)) | 2705 | if (is_realtime(p)) |
2376 | TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); | 2706 | TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); |
2377 | 2707 | ||
2378 | this_cpu = get_cpu(); | ||
2379 | |||
2380 | smp_wmb(); | 2708 | smp_wmb(); |
2381 | rq = task_rq_lock(p, &flags); | 2709 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2382 | if (!(p->state & state)) | 2710 | if (!(p->state & state)) |
2383 | goto out; | 2711 | goto out; |
2384 | 2712 | ||
2385 | if (p->se.on_rq) | 2713 | success = 1; /* we're going to change ->state */ |
2386 | goto out_running; | ||
2387 | |||
2388 | cpu = task_cpu(p); | 2714 | cpu = task_cpu(p); |
2389 | orig_cpu = cpu; | ||
2390 | 2715 | ||
2391 | #ifdef CONFIG_SMP | 2716 | if (p->on_rq && ttwu_remote(p, wake_flags)) |
2392 | if (unlikely(task_running(rq, p)) || is_realtime(p)) | 2717 | goto stat; |
2393 | goto out_activate; | ||
2394 | 2718 | ||
2719 | #ifdef CONFIG_SMP | ||
2395 | /* | 2720 | /* |
2396 | * In order to handle concurrent wakeups and release the rq->lock | 2721 | * If the owning (remote) cpu is still in the middle of schedule() with |
2397 | * we put the task in TASK_WAKING state. | 2722 | * this task as prev, wait until its done referencing the task. |
2398 | * | ||
2399 | * First fix up the nr_uninterruptible count: | ||
2400 | */ | 2723 | */ |
2401 | if (task_contributes_to_load(p)) { | 2724 | while (p->on_cpu) { |
2402 | if (likely(cpu_online(orig_cpu))) | 2725 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
2403 | rq->nr_uninterruptible--; | 2726 | /* |
2404 | else | 2727 | * In case the architecture enables interrupts in |
2405 | this_rq()->nr_uninterruptible--; | 2728 | * context_switch(), we cannot busy wait, since that |
2406 | } | 2729 | * would lead to deadlocks when an interrupt hits and |
2407 | p->state = TASK_WAKING; | 2730 | * tries to wake up @prev. So bail and do a complete |
2408 | 2731 | * remote wakeup. | |
2409 | if (p->sched_class->task_waking) { | 2732 | */ |
2410 | p->sched_class->task_waking(rq, p); | 2733 | if (ttwu_activate_remote(p, wake_flags)) |
2411 | en_flags |= ENQUEUE_WAKING; | 2734 | goto stat; |
2735 | #else | ||
2736 | cpu_relax(); | ||
2737 | #endif | ||
2412 | } | 2738 | } |
2739 | /* | ||
2740 | * Pairs with the smp_wmb() in finish_lock_switch(). | ||
2741 | */ | ||
2742 | smp_rmb(); | ||
2413 | 2743 | ||
2414 | cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); | 2744 | /* LITMUS^RT: once the task can be safely referenced by this |
2415 | if (cpu != orig_cpu) | 2745 | * CPU, don't mess up with Linux load balancing stuff. |
2416 | set_task_cpu(p, cpu); | 2746 | */ |
2417 | __task_rq_unlock(rq); | 2747 | if (is_realtime(p)) |
2748 | goto litmus_out_activate; | ||
2418 | 2749 | ||
2419 | rq = cpu_rq(cpu); | 2750 | p->sched_contributes_to_load = !!task_contributes_to_load(p); |
2420 | raw_spin_lock(&rq->lock); | 2751 | p->state = TASK_WAKING; |
2421 | 2752 | ||
2422 | /* | 2753 | if (p->sched_class->task_waking) |
2423 | * We migrated the task without holding either rq->lock, however | 2754 | p->sched_class->task_waking(p); |
2424 | * since the task is not on the task list itself, nobody else | ||
2425 | * will try and migrate the task, hence the rq should match the | ||
2426 | * cpu we just moved it to. | ||
2427 | */ | ||
2428 | WARN_ON(task_cpu(p) != cpu); | ||
2429 | WARN_ON(p->state != TASK_WAKING); | ||
2430 | 2755 | ||
2431 | #ifdef CONFIG_SCHEDSTATS | 2756 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
2432 | schedstat_inc(rq, ttwu_count); | 2757 | if (task_cpu(p) != cpu) { |
2433 | if (cpu == this_cpu) | 2758 | wake_flags |= WF_MIGRATED; |
2434 | schedstat_inc(rq, ttwu_local); | 2759 | set_task_cpu(p, cpu); |
2435 | else { | ||
2436 | struct sched_domain *sd; | ||
2437 | for_each_domain(this_cpu, sd) { | ||
2438 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2439 | schedstat_inc(sd, ttwu_wake_remote); | ||
2440 | break; | ||
2441 | } | ||
2442 | } | ||
2443 | } | 2760 | } |
2444 | #endif /* CONFIG_SCHEDSTATS */ | ||
2445 | 2761 | ||
2446 | out_activate: | 2762 | litmus_out_activate: |
2447 | #endif /* CONFIG_SMP */ | 2763 | #endif /* CONFIG_SMP */ |
2448 | ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, | 2764 | |
2449 | cpu == this_cpu, en_flags); | 2765 | ttwu_queue(p, cpu); |
2450 | success = 1; | 2766 | stat: |
2451 | out_running: | 2767 | ttwu_stat(p, cpu, wake_flags); |
2452 | ttwu_post_activation(p, rq, wake_flags, success); | ||
2453 | out: | 2768 | out: |
2454 | if (is_realtime(p)) | 2769 | if (is_realtime(p)) |
2455 | TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state); | 2770 | TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state); |
2456 | task_rq_unlock(rq, &flags); | 2771 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2457 | put_cpu(); | ||
2458 | 2772 | ||
2459 | return success; | 2773 | return success; |
2460 | } | 2774 | } |
@@ -2463,31 +2777,34 @@ out: | |||
2463 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2777 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
2464 | * @p: the thread to be awakened | 2778 | * @p: the thread to be awakened |
2465 | * | 2779 | * |
2466 | * Put @p on the run-queue if it's not alredy there. The caller must | 2780 | * Put @p on the run-queue if it's not already there. The caller must |
2467 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2781 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2468 | * the current task. this_rq() stays locked over invocation. | 2782 | * the current task. |
2469 | */ | 2783 | */ |
2470 | static void try_to_wake_up_local(struct task_struct *p) | 2784 | static void try_to_wake_up_local(struct task_struct *p) |
2471 | { | 2785 | { |
2472 | struct rq *rq = task_rq(p); | 2786 | struct rq *rq = task_rq(p); |
2473 | bool success = false; | ||
2474 | 2787 | ||
2475 | BUG_ON(rq != this_rq()); | 2788 | BUG_ON(rq != this_rq()); |
2476 | BUG_ON(p == current); | 2789 | BUG_ON(p == current); |
2477 | lockdep_assert_held(&rq->lock); | 2790 | lockdep_assert_held(&rq->lock); |
2478 | 2791 | ||
2792 | if (!raw_spin_trylock(&p->pi_lock)) { | ||
2793 | raw_spin_unlock(&rq->lock); | ||
2794 | raw_spin_lock(&p->pi_lock); | ||
2795 | raw_spin_lock(&rq->lock); | ||
2796 | } | ||
2797 | |||
2479 | if (!(p->state & TASK_NORMAL)) | 2798 | if (!(p->state & TASK_NORMAL)) |
2480 | return; | 2799 | goto out; |
2481 | 2800 | ||
2482 | if (!p->se.on_rq) { | 2801 | if (!p->on_rq) |
2483 | if (likely(!task_running(rq, p))) { | 2802 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2484 | schedstat_inc(rq, ttwu_count); | 2803 | |
2485 | schedstat_inc(rq, ttwu_local); | 2804 | ttwu_do_wakeup(rq, p, 0); |
2486 | } | 2805 | ttwu_stat(p, smp_processor_id(), 0); |
2487 | ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); | 2806 | out: |
2488 | success = true; | 2807 | raw_spin_unlock(&p->pi_lock); |
2489 | } | ||
2490 | ttwu_post_activation(p, rq, 0, success); | ||
2491 | } | 2808 | } |
2492 | 2809 | ||
2493 | /** | 2810 | /** |
@@ -2520,18 +2837,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
2520 | */ | 2837 | */ |
2521 | static void __sched_fork(struct task_struct *p) | 2838 | static void __sched_fork(struct task_struct *p) |
2522 | { | 2839 | { |
2840 | p->on_rq = 0; | ||
2841 | |||
2842 | p->se.on_rq = 0; | ||
2523 | p->se.exec_start = 0; | 2843 | p->se.exec_start = 0; |
2524 | p->se.sum_exec_runtime = 0; | 2844 | p->se.sum_exec_runtime = 0; |
2525 | p->se.prev_sum_exec_runtime = 0; | 2845 | p->se.prev_sum_exec_runtime = 0; |
2526 | p->se.nr_migrations = 0; | 2846 | p->se.nr_migrations = 0; |
2847 | p->se.vruntime = 0; | ||
2848 | INIT_LIST_HEAD(&p->se.group_node); | ||
2527 | 2849 | ||
2528 | #ifdef CONFIG_SCHEDSTATS | 2850 | #ifdef CONFIG_SCHEDSTATS |
2529 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2851 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
2530 | #endif | 2852 | #endif |
2531 | 2853 | ||
2532 | INIT_LIST_HEAD(&p->rt.run_list); | 2854 | INIT_LIST_HEAD(&p->rt.run_list); |
2533 | p->se.on_rq = 0; | ||
2534 | INIT_LIST_HEAD(&p->se.group_node); | ||
2535 | 2855 | ||
2536 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2856 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2537 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2857 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
@@ -2541,8 +2861,9 @@ static void __sched_fork(struct task_struct *p) | |||
2541 | /* | 2861 | /* |
2542 | * fork()/clone()-time setup: | 2862 | * fork()/clone()-time setup: |
2543 | */ | 2863 | */ |
2544 | void sched_fork(struct task_struct *p, int clone_flags) | 2864 | void sched_fork(struct task_struct *p) |
2545 | { | 2865 | { |
2866 | unsigned long flags; | ||
2546 | int cpu = get_cpu(); | 2867 | int cpu = get_cpu(); |
2547 | 2868 | ||
2548 | __sched_fork(p); | 2869 | __sched_fork(p); |
@@ -2594,22 +2915,24 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2594 | * | 2915 | * |
2595 | * Silence PROVE_RCU. | 2916 | * Silence PROVE_RCU. |
2596 | */ | 2917 | */ |
2597 | rcu_read_lock(); | 2918 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2598 | set_task_cpu(p, cpu); | 2919 | set_task_cpu(p, cpu); |
2599 | rcu_read_unlock(); | 2920 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2600 | 2921 | ||
2601 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2922 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
2602 | if (likely(sched_info_on())) | 2923 | if (likely(sched_info_on())) |
2603 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2924 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
2604 | #endif | 2925 | #endif |
2605 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 2926 | #if defined(CONFIG_SMP) |
2606 | p->oncpu = 0; | 2927 | p->on_cpu = 0; |
2607 | #endif | 2928 | #endif |
2608 | #ifdef CONFIG_PREEMPT | 2929 | #ifdef CONFIG_PREEMPT |
2609 | /* Want to start with kernel preemption disabled. */ | 2930 | /* Want to start with kernel preemption disabled. */ |
2610 | task_thread_info(p)->preempt_count = 1; | 2931 | task_thread_info(p)->preempt_count = 1; |
2611 | #endif | 2932 | #endif |
2933 | #ifdef CONFIG_SMP | ||
2612 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 2934 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
2935 | #endif | ||
2613 | 2936 | ||
2614 | put_cpu(); | 2937 | put_cpu(); |
2615 | } | 2938 | } |
@@ -2621,41 +2944,31 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2621 | * that must be done for every newly created context, then puts the task | 2944 | * that must be done for every newly created context, then puts the task |
2622 | * on the runqueue and wakes it. | 2945 | * on the runqueue and wakes it. |
2623 | */ | 2946 | */ |
2624 | void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 2947 | void wake_up_new_task(struct task_struct *p) |
2625 | { | 2948 | { |
2626 | unsigned long flags; | 2949 | unsigned long flags; |
2627 | struct rq *rq; | 2950 | struct rq *rq; |
2628 | int cpu __maybe_unused = get_cpu(); | ||
2629 | 2951 | ||
2952 | raw_spin_lock_irqsave(&p->pi_lock, flags); | ||
2630 | #ifdef CONFIG_SMP | 2953 | #ifdef CONFIG_SMP |
2631 | rq = task_rq_lock(p, &flags); | ||
2632 | p->state = TASK_WAKING; | ||
2633 | |||
2634 | /* | 2954 | /* |
2635 | * Fork balancing, do it here and not earlier because: | 2955 | * Fork balancing, do it here and not earlier because: |
2636 | * - cpus_allowed can change in the fork path | 2956 | * - cpus_allowed can change in the fork path |
2637 | * - any previously selected cpu might disappear through hotplug | 2957 | * - any previously selected cpu might disappear through hotplug |
2638 | * | ||
2639 | * We set TASK_WAKING so that select_task_rq() can drop rq->lock | ||
2640 | * without people poking at ->cpus_allowed. | ||
2641 | */ | 2958 | */ |
2642 | cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); | 2959 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); |
2643 | set_task_cpu(p, cpu); | ||
2644 | |||
2645 | p->state = TASK_RUNNING; | ||
2646 | task_rq_unlock(rq, &flags); | ||
2647 | #endif | 2960 | #endif |
2648 | 2961 | ||
2649 | rq = task_rq_lock(p, &flags); | 2962 | rq = __task_rq_lock(p); |
2650 | activate_task(rq, p, 0); | 2963 | activate_task(rq, p, 0); |
2651 | trace_sched_wakeup_new(p, 1); | 2964 | p->on_rq = 1; |
2965 | trace_sched_wakeup_new(p, true); | ||
2652 | check_preempt_curr(rq, p, WF_FORK); | 2966 | check_preempt_curr(rq, p, WF_FORK); |
2653 | #ifdef CONFIG_SMP | 2967 | #ifdef CONFIG_SMP |
2654 | if (p->sched_class->task_woken) | 2968 | if (p->sched_class->task_woken) |
2655 | p->sched_class->task_woken(rq, p); | 2969 | p->sched_class->task_woken(rq, p); |
2656 | #endif | 2970 | #endif |
2657 | task_rq_unlock(rq, &flags); | 2971 | task_rq_unlock(rq, p, &flags); |
2658 | put_cpu(); | ||
2659 | } | 2972 | } |
2660 | 2973 | ||
2661 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2974 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -2733,9 +3046,12 @@ static inline void | |||
2733 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 3046 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
2734 | struct task_struct *next) | 3047 | struct task_struct *next) |
2735 | { | 3048 | { |
3049 | sched_info_switch(prev, next); | ||
3050 | perf_event_task_sched_out(prev, next); | ||
2736 | fire_sched_out_preempt_notifiers(prev, next); | 3051 | fire_sched_out_preempt_notifiers(prev, next); |
2737 | prepare_lock_switch(rq, next); | 3052 | prepare_lock_switch(rq, next); |
2738 | prepare_arch_switch(next); | 3053 | prepare_arch_switch(next); |
3054 | trace_sched_switch(prev, next); | ||
2739 | } | 3055 | } |
2740 | 3056 | ||
2741 | /** | 3057 | /** |
@@ -2879,7 +3195,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2879 | struct mm_struct *mm, *oldmm; | 3195 | struct mm_struct *mm, *oldmm; |
2880 | 3196 | ||
2881 | prepare_task_switch(rq, prev, next); | 3197 | prepare_task_switch(rq, prev, next); |
2882 | trace_sched_switch(prev, next); | 3198 | |
2883 | mm = next->mm; | 3199 | mm = next->mm; |
2884 | oldmm = prev->active_mm; | 3200 | oldmm = prev->active_mm; |
2885 | /* | 3201 | /* |
@@ -2889,14 +3205,14 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2889 | */ | 3205 | */ |
2890 | arch_start_context_switch(prev); | 3206 | arch_start_context_switch(prev); |
2891 | 3207 | ||
2892 | if (likely(!mm)) { | 3208 | if (!mm) { |
2893 | next->active_mm = oldmm; | 3209 | next->active_mm = oldmm; |
2894 | atomic_inc(&oldmm->mm_count); | 3210 | atomic_inc(&oldmm->mm_count); |
2895 | enter_lazy_tlb(oldmm, next); | 3211 | enter_lazy_tlb(oldmm, next); |
2896 | } else | 3212 | } else |
2897 | switch_mm(oldmm, mm, next); | 3213 | switch_mm(oldmm, mm, next); |
2898 | 3214 | ||
2899 | if (likely(!prev->mm)) { | 3215 | if (!prev->mm) { |
2900 | prev->active_mm = NULL; | 3216 | prev->active_mm = NULL; |
2901 | rq->prev_mm = oldmm; | 3217 | rq->prev_mm = oldmm; |
2902 | } | 3218 | } |
@@ -3011,6 +3327,15 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
3011 | return delta; | 3327 | return delta; |
3012 | } | 3328 | } |
3013 | 3329 | ||
3330 | static unsigned long | ||
3331 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3332 | { | ||
3333 | load *= exp; | ||
3334 | load += active * (FIXED_1 - exp); | ||
3335 | load += 1UL << (FSHIFT - 1); | ||
3336 | return load >> FSHIFT; | ||
3337 | } | ||
3338 | |||
3014 | #ifdef CONFIG_NO_HZ | 3339 | #ifdef CONFIG_NO_HZ |
3015 | /* | 3340 | /* |
3016 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 3341 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. |
@@ -3040,6 +3365,128 @@ static long calc_load_fold_idle(void) | |||
3040 | 3365 | ||
3041 | return delta; | 3366 | return delta; |
3042 | } | 3367 | } |
3368 | |||
3369 | /** | ||
3370 | * fixed_power_int - compute: x^n, in O(log n) time | ||
3371 | * | ||
3372 | * @x: base of the power | ||
3373 | * @frac_bits: fractional bits of @x | ||
3374 | * @n: power to raise @x to. | ||
3375 | * | ||
3376 | * By exploiting the relation between the definition of the natural power | ||
3377 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
3378 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
3379 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
3380 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
3381 | * of course trivially computable in O(log_2 n), the length of our binary | ||
3382 | * vector. | ||
3383 | */ | ||
3384 | static unsigned long | ||
3385 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
3386 | { | ||
3387 | unsigned long result = 1UL << frac_bits; | ||
3388 | |||
3389 | if (n) for (;;) { | ||
3390 | if (n & 1) { | ||
3391 | result *= x; | ||
3392 | result += 1UL << (frac_bits - 1); | ||
3393 | result >>= frac_bits; | ||
3394 | } | ||
3395 | n >>= 1; | ||
3396 | if (!n) | ||
3397 | break; | ||
3398 | x *= x; | ||
3399 | x += 1UL << (frac_bits - 1); | ||
3400 | x >>= frac_bits; | ||
3401 | } | ||
3402 | |||
3403 | return result; | ||
3404 | } | ||
3405 | |||
3406 | /* | ||
3407 | * a1 = a0 * e + a * (1 - e) | ||
3408 | * | ||
3409 | * a2 = a1 * e + a * (1 - e) | ||
3410 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
3411 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
3412 | * | ||
3413 | * a3 = a2 * e + a * (1 - e) | ||
3414 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
3415 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
3416 | * | ||
3417 | * ... | ||
3418 | * | ||
3419 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
3420 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
3421 | * = a0 * e^n + a * (1 - e^n) | ||
3422 | * | ||
3423 | * [1] application of the geometric series: | ||
3424 | * | ||
3425 | * n 1 - x^(n+1) | ||
3426 | * S_n := \Sum x^i = ------------- | ||
3427 | * i=0 1 - x | ||
3428 | */ | ||
3429 | static unsigned long | ||
3430 | calc_load_n(unsigned long load, unsigned long exp, | ||
3431 | unsigned long active, unsigned int n) | ||
3432 | { | ||
3433 | |||
3434 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
3435 | } | ||
3436 | |||
3437 | /* | ||
3438 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
3439 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
3440 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
3441 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
3442 | * | ||
3443 | * Once we've updated the global active value, we need to apply the exponential | ||
3444 | * weights adjusted to the number of cycles missed. | ||
3445 | */ | ||
3446 | static void calc_global_nohz(unsigned long ticks) | ||
3447 | { | ||
3448 | long delta, active, n; | ||
3449 | |||
3450 | if (time_before(jiffies, calc_load_update)) | ||
3451 | return; | ||
3452 | |||
3453 | /* | ||
3454 | * If we crossed a calc_load_update boundary, make sure to fold | ||
3455 | * any pending idle changes, the respective CPUs might have | ||
3456 | * missed the tick driven calc_load_account_active() update | ||
3457 | * due to NO_HZ. | ||
3458 | */ | ||
3459 | delta = calc_load_fold_idle(); | ||
3460 | if (delta) | ||
3461 | atomic_long_add(delta, &calc_load_tasks); | ||
3462 | |||
3463 | /* | ||
3464 | * If we were idle for multiple load cycles, apply them. | ||
3465 | */ | ||
3466 | if (ticks >= LOAD_FREQ) { | ||
3467 | n = ticks / LOAD_FREQ; | ||
3468 | |||
3469 | active = atomic_long_read(&calc_load_tasks); | ||
3470 | active = active > 0 ? active * FIXED_1 : 0; | ||
3471 | |||
3472 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
3473 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
3474 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
3475 | |||
3476 | calc_load_update += n * LOAD_FREQ; | ||
3477 | } | ||
3478 | |||
3479 | /* | ||
3480 | * Its possible the remainder of the above division also crosses | ||
3481 | * a LOAD_FREQ period, the regular check in calc_global_load() | ||
3482 | * which comes after this will take care of that. | ||
3483 | * | ||
3484 | * Consider us being 11 ticks before a cycle completion, and us | ||
3485 | * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will | ||
3486 | * age us 4 cycles, and the test in calc_global_load() will | ||
3487 | * pick up the final one. | ||
3488 | */ | ||
3489 | } | ||
3043 | #else | 3490 | #else |
3044 | static void calc_load_account_idle(struct rq *this_rq) | 3491 | static void calc_load_account_idle(struct rq *this_rq) |
3045 | { | 3492 | { |
@@ -3049,6 +3496,10 @@ static inline long calc_load_fold_idle(void) | |||
3049 | { | 3496 | { |
3050 | return 0; | 3497 | return 0; |
3051 | } | 3498 | } |
3499 | |||
3500 | static void calc_global_nohz(unsigned long ticks) | ||
3501 | { | ||
3502 | } | ||
3052 | #endif | 3503 | #endif |
3053 | 3504 | ||
3054 | /** | 3505 | /** |
@@ -3066,24 +3517,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
3066 | loads[2] = (avenrun[2] + offset) << shift; | 3517 | loads[2] = (avenrun[2] + offset) << shift; |
3067 | } | 3518 | } |
3068 | 3519 | ||
3069 | static unsigned long | ||
3070 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3071 | { | ||
3072 | load *= exp; | ||
3073 | load += active * (FIXED_1 - exp); | ||
3074 | return load >> FSHIFT; | ||
3075 | } | ||
3076 | |||
3077 | /* | 3520 | /* |
3078 | * calc_load - update the avenrun load estimates 10 ticks after the | 3521 | * calc_load - update the avenrun load estimates 10 ticks after the |
3079 | * CPUs have updated calc_load_tasks. | 3522 | * CPUs have updated calc_load_tasks. |
3080 | */ | 3523 | */ |
3081 | void calc_global_load(void) | 3524 | void calc_global_load(unsigned long ticks) |
3082 | { | 3525 | { |
3083 | unsigned long upd = calc_load_update + 10; | ||
3084 | long active; | 3526 | long active; |
3085 | 3527 | ||
3086 | if (time_before(jiffies, upd)) | 3528 | calc_global_nohz(ticks); |
3529 | |||
3530 | if (time_before(jiffies, calc_load_update + 10)) | ||
3087 | return; | 3531 | return; |
3088 | 3532 | ||
3089 | active = atomic_long_read(&calc_load_tasks); | 3533 | active = atomic_long_read(&calc_load_tasks); |
@@ -3244,27 +3688,22 @@ void sched_exec(void) | |||
3244 | { | 3688 | { |
3245 | struct task_struct *p = current; | 3689 | struct task_struct *p = current; |
3246 | unsigned long flags; | 3690 | unsigned long flags; |
3247 | struct rq *rq; | ||
3248 | int dest_cpu; | 3691 | int dest_cpu; |
3249 | 3692 | ||
3250 | rq = task_rq_lock(p, &flags); | 3693 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
3251 | dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); | 3694 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); |
3252 | if (dest_cpu == smp_processor_id()) | 3695 | if (dest_cpu == smp_processor_id()) |
3253 | goto unlock; | 3696 | goto unlock; |
3254 | 3697 | ||
3255 | /* | 3698 | if (likely(cpu_active(dest_cpu))) { |
3256 | * select_task_rq() can race against ->cpus_allowed | ||
3257 | */ | ||
3258 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | ||
3259 | likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { | ||
3260 | struct migration_arg arg = { p, dest_cpu }; | 3699 | struct migration_arg arg = { p, dest_cpu }; |
3261 | 3700 | ||
3262 | task_rq_unlock(rq, &flags); | 3701 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3263 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 3702 | stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); |
3264 | return; | 3703 | return; |
3265 | } | 3704 | } |
3266 | unlock: | 3705 | unlock: |
3267 | task_rq_unlock(rq, &flags); | 3706 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3268 | } | 3707 | } |
3269 | 3708 | ||
3270 | #endif | 3709 | #endif |
@@ -3285,7 +3724,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
3285 | 3724 | ||
3286 | if (task_current(rq, p)) { | 3725 | if (task_current(rq, p)) { |
3287 | update_rq_clock(rq); | 3726 | update_rq_clock(rq); |
3288 | ns = rq->clock - p->se.exec_start; | 3727 | ns = rq->clock_task - p->se.exec_start; |
3289 | if ((s64)ns < 0) | 3728 | if ((s64)ns < 0) |
3290 | ns = 0; | 3729 | ns = 0; |
3291 | } | 3730 | } |
@@ -3301,7 +3740,7 @@ unsigned long long task_delta_exec(struct task_struct *p) | |||
3301 | 3740 | ||
3302 | rq = task_rq_lock(p, &flags); | 3741 | rq = task_rq_lock(p, &flags); |
3303 | ns = do_task_delta_exec(p, rq); | 3742 | ns = do_task_delta_exec(p, rq); |
3304 | task_rq_unlock(rq, &flags); | 3743 | task_rq_unlock(rq, p, &flags); |
3305 | 3744 | ||
3306 | return ns; | 3745 | return ns; |
3307 | } | 3746 | } |
@@ -3319,7 +3758,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3319 | 3758 | ||
3320 | rq = task_rq_lock(p, &flags); | 3759 | rq = task_rq_lock(p, &flags); |
3321 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); | 3760 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); |
3322 | task_rq_unlock(rq, &flags); | 3761 | task_rq_unlock(rq, p, &flags); |
3323 | 3762 | ||
3324 | return ns; | 3763 | return ns; |
3325 | } | 3764 | } |
@@ -3343,7 +3782,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) | |||
3343 | rq = task_rq_lock(p, &flags); | 3782 | rq = task_rq_lock(p, &flags); |
3344 | thread_group_cputime(p, &totals); | 3783 | thread_group_cputime(p, &totals); |
3345 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); | 3784 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); |
3346 | task_rq_unlock(rq, &flags); | 3785 | task_rq_unlock(rq, p, &flags); |
3347 | 3786 | ||
3348 | return ns; | 3787 | return ns; |
3349 | } | 3788 | } |
@@ -3408,6 +3847,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
3408 | } | 3847 | } |
3409 | 3848 | ||
3410 | /* | 3849 | /* |
3850 | * Account system cpu time to a process and desired cpustat field | ||
3851 | * @p: the process that the cpu time gets accounted to | ||
3852 | * @cputime: the cpu time spent in kernel space since the last update | ||
3853 | * @cputime_scaled: cputime scaled by cpu frequency | ||
3854 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
3855 | */ | ||
3856 | static inline | ||
3857 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
3858 | cputime_t cputime_scaled, cputime64_t *target_cputime64) | ||
3859 | { | ||
3860 | cputime64_t tmp = cputime_to_cputime64(cputime); | ||
3861 | |||
3862 | /* Add system time to process. */ | ||
3863 | p->stime = cputime_add(p->stime, cputime); | ||
3864 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3865 | account_group_system_time(p, cputime); | ||
3866 | |||
3867 | /* Add system time to cpustat. */ | ||
3868 | *target_cputime64 = cputime64_add(*target_cputime64, tmp); | ||
3869 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3870 | |||
3871 | /* Account for system time used */ | ||
3872 | acct_update_integrals(p); | ||
3873 | } | ||
3874 | |||
3875 | /* | ||
3411 | * Account system cpu time to a process. | 3876 | * Account system cpu time to a process. |
3412 | * @p: the process that the cpu time gets accounted to | 3877 | * @p: the process that the cpu time gets accounted to |
3413 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3878 | * @hardirq_offset: the offset to subtract from hardirq_count() |
@@ -3418,36 +3883,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3418 | cputime_t cputime, cputime_t cputime_scaled) | 3883 | cputime_t cputime, cputime_t cputime_scaled) |
3419 | { | 3884 | { |
3420 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3885 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3421 | cputime64_t tmp; | 3886 | cputime64_t *target_cputime64; |
3422 | 3887 | ||
3423 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 3888 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
3424 | account_guest_time(p, cputime, cputime_scaled); | 3889 | account_guest_time(p, cputime, cputime_scaled); |
3425 | return; | 3890 | return; |
3426 | } | 3891 | } |
3427 | 3892 | ||
3428 | /* Add system time to process. */ | ||
3429 | p->stime = cputime_add(p->stime, cputime); | ||
3430 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3431 | account_group_system_time(p, cputime); | ||
3432 | |||
3433 | /* Add system time to cpustat. */ | ||
3434 | tmp = cputime_to_cputime64(cputime); | ||
3435 | if (hardirq_count() - hardirq_offset) | 3893 | if (hardirq_count() - hardirq_offset) |
3436 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3894 | target_cputime64 = &cpustat->irq; |
3437 | else if (softirq_count()) | 3895 | else if (in_serving_softirq()) |
3438 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3896 | target_cputime64 = &cpustat->softirq; |
3439 | else | 3897 | else |
3440 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3898 | target_cputime64 = &cpustat->system; |
3441 | |||
3442 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3443 | 3899 | ||
3444 | /* Account for system time used */ | 3900 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); |
3445 | acct_update_integrals(p); | ||
3446 | } | 3901 | } |
3447 | 3902 | ||
3448 | /* | 3903 | /* |
3449 | * Account for involuntary wait time. | 3904 | * Account for involuntary wait time. |
3450 | * @steal: the cpu time spent in involuntary wait | 3905 | * @cputime: the cpu time spent in involuntary wait |
3451 | */ | 3906 | */ |
3452 | void account_steal_time(cputime_t cputime) | 3907 | void account_steal_time(cputime_t cputime) |
3453 | { | 3908 | { |
@@ -3475,6 +3930,73 @@ void account_idle_time(cputime_t cputime) | |||
3475 | 3930 | ||
3476 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 3931 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
3477 | 3932 | ||
3933 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
3934 | /* | ||
3935 | * Account a tick to a process and cpustat | ||
3936 | * @p: the process that the cpu time gets accounted to | ||
3937 | * @user_tick: is the tick from userspace | ||
3938 | * @rq: the pointer to rq | ||
3939 | * | ||
3940 | * Tick demultiplexing follows the order | ||
3941 | * - pending hardirq update | ||
3942 | * - pending softirq update | ||
3943 | * - user_time | ||
3944 | * - idle_time | ||
3945 | * - system time | ||
3946 | * - check for guest_time | ||
3947 | * - else account as system_time | ||
3948 | * | ||
3949 | * Check for hardirq is done both for system and user time as there is | ||
3950 | * no timer going off while we are on hardirq and hence we may never get an | ||
3951 | * opportunity to update it solely in system time. | ||
3952 | * p->stime and friends are only updated on system time and not on irq | ||
3953 | * softirq as those do not count in task exec_runtime any more. | ||
3954 | */ | ||
3955 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3956 | struct rq *rq) | ||
3957 | { | ||
3958 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
3959 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | ||
3960 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
3961 | |||
3962 | if (irqtime_account_hi_update()) { | ||
3963 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | ||
3964 | } else if (irqtime_account_si_update()) { | ||
3965 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | ||
3966 | } else if (this_cpu_ksoftirqd() == p) { | ||
3967 | /* | ||
3968 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
3969 | * So, we have to handle it separately here. | ||
3970 | * Also, p->stime needs to be updated for ksoftirqd. | ||
3971 | */ | ||
3972 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3973 | &cpustat->softirq); | ||
3974 | } else if (user_tick) { | ||
3975 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3976 | } else if (p == rq->idle) { | ||
3977 | account_idle_time(cputime_one_jiffy); | ||
3978 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
3979 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3980 | } else { | ||
3981 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3982 | &cpustat->system); | ||
3983 | } | ||
3984 | } | ||
3985 | |||
3986 | static void irqtime_account_idle_ticks(int ticks) | ||
3987 | { | ||
3988 | int i; | ||
3989 | struct rq *rq = this_rq(); | ||
3990 | |||
3991 | for (i = 0; i < ticks; i++) | ||
3992 | irqtime_account_process_tick(current, 0, rq); | ||
3993 | } | ||
3994 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3995 | static void irqtime_account_idle_ticks(int ticks) {} | ||
3996 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3997 | struct rq *rq) {} | ||
3998 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3999 | |||
3478 | /* | 4000 | /* |
3479 | * Account a single tick of cpu time. | 4001 | * Account a single tick of cpu time. |
3480 | * @p: the process that the cpu time gets accounted to | 4002 | * @p: the process that the cpu time gets accounted to |
@@ -3485,6 +4007,11 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
3485 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 4007 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
3486 | struct rq *rq = this_rq(); | 4008 | struct rq *rq = this_rq(); |
3487 | 4009 | ||
4010 | if (sched_clock_irqtime) { | ||
4011 | irqtime_account_process_tick(p, user_tick, rq); | ||
4012 | return; | ||
4013 | } | ||
4014 | |||
3488 | if (user_tick) | 4015 | if (user_tick) |
3489 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 4016 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
3490 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 4017 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
@@ -3510,6 +4037,12 @@ void account_steal_ticks(unsigned long ticks) | |||
3510 | */ | 4037 | */ |
3511 | void account_idle_ticks(unsigned long ticks) | 4038 | void account_idle_ticks(unsigned long ticks) |
3512 | { | 4039 | { |
4040 | |||
4041 | if (sched_clock_irqtime) { | ||
4042 | irqtime_account_idle_ticks(ticks); | ||
4043 | return; | ||
4044 | } | ||
4045 | |||
3513 | account_idle_time(jiffies_to_cputime(ticks)); | 4046 | account_idle_time(jiffies_to_cputime(ticks)); |
3514 | } | 4047 | } |
3515 | 4048 | ||
@@ -3603,9 +4136,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3603 | /* | 4136 | /* |
3604 | * This function gets called by the timer code, with HZ frequency. | 4137 | * This function gets called by the timer code, with HZ frequency. |
3605 | * We call it with interrupts disabled. | 4138 | * We call it with interrupts disabled. |
3606 | * | ||
3607 | * It also gets called by the fork code, when changing the parent's | ||
3608 | * timeslices. | ||
3609 | */ | 4139 | */ |
3610 | void scheduler_tick(void) | 4140 | void scheduler_tick(void) |
3611 | { | 4141 | { |
@@ -3627,7 +4157,7 @@ void scheduler_tick(void) | |||
3627 | 4157 | ||
3628 | raw_spin_unlock(&rq->lock); | 4158 | raw_spin_unlock(&rq->lock); |
3629 | 4159 | ||
3630 | perf_event_task_tick(curr); | 4160 | perf_event_task_tick(); |
3631 | 4161 | ||
3632 | #ifdef CONFIG_SMP | 4162 | #ifdef CONFIG_SMP |
3633 | rq->idle_at_tick = idle_cpu(cpu); | 4163 | rq->idle_at_tick = idle_cpu(cpu); |
@@ -3733,19 +4263,12 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3733 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 4263 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3734 | 4264 | ||
3735 | schedstat_inc(this_rq(), sched_count); | 4265 | schedstat_inc(this_rq(), sched_count); |
3736 | #ifdef CONFIG_SCHEDSTATS | ||
3737 | if (unlikely(prev->lock_depth >= 0)) { | ||
3738 | schedstat_inc(this_rq(), bkl_count); | ||
3739 | schedstat_inc(prev, sched_info.bkl_count); | ||
3740 | } | ||
3741 | #endif | ||
3742 | } | 4266 | } |
3743 | 4267 | ||
3744 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | 4268 | static void put_prev_task(struct rq *rq, struct task_struct *prev) |
3745 | { | 4269 | { |
3746 | if (prev->se.on_rq) | 4270 | if (prev->on_rq || rq->skip_clock_update < 0) |
3747 | update_rq_clock(rq); | 4271 | update_rq_clock(rq); |
3748 | rq->skip_clock_update = 0; | ||
3749 | prev->sched_class->put_prev_task(rq, prev); | 4272 | prev->sched_class->put_prev_task(rq, prev); |
3750 | } | 4273 | } |
3751 | 4274 | ||
@@ -3776,17 +4299,13 @@ pick_next_task(struct rq *rq) | |||
3776 | } | 4299 | } |
3777 | */ | 4300 | */ |
3778 | 4301 | ||
3779 | class = sched_class_highest; | 4302 | for_each_class(class) { |
3780 | for ( ; ; ) { | ||
3781 | p = class->pick_next_task(rq); | 4303 | p = class->pick_next_task(rq); |
3782 | if (p) | 4304 | if (p) |
3783 | return p; | 4305 | return p; |
3784 | /* | ||
3785 | * Will never be NULL as the idle class always | ||
3786 | * returns a non-NULL p: | ||
3787 | */ | ||
3788 | class = class->next; | ||
3789 | } | 4306 | } |
4307 | |||
4308 | BUG(); /* the idle class will always have a runnable task */ | ||
3790 | } | 4309 | } |
3791 | 4310 | ||
3792 | /* | 4311 | /* |
@@ -3807,8 +4326,10 @@ need_resched: | |||
3807 | rcu_note_context_switch(cpu); | 4326 | rcu_note_context_switch(cpu); |
3808 | prev = rq->curr; | 4327 | prev = rq->curr; |
3809 | 4328 | ||
3810 | release_kernel_lock(prev); | 4329 | /* LITMUS^RT: quickly re-evaluate the scheduling decision |
3811 | need_resched_nonpreemptible: | 4330 | * if the previous one is no longer valid after CTX. |
4331 | */ | ||
4332 | litmus_need_resched_nonpreemptible: | ||
3812 | TS_SCHED_START; | 4333 | TS_SCHED_START; |
3813 | TS_LVLA_SCHED_START; | 4334 | TS_LVLA_SCHED_START; |
3814 | TS_LVLB_SCHED_START; | 4335 | TS_LVLB_SCHED_START; |
@@ -3821,18 +4342,19 @@ need_resched_nonpreemptible: | |||
3821 | hrtick_clear(rq); | 4342 | hrtick_clear(rq); |
3822 | 4343 | ||
3823 | raw_spin_lock_irq(&rq->lock); | 4344 | raw_spin_lock_irq(&rq->lock); |
3824 | clear_tsk_need_resched(prev); | ||
3825 | 4345 | ||
3826 | switch_count = &prev->nivcsw; | 4346 | switch_count = &prev->nivcsw; |
3827 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 4347 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3828 | if (unlikely(signal_pending_state(prev->state, prev))) { | 4348 | if (unlikely(signal_pending_state(prev->state, prev))) { |
3829 | prev->state = TASK_RUNNING; | 4349 | prev->state = TASK_RUNNING; |
3830 | } else { | 4350 | } else { |
4351 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | ||
4352 | prev->on_rq = 0; | ||
4353 | |||
3831 | /* | 4354 | /* |
3832 | * If a worker is going to sleep, notify and | 4355 | * If a worker went to sleep, notify and ask workqueue |
3833 | * ask workqueue whether it wants to wake up a | 4356 | * whether it wants to wake up a task to maintain |
3834 | * task to maintain concurrency. If so, wake | 4357 | * concurrency. |
3835 | * up the task. | ||
3836 | */ | 4358 | */ |
3837 | if (prev->flags & PF_WQ_WORKER) { | 4359 | if (prev->flags & PF_WQ_WORKER) { |
3838 | struct task_struct *to_wakeup; | 4360 | struct task_struct *to_wakeup; |
@@ -3841,7 +4363,16 @@ need_resched_nonpreemptible: | |||
3841 | if (to_wakeup) | 4363 | if (to_wakeup) |
3842 | try_to_wake_up_local(to_wakeup); | 4364 | try_to_wake_up_local(to_wakeup); |
3843 | } | 4365 | } |
3844 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 4366 | |
4367 | /* | ||
4368 | * If we are going to sleep and we have plugged IO | ||
4369 | * queued, make sure to submit it to avoid deadlocks. | ||
4370 | */ | ||
4371 | if (blk_needs_flush_plug(prev)) { | ||
4372 | raw_spin_unlock(&rq->lock); | ||
4373 | blk_schedule_flush_plug(prev); | ||
4374 | raw_spin_lock(&rq->lock); | ||
4375 | } | ||
3845 | } | 4376 | } |
3846 | switch_count = &prev->nvcsw; | 4377 | switch_count = &prev->nvcsw; |
3847 | } | 4378 | } |
@@ -3853,11 +4384,10 @@ need_resched_nonpreemptible: | |||
3853 | 4384 | ||
3854 | put_prev_task(rq, prev); | 4385 | put_prev_task(rq, prev); |
3855 | next = pick_next_task(rq); | 4386 | next = pick_next_task(rq); |
4387 | clear_tsk_need_resched(prev); | ||
4388 | rq->skip_clock_update = 0; | ||
3856 | 4389 | ||
3857 | if (likely(prev != next)) { | 4390 | if (likely(prev != next)) { |
3858 | sched_info_switch(prev, next); | ||
3859 | perf_event_task_sched_out(prev, next); | ||
3860 | |||
3861 | rq->nr_switches++; | 4391 | rq->nr_switches++; |
3862 | rq->curr = next; | 4392 | rq->curr = next; |
3863 | ++*switch_count; | 4393 | ++*switch_count; |
@@ -3886,8 +4416,8 @@ need_resched_nonpreemptible: | |||
3886 | 4416 | ||
3887 | post_schedule(rq); | 4417 | post_schedule(rq); |
3888 | 4418 | ||
3889 | if (sched_state_validate_switch() || unlikely(reacquire_kernel_lock(prev))) | 4419 | if (sched_state_validate_switch()) |
3890 | goto need_resched_nonpreemptible; | 4420 | goto litmus_need_resched_nonpreemptible; |
3891 | 4421 | ||
3892 | preempt_enable_no_resched(); | 4422 | preempt_enable_no_resched(); |
3893 | if (need_resched()) | 4423 | if (need_resched()) |
@@ -3898,70 +4428,53 @@ need_resched_nonpreemptible: | |||
3898 | EXPORT_SYMBOL(schedule); | 4428 | EXPORT_SYMBOL(schedule); |
3899 | 4429 | ||
3900 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 4430 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
4431 | |||
4432 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | ||
4433 | { | ||
4434 | bool ret = false; | ||
4435 | |||
4436 | rcu_read_lock(); | ||
4437 | if (lock->owner != owner) | ||
4438 | goto fail; | ||
4439 | |||
4440 | /* | ||
4441 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
4442 | * lock->owner still matches owner, if that fails, owner might | ||
4443 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
4444 | * ensures the memory stays valid. | ||
4445 | */ | ||
4446 | barrier(); | ||
4447 | |||
4448 | ret = owner->on_cpu; | ||
4449 | fail: | ||
4450 | rcu_read_unlock(); | ||
4451 | |||
4452 | return ret; | ||
4453 | } | ||
4454 | |||
3901 | /* | 4455 | /* |
3902 | * Look out! "owner" is an entirely speculative pointer | 4456 | * Look out! "owner" is an entirely speculative pointer |
3903 | * access and not reliable. | 4457 | * access and not reliable. |
3904 | */ | 4458 | */ |
3905 | int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | 4459 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) |
3906 | { | 4460 | { |
3907 | unsigned int cpu; | ||
3908 | struct rq *rq; | ||
3909 | |||
3910 | if (!sched_feat(OWNER_SPIN)) | 4461 | if (!sched_feat(OWNER_SPIN)) |
3911 | return 0; | 4462 | return 0; |
3912 | 4463 | ||
3913 | #ifdef CONFIG_DEBUG_PAGEALLOC | 4464 | while (owner_running(lock, owner)) { |
3914 | /* | 4465 | if (need_resched()) |
3915 | * Need to access the cpu field knowing that | 4466 | return 0; |
3916 | * DEBUG_PAGEALLOC could have unmapped it if | ||
3917 | * the mutex owner just released it and exited. | ||
3918 | */ | ||
3919 | if (probe_kernel_address(&owner->cpu, cpu)) | ||
3920 | return 0; | ||
3921 | #else | ||
3922 | cpu = owner->cpu; | ||
3923 | #endif | ||
3924 | 4467 | ||
3925 | /* | 4468 | arch_mutex_cpu_relax(); |
3926 | * Even if the access succeeded (likely case), | 4469 | } |
3927 | * the cpu field may no longer be valid. | ||
3928 | */ | ||
3929 | if (cpu >= nr_cpumask_bits) | ||
3930 | return 0; | ||
3931 | 4470 | ||
3932 | /* | 4471 | /* |
3933 | * We need to validate that we can do a | 4472 | * If the owner changed to another task there is likely |
3934 | * get_cpu() and that we have the percpu area. | 4473 | * heavy contention, stop spinning. |
3935 | */ | 4474 | */ |
3936 | if (!cpu_online(cpu)) | 4475 | if (lock->owner) |
3937 | return 0; | 4476 | return 0; |
3938 | 4477 | ||
3939 | rq = cpu_rq(cpu); | ||
3940 | |||
3941 | for (;;) { | ||
3942 | /* | ||
3943 | * Owner changed, break to re-assess state. | ||
3944 | */ | ||
3945 | if (lock->owner != owner) { | ||
3946 | /* | ||
3947 | * If the lock has switched to a different owner, | ||
3948 | * we likely have heavy contention. Return 0 to quit | ||
3949 | * optimistic spinning and not contend further: | ||
3950 | */ | ||
3951 | if (lock->owner) | ||
3952 | return 0; | ||
3953 | break; | ||
3954 | } | ||
3955 | |||
3956 | /* | ||
3957 | * Is that owner really running on that cpu? | ||
3958 | */ | ||
3959 | if (task_thread_info(rq->curr) != owner || need_resched()) | ||
3960 | return 0; | ||
3961 | |||
3962 | cpu_relax(); | ||
3963 | } | ||
3964 | |||
3965 | return 1; | 4478 | return 1; |
3966 | } | 4479 | } |
3967 | #endif | 4480 | #endif |
@@ -4091,6 +4604,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | |||
4091 | { | 4604 | { |
4092 | __wake_up_common(q, mode, 1, 0, key); | 4605 | __wake_up_common(q, mode, 1, 0, key); |
4093 | } | 4606 | } |
4607 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | ||
4094 | 4608 | ||
4095 | /** | 4609 | /** |
4096 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | 4610 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. |
@@ -4282,7 +4796,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); | |||
4282 | * This waits for either a completion of a specific task to be signaled or for a | 4796 | * This waits for either a completion of a specific task to be signaled or for a |
4283 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 4797 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
4284 | */ | 4798 | */ |
4285 | unsigned long __sched | 4799 | long __sched |
4286 | wait_for_completion_interruptible_timeout(struct completion *x, | 4800 | wait_for_completion_interruptible_timeout(struct completion *x, |
4287 | unsigned long timeout) | 4801 | unsigned long timeout) |
4288 | { | 4802 | { |
@@ -4315,7 +4829,7 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
4315 | * signaled or for a specified timeout to expire. It can be | 4829 | * signaled or for a specified timeout to expire. It can be |
4316 | * interrupted by a kill signal. The timeout is in jiffies. | 4830 | * interrupted by a kill signal. The timeout is in jiffies. |
4317 | */ | 4831 | */ |
4318 | unsigned long __sched | 4832 | long __sched |
4319 | wait_for_completion_killable_timeout(struct completion *x, | 4833 | wait_for_completion_killable_timeout(struct completion *x, |
4320 | unsigned long timeout) | 4834 | unsigned long timeout) |
4321 | { | 4835 | { |
@@ -4431,18 +4945,18 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
4431 | */ | 4945 | */ |
4432 | void rt_mutex_setprio(struct task_struct *p, int prio) | 4946 | void rt_mutex_setprio(struct task_struct *p, int prio) |
4433 | { | 4947 | { |
4434 | unsigned long flags; | ||
4435 | int oldprio, on_rq, running; | 4948 | int oldprio, on_rq, running; |
4436 | struct rq *rq; | 4949 | struct rq *rq; |
4437 | const struct sched_class *prev_class; | 4950 | const struct sched_class *prev_class; |
4438 | 4951 | ||
4439 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4952 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4440 | 4953 | ||
4441 | rq = task_rq_lock(p, &flags); | 4954 | rq = __task_rq_lock(p); |
4442 | 4955 | ||
4956 | trace_sched_pi_setprio(p, prio); | ||
4443 | oldprio = p->prio; | 4957 | oldprio = p->prio; |
4444 | prev_class = p->sched_class; | 4958 | prev_class = p->sched_class; |
4445 | on_rq = p->se.on_rq; | 4959 | on_rq = p->on_rq; |
4446 | running = task_current(rq, p); | 4960 | running = task_current(rq, p); |
4447 | if (on_rq) | 4961 | if (on_rq) |
4448 | dequeue_task(rq, p, 0); | 4962 | dequeue_task(rq, p, 0); |
@@ -4458,12 +4972,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4458 | 4972 | ||
4459 | if (running) | 4973 | if (running) |
4460 | p->sched_class->set_curr_task(rq); | 4974 | p->sched_class->set_curr_task(rq); |
4461 | if (on_rq) { | 4975 | if (on_rq) |
4462 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 4976 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
4463 | 4977 | ||
4464 | check_class_changed(rq, p, prev_class, oldprio, running); | 4978 | check_class_changed(rq, p, prev_class, oldprio); |
4465 | } | 4979 | __task_rq_unlock(rq); |
4466 | task_rq_unlock(rq, &flags); | ||
4467 | } | 4980 | } |
4468 | 4981 | ||
4469 | #endif | 4982 | #endif |
@@ -4491,7 +5004,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4491 | p->static_prio = NICE_TO_PRIO(nice); | 5004 | p->static_prio = NICE_TO_PRIO(nice); |
4492 | goto out_unlock; | 5005 | goto out_unlock; |
4493 | } | 5006 | } |
4494 | on_rq = p->se.on_rq; | 5007 | on_rq = p->on_rq; |
4495 | if (on_rq) | 5008 | if (on_rq) |
4496 | dequeue_task(rq, p, 0); | 5009 | dequeue_task(rq, p, 0); |
4497 | 5010 | ||
@@ -4511,7 +5024,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4511 | resched_task(rq->curr); | 5024 | resched_task(rq->curr); |
4512 | } | 5025 | } |
4513 | out_unlock: | 5026 | out_unlock: |
4514 | task_rq_unlock(rq, &flags); | 5027 | task_rq_unlock(rq, p, &flags); |
4515 | } | 5028 | } |
4516 | EXPORT_SYMBOL(set_user_nice); | 5029 | EXPORT_SYMBOL(set_user_nice); |
4517 | 5030 | ||
@@ -4625,8 +5138,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
4625 | static void | 5138 | static void |
4626 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 5139 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) |
4627 | { | 5140 | { |
4628 | BUG_ON(p->se.on_rq); | ||
4629 | |||
4630 | p->policy = policy; | 5141 | p->policy = policy; |
4631 | p->rt_priority = prio; | 5142 | p->rt_priority = prio; |
4632 | p->normal_prio = normal_prio(p); | 5143 | p->normal_prio = normal_prio(p); |
@@ -4651,14 +5162,17 @@ static bool check_same_owner(struct task_struct *p) | |||
4651 | 5162 | ||
4652 | rcu_read_lock(); | 5163 | rcu_read_lock(); |
4653 | pcred = __task_cred(p); | 5164 | pcred = __task_cred(p); |
4654 | match = (cred->euid == pcred->euid || | 5165 | if (cred->user->user_ns == pcred->user->user_ns) |
4655 | cred->euid == pcred->uid); | 5166 | match = (cred->euid == pcred->euid || |
5167 | cred->euid == pcred->uid); | ||
5168 | else | ||
5169 | match = false; | ||
4656 | rcu_read_unlock(); | 5170 | rcu_read_unlock(); |
4657 | return match; | 5171 | return match; |
4658 | } | 5172 | } |
4659 | 5173 | ||
4660 | static int __sched_setscheduler(struct task_struct *p, int policy, | 5174 | static int __sched_setscheduler(struct task_struct *p, int policy, |
4661 | struct sched_param *param, bool user) | 5175 | const struct sched_param *param, bool user) |
4662 | { | 5176 | { |
4663 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 5177 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4664 | unsigned long flags; | 5178 | unsigned long flags; |
@@ -4714,12 +5228,15 @@ recheck: | |||
4714 | param->sched_priority > rlim_rtprio) | 5228 | param->sched_priority > rlim_rtprio) |
4715 | return -EPERM; | 5229 | return -EPERM; |
4716 | } | 5230 | } |
5231 | |||
4717 | /* | 5232 | /* |
4718 | * Like positive nice levels, dont allow tasks to | 5233 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
4719 | * move out of SCHED_IDLE either: | 5234 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
4720 | */ | 5235 | */ |
4721 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) | 5236 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { |
4722 | return -EPERM; | 5237 | if (!can_nice(p, TASK_NICE(p))) |
5238 | return -EPERM; | ||
5239 | } | ||
4723 | 5240 | ||
4724 | /* can't change other user's priorities */ | 5241 | /* can't change other user's priorities */ |
4725 | if (!check_same_owner(p)) | 5242 | if (!check_same_owner(p)) |
@@ -4731,7 +5248,7 @@ recheck: | |||
4731 | } | 5248 | } |
4732 | 5249 | ||
4733 | if (user) { | 5250 | if (user) { |
4734 | retval = security_task_setscheduler(p, policy, param); | 5251 | retval = security_task_setscheduler(p); |
4735 | if (retval) | 5252 | if (retval) |
4736 | return retval; | 5253 | return retval; |
4737 | } | 5254 | } |
@@ -4745,13 +5262,30 @@ recheck: | |||
4745 | /* | 5262 | /* |
4746 | * make sure no PI-waiters arrive (or leave) while we are | 5263 | * make sure no PI-waiters arrive (or leave) while we are |
4747 | * changing the priority of the task: | 5264 | * changing the priority of the task: |
5265 | * | ||
5266 | * To be able to change p->policy safely, the appropriate | ||
5267 | * runqueue lock must be held. | ||
4748 | */ | 5268 | */ |
4749 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 5269 | rq = task_rq_lock(p, &flags); |
5270 | |||
4750 | /* | 5271 | /* |
4751 | * To be able to change p->policy safely, the apropriate | 5272 | * Changing the policy of the stop threads its a very bad idea |
4752 | * runqueue lock must be held. | ||
4753 | */ | 5273 | */ |
4754 | rq = __task_rq_lock(p); | 5274 | if (p == rq->stop) { |
5275 | task_rq_unlock(rq, p, &flags); | ||
5276 | return -EINVAL; | ||
5277 | } | ||
5278 | |||
5279 | /* | ||
5280 | * If not changing anything there's no need to proceed further: | ||
5281 | */ | ||
5282 | if (unlikely(policy == p->policy && (!rt_policy(policy) || | ||
5283 | param->sched_priority == p->rt_priority))) { | ||
5284 | |||
5285 | __task_rq_unlock(rq); | ||
5286 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5287 | return 0; | ||
5288 | } | ||
4755 | 5289 | ||
4756 | #ifdef CONFIG_RT_GROUP_SCHED | 5290 | #ifdef CONFIG_RT_GROUP_SCHED |
4757 | if (user) { | 5291 | if (user) { |
@@ -4760,9 +5294,9 @@ recheck: | |||
4760 | * assigned. | 5294 | * assigned. |
4761 | */ | 5295 | */ |
4762 | if (rt_bandwidth_enabled() && rt_policy(policy) && | 5296 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
4763 | task_group(p)->rt_bandwidth.rt_runtime == 0) { | 5297 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
4764 | __task_rq_unlock(rq); | 5298 | !task_group_is_autogroup(task_group(p))) { |
4765 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 5299 | task_rq_unlock(rq, p, &flags); |
4766 | return -EPERM; | 5300 | return -EPERM; |
4767 | } | 5301 | } |
4768 | } | 5302 | } |
@@ -4771,11 +5305,10 @@ recheck: | |||
4771 | /* recheck policy now with rq lock held */ | 5305 | /* recheck policy now with rq lock held */ |
4772 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 5306 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
4773 | policy = oldpolicy = -1; | 5307 | policy = oldpolicy = -1; |
4774 | __task_rq_unlock(rq); | 5308 | task_rq_unlock(rq, p, &flags); |
4775 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4776 | goto recheck; | 5309 | goto recheck; |
4777 | } | 5310 | } |
4778 | on_rq = p->se.on_rq; | 5311 | on_rq = p->on_rq; |
4779 | running = task_current(rq, p); | 5312 | running = task_current(rq, p); |
4780 | if (on_rq) | 5313 | if (on_rq) |
4781 | deactivate_task(rq, p, 0); | 5314 | deactivate_task(rq, p, 0); |
@@ -4799,13 +5332,11 @@ recheck: | |||
4799 | 5332 | ||
4800 | if (running) | 5333 | if (running) |
4801 | p->sched_class->set_curr_task(rq); | 5334 | p->sched_class->set_curr_task(rq); |
4802 | if (on_rq) { | 5335 | if (on_rq) |
4803 | activate_task(rq, p, 0); | 5336 | activate_task(rq, p, 0); |
4804 | 5337 | ||
4805 | check_class_changed(rq, p, prev_class, oldprio, running); | 5338 | check_class_changed(rq, p, prev_class, oldprio); |
4806 | } | 5339 | task_rq_unlock(rq, p, &flags); |
4807 | __task_rq_unlock(rq); | ||
4808 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4809 | 5340 | ||
4810 | rt_mutex_adjust_pi(p); | 5341 | rt_mutex_adjust_pi(p); |
4811 | 5342 | ||
@@ -4821,7 +5352,7 @@ recheck: | |||
4821 | * NOTE that the task may be already dead. | 5352 | * NOTE that the task may be already dead. |
4822 | */ | 5353 | */ |
4823 | int sched_setscheduler(struct task_struct *p, int policy, | 5354 | int sched_setscheduler(struct task_struct *p, int policy, |
4824 | struct sched_param *param) | 5355 | const struct sched_param *param) |
4825 | { | 5356 | { |
4826 | return __sched_setscheduler(p, policy, param, true); | 5357 | return __sched_setscheduler(p, policy, param, true); |
4827 | } | 5358 | } |
@@ -4839,7 +5370,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
4839 | * but our caller might not have that capability. | 5370 | * but our caller might not have that capability. |
4840 | */ | 5371 | */ |
4841 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 5372 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
4842 | struct sched_param *param) | 5373 | const struct sched_param *param) |
4843 | { | 5374 | { |
4844 | return __sched_setscheduler(p, policy, param, false); | 5375 | return __sched_setscheduler(p, policy, param, false); |
4845 | } | 5376 | } |
@@ -4986,16 +5517,16 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
4986 | goto out_free_cpus_allowed; | 5517 | goto out_free_cpus_allowed; |
4987 | } | 5518 | } |
4988 | retval = -EPERM; | 5519 | retval = -EPERM; |
4989 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) | 5520 | if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) |
4990 | goto out_unlock; | 5521 | goto out_unlock; |
4991 | 5522 | ||
4992 | retval = security_task_setscheduler(p, 0, NULL); | 5523 | retval = security_task_setscheduler(p); |
4993 | if (retval) | 5524 | if (retval) |
4994 | goto out_unlock; | 5525 | goto out_unlock; |
4995 | 5526 | ||
4996 | cpuset_cpus_allowed(p, cpus_allowed); | 5527 | cpuset_cpus_allowed(p, cpus_allowed); |
4997 | cpumask_and(new_mask, in_mask, cpus_allowed); | 5528 | cpumask_and(new_mask, in_mask, cpus_allowed); |
4998 | again: | 5529 | again: |
4999 | retval = set_cpus_allowed_ptr(p, new_mask); | 5530 | retval = set_cpus_allowed_ptr(p, new_mask); |
5000 | 5531 | ||
5001 | if (!retval) { | 5532 | if (!retval) { |
@@ -5057,7 +5588,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5057 | { | 5588 | { |
5058 | struct task_struct *p; | 5589 | struct task_struct *p; |
5059 | unsigned long flags; | 5590 | unsigned long flags; |
5060 | struct rq *rq; | ||
5061 | int retval; | 5591 | int retval; |
5062 | 5592 | ||
5063 | get_online_cpus(); | 5593 | get_online_cpus(); |
@@ -5072,9 +5602,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5072 | if (retval) | 5602 | if (retval) |
5073 | goto out_unlock; | 5603 | goto out_unlock; |
5074 | 5604 | ||
5075 | rq = task_rq_lock(p, &flags); | 5605 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
5076 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 5606 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); |
5077 | task_rq_unlock(rq, &flags); | 5607 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
5078 | 5608 | ||
5079 | out_unlock: | 5609 | out_unlock: |
5080 | rcu_read_unlock(); | 5610 | rcu_read_unlock(); |
@@ -5221,6 +5751,67 @@ void __sched yield(void) | |||
5221 | } | 5751 | } |
5222 | EXPORT_SYMBOL(yield); | 5752 | EXPORT_SYMBOL(yield); |
5223 | 5753 | ||
5754 | /** | ||
5755 | * yield_to - yield the current processor to another thread in | ||
5756 | * your thread group, or accelerate that thread toward the | ||
5757 | * processor it's on. | ||
5758 | * @p: target task | ||
5759 | * @preempt: whether task preemption is allowed or not | ||
5760 | * | ||
5761 | * It's the caller's job to ensure that the target task struct | ||
5762 | * can't go away on us before we can do any checks. | ||
5763 | * | ||
5764 | * Returns true if we indeed boosted the target task. | ||
5765 | */ | ||
5766 | bool __sched yield_to(struct task_struct *p, bool preempt) | ||
5767 | { | ||
5768 | struct task_struct *curr = current; | ||
5769 | struct rq *rq, *p_rq; | ||
5770 | unsigned long flags; | ||
5771 | bool yielded = 0; | ||
5772 | |||
5773 | local_irq_save(flags); | ||
5774 | rq = this_rq(); | ||
5775 | |||
5776 | again: | ||
5777 | p_rq = task_rq(p); | ||
5778 | double_rq_lock(rq, p_rq); | ||
5779 | while (task_rq(p) != p_rq) { | ||
5780 | double_rq_unlock(rq, p_rq); | ||
5781 | goto again; | ||
5782 | } | ||
5783 | |||
5784 | if (!curr->sched_class->yield_to_task) | ||
5785 | goto out; | ||
5786 | |||
5787 | if (curr->sched_class != p->sched_class) | ||
5788 | goto out; | ||
5789 | |||
5790 | if (task_running(p_rq, p) || p->state) | ||
5791 | goto out; | ||
5792 | |||
5793 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); | ||
5794 | if (yielded) { | ||
5795 | schedstat_inc(rq, yld_count); | ||
5796 | /* | ||
5797 | * Make p's CPU reschedule; pick_next_entity takes care of | ||
5798 | * fairness. | ||
5799 | */ | ||
5800 | if (preempt && rq != p_rq) | ||
5801 | resched_task(p_rq->curr); | ||
5802 | } | ||
5803 | |||
5804 | out: | ||
5805 | double_rq_unlock(rq, p_rq); | ||
5806 | local_irq_restore(flags); | ||
5807 | |||
5808 | if (yielded) | ||
5809 | schedule(); | ||
5810 | |||
5811 | return yielded; | ||
5812 | } | ||
5813 | EXPORT_SYMBOL_GPL(yield_to); | ||
5814 | |||
5224 | /* | 5815 | /* |
5225 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 5816 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
5226 | * that process accounting knows that this is a task in IO wait state. | 5817 | * that process accounting knows that this is a task in IO wait state. |
@@ -5231,6 +5822,7 @@ void __sched io_schedule(void) | |||
5231 | 5822 | ||
5232 | delayacct_blkio_start(); | 5823 | delayacct_blkio_start(); |
5233 | atomic_inc(&rq->nr_iowait); | 5824 | atomic_inc(&rq->nr_iowait); |
5825 | blk_flush_plug(current); | ||
5234 | current->in_iowait = 1; | 5826 | current->in_iowait = 1; |
5235 | schedule(); | 5827 | schedule(); |
5236 | current->in_iowait = 0; | 5828 | current->in_iowait = 0; |
@@ -5246,6 +5838,7 @@ long __sched io_schedule_timeout(long timeout) | |||
5246 | 5838 | ||
5247 | delayacct_blkio_start(); | 5839 | delayacct_blkio_start(); |
5248 | atomic_inc(&rq->nr_iowait); | 5840 | atomic_inc(&rq->nr_iowait); |
5841 | blk_flush_plug(current); | ||
5249 | current->in_iowait = 1; | 5842 | current->in_iowait = 1; |
5250 | ret = schedule_timeout(timeout); | 5843 | ret = schedule_timeout(timeout); |
5251 | current->in_iowait = 0; | 5844 | current->in_iowait = 0; |
@@ -5336,7 +5929,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
5336 | 5929 | ||
5337 | rq = task_rq_lock(p, &flags); | 5930 | rq = task_rq_lock(p, &flags); |
5338 | time_slice = p->sched_class->get_rr_interval(rq, p); | 5931 | time_slice = p->sched_class->get_rr_interval(rq, p); |
5339 | task_rq_unlock(rq, &flags); | 5932 | task_rq_unlock(rq, p, &flags); |
5340 | 5933 | ||
5341 | rcu_read_unlock(); | 5934 | rcu_read_unlock(); |
5342 | jiffies_to_timespec(time_slice, &t); | 5935 | jiffies_to_timespec(time_slice, &t); |
@@ -5356,7 +5949,7 @@ void sched_show_task(struct task_struct *p) | |||
5356 | unsigned state; | 5949 | unsigned state; |
5357 | 5950 | ||
5358 | state = p->state ? __ffs(p->state) + 1 : 0; | 5951 | state = p->state ? __ffs(p->state) + 1 : 0; |
5359 | printk(KERN_INFO "%-13.13s %c", p->comm, | 5952 | printk(KERN_INFO "%-15.15s %c", p->comm, |
5360 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 5953 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
5361 | #if BITS_PER_LONG == 32 | 5954 | #if BITS_PER_LONG == 32 |
5362 | if (state == TASK_RUNNING) | 5955 | if (state == TASK_RUNNING) |
@@ -5394,7 +5987,7 @@ void show_state_filter(unsigned long state_filter) | |||
5394 | do_each_thread(g, p) { | 5987 | do_each_thread(g, p) { |
5395 | /* | 5988 | /* |
5396 | * reset the NMI-timeout, listing all files on a slow | 5989 | * reset the NMI-timeout, listing all files on a slow |
5397 | * console might take alot of time: | 5990 | * console might take a lot of time: |
5398 | */ | 5991 | */ |
5399 | touch_nmi_watchdog(); | 5992 | touch_nmi_watchdog(); |
5400 | if (!state_filter || (p->state & state_filter)) | 5993 | if (!state_filter || (p->state & state_filter)) |
@@ -5438,26 +6031,35 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5438 | idle->state = TASK_RUNNING; | 6031 | idle->state = TASK_RUNNING; |
5439 | idle->se.exec_start = sched_clock(); | 6032 | idle->se.exec_start = sched_clock(); |
5440 | 6033 | ||
5441 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); | 6034 | do_set_cpus_allowed(idle, cpumask_of(cpu)); |
6035 | /* | ||
6036 | * We're having a chicken and egg problem, even though we are | ||
6037 | * holding rq->lock, the cpu isn't yet set to this cpu so the | ||
6038 | * lockdep check in task_group() will fail. | ||
6039 | * | ||
6040 | * Similar case to sched_fork(). / Alternatively we could | ||
6041 | * use task_rq_lock() here and obtain the other rq->lock. | ||
6042 | * | ||
6043 | * Silence PROVE_RCU | ||
6044 | */ | ||
6045 | rcu_read_lock(); | ||
5442 | __set_task_cpu(idle, cpu); | 6046 | __set_task_cpu(idle, cpu); |
6047 | rcu_read_unlock(); | ||
5443 | 6048 | ||
5444 | rq->curr = rq->idle = idle; | 6049 | rq->curr = rq->idle = idle; |
5445 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 6050 | #if defined(CONFIG_SMP) |
5446 | idle->oncpu = 1; | 6051 | idle->on_cpu = 1; |
5447 | #endif | 6052 | #endif |
5448 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6053 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
5449 | 6054 | ||
5450 | /* Set the preempt count _outside_ the spinlocks! */ | 6055 | /* Set the preempt count _outside_ the spinlocks! */ |
5451 | #if defined(CONFIG_PREEMPT) | ||
5452 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
5453 | #else | ||
5454 | task_thread_info(idle)->preempt_count = 0; | 6056 | task_thread_info(idle)->preempt_count = 0; |
5455 | #endif | 6057 | |
5456 | /* | 6058 | /* |
5457 | * The idle tasks have their own, simple scheduling class: | 6059 | * The idle tasks have their own, simple scheduling class: |
5458 | */ | 6060 | */ |
5459 | idle->sched_class = &idle_sched_class; | 6061 | idle->sched_class = &idle_sched_class; |
5460 | ftrace_graph_init_task(idle); | 6062 | ftrace_graph_init_idle_task(idle, cpu); |
5461 | } | 6063 | } |
5462 | 6064 | ||
5463 | /* | 6065 | /* |
@@ -5508,7 +6110,6 @@ static void update_sysctl(void) | |||
5508 | SET_SYSCTL(sched_min_granularity); | 6110 | SET_SYSCTL(sched_min_granularity); |
5509 | SET_SYSCTL(sched_latency); | 6111 | SET_SYSCTL(sched_latency); |
5510 | SET_SYSCTL(sched_wakeup_granularity); | 6112 | SET_SYSCTL(sched_wakeup_granularity); |
5511 | SET_SYSCTL(sched_shares_ratelimit); | ||
5512 | #undef SET_SYSCTL | 6113 | #undef SET_SYSCTL |
5513 | } | 6114 | } |
5514 | 6115 | ||
@@ -5518,6 +6119,16 @@ static inline void sched_init_granularity(void) | |||
5518 | } | 6119 | } |
5519 | 6120 | ||
5520 | #ifdef CONFIG_SMP | 6121 | #ifdef CONFIG_SMP |
6122 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | ||
6123 | { | ||
6124 | if (p->sched_class && p->sched_class->set_cpus_allowed) | ||
6125 | p->sched_class->set_cpus_allowed(p, new_mask); | ||
6126 | else { | ||
6127 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
6128 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | ||
6129 | } | ||
6130 | } | ||
6131 | |||
5521 | /* | 6132 | /* |
5522 | * This is how migration works: | 6133 | * This is how migration works: |
5523 | * | 6134 | * |
@@ -5548,52 +6159,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
5548 | unsigned int dest_cpu; | 6159 | unsigned int dest_cpu; |
5549 | int ret = 0; | 6160 | int ret = 0; |
5550 | 6161 | ||
5551 | /* | ||
5552 | * Serialize against TASK_WAKING so that ttwu() and wunt() can | ||
5553 | * drop the rq->lock and still rely on ->cpus_allowed. | ||
5554 | */ | ||
5555 | again: | ||
5556 | while (task_is_waking(p)) | ||
5557 | cpu_relax(); | ||
5558 | rq = task_rq_lock(p, &flags); | 6162 | rq = task_rq_lock(p, &flags); |
5559 | if (task_is_waking(p)) { | 6163 | |
5560 | task_rq_unlock(rq, &flags); | 6164 | if (cpumask_equal(&p->cpus_allowed, new_mask)) |
5561 | goto again; | 6165 | goto out; |
5562 | } | ||
5563 | 6166 | ||
5564 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | 6167 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
5565 | ret = -EINVAL; | 6168 | ret = -EINVAL; |
5566 | goto out; | 6169 | goto out; |
5567 | } | 6170 | } |
5568 | 6171 | ||
5569 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && | 6172 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { |
5570 | !cpumask_equal(&p->cpus_allowed, new_mask))) { | ||
5571 | ret = -EINVAL; | 6173 | ret = -EINVAL; |
5572 | goto out; | 6174 | goto out; |
5573 | } | 6175 | } |
5574 | 6176 | ||
5575 | if (p->sched_class->set_cpus_allowed) | 6177 | do_set_cpus_allowed(p, new_mask); |
5576 | p->sched_class->set_cpus_allowed(p, new_mask); | ||
5577 | else { | ||
5578 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
5579 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | ||
5580 | } | ||
5581 | 6178 | ||
5582 | /* Can the task run on the task's current CPU? If so, we're done */ | 6179 | /* Can the task run on the task's current CPU? If so, we're done */ |
5583 | if (cpumask_test_cpu(task_cpu(p), new_mask)) | 6180 | if (cpumask_test_cpu(task_cpu(p), new_mask)) |
5584 | goto out; | 6181 | goto out; |
5585 | 6182 | ||
5586 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 6183 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
5587 | if (migrate_task(p, dest_cpu)) { | 6184 | if (p->on_rq) { |
5588 | struct migration_arg arg = { p, dest_cpu }; | 6185 | struct migration_arg arg = { p, dest_cpu }; |
5589 | /* Need help from migration thread: drop lock and wait. */ | 6186 | /* Need help from migration thread: drop lock and wait. */ |
5590 | task_rq_unlock(rq, &flags); | 6187 | task_rq_unlock(rq, p, &flags); |
5591 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 6188 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
5592 | tlb_migrate_finish(p->mm); | 6189 | tlb_migrate_finish(p->mm); |
5593 | return 0; | 6190 | return 0; |
5594 | } | 6191 | } |
5595 | out: | 6192 | out: |
5596 | task_rq_unlock(rq, &flags); | 6193 | task_rq_unlock(rq, p, &flags); |
5597 | 6194 | ||
5598 | return ret; | 6195 | return ret; |
5599 | } | 6196 | } |
@@ -5621,6 +6218,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5621 | rq_src = cpu_rq(src_cpu); | 6218 | rq_src = cpu_rq(src_cpu); |
5622 | rq_dest = cpu_rq(dest_cpu); | 6219 | rq_dest = cpu_rq(dest_cpu); |
5623 | 6220 | ||
6221 | raw_spin_lock(&p->pi_lock); | ||
5624 | double_rq_lock(rq_src, rq_dest); | 6222 | double_rq_lock(rq_src, rq_dest); |
5625 | /* Already moved. */ | 6223 | /* Already moved. */ |
5626 | if (task_cpu(p) != src_cpu) | 6224 | if (task_cpu(p) != src_cpu) |
@@ -5633,7 +6231,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5633 | * If we're not on a rq, the next wake-up will ensure we're | 6231 | * If we're not on a rq, the next wake-up will ensure we're |
5634 | * placed properly. | 6232 | * placed properly. |
5635 | */ | 6233 | */ |
5636 | if (p->se.on_rq) { | 6234 | if (p->on_rq) { |
5637 | deactivate_task(rq_src, p, 0); | 6235 | deactivate_task(rq_src, p, 0); |
5638 | set_task_cpu(p, dest_cpu); | 6236 | set_task_cpu(p, dest_cpu); |
5639 | activate_task(rq_dest, p, 0); | 6237 | activate_task(rq_dest, p, 0); |
@@ -5643,6 +6241,7 @@ done: | |||
5643 | ret = 1; | 6241 | ret = 1; |
5644 | fail: | 6242 | fail: |
5645 | double_rq_unlock(rq_src, rq_dest); | 6243 | double_rq_unlock(rq_src, rq_dest); |
6244 | raw_spin_unlock(&p->pi_lock); | ||
5646 | return ret; | 6245 | return ret; |
5647 | } | 6246 | } |
5648 | 6247 | ||
@@ -5666,29 +6265,20 @@ static int migration_cpu_stop(void *data) | |||
5666 | } | 6265 | } |
5667 | 6266 | ||
5668 | #ifdef CONFIG_HOTPLUG_CPU | 6267 | #ifdef CONFIG_HOTPLUG_CPU |
6268 | |||
5669 | /* | 6269 | /* |
5670 | * Figure out where task on dead CPU should go, use force if necessary. | 6270 | * Ensures that the idle task is using init_mm right before its cpu goes |
6271 | * offline. | ||
5671 | */ | 6272 | */ |
5672 | void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 6273 | void idle_task_exit(void) |
5673 | { | 6274 | { |
5674 | struct rq *rq = cpu_rq(dead_cpu); | 6275 | struct mm_struct *mm = current->active_mm; |
5675 | int needs_cpu, uninitialized_var(dest_cpu); | ||
5676 | unsigned long flags; | ||
5677 | 6276 | ||
5678 | local_irq_save(flags); | 6277 | BUG_ON(cpu_online(smp_processor_id())); |
5679 | 6278 | ||
5680 | raw_spin_lock(&rq->lock); | 6279 | if (mm != &init_mm) |
5681 | needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); | 6280 | switch_mm(mm, &init_mm, current); |
5682 | if (needs_cpu) | 6281 | mmdrop(mm); |
5683 | dest_cpu = select_fallback_rq(dead_cpu, p); | ||
5684 | raw_spin_unlock(&rq->lock); | ||
5685 | /* | ||
5686 | * It can only fail if we race with set_cpus_allowed(), | ||
5687 | * in the racer should migrate the task anyway. | ||
5688 | */ | ||
5689 | if (needs_cpu) | ||
5690 | __migrate_task(p, dead_cpu, dest_cpu); | ||
5691 | local_irq_restore(flags); | ||
5692 | } | 6282 | } |
5693 | 6283 | ||
5694 | /* | 6284 | /* |
@@ -5701,128 +6291,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5701 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 6291 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
5702 | { | 6292 | { |
5703 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); | 6293 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |
5704 | unsigned long flags; | ||
5705 | 6294 | ||
5706 | local_irq_save(flags); | ||
5707 | double_rq_lock(rq_src, rq_dest); | ||
5708 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 6295 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
5709 | rq_src->nr_uninterruptible = 0; | 6296 | rq_src->nr_uninterruptible = 0; |
5710 | double_rq_unlock(rq_src, rq_dest); | ||
5711 | local_irq_restore(flags); | ||
5712 | } | ||
5713 | |||
5714 | /* Run through task list and migrate tasks from the dead cpu. */ | ||
5715 | static void migrate_live_tasks(int src_cpu) | ||
5716 | { | ||
5717 | struct task_struct *p, *t; | ||
5718 | |||
5719 | read_lock(&tasklist_lock); | ||
5720 | |||
5721 | do_each_thread(t, p) { | ||
5722 | if (p == current) | ||
5723 | continue; | ||
5724 | |||
5725 | if (task_cpu(p) == src_cpu) | ||
5726 | move_task_off_dead_cpu(src_cpu, p); | ||
5727 | } while_each_thread(t, p); | ||
5728 | |||
5729 | read_unlock(&tasklist_lock); | ||
5730 | } | 6297 | } |
5731 | 6298 | ||
5732 | /* | 6299 | /* |
5733 | * Schedules idle task to be the next runnable task on current CPU. | 6300 | * remove the tasks which were accounted by rq from calc_load_tasks. |
5734 | * It does so by boosting its priority to highest possible. | ||
5735 | * Used by CPU offline code. | ||
5736 | */ | 6301 | */ |
5737 | void sched_idle_next(void) | 6302 | static void calc_global_load_remove(struct rq *rq) |
5738 | { | 6303 | { |
5739 | int this_cpu = smp_processor_id(); | 6304 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); |
5740 | struct rq *rq = cpu_rq(this_cpu); | 6305 | rq->calc_load_active = 0; |
5741 | struct task_struct *p = rq->idle; | ||
5742 | unsigned long flags; | ||
5743 | |||
5744 | /* cpu has to be offline */ | ||
5745 | BUG_ON(cpu_online(this_cpu)); | ||
5746 | |||
5747 | /* | ||
5748 | * Strictly not necessary since rest of the CPUs are stopped by now | ||
5749 | * and interrupts disabled on the current cpu. | ||
5750 | */ | ||
5751 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5752 | |||
5753 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | ||
5754 | |||
5755 | activate_task(rq, p, 0); | ||
5756 | |||
5757 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5758 | } | 6306 | } |
5759 | 6307 | ||
5760 | /* | 6308 | /* |
5761 | * Ensures that the idle task is using init_mm right before its cpu goes | 6309 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
5762 | * offline. | 6310 | * try_to_wake_up()->select_task_rq(). |
6311 | * | ||
6312 | * Called with rq->lock held even though we'er in stop_machine() and | ||
6313 | * there's no concurrency possible, we hold the required locks anyway | ||
6314 | * because of lock validation efforts. | ||
5763 | */ | 6315 | */ |
5764 | void idle_task_exit(void) | 6316 | static void migrate_tasks(unsigned int dead_cpu) |
5765 | { | ||
5766 | struct mm_struct *mm = current->active_mm; | ||
5767 | |||
5768 | BUG_ON(cpu_online(smp_processor_id())); | ||
5769 | |||
5770 | if (mm != &init_mm) | ||
5771 | switch_mm(mm, &init_mm, current); | ||
5772 | mmdrop(mm); | ||
5773 | } | ||
5774 | |||
5775 | /* called under rq->lock with disabled interrupts */ | ||
5776 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | ||
5777 | { | 6317 | { |
5778 | struct rq *rq = cpu_rq(dead_cpu); | 6318 | struct rq *rq = cpu_rq(dead_cpu); |
5779 | 6319 | struct task_struct *next, *stop = rq->stop; | |
5780 | /* Must be exiting, otherwise would be on tasklist. */ | 6320 | int dest_cpu; |
5781 | BUG_ON(!p->exit_state); | ||
5782 | |||
5783 | /* Cannot have done final schedule yet: would have vanished. */ | ||
5784 | BUG_ON(p->state == TASK_DEAD); | ||
5785 | |||
5786 | get_task_struct(p); | ||
5787 | 6321 | ||
5788 | /* | 6322 | /* |
5789 | * Drop lock around migration; if someone else moves it, | 6323 | * Fudge the rq selection such that the below task selection loop |
5790 | * that's OK. No task can be added to this CPU, so iteration is | 6324 | * doesn't get stuck on the currently eligible stop task. |
5791 | * fine. | 6325 | * |
6326 | * We're currently inside stop_machine() and the rq is either stuck | ||
6327 | * in the stop_machine_cpu_stop() loop, or we're executing this code, | ||
6328 | * either way we should never end up calling schedule() until we're | ||
6329 | * done here. | ||
5792 | */ | 6330 | */ |
5793 | raw_spin_unlock_irq(&rq->lock); | 6331 | rq->stop = NULL; |
5794 | move_task_off_dead_cpu(dead_cpu, p); | ||
5795 | raw_spin_lock_irq(&rq->lock); | ||
5796 | |||
5797 | put_task_struct(p); | ||
5798 | } | ||
5799 | |||
5800 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | ||
5801 | static void migrate_dead_tasks(unsigned int dead_cpu) | ||
5802 | { | ||
5803 | struct rq *rq = cpu_rq(dead_cpu); | ||
5804 | struct task_struct *next; | ||
5805 | 6332 | ||
5806 | for ( ; ; ) { | 6333 | for ( ; ; ) { |
5807 | if (!rq->nr_running) | 6334 | /* |
6335 | * There's this thread running, bail when that's the only | ||
6336 | * remaining thread. | ||
6337 | */ | ||
6338 | if (rq->nr_running == 1) | ||
5808 | break; | 6339 | break; |
6340 | |||
5809 | next = pick_next_task(rq); | 6341 | next = pick_next_task(rq); |
5810 | if (!next) | 6342 | BUG_ON(!next); |
5811 | break; | ||
5812 | next->sched_class->put_prev_task(rq, next); | 6343 | next->sched_class->put_prev_task(rq, next); |
5813 | migrate_dead(dead_cpu, next); | ||
5814 | 6344 | ||
6345 | /* Find suitable destination for @next, with force if needed. */ | ||
6346 | dest_cpu = select_fallback_rq(dead_cpu, next); | ||
6347 | raw_spin_unlock(&rq->lock); | ||
6348 | |||
6349 | __migrate_task(next, dead_cpu, dest_cpu); | ||
6350 | |||
6351 | raw_spin_lock(&rq->lock); | ||
5815 | } | 6352 | } |
5816 | } | ||
5817 | 6353 | ||
5818 | /* | 6354 | rq->stop = stop; |
5819 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
5820 | */ | ||
5821 | static void calc_global_load_remove(struct rq *rq) | ||
5822 | { | ||
5823 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
5824 | rq->calc_load_active = 0; | ||
5825 | } | 6355 | } |
6356 | |||
5826 | #endif /* CONFIG_HOTPLUG_CPU */ | 6357 | #endif /* CONFIG_HOTPLUG_CPU */ |
5827 | 6358 | ||
5828 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 6359 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
@@ -6032,15 +6563,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6032 | unsigned long flags; | 6563 | unsigned long flags; |
6033 | struct rq *rq = cpu_rq(cpu); | 6564 | struct rq *rq = cpu_rq(cpu); |
6034 | 6565 | ||
6035 | switch (action) { | 6566 | switch (action & ~CPU_TASKS_FROZEN) { |
6036 | 6567 | ||
6037 | case CPU_UP_PREPARE: | 6568 | case CPU_UP_PREPARE: |
6038 | case CPU_UP_PREPARE_FROZEN: | ||
6039 | rq->calc_load_update = calc_load_update; | 6569 | rq->calc_load_update = calc_load_update; |
6040 | break; | 6570 | break; |
6041 | 6571 | ||
6042 | case CPU_ONLINE: | 6572 | case CPU_ONLINE: |
6043 | case CPU_ONLINE_FROZEN: | ||
6044 | /* Update our root-domain */ | 6573 | /* Update our root-domain */ |
6045 | raw_spin_lock_irqsave(&rq->lock, flags); | 6574 | raw_spin_lock_irqsave(&rq->lock, flags); |
6046 | if (rq->rd) { | 6575 | if (rq->rd) { |
@@ -6052,33 +6581,26 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6052 | break; | 6581 | break; |
6053 | 6582 | ||
6054 | #ifdef CONFIG_HOTPLUG_CPU | 6583 | #ifdef CONFIG_HOTPLUG_CPU |
6055 | case CPU_DEAD: | ||
6056 | case CPU_DEAD_FROZEN: | ||
6057 | migrate_live_tasks(cpu); | ||
6058 | /* Idle task back to normal (off runqueue, low prio) */ | ||
6059 | raw_spin_lock_irq(&rq->lock); | ||
6060 | deactivate_task(rq, rq->idle, 0); | ||
6061 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | ||
6062 | rq->idle->sched_class = &idle_sched_class; | ||
6063 | migrate_dead_tasks(cpu); | ||
6064 | raw_spin_unlock_irq(&rq->lock); | ||
6065 | migrate_nr_uninterruptible(rq); | ||
6066 | BUG_ON(rq->nr_running != 0); | ||
6067 | calc_global_load_remove(rq); | ||
6068 | break; | ||
6069 | |||
6070 | case CPU_DYING: | 6584 | case CPU_DYING: |
6071 | case CPU_DYING_FROZEN: | 6585 | sched_ttwu_pending(); |
6072 | /* Update our root-domain */ | 6586 | /* Update our root-domain */ |
6073 | raw_spin_lock_irqsave(&rq->lock, flags); | 6587 | raw_spin_lock_irqsave(&rq->lock, flags); |
6074 | if (rq->rd) { | 6588 | if (rq->rd) { |
6075 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 6589 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
6076 | set_rq_offline(rq); | 6590 | set_rq_offline(rq); |
6077 | } | 6591 | } |
6592 | migrate_tasks(cpu); | ||
6593 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | ||
6078 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6594 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6595 | |||
6596 | migrate_nr_uninterruptible(rq); | ||
6597 | calc_global_load_remove(rq); | ||
6079 | break; | 6598 | break; |
6080 | #endif | 6599 | #endif |
6081 | } | 6600 | } |
6601 | |||
6602 | update_max_interval(); | ||
6603 | |||
6082 | return NOTIFY_OK; | 6604 | return NOTIFY_OK; |
6083 | } | 6605 | } |
6084 | 6606 | ||
@@ -6139,6 +6661,8 @@ early_initcall(migration_init); | |||
6139 | 6661 | ||
6140 | #ifdef CONFIG_SMP | 6662 | #ifdef CONFIG_SMP |
6141 | 6663 | ||
6664 | static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | ||
6665 | |||
6142 | #ifdef CONFIG_SCHED_DEBUG | 6666 | #ifdef CONFIG_SCHED_DEBUG |
6143 | 6667 | ||
6144 | static __read_mostly int sched_domain_debug_enabled; | 6668 | static __read_mostly int sched_domain_debug_enabled; |
@@ -6189,7 +6713,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6189 | break; | 6713 | break; |
6190 | } | 6714 | } |
6191 | 6715 | ||
6192 | if (!group->cpu_power) { | 6716 | if (!group->sgp->power) { |
6193 | printk(KERN_CONT "\n"); | 6717 | printk(KERN_CONT "\n"); |
6194 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 6718 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
6195 | "set\n"); | 6719 | "set\n"); |
@@ -6213,9 +6737,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6213 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 6737 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
6214 | 6738 | ||
6215 | printk(KERN_CONT " %s", str); | 6739 | printk(KERN_CONT " %s", str); |
6216 | if (group->cpu_power != SCHED_LOAD_SCALE) { | 6740 | if (group->sgp->power != SCHED_POWER_SCALE) { |
6217 | printk(KERN_CONT " (cpu_power = %d)", | 6741 | printk(KERN_CONT " (cpu_power = %d)", |
6218 | group->cpu_power); | 6742 | group->sgp->power); |
6219 | } | 6743 | } |
6220 | 6744 | ||
6221 | group = group->next; | 6745 | group = group->next; |
@@ -6234,7 +6758,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6234 | 6758 | ||
6235 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6759 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
6236 | { | 6760 | { |
6237 | cpumask_var_t groupmask; | ||
6238 | int level = 0; | 6761 | int level = 0; |
6239 | 6762 | ||
6240 | if (!sched_domain_debug_enabled) | 6763 | if (!sched_domain_debug_enabled) |
@@ -6247,20 +6770,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
6247 | 6770 | ||
6248 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6771 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
6249 | 6772 | ||
6250 | if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { | ||
6251 | printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); | ||
6252 | return; | ||
6253 | } | ||
6254 | |||
6255 | for (;;) { | 6773 | for (;;) { |
6256 | if (sched_domain_debug_one(sd, cpu, level, groupmask)) | 6774 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) |
6257 | break; | 6775 | break; |
6258 | level++; | 6776 | level++; |
6259 | sd = sd->parent; | 6777 | sd = sd->parent; |
6260 | if (!sd) | 6778 | if (!sd) |
6261 | break; | 6779 | break; |
6262 | } | 6780 | } |
6263 | free_cpumask_var(groupmask); | ||
6264 | } | 6781 | } |
6265 | #else /* !CONFIG_SCHED_DEBUG */ | 6782 | #else /* !CONFIG_SCHED_DEBUG */ |
6266 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6783 | # define sched_domain_debug(sd, cpu) do { } while (0) |
@@ -6317,12 +6834,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
6317 | return 1; | 6834 | return 1; |
6318 | } | 6835 | } |
6319 | 6836 | ||
6320 | static void free_rootdomain(struct root_domain *rd) | 6837 | static void free_rootdomain(struct rcu_head *rcu) |
6321 | { | 6838 | { |
6322 | synchronize_sched(); | 6839 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
6323 | 6840 | ||
6324 | cpupri_cleanup(&rd->cpupri); | 6841 | cpupri_cleanup(&rd->cpupri); |
6325 | |||
6326 | free_cpumask_var(rd->rto_mask); | 6842 | free_cpumask_var(rd->rto_mask); |
6327 | free_cpumask_var(rd->online); | 6843 | free_cpumask_var(rd->online); |
6328 | free_cpumask_var(rd->span); | 6844 | free_cpumask_var(rd->span); |
@@ -6363,7 +6879,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6363 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6879 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6364 | 6880 | ||
6365 | if (old_rd) | 6881 | if (old_rd) |
6366 | free_rootdomain(old_rd); | 6882 | call_rcu_sched(&old_rd->rcu, free_rootdomain); |
6367 | } | 6883 | } |
6368 | 6884 | ||
6369 | static int init_rootdomain(struct root_domain *rd) | 6885 | static int init_rootdomain(struct root_domain *rd) |
@@ -6414,6 +6930,53 @@ static struct root_domain *alloc_rootdomain(void) | |||
6414 | return rd; | 6930 | return rd; |
6415 | } | 6931 | } |
6416 | 6932 | ||
6933 | static void free_sched_groups(struct sched_group *sg, int free_sgp) | ||
6934 | { | ||
6935 | struct sched_group *tmp, *first; | ||
6936 | |||
6937 | if (!sg) | ||
6938 | return; | ||
6939 | |||
6940 | first = sg; | ||
6941 | do { | ||
6942 | tmp = sg->next; | ||
6943 | |||
6944 | if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) | ||
6945 | kfree(sg->sgp); | ||
6946 | |||
6947 | kfree(sg); | ||
6948 | sg = tmp; | ||
6949 | } while (sg != first); | ||
6950 | } | ||
6951 | |||
6952 | static void free_sched_domain(struct rcu_head *rcu) | ||
6953 | { | ||
6954 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
6955 | |||
6956 | /* | ||
6957 | * If its an overlapping domain it has private groups, iterate and | ||
6958 | * nuke them all. | ||
6959 | */ | ||
6960 | if (sd->flags & SD_OVERLAP) { | ||
6961 | free_sched_groups(sd->groups, 1); | ||
6962 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | ||
6963 | kfree(sd->groups->sgp); | ||
6964 | kfree(sd->groups); | ||
6965 | } | ||
6966 | kfree(sd); | ||
6967 | } | ||
6968 | |||
6969 | static void destroy_sched_domain(struct sched_domain *sd, int cpu) | ||
6970 | { | ||
6971 | call_rcu(&sd->rcu, free_sched_domain); | ||
6972 | } | ||
6973 | |||
6974 | static void destroy_sched_domains(struct sched_domain *sd, int cpu) | ||
6975 | { | ||
6976 | for (; sd; sd = sd->parent) | ||
6977 | destroy_sched_domain(sd, cpu); | ||
6978 | } | ||
6979 | |||
6417 | /* | 6980 | /* |
6418 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6981 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
6419 | * hold the hotplug lock. | 6982 | * hold the hotplug lock. |
@@ -6424,9 +6987,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6424 | struct rq *rq = cpu_rq(cpu); | 6987 | struct rq *rq = cpu_rq(cpu); |
6425 | struct sched_domain *tmp; | 6988 | struct sched_domain *tmp; |
6426 | 6989 | ||
6427 | for (tmp = sd; tmp; tmp = tmp->parent) | ||
6428 | tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); | ||
6429 | |||
6430 | /* Remove the sched domains which do not contribute to scheduling. */ | 6990 | /* Remove the sched domains which do not contribute to scheduling. */ |
6431 | for (tmp = sd; tmp; ) { | 6991 | for (tmp = sd; tmp; ) { |
6432 | struct sched_domain *parent = tmp->parent; | 6992 | struct sched_domain *parent = tmp->parent; |
@@ -6437,12 +6997,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6437 | tmp->parent = parent->parent; | 6997 | tmp->parent = parent->parent; |
6438 | if (parent->parent) | 6998 | if (parent->parent) |
6439 | parent->parent->child = tmp; | 6999 | parent->parent->child = tmp; |
7000 | destroy_sched_domain(parent, cpu); | ||
6440 | } else | 7001 | } else |
6441 | tmp = tmp->parent; | 7002 | tmp = tmp->parent; |
6442 | } | 7003 | } |
6443 | 7004 | ||
6444 | if (sd && sd_degenerate(sd)) { | 7005 | if (sd && sd_degenerate(sd)) { |
7006 | tmp = sd; | ||
6445 | sd = sd->parent; | 7007 | sd = sd->parent; |
7008 | destroy_sched_domain(tmp, cpu); | ||
6446 | if (sd) | 7009 | if (sd) |
6447 | sd->child = NULL; | 7010 | sd->child = NULL; |
6448 | } | 7011 | } |
@@ -6450,7 +7013,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6450 | sched_domain_debug(sd, cpu); | 7013 | sched_domain_debug(sd, cpu); |
6451 | 7014 | ||
6452 | rq_attach_root(rq, rd); | 7015 | rq_attach_root(rq, rd); |
7016 | tmp = rq->sd; | ||
6453 | rcu_assign_pointer(rq->sd, sd); | 7017 | rcu_assign_pointer(rq->sd, sd); |
7018 | destroy_sched_domains(tmp, cpu); | ||
6454 | } | 7019 | } |
6455 | 7020 | ||
6456 | /* cpus with isolated domains */ | 7021 | /* cpus with isolated domains */ |
@@ -6466,56 +7031,6 @@ static int __init isolated_cpu_setup(char *str) | |||
6466 | 7031 | ||
6467 | __setup("isolcpus=", isolated_cpu_setup); | 7032 | __setup("isolcpus=", isolated_cpu_setup); |
6468 | 7033 | ||
6469 | /* | ||
6470 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | ||
6471 | * to a function which identifies what group(along with sched group) a CPU | ||
6472 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids | ||
6473 | * (due to the fact that we keep track of groups covered with a struct cpumask). | ||
6474 | * | ||
6475 | * init_sched_build_groups will build a circular linked list of the groups | ||
6476 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6477 | * and ->cpu_power to 0. | ||
6478 | */ | ||
6479 | static void | ||
6480 | init_sched_build_groups(const struct cpumask *span, | ||
6481 | const struct cpumask *cpu_map, | ||
6482 | int (*group_fn)(int cpu, const struct cpumask *cpu_map, | ||
6483 | struct sched_group **sg, | ||
6484 | struct cpumask *tmpmask), | ||
6485 | struct cpumask *covered, struct cpumask *tmpmask) | ||
6486 | { | ||
6487 | struct sched_group *first = NULL, *last = NULL; | ||
6488 | int i; | ||
6489 | |||
6490 | cpumask_clear(covered); | ||
6491 | |||
6492 | for_each_cpu(i, span) { | ||
6493 | struct sched_group *sg; | ||
6494 | int group = group_fn(i, cpu_map, &sg, tmpmask); | ||
6495 | int j; | ||
6496 | |||
6497 | if (cpumask_test_cpu(i, covered)) | ||
6498 | continue; | ||
6499 | |||
6500 | cpumask_clear(sched_group_cpus(sg)); | ||
6501 | sg->cpu_power = 0; | ||
6502 | |||
6503 | for_each_cpu(j, span) { | ||
6504 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | ||
6505 | continue; | ||
6506 | |||
6507 | cpumask_set_cpu(j, covered); | ||
6508 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
6509 | } | ||
6510 | if (!first) | ||
6511 | first = sg; | ||
6512 | if (last) | ||
6513 | last->next = sg; | ||
6514 | last = sg; | ||
6515 | } | ||
6516 | last->next = first; | ||
6517 | } | ||
6518 | |||
6519 | #define SD_NODES_PER_DOMAIN 16 | 7034 | #define SD_NODES_PER_DOMAIN 16 |
6520 | 7035 | ||
6521 | #ifdef CONFIG_NUMA | 7036 | #ifdef CONFIG_NUMA |
@@ -6532,7 +7047,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
6532 | */ | 7047 | */ |
6533 | static int find_next_best_node(int node, nodemask_t *used_nodes) | 7048 | static int find_next_best_node(int node, nodemask_t *used_nodes) |
6534 | { | 7049 | { |
6535 | int i, n, val, min_val, best_node = 0; | 7050 | int i, n, val, min_val, best_node = -1; |
6536 | 7051 | ||
6537 | min_val = INT_MAX; | 7052 | min_val = INT_MAX; |
6538 | 7053 | ||
@@ -6556,7 +7071,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) | |||
6556 | } | 7071 | } |
6557 | } | 7072 | } |
6558 | 7073 | ||
6559 | node_set(best_node, *used_nodes); | 7074 | if (best_node != -1) |
7075 | node_set(best_node, *used_nodes); | ||
6560 | return best_node; | 7076 | return best_node; |
6561 | } | 7077 | } |
6562 | 7078 | ||
@@ -6582,293 +7098,197 @@ static void sched_domain_node_span(int node, struct cpumask *span) | |||
6582 | 7098 | ||
6583 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 7099 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
6584 | int next_node = find_next_best_node(node, &used_nodes); | 7100 | int next_node = find_next_best_node(node, &used_nodes); |
6585 | 7101 | if (next_node < 0) | |
7102 | break; | ||
6586 | cpumask_or(span, span, cpumask_of_node(next_node)); | 7103 | cpumask_or(span, span, cpumask_of_node(next_node)); |
6587 | } | 7104 | } |
6588 | } | 7105 | } |
7106 | |||
7107 | static const struct cpumask *cpu_node_mask(int cpu) | ||
7108 | { | ||
7109 | lockdep_assert_held(&sched_domains_mutex); | ||
7110 | |||
7111 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
7112 | |||
7113 | return sched_domains_tmpmask; | ||
7114 | } | ||
7115 | |||
7116 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
7117 | { | ||
7118 | return cpu_possible_mask; | ||
7119 | } | ||
6589 | #endif /* CONFIG_NUMA */ | 7120 | #endif /* CONFIG_NUMA */ |
6590 | 7121 | ||
6591 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 7122 | static const struct cpumask *cpu_cpu_mask(int cpu) |
7123 | { | ||
7124 | return cpumask_of_node(cpu_to_node(cpu)); | ||
7125 | } | ||
6592 | 7126 | ||
6593 | /* | 7127 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6594 | * The cpus mask in sched_group and sched_domain hangs off the end. | ||
6595 | * | ||
6596 | * ( See the the comments in include/linux/sched.h:struct sched_group | ||
6597 | * and struct sched_domain. ) | ||
6598 | */ | ||
6599 | struct static_sched_group { | ||
6600 | struct sched_group sg; | ||
6601 | DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); | ||
6602 | }; | ||
6603 | 7128 | ||
6604 | struct static_sched_domain { | 7129 | struct sd_data { |
6605 | struct sched_domain sd; | 7130 | struct sched_domain **__percpu sd; |
6606 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 7131 | struct sched_group **__percpu sg; |
7132 | struct sched_group_power **__percpu sgp; | ||
6607 | }; | 7133 | }; |
6608 | 7134 | ||
6609 | struct s_data { | 7135 | struct s_data { |
6610 | #ifdef CONFIG_NUMA | 7136 | struct sched_domain ** __percpu sd; |
6611 | int sd_allnodes; | ||
6612 | cpumask_var_t domainspan; | ||
6613 | cpumask_var_t covered; | ||
6614 | cpumask_var_t notcovered; | ||
6615 | #endif | ||
6616 | cpumask_var_t nodemask; | ||
6617 | cpumask_var_t this_sibling_map; | ||
6618 | cpumask_var_t this_core_map; | ||
6619 | cpumask_var_t send_covered; | ||
6620 | cpumask_var_t tmpmask; | ||
6621 | struct sched_group **sched_group_nodes; | ||
6622 | struct root_domain *rd; | 7137 | struct root_domain *rd; |
6623 | }; | 7138 | }; |
6624 | 7139 | ||
6625 | enum s_alloc { | 7140 | enum s_alloc { |
6626 | sa_sched_groups = 0, | ||
6627 | sa_rootdomain, | 7141 | sa_rootdomain, |
6628 | sa_tmpmask, | 7142 | sa_sd, |
6629 | sa_send_covered, | 7143 | sa_sd_storage, |
6630 | sa_this_core_map, | ||
6631 | sa_this_sibling_map, | ||
6632 | sa_nodemask, | ||
6633 | sa_sched_group_nodes, | ||
6634 | #ifdef CONFIG_NUMA | ||
6635 | sa_notcovered, | ||
6636 | sa_covered, | ||
6637 | sa_domainspan, | ||
6638 | #endif | ||
6639 | sa_none, | 7144 | sa_none, |
6640 | }; | 7145 | }; |
6641 | 7146 | ||
6642 | /* | 7147 | struct sched_domain_topology_level; |
6643 | * SMT sched-domains: | ||
6644 | */ | ||
6645 | #ifdef CONFIG_SCHED_SMT | ||
6646 | static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); | ||
6647 | static DEFINE_PER_CPU(struct static_sched_group, sched_groups); | ||
6648 | 7148 | ||
6649 | static int | 7149 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); |
6650 | cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | 7150 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); |
6651 | struct sched_group **sg, struct cpumask *unused) | ||
6652 | { | ||
6653 | if (sg) | ||
6654 | *sg = &per_cpu(sched_groups, cpu).sg; | ||
6655 | return cpu; | ||
6656 | } | ||
6657 | #endif /* CONFIG_SCHED_SMT */ | ||
6658 | 7151 | ||
6659 | /* | 7152 | #define SDTL_OVERLAP 0x01 |
6660 | * multi-core sched-domains: | ||
6661 | */ | ||
6662 | #ifdef CONFIG_SCHED_MC | ||
6663 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); | ||
6664 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); | ||
6665 | #endif /* CONFIG_SCHED_MC */ | ||
6666 | 7153 | ||
6667 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 7154 | struct sched_domain_topology_level { |
6668 | static int | 7155 | sched_domain_init_f init; |
6669 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 7156 | sched_domain_mask_f mask; |
6670 | struct sched_group **sg, struct cpumask *mask) | 7157 | int flags; |
6671 | { | 7158 | struct sd_data data; |
6672 | int group; | 7159 | }; |
6673 | 7160 | ||
6674 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6675 | group = cpumask_first(mask); | ||
6676 | if (sg) | ||
6677 | *sg = &per_cpu(sched_group_core, group).sg; | ||
6678 | return group; | ||
6679 | } | ||
6680 | #elif defined(CONFIG_SCHED_MC) | ||
6681 | static int | 7161 | static int |
6682 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 7162 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) |
6683 | struct sched_group **sg, struct cpumask *unused) | ||
6684 | { | 7163 | { |
6685 | if (sg) | 7164 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; |
6686 | *sg = &per_cpu(sched_group_core, cpu).sg; | 7165 | const struct cpumask *span = sched_domain_span(sd); |
6687 | return cpu; | 7166 | struct cpumask *covered = sched_domains_tmpmask; |
6688 | } | 7167 | struct sd_data *sdd = sd->private; |
6689 | #endif | 7168 | struct sched_domain *child; |
7169 | int i; | ||
6690 | 7170 | ||
6691 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); | 7171 | cpumask_clear(covered); |
6692 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); | ||
6693 | 7172 | ||
6694 | static int | 7173 | for_each_cpu(i, span) { |
6695 | cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | 7174 | struct cpumask *sg_span; |
6696 | struct sched_group **sg, struct cpumask *mask) | ||
6697 | { | ||
6698 | int group; | ||
6699 | #ifdef CONFIG_SCHED_MC | ||
6700 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6701 | group = cpumask_first(mask); | ||
6702 | #elif defined(CONFIG_SCHED_SMT) | ||
6703 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6704 | group = cpumask_first(mask); | ||
6705 | #else | ||
6706 | group = cpu; | ||
6707 | #endif | ||
6708 | if (sg) | ||
6709 | *sg = &per_cpu(sched_group_phys, group).sg; | ||
6710 | return group; | ||
6711 | } | ||
6712 | 7175 | ||
6713 | #ifdef CONFIG_NUMA | 7176 | if (cpumask_test_cpu(i, covered)) |
6714 | /* | 7177 | continue; |
6715 | * The init_sched_build_groups can't handle what we want to do with node | ||
6716 | * groups, so roll our own. Now each node has its own list of groups which | ||
6717 | * gets dynamically allocated. | ||
6718 | */ | ||
6719 | static DEFINE_PER_CPU(struct static_sched_domain, node_domains); | ||
6720 | static struct sched_group ***sched_group_nodes_bycpu; | ||
6721 | 7178 | ||
6722 | static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); | 7179 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
6723 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); | 7180 | GFP_KERNEL, cpu_to_node(i)); |
6724 | 7181 | ||
6725 | static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, | 7182 | if (!sg) |
6726 | struct sched_group **sg, | 7183 | goto fail; |
6727 | struct cpumask *nodemask) | ||
6728 | { | ||
6729 | int group; | ||
6730 | 7184 | ||
6731 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); | 7185 | sg_span = sched_group_cpus(sg); |
6732 | group = cpumask_first(nodemask); | ||
6733 | 7186 | ||
6734 | if (sg) | 7187 | child = *per_cpu_ptr(sdd->sd, i); |
6735 | *sg = &per_cpu(sched_group_allnodes, group).sg; | 7188 | if (child->child) { |
6736 | return group; | 7189 | child = child->child; |
6737 | } | 7190 | cpumask_copy(sg_span, sched_domain_span(child)); |
7191 | } else | ||
7192 | cpumask_set_cpu(i, sg_span); | ||
6738 | 7193 | ||
6739 | static void init_numa_sched_groups_power(struct sched_group *group_head) | 7194 | cpumask_or(covered, covered, sg_span); |
6740 | { | ||
6741 | struct sched_group *sg = group_head; | ||
6742 | int j; | ||
6743 | 7195 | ||
6744 | if (!sg) | 7196 | sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); |
6745 | return; | 7197 | atomic_inc(&sg->sgp->ref); |
6746 | do { | ||
6747 | for_each_cpu(j, sched_group_cpus(sg)) { | ||
6748 | struct sched_domain *sd; | ||
6749 | 7198 | ||
6750 | sd = &per_cpu(phys_domains, j).sd; | 7199 | if (cpumask_test_cpu(cpu, sg_span)) |
6751 | if (j != group_first_cpu(sd->groups)) { | 7200 | groups = sg; |
6752 | /* | ||
6753 | * Only add "power" once for each | ||
6754 | * physical package. | ||
6755 | */ | ||
6756 | continue; | ||
6757 | } | ||
6758 | 7201 | ||
6759 | sg->cpu_power += sd->groups->cpu_power; | 7202 | if (!first) |
6760 | } | 7203 | first = sg; |
6761 | sg = sg->next; | 7204 | if (last) |
6762 | } while (sg != group_head); | 7205 | last->next = sg; |
7206 | last = sg; | ||
7207 | last->next = first; | ||
7208 | } | ||
7209 | sd->groups = groups; | ||
7210 | |||
7211 | return 0; | ||
7212 | |||
7213 | fail: | ||
7214 | free_sched_groups(first, 0); | ||
7215 | |||
7216 | return -ENOMEM; | ||
6763 | } | 7217 | } |
6764 | 7218 | ||
6765 | static int build_numa_sched_groups(struct s_data *d, | 7219 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) |
6766 | const struct cpumask *cpu_map, int num) | ||
6767 | { | 7220 | { |
6768 | struct sched_domain *sd; | 7221 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
6769 | struct sched_group *sg, *prev; | 7222 | struct sched_domain *child = sd->child; |
6770 | int n, j; | ||
6771 | 7223 | ||
6772 | cpumask_clear(d->covered); | 7224 | if (child) |
6773 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | 7225 | cpu = cpumask_first(sched_domain_span(child)); |
6774 | if (cpumask_empty(d->nodemask)) { | 7226 | |
6775 | d->sched_group_nodes[num] = NULL; | 7227 | if (sg) { |
6776 | goto out; | 7228 | *sg = *per_cpu_ptr(sdd->sg, cpu); |
7229 | (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); | ||
7230 | atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ | ||
6777 | } | 7231 | } |
6778 | 7232 | ||
6779 | sched_domain_node_span(num, d->domainspan); | 7233 | return cpu; |
6780 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | 7234 | } |
6781 | 7235 | ||
6782 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | 7236 | /* |
6783 | GFP_KERNEL, num); | 7237 | * build_sched_groups will build a circular linked list of the groups |
6784 | if (!sg) { | 7238 | * covered by the given span, and will set each group's ->cpumask correctly, |
6785 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | 7239 | * and ->cpu_power to 0. |
6786 | num); | 7240 | * |
6787 | return -ENOMEM; | 7241 | * Assumes the sched_domain tree is fully constructed |
6788 | } | 7242 | */ |
6789 | d->sched_group_nodes[num] = sg; | 7243 | static int |
7244 | build_sched_groups(struct sched_domain *sd, int cpu) | ||
7245 | { | ||
7246 | struct sched_group *first = NULL, *last = NULL; | ||
7247 | struct sd_data *sdd = sd->private; | ||
7248 | const struct cpumask *span = sched_domain_span(sd); | ||
7249 | struct cpumask *covered; | ||
7250 | int i; | ||
6790 | 7251 | ||
6791 | for_each_cpu(j, d->nodemask) { | 7252 | get_group(cpu, sdd, &sd->groups); |
6792 | sd = &per_cpu(node_domains, j).sd; | 7253 | atomic_inc(&sd->groups->ref); |
6793 | sd->groups = sg; | ||
6794 | } | ||
6795 | 7254 | ||
6796 | sg->cpu_power = 0; | 7255 | if (cpu != cpumask_first(sched_domain_span(sd))) |
6797 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | 7256 | return 0; |
6798 | sg->next = sg; | ||
6799 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
6800 | 7257 | ||
6801 | prev = sg; | 7258 | lockdep_assert_held(&sched_domains_mutex); |
6802 | for (j = 0; j < nr_node_ids; j++) { | 7259 | covered = sched_domains_tmpmask; |
6803 | n = (num + j) % nr_node_ids; | ||
6804 | cpumask_complement(d->notcovered, d->covered); | ||
6805 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
6806 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
6807 | if (cpumask_empty(d->tmpmask)) | ||
6808 | break; | ||
6809 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
6810 | if (cpumask_empty(d->tmpmask)) | ||
6811 | continue; | ||
6812 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
6813 | GFP_KERNEL, num); | ||
6814 | if (!sg) { | ||
6815 | printk(KERN_WARNING | ||
6816 | "Can not alloc domain group for node %d\n", j); | ||
6817 | return -ENOMEM; | ||
6818 | } | ||
6819 | sg->cpu_power = 0; | ||
6820 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
6821 | sg->next = prev->next; | ||
6822 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
6823 | prev->next = sg; | ||
6824 | prev = sg; | ||
6825 | } | ||
6826 | out: | ||
6827 | return 0; | ||
6828 | } | ||
6829 | #endif /* CONFIG_NUMA */ | ||
6830 | 7260 | ||
6831 | #ifdef CONFIG_NUMA | 7261 | cpumask_clear(covered); |
6832 | /* Free memory allocated for various sched_group structures */ | ||
6833 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
6834 | struct cpumask *nodemask) | ||
6835 | { | ||
6836 | int cpu, i; | ||
6837 | 7262 | ||
6838 | for_each_cpu(cpu, cpu_map) { | 7263 | for_each_cpu(i, span) { |
6839 | struct sched_group **sched_group_nodes | 7264 | struct sched_group *sg; |
6840 | = sched_group_nodes_bycpu[cpu]; | 7265 | int group = get_group(i, sdd, &sg); |
7266 | int j; | ||
6841 | 7267 | ||
6842 | if (!sched_group_nodes) | 7268 | if (cpumask_test_cpu(i, covered)) |
6843 | continue; | 7269 | continue; |
6844 | 7270 | ||
6845 | for (i = 0; i < nr_node_ids; i++) { | 7271 | cpumask_clear(sched_group_cpus(sg)); |
6846 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 7272 | sg->sgp->power = 0; |
6847 | 7273 | ||
6848 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 7274 | for_each_cpu(j, span) { |
6849 | if (cpumask_empty(nodemask)) | 7275 | if (get_group(j, sdd, NULL) != group) |
6850 | continue; | 7276 | continue; |
6851 | 7277 | ||
6852 | if (sg == NULL) | 7278 | cpumask_set_cpu(j, covered); |
6853 | continue; | 7279 | cpumask_set_cpu(j, sched_group_cpus(sg)); |
6854 | sg = sg->next; | ||
6855 | next_sg: | ||
6856 | oldsg = sg; | ||
6857 | sg = sg->next; | ||
6858 | kfree(oldsg); | ||
6859 | if (oldsg != sched_group_nodes[i]) | ||
6860 | goto next_sg; | ||
6861 | } | 7280 | } |
6862 | kfree(sched_group_nodes); | 7281 | |
6863 | sched_group_nodes_bycpu[cpu] = NULL; | 7282 | if (!first) |
7283 | first = sg; | ||
7284 | if (last) | ||
7285 | last->next = sg; | ||
7286 | last = sg; | ||
6864 | } | 7287 | } |
7288 | last->next = first; | ||
7289 | |||
7290 | return 0; | ||
6865 | } | 7291 | } |
6866 | #else /* !CONFIG_NUMA */ | ||
6867 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
6868 | struct cpumask *nodemask) | ||
6869 | { | ||
6870 | } | ||
6871 | #endif /* CONFIG_NUMA */ | ||
6872 | 7292 | ||
6873 | /* | 7293 | /* |
6874 | * Initialize sched groups cpu_power. | 7294 | * Initialize sched groups cpu_power. |
@@ -6882,46 +7302,19 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
6882 | */ | 7302 | */ |
6883 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 7303 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
6884 | { | 7304 | { |
6885 | struct sched_domain *child; | 7305 | struct sched_group *sg = sd->groups; |
6886 | struct sched_group *group; | ||
6887 | long power; | ||
6888 | int weight; | ||
6889 | |||
6890 | WARN_ON(!sd || !sd->groups); | ||
6891 | |||
6892 | if (cpu != group_first_cpu(sd->groups)) | ||
6893 | return; | ||
6894 | 7306 | ||
6895 | child = sd->child; | 7307 | WARN_ON(!sd || !sg); |
6896 | 7308 | ||
6897 | sd->groups->cpu_power = 0; | 7309 | do { |
7310 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | ||
7311 | sg = sg->next; | ||
7312 | } while (sg != sd->groups); | ||
6898 | 7313 | ||
6899 | if (!child) { | 7314 | if (cpu != group_first_cpu(sg)) |
6900 | power = SCHED_LOAD_SCALE; | ||
6901 | weight = cpumask_weight(sched_domain_span(sd)); | ||
6902 | /* | ||
6903 | * SMT siblings share the power of a single core. | ||
6904 | * Usually multiple threads get a better yield out of | ||
6905 | * that one core than a single thread would have, | ||
6906 | * reflect that in sd->smt_gain. | ||
6907 | */ | ||
6908 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
6909 | power *= sd->smt_gain; | ||
6910 | power /= weight; | ||
6911 | power >>= SCHED_LOAD_SHIFT; | ||
6912 | } | ||
6913 | sd->groups->cpu_power += power; | ||
6914 | return; | 7315 | return; |
6915 | } | ||
6916 | 7316 | ||
6917 | /* | 7317 | update_group_power(sd, cpu); |
6918 | * Add cpu_power of each child group to this groups cpu_power. | ||
6919 | */ | ||
6920 | group = child->groups; | ||
6921 | do { | ||
6922 | sd->groups->cpu_power += group->cpu_power; | ||
6923 | group = group->next; | ||
6924 | } while (group != child->groups); | ||
6925 | } | 7318 | } |
6926 | 7319 | ||
6927 | /* | 7320 | /* |
@@ -6935,15 +7328,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6935 | # define SD_INIT_NAME(sd, type) do { } while (0) | 7328 | # define SD_INIT_NAME(sd, type) do { } while (0) |
6936 | #endif | 7329 | #endif |
6937 | 7330 | ||
6938 | #define SD_INIT(sd, type) sd_init_##type(sd) | 7331 | #define SD_INIT_FUNC(type) \ |
6939 | 7332 | static noinline struct sched_domain * \ | |
6940 | #define SD_INIT_FUNC(type) \ | 7333 | sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ |
6941 | static noinline void sd_init_##type(struct sched_domain *sd) \ | 7334 | { \ |
6942 | { \ | 7335 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ |
6943 | memset(sd, 0, sizeof(*sd)); \ | 7336 | *sd = SD_##type##_INIT; \ |
6944 | *sd = SD_##type##_INIT; \ | 7337 | SD_INIT_NAME(sd, type); \ |
6945 | sd->level = SD_LV_##type; \ | 7338 | sd->private = &tl->data; \ |
6946 | SD_INIT_NAME(sd, type); \ | 7339 | return sd; \ |
6947 | } | 7340 | } |
6948 | 7341 | ||
6949 | SD_INIT_FUNC(CPU) | 7342 | SD_INIT_FUNC(CPU) |
@@ -6957,15 +7350,19 @@ SD_INIT_FUNC(CPU) | |||
6957 | #ifdef CONFIG_SCHED_MC | 7350 | #ifdef CONFIG_SCHED_MC |
6958 | SD_INIT_FUNC(MC) | 7351 | SD_INIT_FUNC(MC) |
6959 | #endif | 7352 | #endif |
7353 | #ifdef CONFIG_SCHED_BOOK | ||
7354 | SD_INIT_FUNC(BOOK) | ||
7355 | #endif | ||
6960 | 7356 | ||
6961 | static int default_relax_domain_level = -1; | 7357 | static int default_relax_domain_level = -1; |
7358 | int sched_domain_level_max; | ||
6962 | 7359 | ||
6963 | static int __init setup_relax_domain_level(char *str) | 7360 | static int __init setup_relax_domain_level(char *str) |
6964 | { | 7361 | { |
6965 | unsigned long val; | 7362 | unsigned long val; |
6966 | 7363 | ||
6967 | val = simple_strtoul(str, NULL, 0); | 7364 | val = simple_strtoul(str, NULL, 0); |
6968 | if (val < SD_LV_MAX) | 7365 | if (val < sched_domain_level_max) |
6969 | default_relax_domain_level = val; | 7366 | default_relax_domain_level = val; |
6970 | 7367 | ||
6971 | return 1; | 7368 | return 1; |
@@ -6993,35 +7390,20 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
6993 | } | 7390 | } |
6994 | } | 7391 | } |
6995 | 7392 | ||
7393 | static void __sdt_free(const struct cpumask *cpu_map); | ||
7394 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
7395 | |||
6996 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | 7396 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
6997 | const struct cpumask *cpu_map) | 7397 | const struct cpumask *cpu_map) |
6998 | { | 7398 | { |
6999 | switch (what) { | 7399 | switch (what) { |
7000 | case sa_sched_groups: | ||
7001 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ | ||
7002 | d->sched_group_nodes = NULL; | ||
7003 | case sa_rootdomain: | 7400 | case sa_rootdomain: |
7004 | free_rootdomain(d->rd); /* fall through */ | 7401 | if (!atomic_read(&d->rd->refcount)) |
7005 | case sa_tmpmask: | 7402 | free_rootdomain(&d->rd->rcu); /* fall through */ |
7006 | free_cpumask_var(d->tmpmask); /* fall through */ | 7403 | case sa_sd: |
7007 | case sa_send_covered: | 7404 | free_percpu(d->sd); /* fall through */ |
7008 | free_cpumask_var(d->send_covered); /* fall through */ | 7405 | case sa_sd_storage: |
7009 | case sa_this_core_map: | 7406 | __sdt_free(cpu_map); /* fall through */ |
7010 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
7011 | case sa_this_sibling_map: | ||
7012 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
7013 | case sa_nodemask: | ||
7014 | free_cpumask_var(d->nodemask); /* fall through */ | ||
7015 | case sa_sched_group_nodes: | ||
7016 | #ifdef CONFIG_NUMA | ||
7017 | kfree(d->sched_group_nodes); /* fall through */ | ||
7018 | case sa_notcovered: | ||
7019 | free_cpumask_var(d->notcovered); /* fall through */ | ||
7020 | case sa_covered: | ||
7021 | free_cpumask_var(d->covered); /* fall through */ | ||
7022 | case sa_domainspan: | ||
7023 | free_cpumask_var(d->domainspan); /* fall through */ | ||
7024 | #endif | ||
7025 | case sa_none: | 7407 | case sa_none: |
7026 | break; | 7408 | break; |
7027 | } | 7409 | } |
@@ -7030,270 +7412,233 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
7030 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | 7412 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
7031 | const struct cpumask *cpu_map) | 7413 | const struct cpumask *cpu_map) |
7032 | { | 7414 | { |
7033 | #ifdef CONFIG_NUMA | 7415 | memset(d, 0, sizeof(*d)); |
7034 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) | 7416 | |
7035 | return sa_none; | 7417 | if (__sdt_alloc(cpu_map)) |
7036 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) | 7418 | return sa_sd_storage; |
7037 | return sa_domainspan; | 7419 | d->sd = alloc_percpu(struct sched_domain *); |
7038 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) | 7420 | if (!d->sd) |
7039 | return sa_covered; | 7421 | return sa_sd_storage; |
7040 | /* Allocate the per-node list of sched groups */ | ||
7041 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
7042 | sizeof(struct sched_group *), GFP_KERNEL); | ||
7043 | if (!d->sched_group_nodes) { | ||
7044 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | ||
7045 | return sa_notcovered; | ||
7046 | } | ||
7047 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; | ||
7048 | #endif | ||
7049 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) | ||
7050 | return sa_sched_group_nodes; | ||
7051 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
7052 | return sa_nodemask; | ||
7053 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
7054 | return sa_this_sibling_map; | ||
7055 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
7056 | return sa_this_core_map; | ||
7057 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
7058 | return sa_send_covered; | ||
7059 | d->rd = alloc_rootdomain(); | 7422 | d->rd = alloc_rootdomain(); |
7060 | if (!d->rd) { | 7423 | if (!d->rd) |
7061 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 7424 | return sa_sd; |
7062 | return sa_tmpmask; | ||
7063 | } | ||
7064 | return sa_rootdomain; | 7425 | return sa_rootdomain; |
7065 | } | 7426 | } |
7066 | 7427 | ||
7067 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | 7428 | /* |
7068 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | 7429 | * NULL the sd_data elements we've used to build the sched_domain and |
7430 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
7431 | * will not free the data we're using. | ||
7432 | */ | ||
7433 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
7069 | { | 7434 | { |
7070 | struct sched_domain *sd = NULL; | 7435 | struct sd_data *sdd = sd->private; |
7071 | #ifdef CONFIG_NUMA | ||
7072 | struct sched_domain *parent; | ||
7073 | |||
7074 | d->sd_allnodes = 0; | ||
7075 | if (cpumask_weight(cpu_map) > | ||
7076 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
7077 | sd = &per_cpu(allnodes_domains, i).sd; | ||
7078 | SD_INIT(sd, ALLNODES); | ||
7079 | set_domain_attribute(sd, attr); | ||
7080 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
7081 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7082 | d->sd_allnodes = 1; | ||
7083 | } | ||
7084 | parent = sd; | ||
7085 | |||
7086 | sd = &per_cpu(node_domains, i).sd; | ||
7087 | SD_INIT(sd, NODE); | ||
7088 | set_domain_attribute(sd, attr); | ||
7089 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
7090 | sd->parent = parent; | ||
7091 | if (parent) | ||
7092 | parent->child = sd; | ||
7093 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
7094 | #endif | ||
7095 | return sd; | ||
7096 | } | ||
7097 | 7436 | ||
7098 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, | 7437 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
7099 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7438 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
7100 | struct sched_domain *parent, int i) | ||
7101 | { | ||
7102 | struct sched_domain *sd; | ||
7103 | sd = &per_cpu(phys_domains, i).sd; | ||
7104 | SD_INIT(sd, CPU); | ||
7105 | set_domain_attribute(sd, attr); | ||
7106 | cpumask_copy(sched_domain_span(sd), d->nodemask); | ||
7107 | sd->parent = parent; | ||
7108 | if (parent) | ||
7109 | parent->child = sd; | ||
7110 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7111 | return sd; | ||
7112 | } | ||
7113 | 7439 | ||
7114 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | 7440 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) |
7115 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7441 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
7116 | struct sched_domain *parent, int i) | 7442 | |
7117 | { | 7443 | if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) |
7118 | struct sched_domain *sd = parent; | 7444 | *per_cpu_ptr(sdd->sgp, cpu) = NULL; |
7119 | #ifdef CONFIG_SCHED_MC | ||
7120 | sd = &per_cpu(core_domains, i).sd; | ||
7121 | SD_INIT(sd, MC); | ||
7122 | set_domain_attribute(sd, attr); | ||
7123 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); | ||
7124 | sd->parent = parent; | ||
7125 | parent->child = sd; | ||
7126 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7127 | #endif | ||
7128 | return sd; | ||
7129 | } | 7445 | } |
7130 | 7446 | ||
7131 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
7132 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
7133 | struct sched_domain *parent, int i) | ||
7134 | { | ||
7135 | struct sched_domain *sd = parent; | ||
7136 | #ifdef CONFIG_SCHED_SMT | 7447 | #ifdef CONFIG_SCHED_SMT |
7137 | sd = &per_cpu(cpu_domains, i).sd; | 7448 | static const struct cpumask *cpu_smt_mask(int cpu) |
7138 | SD_INIT(sd, SIBLING); | 7449 | { |
7139 | set_domain_attribute(sd, attr); | 7450 | return topology_thread_cpumask(cpu); |
7140 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); | ||
7141 | sd->parent = parent; | ||
7142 | parent->child = sd; | ||
7143 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7144 | #endif | ||
7145 | return sd; | ||
7146 | } | 7451 | } |
7452 | #endif | ||
7147 | 7453 | ||
7148 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | 7454 | /* |
7149 | const struct cpumask *cpu_map, int cpu) | 7455 | * Topology list, bottom-up. |
7150 | { | 7456 | */ |
7151 | switch (l) { | 7457 | static struct sched_domain_topology_level default_topology[] = { |
7152 | #ifdef CONFIG_SCHED_SMT | 7458 | #ifdef CONFIG_SCHED_SMT |
7153 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ | 7459 | { sd_init_SIBLING, cpu_smt_mask, }, |
7154 | cpumask_and(d->this_sibling_map, cpu_map, | ||
7155 | topology_thread_cpumask(cpu)); | ||
7156 | if (cpu == cpumask_first(d->this_sibling_map)) | ||
7157 | init_sched_build_groups(d->this_sibling_map, cpu_map, | ||
7158 | &cpu_to_cpu_group, | ||
7159 | d->send_covered, d->tmpmask); | ||
7160 | break; | ||
7161 | #endif | 7460 | #endif |
7162 | #ifdef CONFIG_SCHED_MC | 7461 | #ifdef CONFIG_SCHED_MC |
7163 | case SD_LV_MC: /* set up multi-core groups */ | 7462 | { sd_init_MC, cpu_coregroup_mask, }, |
7164 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); | ||
7165 | if (cpu == cpumask_first(d->this_core_map)) | ||
7166 | init_sched_build_groups(d->this_core_map, cpu_map, | ||
7167 | &cpu_to_core_group, | ||
7168 | d->send_covered, d->tmpmask); | ||
7169 | break; | ||
7170 | #endif | 7463 | #endif |
7171 | case SD_LV_CPU: /* set up physical groups */ | 7464 | #ifdef CONFIG_SCHED_BOOK |
7172 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); | 7465 | { sd_init_BOOK, cpu_book_mask, }, |
7173 | if (!cpumask_empty(d->nodemask)) | 7466 | #endif |
7174 | init_sched_build_groups(d->nodemask, cpu_map, | 7467 | { sd_init_CPU, cpu_cpu_mask, }, |
7175 | &cpu_to_phys_group, | ||
7176 | d->send_covered, d->tmpmask); | ||
7177 | break; | ||
7178 | #ifdef CONFIG_NUMA | 7468 | #ifdef CONFIG_NUMA |
7179 | case SD_LV_ALLNODES: | 7469 | { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, |
7180 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, | 7470 | { sd_init_ALLNODES, cpu_allnodes_mask, }, |
7181 | d->send_covered, d->tmpmask); | ||
7182 | break; | ||
7183 | #endif | 7471 | #endif |
7184 | default: | 7472 | { NULL, }, |
7185 | break; | 7473 | }; |
7474 | |||
7475 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | ||
7476 | |||
7477 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
7478 | { | ||
7479 | struct sched_domain_topology_level *tl; | ||
7480 | int j; | ||
7481 | |||
7482 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7483 | struct sd_data *sdd = &tl->data; | ||
7484 | |||
7485 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
7486 | if (!sdd->sd) | ||
7487 | return -ENOMEM; | ||
7488 | |||
7489 | sdd->sg = alloc_percpu(struct sched_group *); | ||
7490 | if (!sdd->sg) | ||
7491 | return -ENOMEM; | ||
7492 | |||
7493 | sdd->sgp = alloc_percpu(struct sched_group_power *); | ||
7494 | if (!sdd->sgp) | ||
7495 | return -ENOMEM; | ||
7496 | |||
7497 | for_each_cpu(j, cpu_map) { | ||
7498 | struct sched_domain *sd; | ||
7499 | struct sched_group *sg; | ||
7500 | struct sched_group_power *sgp; | ||
7501 | |||
7502 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
7503 | GFP_KERNEL, cpu_to_node(j)); | ||
7504 | if (!sd) | ||
7505 | return -ENOMEM; | ||
7506 | |||
7507 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
7508 | |||
7509 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7510 | GFP_KERNEL, cpu_to_node(j)); | ||
7511 | if (!sg) | ||
7512 | return -ENOMEM; | ||
7513 | |||
7514 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
7515 | |||
7516 | sgp = kzalloc_node(sizeof(struct sched_group_power), | ||
7517 | GFP_KERNEL, cpu_to_node(j)); | ||
7518 | if (!sgp) | ||
7519 | return -ENOMEM; | ||
7520 | |||
7521 | *per_cpu_ptr(sdd->sgp, j) = sgp; | ||
7522 | } | ||
7523 | } | ||
7524 | |||
7525 | return 0; | ||
7526 | } | ||
7527 | |||
7528 | static void __sdt_free(const struct cpumask *cpu_map) | ||
7529 | { | ||
7530 | struct sched_domain_topology_level *tl; | ||
7531 | int j; | ||
7532 | |||
7533 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7534 | struct sd_data *sdd = &tl->data; | ||
7535 | |||
7536 | for_each_cpu(j, cpu_map) { | ||
7537 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); | ||
7538 | if (sd && (sd->flags & SD_OVERLAP)) | ||
7539 | free_sched_groups(sd->groups, 0); | ||
7540 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
7541 | kfree(*per_cpu_ptr(sdd->sgp, j)); | ||
7542 | } | ||
7543 | free_percpu(sdd->sd); | ||
7544 | free_percpu(sdd->sg); | ||
7545 | free_percpu(sdd->sgp); | ||
7546 | } | ||
7547 | } | ||
7548 | |||
7549 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
7550 | struct s_data *d, const struct cpumask *cpu_map, | ||
7551 | struct sched_domain_attr *attr, struct sched_domain *child, | ||
7552 | int cpu) | ||
7553 | { | ||
7554 | struct sched_domain *sd = tl->init(tl, cpu); | ||
7555 | if (!sd) | ||
7556 | return child; | ||
7557 | |||
7558 | set_domain_attribute(sd, attr); | ||
7559 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
7560 | if (child) { | ||
7561 | sd->level = child->level + 1; | ||
7562 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
7563 | child->parent = sd; | ||
7186 | } | 7564 | } |
7565 | sd->child = child; | ||
7566 | |||
7567 | return sd; | ||
7187 | } | 7568 | } |
7188 | 7569 | ||
7189 | /* | 7570 | /* |
7190 | * Build sched domains for a given set of cpus and attach the sched domains | 7571 | * Build sched domains for a given set of cpus and attach the sched domains |
7191 | * to the individual cpus | 7572 | * to the individual cpus |
7192 | */ | 7573 | */ |
7193 | static int __build_sched_domains(const struct cpumask *cpu_map, | 7574 | static int build_sched_domains(const struct cpumask *cpu_map, |
7194 | struct sched_domain_attr *attr) | 7575 | struct sched_domain_attr *attr) |
7195 | { | 7576 | { |
7196 | enum s_alloc alloc_state = sa_none; | 7577 | enum s_alloc alloc_state = sa_none; |
7197 | struct s_data d; | ||
7198 | struct sched_domain *sd; | 7578 | struct sched_domain *sd; |
7199 | int i; | 7579 | struct s_data d; |
7200 | #ifdef CONFIG_NUMA | 7580 | int i, ret = -ENOMEM; |
7201 | d.sd_allnodes = 0; | ||
7202 | #endif | ||
7203 | 7581 | ||
7204 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | 7582 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
7205 | if (alloc_state != sa_rootdomain) | 7583 | if (alloc_state != sa_rootdomain) |
7206 | goto error; | 7584 | goto error; |
7207 | alloc_state = sa_sched_groups; | ||
7208 | |||
7209 | /* | ||
7210 | * Set up domains for cpus specified by the cpu_map. | ||
7211 | */ | ||
7212 | for_each_cpu(i, cpu_map) { | ||
7213 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), | ||
7214 | cpu_map); | ||
7215 | |||
7216 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); | ||
7217 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); | ||
7218 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); | ||
7219 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); | ||
7220 | } | ||
7221 | 7585 | ||
7586 | /* Set up domains for cpus specified by the cpu_map. */ | ||
7222 | for_each_cpu(i, cpu_map) { | 7587 | for_each_cpu(i, cpu_map) { |
7223 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); | 7588 | struct sched_domain_topology_level *tl; |
7224 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | 7589 | |
7225 | } | 7590 | sd = NULL; |
7226 | 7591 | for (tl = sched_domain_topology; tl->init; tl++) { | |
7227 | /* Set up physical groups */ | 7592 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); |
7228 | for (i = 0; i < nr_node_ids; i++) | 7593 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) |
7229 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); | 7594 | sd->flags |= SD_OVERLAP; |
7230 | 7595 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | |
7231 | #ifdef CONFIG_NUMA | 7596 | break; |
7232 | /* Set up node groups */ | 7597 | } |
7233 | if (d.sd_allnodes) | ||
7234 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
7235 | 7598 | ||
7236 | for (i = 0; i < nr_node_ids; i++) | 7599 | while (sd->child) |
7237 | if (build_numa_sched_groups(&d, cpu_map, i)) | 7600 | sd = sd->child; |
7238 | goto error; | ||
7239 | #endif | ||
7240 | 7601 | ||
7241 | /* Calculate CPU power for physical packages and nodes */ | 7602 | *per_cpu_ptr(d.sd, i) = sd; |
7242 | #ifdef CONFIG_SCHED_SMT | ||
7243 | for_each_cpu(i, cpu_map) { | ||
7244 | sd = &per_cpu(cpu_domains, i).sd; | ||
7245 | init_sched_groups_power(i, sd); | ||
7246 | } | ||
7247 | #endif | ||
7248 | #ifdef CONFIG_SCHED_MC | ||
7249 | for_each_cpu(i, cpu_map) { | ||
7250 | sd = &per_cpu(core_domains, i).sd; | ||
7251 | init_sched_groups_power(i, sd); | ||
7252 | } | 7603 | } |
7253 | #endif | ||
7254 | 7604 | ||
7605 | /* Build the groups for the domains */ | ||
7255 | for_each_cpu(i, cpu_map) { | 7606 | for_each_cpu(i, cpu_map) { |
7256 | sd = &per_cpu(phys_domains, i).sd; | 7607 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7257 | init_sched_groups_power(i, sd); | 7608 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); |
7609 | if (sd->flags & SD_OVERLAP) { | ||
7610 | if (build_overlap_sched_groups(sd, i)) | ||
7611 | goto error; | ||
7612 | } else { | ||
7613 | if (build_sched_groups(sd, i)) | ||
7614 | goto error; | ||
7615 | } | ||
7616 | } | ||
7258 | } | 7617 | } |
7259 | 7618 | ||
7260 | #ifdef CONFIG_NUMA | 7619 | /* Calculate CPU power for physical packages and nodes */ |
7261 | for (i = 0; i < nr_node_ids; i++) | 7620 | for (i = nr_cpumask_bits-1; i >= 0; i--) { |
7262 | init_numa_sched_groups_power(d.sched_group_nodes[i]); | 7621 | if (!cpumask_test_cpu(i, cpu_map)) |
7263 | 7622 | continue; | |
7264 | if (d.sd_allnodes) { | ||
7265 | struct sched_group *sg; | ||
7266 | 7623 | ||
7267 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 7624 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7268 | d.tmpmask); | 7625 | claim_allocations(i, sd); |
7269 | init_numa_sched_groups_power(sg); | 7626 | init_sched_groups_power(i, sd); |
7627 | } | ||
7270 | } | 7628 | } |
7271 | #endif | ||
7272 | 7629 | ||
7273 | /* Attach the domains */ | 7630 | /* Attach the domains */ |
7631 | rcu_read_lock(); | ||
7274 | for_each_cpu(i, cpu_map) { | 7632 | for_each_cpu(i, cpu_map) { |
7275 | #ifdef CONFIG_SCHED_SMT | 7633 | sd = *per_cpu_ptr(d.sd, i); |
7276 | sd = &per_cpu(cpu_domains, i).sd; | ||
7277 | #elif defined(CONFIG_SCHED_MC) | ||
7278 | sd = &per_cpu(core_domains, i).sd; | ||
7279 | #else | ||
7280 | sd = &per_cpu(phys_domains, i).sd; | ||
7281 | #endif | ||
7282 | cpu_attach_domain(sd, d.rd, i); | 7634 | cpu_attach_domain(sd, d.rd, i); |
7283 | } | 7635 | } |
7636 | rcu_read_unlock(); | ||
7284 | 7637 | ||
7285 | d.sched_group_nodes = NULL; /* don't free this we still need it */ | 7638 | ret = 0; |
7286 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | ||
7287 | return 0; | ||
7288 | |||
7289 | error: | 7639 | error: |
7290 | __free_domain_allocs(&d, alloc_state, cpu_map); | 7640 | __free_domain_allocs(&d, alloc_state, cpu_map); |
7291 | return -ENOMEM; | 7641 | return ret; |
7292 | } | ||
7293 | |||
7294 | static int build_sched_domains(const struct cpumask *cpu_map) | ||
7295 | { | ||
7296 | return __build_sched_domains(cpu_map, NULL); | ||
7297 | } | 7642 | } |
7298 | 7643 | ||
7299 | static cpumask_var_t *doms_cur; /* current sched domains */ | 7644 | static cpumask_var_t *doms_cur; /* current sched domains */ |
@@ -7348,7 +7693,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | |||
7348 | * For now this just excludes isolated cpus, but could be used to | 7693 | * For now this just excludes isolated cpus, but could be used to |
7349 | * exclude other special cases in the future. | 7694 | * exclude other special cases in the future. |
7350 | */ | 7695 | */ |
7351 | static int arch_init_sched_domains(const struct cpumask *cpu_map) | 7696 | static int init_sched_domains(const struct cpumask *cpu_map) |
7352 | { | 7697 | { |
7353 | int err; | 7698 | int err; |
7354 | 7699 | ||
@@ -7359,32 +7704,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) | |||
7359 | doms_cur = &fallback_doms; | 7704 | doms_cur = &fallback_doms; |
7360 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 7705 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
7361 | dattr_cur = NULL; | 7706 | dattr_cur = NULL; |
7362 | err = build_sched_domains(doms_cur[0]); | 7707 | err = build_sched_domains(doms_cur[0], NULL); |
7363 | register_sched_domain_sysctl(); | 7708 | register_sched_domain_sysctl(); |
7364 | 7709 | ||
7365 | return err; | 7710 | return err; |
7366 | } | 7711 | } |
7367 | 7712 | ||
7368 | static void arch_destroy_sched_domains(const struct cpumask *cpu_map, | ||
7369 | struct cpumask *tmpmask) | ||
7370 | { | ||
7371 | free_sched_groups(cpu_map, tmpmask); | ||
7372 | } | ||
7373 | |||
7374 | /* | 7713 | /* |
7375 | * Detach sched domains from a group of cpus specified in cpu_map | 7714 | * Detach sched domains from a group of cpus specified in cpu_map |
7376 | * These cpus will now be attached to the NULL domain | 7715 | * These cpus will now be attached to the NULL domain |
7377 | */ | 7716 | */ |
7378 | static void detach_destroy_domains(const struct cpumask *cpu_map) | 7717 | static void detach_destroy_domains(const struct cpumask *cpu_map) |
7379 | { | 7718 | { |
7380 | /* Save because hotplug lock held. */ | ||
7381 | static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); | ||
7382 | int i; | 7719 | int i; |
7383 | 7720 | ||
7721 | rcu_read_lock(); | ||
7384 | for_each_cpu(i, cpu_map) | 7722 | for_each_cpu(i, cpu_map) |
7385 | cpu_attach_domain(NULL, &def_root_domain, i); | 7723 | cpu_attach_domain(NULL, &def_root_domain, i); |
7386 | synchronize_sched(); | 7724 | rcu_read_unlock(); |
7387 | arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); | ||
7388 | } | 7725 | } |
7389 | 7726 | ||
7390 | /* handle null as "default" */ | 7727 | /* handle null as "default" */ |
@@ -7473,8 +7810,7 @@ match1: | |||
7473 | goto match2; | 7810 | goto match2; |
7474 | } | 7811 | } |
7475 | /* no match - add a new doms_new */ | 7812 | /* no match - add a new doms_new */ |
7476 | __build_sched_domains(doms_new[i], | 7813 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); |
7477 | dattr_new ? dattr_new + i : NULL); | ||
7478 | match2: | 7814 | match2: |
7479 | ; | 7815 | ; |
7480 | } | 7816 | } |
@@ -7493,7 +7829,7 @@ match2: | |||
7493 | } | 7829 | } |
7494 | 7830 | ||
7495 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 7831 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
7496 | static void arch_reinit_sched_domains(void) | 7832 | static void reinit_sched_domains(void) |
7497 | { | 7833 | { |
7498 | get_online_cpus(); | 7834 | get_online_cpus(); |
7499 | 7835 | ||
@@ -7526,7 +7862,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
7526 | else | 7862 | else |
7527 | sched_mc_power_savings = level; | 7863 | sched_mc_power_savings = level; |
7528 | 7864 | ||
7529 | arch_reinit_sched_domains(); | 7865 | reinit_sched_domains(); |
7530 | 7866 | ||
7531 | return count; | 7867 | return count; |
7532 | } | 7868 | } |
@@ -7645,14 +7981,9 @@ void __init sched_init_smp(void) | |||
7645 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7981 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
7646 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7982 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
7647 | 7983 | ||
7648 | #if defined(CONFIG_NUMA) | ||
7649 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
7650 | GFP_KERNEL); | ||
7651 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
7652 | #endif | ||
7653 | get_online_cpus(); | 7984 | get_online_cpus(); |
7654 | mutex_lock(&sched_domains_mutex); | 7985 | mutex_lock(&sched_domains_mutex); |
7655 | arch_init_sched_domains(cpu_active_mask); | 7986 | init_sched_domains(cpu_active_mask); |
7656 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 7987 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
7657 | if (cpumask_empty(non_isolated_cpus)) | 7988 | if (cpumask_empty(non_isolated_cpus)) |
7658 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 7989 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
@@ -7697,8 +8028,15 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
7697 | INIT_LIST_HEAD(&cfs_rq->tasks); | 8028 | INIT_LIST_HEAD(&cfs_rq->tasks); |
7698 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8029 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7699 | cfs_rq->rq = rq; | 8030 | cfs_rq->rq = rq; |
8031 | /* allow initial update_cfs_load() to truncate */ | ||
8032 | #ifdef CONFIG_SMP | ||
8033 | cfs_rq->load_stamp = 1; | ||
8034 | #endif | ||
7700 | #endif | 8035 | #endif |
7701 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 8036 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
8037 | #ifndef CONFIG_64BIT | ||
8038 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
8039 | #endif | ||
7702 | } | 8040 | } |
7703 | 8041 | ||
7704 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | 8042 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) |
@@ -7739,18 +8077,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
7739 | 8077 | ||
7740 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8078 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7741 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 8079 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
7742 | struct sched_entity *se, int cpu, int add, | 8080 | struct sched_entity *se, int cpu, |
7743 | struct sched_entity *parent) | 8081 | struct sched_entity *parent) |
7744 | { | 8082 | { |
7745 | struct rq *rq = cpu_rq(cpu); | 8083 | struct rq *rq = cpu_rq(cpu); |
7746 | tg->cfs_rq[cpu] = cfs_rq; | 8084 | tg->cfs_rq[cpu] = cfs_rq; |
7747 | init_cfs_rq(cfs_rq, rq); | 8085 | init_cfs_rq(cfs_rq, rq); |
7748 | cfs_rq->tg = tg; | 8086 | cfs_rq->tg = tg; |
7749 | if (add) | ||
7750 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7751 | 8087 | ||
7752 | tg->se[cpu] = se; | 8088 | tg->se[cpu] = se; |
7753 | /* se could be NULL for init_task_group */ | 8089 | /* se could be NULL for root_task_group */ |
7754 | if (!se) | 8090 | if (!se) |
7755 | return; | 8091 | return; |
7756 | 8092 | ||
@@ -7760,15 +8096,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7760 | se->cfs_rq = parent->my_q; | 8096 | se->cfs_rq = parent->my_q; |
7761 | 8097 | ||
7762 | se->my_q = cfs_rq; | 8098 | se->my_q = cfs_rq; |
7763 | se->load.weight = tg->shares; | 8099 | update_load_set(&se->load, 0); |
7764 | se->load.inv_weight = 0; | ||
7765 | se->parent = parent; | 8100 | se->parent = parent; |
7766 | } | 8101 | } |
7767 | #endif | 8102 | #endif |
7768 | 8103 | ||
7769 | #ifdef CONFIG_RT_GROUP_SCHED | 8104 | #ifdef CONFIG_RT_GROUP_SCHED |
7770 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | 8105 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
7771 | struct sched_rt_entity *rt_se, int cpu, int add, | 8106 | struct sched_rt_entity *rt_se, int cpu, |
7772 | struct sched_rt_entity *parent) | 8107 | struct sched_rt_entity *parent) |
7773 | { | 8108 | { |
7774 | struct rq *rq = cpu_rq(cpu); | 8109 | struct rq *rq = cpu_rq(cpu); |
@@ -7777,8 +8112,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
7777 | init_rt_rq(rt_rq, rq); | 8112 | init_rt_rq(rt_rq, rq); |
7778 | rt_rq->tg = tg; | 8113 | rt_rq->tg = tg; |
7779 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 8114 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
7780 | if (add) | ||
7781 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7782 | 8115 | ||
7783 | tg->rt_se[cpu] = rt_se; | 8116 | tg->rt_se[cpu] = rt_se; |
7784 | if (!rt_se) | 8117 | if (!rt_se) |
@@ -7813,18 +8146,18 @@ void __init sched_init(void) | |||
7813 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 8146 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
7814 | 8147 | ||
7815 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8148 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7816 | init_task_group.se = (struct sched_entity **)ptr; | 8149 | root_task_group.se = (struct sched_entity **)ptr; |
7817 | ptr += nr_cpu_ids * sizeof(void **); | 8150 | ptr += nr_cpu_ids * sizeof(void **); |
7818 | 8151 | ||
7819 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | 8152 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
7820 | ptr += nr_cpu_ids * sizeof(void **); | 8153 | ptr += nr_cpu_ids * sizeof(void **); |
7821 | 8154 | ||
7822 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8155 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7823 | #ifdef CONFIG_RT_GROUP_SCHED | 8156 | #ifdef CONFIG_RT_GROUP_SCHED |
7824 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 8157 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; |
7825 | ptr += nr_cpu_ids * sizeof(void **); | 8158 | ptr += nr_cpu_ids * sizeof(void **); |
7826 | 8159 | ||
7827 | init_task_group.rt_rq = (struct rt_rq **)ptr; | 8160 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
7828 | ptr += nr_cpu_ids * sizeof(void **); | 8161 | ptr += nr_cpu_ids * sizeof(void **); |
7829 | 8162 | ||
7830 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8163 | #endif /* CONFIG_RT_GROUP_SCHED */ |
@@ -7844,20 +8177,16 @@ void __init sched_init(void) | |||
7844 | global_rt_period(), global_rt_runtime()); | 8177 | global_rt_period(), global_rt_runtime()); |
7845 | 8178 | ||
7846 | #ifdef CONFIG_RT_GROUP_SCHED | 8179 | #ifdef CONFIG_RT_GROUP_SCHED |
7847 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | 8180 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
7848 | global_rt_period(), global_rt_runtime()); | 8181 | global_rt_period(), global_rt_runtime()); |
7849 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8182 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7850 | 8183 | ||
7851 | #ifdef CONFIG_CGROUP_SCHED | 8184 | #ifdef CONFIG_CGROUP_SCHED |
7852 | list_add(&init_task_group.list, &task_groups); | 8185 | list_add(&root_task_group.list, &task_groups); |
7853 | INIT_LIST_HEAD(&init_task_group.children); | 8186 | INIT_LIST_HEAD(&root_task_group.children); |
7854 | 8187 | autogroup_init(&init_task); | |
7855 | #endif /* CONFIG_CGROUP_SCHED */ | 8188 | #endif /* CONFIG_CGROUP_SCHED */ |
7856 | 8189 | ||
7857 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
7858 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | ||
7859 | __alignof__(unsigned long)); | ||
7860 | #endif | ||
7861 | for_each_possible_cpu(i) { | 8190 | for_each_possible_cpu(i) { |
7862 | struct rq *rq; | 8191 | struct rq *rq; |
7863 | 8192 | ||
@@ -7869,38 +8198,34 @@ void __init sched_init(void) | |||
7869 | init_cfs_rq(&rq->cfs, rq); | 8198 | init_cfs_rq(&rq->cfs, rq); |
7870 | init_rt_rq(&rq->rt, rq); | 8199 | init_rt_rq(&rq->rt, rq); |
7871 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8200 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7872 | init_task_group.shares = init_task_group_load; | 8201 | root_task_group.shares = root_task_group_load; |
7873 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 8202 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7874 | #ifdef CONFIG_CGROUP_SCHED | ||
7875 | /* | 8203 | /* |
7876 | * How much cpu bandwidth does init_task_group get? | 8204 | * How much cpu bandwidth does root_task_group get? |
7877 | * | 8205 | * |
7878 | * In case of task-groups formed thr' the cgroup filesystem, it | 8206 | * In case of task-groups formed thr' the cgroup filesystem, it |
7879 | * gets 100% of the cpu resources in the system. This overall | 8207 | * gets 100% of the cpu resources in the system. This overall |
7880 | * system cpu resource is divided among the tasks of | 8208 | * system cpu resource is divided among the tasks of |
7881 | * init_task_group and its child task-groups in a fair manner, | 8209 | * root_task_group and its child task-groups in a fair manner, |
7882 | * based on each entity's (task or task-group's) weight | 8210 | * based on each entity's (task or task-group's) weight |
7883 | * (se->load.weight). | 8211 | * (se->load.weight). |
7884 | * | 8212 | * |
7885 | * In other words, if init_task_group has 10 tasks of weight | 8213 | * In other words, if root_task_group has 10 tasks of weight |
7886 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | 8214 | * 1024) and two child groups A0 and A1 (of weight 1024 each), |
7887 | * then A0's share of the cpu resource is: | 8215 | * then A0's share of the cpu resource is: |
7888 | * | 8216 | * |
7889 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | 8217 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% |
7890 | * | 8218 | * |
7891 | * We achieve this by letting init_task_group's tasks sit | 8219 | * We achieve this by letting root_task_group's tasks sit |
7892 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 8220 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
7893 | */ | 8221 | */ |
7894 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | 8222 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
7895 | #endif | ||
7896 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8223 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7897 | 8224 | ||
7898 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | 8225 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; |
7899 | #ifdef CONFIG_RT_GROUP_SCHED | 8226 | #ifdef CONFIG_RT_GROUP_SCHED |
7900 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 8227 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
7901 | #ifdef CONFIG_CGROUP_SCHED | 8228 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); |
7902 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | ||
7903 | #endif | ||
7904 | #endif | 8229 | #endif |
7905 | 8230 | ||
7906 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 8231 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
@@ -7911,7 +8236,7 @@ void __init sched_init(void) | |||
7911 | #ifdef CONFIG_SMP | 8236 | #ifdef CONFIG_SMP |
7912 | rq->sd = NULL; | 8237 | rq->sd = NULL; |
7913 | rq->rd = NULL; | 8238 | rq->rd = NULL; |
7914 | rq->cpu_power = SCHED_LOAD_SCALE; | 8239 | rq->cpu_power = SCHED_POWER_SCALE; |
7915 | rq->post_schedule = 0; | 8240 | rq->post_schedule = 0; |
7916 | rq->active_balance = 0; | 8241 | rq->active_balance = 0; |
7917 | rq->next_balance = jiffies; | 8242 | rq->next_balance = jiffies; |
@@ -7968,6 +8293,7 @@ void __init sched_init(void) | |||
7968 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | 8293 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ |
7969 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 8294 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
7970 | #ifdef CONFIG_SMP | 8295 | #ifdef CONFIG_SMP |
8296 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | ||
7971 | #ifdef CONFIG_NO_HZ | 8297 | #ifdef CONFIG_NO_HZ |
7972 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 8298 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
7973 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | 8299 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
@@ -7980,8 +8306,6 @@ void __init sched_init(void) | |||
7980 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 8306 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
7981 | #endif /* SMP */ | 8307 | #endif /* SMP */ |
7982 | 8308 | ||
7983 | perf_event_init(); | ||
7984 | |||
7985 | scheduler_running = 1; | 8309 | scheduler_running = 1; |
7986 | } | 8310 | } |
7987 | 8311 | ||
@@ -7990,7 +8314,7 @@ static inline int preempt_count_equals(int preempt_offset) | |||
7990 | { | 8314 | { |
7991 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); | 8315 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); |
7992 | 8316 | ||
7993 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | 8317 | return (nested == preempt_offset); |
7994 | } | 8318 | } |
7995 | 8319 | ||
7996 | void __might_sleep(const char *file, int line, int preempt_offset) | 8320 | void __might_sleep(const char *file, int line, int preempt_offset) |
@@ -8025,9 +8349,11 @@ EXPORT_SYMBOL(__might_sleep); | |||
8025 | #ifdef CONFIG_MAGIC_SYSRQ | 8349 | #ifdef CONFIG_MAGIC_SYSRQ |
8026 | static void normalize_task(struct rq *rq, struct task_struct *p) | 8350 | static void normalize_task(struct rq *rq, struct task_struct *p) |
8027 | { | 8351 | { |
8352 | const struct sched_class *prev_class = p->sched_class; | ||
8353 | int old_prio = p->prio; | ||
8028 | int on_rq; | 8354 | int on_rq; |
8029 | 8355 | ||
8030 | on_rq = p->se.on_rq; | 8356 | on_rq = p->on_rq; |
8031 | if (on_rq) | 8357 | if (on_rq) |
8032 | deactivate_task(rq, p, 0); | 8358 | deactivate_task(rq, p, 0); |
8033 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 8359 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
@@ -8035,6 +8361,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
8035 | activate_task(rq, p, 0); | 8361 | activate_task(rq, p, 0); |
8036 | resched_task(rq->curr); | 8362 | resched_task(rq->curr); |
8037 | } | 8363 | } |
8364 | |||
8365 | check_class_changed(rq, p, prev_class, old_prio); | ||
8038 | } | 8366 | } |
8039 | 8367 | ||
8040 | void normalize_rt_tasks(void) | 8368 | void normalize_rt_tasks(void) |
@@ -8150,7 +8478,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8150 | { | 8478 | { |
8151 | struct cfs_rq *cfs_rq; | 8479 | struct cfs_rq *cfs_rq; |
8152 | struct sched_entity *se; | 8480 | struct sched_entity *se; |
8153 | struct rq *rq; | ||
8154 | int i; | 8481 | int i; |
8155 | 8482 | ||
8156 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | 8483 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8163,8 +8490,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8163 | tg->shares = NICE_0_LOAD; | 8490 | tg->shares = NICE_0_LOAD; |
8164 | 8491 | ||
8165 | for_each_possible_cpu(i) { | 8492 | for_each_possible_cpu(i) { |
8166 | rq = cpu_rq(i); | ||
8167 | |||
8168 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8493 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8169 | GFP_KERNEL, cpu_to_node(i)); | 8494 | GFP_KERNEL, cpu_to_node(i)); |
8170 | if (!cfs_rq) | 8495 | if (!cfs_rq) |
@@ -8175,26 +8500,32 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8175 | if (!se) | 8500 | if (!se) |
8176 | goto err_free_rq; | 8501 | goto err_free_rq; |
8177 | 8502 | ||
8178 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); | 8503 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8179 | } | 8504 | } |
8180 | 8505 | ||
8181 | return 1; | 8506 | return 1; |
8182 | 8507 | ||
8183 | err_free_rq: | 8508 | err_free_rq: |
8184 | kfree(cfs_rq); | 8509 | kfree(cfs_rq); |
8185 | err: | 8510 | err: |
8186 | return 0; | 8511 | return 0; |
8187 | } | 8512 | } |
8188 | 8513 | ||
8189 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8190 | { | ||
8191 | list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, | ||
8192 | &cpu_rq(cpu)->leaf_cfs_rq_list); | ||
8193 | } | ||
8194 | |||
8195 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8514 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8196 | { | 8515 | { |
8197 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8516 | struct rq *rq = cpu_rq(cpu); |
8517 | unsigned long flags; | ||
8518 | |||
8519 | /* | ||
8520 | * Only empty task groups can be destroyed; so we can speculatively | ||
8521 | * check on_list without danger of it being re-added. | ||
8522 | */ | ||
8523 | if (!tg->cfs_rq[cpu]->on_list) | ||
8524 | return; | ||
8525 | |||
8526 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8527 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
8528 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8198 | } | 8529 | } |
8199 | #else /* !CONFG_FAIR_GROUP_SCHED */ | 8530 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
8200 | static inline void free_fair_sched_group(struct task_group *tg) | 8531 | static inline void free_fair_sched_group(struct task_group *tg) |
@@ -8207,10 +8538,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8207 | return 1; | 8538 | return 1; |
8208 | } | 8539 | } |
8209 | 8540 | ||
8210 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8211 | { | ||
8212 | } | ||
8213 | |||
8214 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8541 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8215 | { | 8542 | { |
8216 | } | 8543 | } |
@@ -8239,7 +8566,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8239 | { | 8566 | { |
8240 | struct rt_rq *rt_rq; | 8567 | struct rt_rq *rt_rq; |
8241 | struct sched_rt_entity *rt_se; | 8568 | struct sched_rt_entity *rt_se; |
8242 | struct rq *rq; | ||
8243 | int i; | 8569 | int i; |
8244 | 8570 | ||
8245 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | 8571 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8253,8 +8579,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8253 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | 8579 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); |
8254 | 8580 | ||
8255 | for_each_possible_cpu(i) { | 8581 | for_each_possible_cpu(i) { |
8256 | rq = cpu_rq(i); | ||
8257 | |||
8258 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | 8582 | rt_rq = kzalloc_node(sizeof(struct rt_rq), |
8259 | GFP_KERNEL, cpu_to_node(i)); | 8583 | GFP_KERNEL, cpu_to_node(i)); |
8260 | if (!rt_rq) | 8584 | if (!rt_rq) |
@@ -8265,27 +8589,16 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8265 | if (!rt_se) | 8589 | if (!rt_se) |
8266 | goto err_free_rq; | 8590 | goto err_free_rq; |
8267 | 8591 | ||
8268 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); | 8592 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
8269 | } | 8593 | } |
8270 | 8594 | ||
8271 | return 1; | 8595 | return 1; |
8272 | 8596 | ||
8273 | err_free_rq: | 8597 | err_free_rq: |
8274 | kfree(rt_rq); | 8598 | kfree(rt_rq); |
8275 | err: | 8599 | err: |
8276 | return 0; | 8600 | return 0; |
8277 | } | 8601 | } |
8278 | |||
8279 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8280 | { | ||
8281 | list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, | ||
8282 | &cpu_rq(cpu)->leaf_rt_rq_list); | ||
8283 | } | ||
8284 | |||
8285 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8286 | { | ||
8287 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | ||
8288 | } | ||
8289 | #else /* !CONFIG_RT_GROUP_SCHED */ | 8602 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8290 | static inline void free_rt_sched_group(struct task_group *tg) | 8603 | static inline void free_rt_sched_group(struct task_group *tg) |
8291 | { | 8604 | { |
@@ -8296,14 +8609,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8296 | { | 8609 | { |
8297 | return 1; | 8610 | return 1; |
8298 | } | 8611 | } |
8299 | |||
8300 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8301 | { | ||
8302 | } | ||
8303 | |||
8304 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8305 | { | ||
8306 | } | ||
8307 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8612 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8308 | 8613 | ||
8309 | #ifdef CONFIG_CGROUP_SCHED | 8614 | #ifdef CONFIG_CGROUP_SCHED |
@@ -8311,6 +8616,7 @@ static void free_sched_group(struct task_group *tg) | |||
8311 | { | 8616 | { |
8312 | free_fair_sched_group(tg); | 8617 | free_fair_sched_group(tg); |
8313 | free_rt_sched_group(tg); | 8618 | free_rt_sched_group(tg); |
8619 | autogroup_free(tg); | ||
8314 | kfree(tg); | 8620 | kfree(tg); |
8315 | } | 8621 | } |
8316 | 8622 | ||
@@ -8319,7 +8625,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8319 | { | 8625 | { |
8320 | struct task_group *tg; | 8626 | struct task_group *tg; |
8321 | unsigned long flags; | 8627 | unsigned long flags; |
8322 | int i; | ||
8323 | 8628 | ||
8324 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | 8629 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); |
8325 | if (!tg) | 8630 | if (!tg) |
@@ -8332,10 +8637,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8332 | goto err; | 8637 | goto err; |
8333 | 8638 | ||
8334 | spin_lock_irqsave(&task_group_lock, flags); | 8639 | spin_lock_irqsave(&task_group_lock, flags); |
8335 | for_each_possible_cpu(i) { | ||
8336 | register_fair_sched_group(tg, i); | ||
8337 | register_rt_sched_group(tg, i); | ||
8338 | } | ||
8339 | list_add_rcu(&tg->list, &task_groups); | 8640 | list_add_rcu(&tg->list, &task_groups); |
8340 | 8641 | ||
8341 | WARN_ON(!parent); /* root should already exist */ | 8642 | WARN_ON(!parent); /* root should already exist */ |
@@ -8365,11 +8666,11 @@ void sched_destroy_group(struct task_group *tg) | |||
8365 | unsigned long flags; | 8666 | unsigned long flags; |
8366 | int i; | 8667 | int i; |
8367 | 8668 | ||
8368 | spin_lock_irqsave(&task_group_lock, flags); | 8669 | /* end participation in shares distribution */ |
8369 | for_each_possible_cpu(i) { | 8670 | for_each_possible_cpu(i) |
8370 | unregister_fair_sched_group(tg, i); | 8671 | unregister_fair_sched_group(tg, i); |
8371 | unregister_rt_sched_group(tg, i); | 8672 | |
8372 | } | 8673 | spin_lock_irqsave(&task_group_lock, flags); |
8373 | list_del_rcu(&tg->list); | 8674 | list_del_rcu(&tg->list); |
8374 | list_del_rcu(&tg->siblings); | 8675 | list_del_rcu(&tg->siblings); |
8375 | spin_unlock_irqrestore(&task_group_lock, flags); | 8676 | spin_unlock_irqrestore(&task_group_lock, flags); |
@@ -8392,57 +8693,30 @@ void sched_move_task(struct task_struct *tsk) | |||
8392 | rq = task_rq_lock(tsk, &flags); | 8693 | rq = task_rq_lock(tsk, &flags); |
8393 | 8694 | ||
8394 | running = task_current(rq, tsk); | 8695 | running = task_current(rq, tsk); |
8395 | on_rq = tsk->se.on_rq; | 8696 | on_rq = tsk->on_rq; |
8396 | 8697 | ||
8397 | if (on_rq) | 8698 | if (on_rq) |
8398 | dequeue_task(rq, tsk, 0); | 8699 | dequeue_task(rq, tsk, 0); |
8399 | if (unlikely(running)) | 8700 | if (unlikely(running)) |
8400 | tsk->sched_class->put_prev_task(rq, tsk); | 8701 | tsk->sched_class->put_prev_task(rq, tsk); |
8401 | 8702 | ||
8402 | set_task_rq(tsk, task_cpu(tsk)); | ||
8403 | |||
8404 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8703 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8405 | if (tsk->sched_class->moved_group) | 8704 | if (tsk->sched_class->task_move_group) |
8406 | tsk->sched_class->moved_group(tsk, on_rq); | 8705 | tsk->sched_class->task_move_group(tsk, on_rq); |
8706 | else | ||
8407 | #endif | 8707 | #endif |
8708 | set_task_rq(tsk, task_cpu(tsk)); | ||
8408 | 8709 | ||
8409 | if (unlikely(running)) | 8710 | if (unlikely(running)) |
8410 | tsk->sched_class->set_curr_task(rq); | 8711 | tsk->sched_class->set_curr_task(rq); |
8411 | if (on_rq) | 8712 | if (on_rq) |
8412 | enqueue_task(rq, tsk, 0); | 8713 | enqueue_task(rq, tsk, 0); |
8413 | 8714 | ||
8414 | task_rq_unlock(rq, &flags); | 8715 | task_rq_unlock(rq, tsk, &flags); |
8415 | } | 8716 | } |
8416 | #endif /* CONFIG_CGROUP_SCHED */ | 8717 | #endif /* CONFIG_CGROUP_SCHED */ |
8417 | 8718 | ||
8418 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8719 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8419 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8420 | { | ||
8421 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8422 | int on_rq; | ||
8423 | |||
8424 | on_rq = se->on_rq; | ||
8425 | if (on_rq) | ||
8426 | dequeue_entity(cfs_rq, se, 0); | ||
8427 | |||
8428 | se->load.weight = shares; | ||
8429 | se->load.inv_weight = 0; | ||
8430 | |||
8431 | if (on_rq) | ||
8432 | enqueue_entity(cfs_rq, se, 0); | ||
8433 | } | ||
8434 | |||
8435 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8436 | { | ||
8437 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8438 | struct rq *rq = cfs_rq->rq; | ||
8439 | unsigned long flags; | ||
8440 | |||
8441 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8442 | __set_se_shares(se, shares); | ||
8443 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8444 | } | ||
8445 | |||
8446 | static DEFINE_MUTEX(shares_mutex); | 8720 | static DEFINE_MUTEX(shares_mutex); |
8447 | 8721 | ||
8448 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 8722 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
@@ -8456,46 +8730,25 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8456 | if (!tg->se[0]) | 8730 | if (!tg->se[0]) |
8457 | return -EINVAL; | 8731 | return -EINVAL; |
8458 | 8732 | ||
8459 | if (shares < MIN_SHARES) | 8733 | shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); |
8460 | shares = MIN_SHARES; | ||
8461 | else if (shares > MAX_SHARES) | ||
8462 | shares = MAX_SHARES; | ||
8463 | 8734 | ||
8464 | mutex_lock(&shares_mutex); | 8735 | mutex_lock(&shares_mutex); |
8465 | if (tg->shares == shares) | 8736 | if (tg->shares == shares) |
8466 | goto done; | 8737 | goto done; |
8467 | 8738 | ||
8468 | spin_lock_irqsave(&task_group_lock, flags); | ||
8469 | for_each_possible_cpu(i) | ||
8470 | unregister_fair_sched_group(tg, i); | ||
8471 | list_del_rcu(&tg->siblings); | ||
8472 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8473 | |||
8474 | /* wait for any ongoing reference to this group to finish */ | ||
8475 | synchronize_sched(); | ||
8476 | |||
8477 | /* | ||
8478 | * Now we are free to modify the group's share on each cpu | ||
8479 | * w/o tripping rebalance_share or load_balance_fair. | ||
8480 | */ | ||
8481 | tg->shares = shares; | 8739 | tg->shares = shares; |
8482 | for_each_possible_cpu(i) { | 8740 | for_each_possible_cpu(i) { |
8483 | /* | 8741 | struct rq *rq = cpu_rq(i); |
8484 | * force a rebalance | 8742 | struct sched_entity *se; |
8485 | */ | 8743 | |
8486 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | 8744 | se = tg->se[i]; |
8487 | set_se_shares(tg->se[i], shares); | 8745 | /* Propagate contribution to hierarchy */ |
8746 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8747 | for_each_sched_entity(se) | ||
8748 | update_cfs_shares(group_cfs_rq(se)); | ||
8749 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8488 | } | 8750 | } |
8489 | 8751 | ||
8490 | /* | ||
8491 | * Enable load balance activity on this group, by inserting it back on | ||
8492 | * each cpu's rq->leaf_cfs_rq_list. | ||
8493 | */ | ||
8494 | spin_lock_irqsave(&task_group_lock, flags); | ||
8495 | for_each_possible_cpu(i) | ||
8496 | register_fair_sched_group(tg, i); | ||
8497 | list_add_rcu(&tg->siblings, &tg->parent->children); | ||
8498 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8499 | done: | 8752 | done: |
8500 | mutex_unlock(&shares_mutex); | 8753 | mutex_unlock(&shares_mutex); |
8501 | return 0; | 8754 | return 0; |
@@ -8630,7 +8883,7 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
8630 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 8883 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
8631 | } | 8884 | } |
8632 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8885 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
8633 | unlock: | 8886 | unlock: |
8634 | read_unlock(&tasklist_lock); | 8887 | read_unlock(&tasklist_lock); |
8635 | mutex_unlock(&rt_constraints_mutex); | 8888 | mutex_unlock(&rt_constraints_mutex); |
8636 | 8889 | ||
@@ -8794,7 +9047,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
8794 | 9047 | ||
8795 | if (!cgrp->parent) { | 9048 | if (!cgrp->parent) { |
8796 | /* This is early initialization for the top cgroup */ | 9049 | /* This is early initialization for the top cgroup */ |
8797 | return &init_task_group.css; | 9050 | return &root_task_group.css; |
8798 | } | 9051 | } |
8799 | 9052 | ||
8800 | parent = cgroup_tg(cgrp->parent); | 9053 | parent = cgroup_tg(cgrp->parent); |
@@ -8827,56 +9080,39 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
8827 | return 0; | 9080 | return 0; |
8828 | } | 9081 | } |
8829 | 9082 | ||
8830 | static int | 9083 | static void |
8831 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 9084 | cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
8832 | struct task_struct *tsk, bool threadgroup) | ||
8833 | { | 9085 | { |
8834 | int retval = cpu_cgroup_can_attach_task(cgrp, tsk); | 9086 | sched_move_task(tsk); |
8835 | if (retval) | ||
8836 | return retval; | ||
8837 | if (threadgroup) { | ||
8838 | struct task_struct *c; | ||
8839 | rcu_read_lock(); | ||
8840 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
8841 | retval = cpu_cgroup_can_attach_task(cgrp, c); | ||
8842 | if (retval) { | ||
8843 | rcu_read_unlock(); | ||
8844 | return retval; | ||
8845 | } | ||
8846 | } | ||
8847 | rcu_read_unlock(); | ||
8848 | } | ||
8849 | return 0; | ||
8850 | } | 9087 | } |
8851 | 9088 | ||
8852 | static void | 9089 | static void |
8853 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 9090 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, |
8854 | struct cgroup *old_cont, struct task_struct *tsk, | 9091 | struct cgroup *old_cgrp, struct task_struct *task) |
8855 | bool threadgroup) | ||
8856 | { | 9092 | { |
8857 | sched_move_task(tsk); | 9093 | /* |
8858 | if (threadgroup) { | 9094 | * cgroup_exit() is called in the copy_process() failure path. |
8859 | struct task_struct *c; | 9095 | * Ignore this case since the task hasn't ran yet, this avoids |
8860 | rcu_read_lock(); | 9096 | * trying to poke a half freed task state from generic code. |
8861 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | 9097 | */ |
8862 | sched_move_task(c); | 9098 | if (!(task->flags & PF_EXITING)) |
8863 | } | 9099 | return; |
8864 | rcu_read_unlock(); | 9100 | |
8865 | } | 9101 | sched_move_task(task); |
8866 | } | 9102 | } |
8867 | 9103 | ||
8868 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9104 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8869 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 9105 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, |
8870 | u64 shareval) | 9106 | u64 shareval) |
8871 | { | 9107 | { |
8872 | return sched_group_set_shares(cgroup_tg(cgrp), shareval); | 9108 | return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); |
8873 | } | 9109 | } |
8874 | 9110 | ||
8875 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | 9111 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) |
8876 | { | 9112 | { |
8877 | struct task_group *tg = cgroup_tg(cgrp); | 9113 | struct task_group *tg = cgroup_tg(cgrp); |
8878 | 9114 | ||
8879 | return (u64) tg->shares; | 9115 | return (u64) scale_load_down(tg->shares); |
8880 | } | 9116 | } |
8881 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 9117 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8882 | 9118 | ||
@@ -8935,8 +9171,9 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
8935 | .name = "cpu", | 9171 | .name = "cpu", |
8936 | .create = cpu_cgroup_create, | 9172 | .create = cpu_cgroup_create, |
8937 | .destroy = cpu_cgroup_destroy, | 9173 | .destroy = cpu_cgroup_destroy, |
8938 | .can_attach = cpu_cgroup_can_attach, | 9174 | .can_attach_task = cpu_cgroup_can_attach_task, |
8939 | .attach = cpu_cgroup_attach, | 9175 | .attach_task = cpu_cgroup_attach_task, |
9176 | .exit = cpu_cgroup_exit, | ||
8940 | .populate = cpu_cgroup_populate, | 9177 | .populate = cpu_cgroup_populate, |
8941 | .subsys_id = cpu_cgroup_subsys_id, | 9178 | .subsys_id = cpu_cgroup_subsys_id, |
8942 | .early_init = 1, | 9179 | .early_init = 1, |
@@ -9221,72 +9458,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9221 | }; | 9458 | }; |
9222 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9459 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9223 | 9460 | ||
9224 | #ifndef CONFIG_SMP | ||
9225 | |||
9226 | void synchronize_sched_expedited(void) | ||
9227 | { | ||
9228 | barrier(); | ||
9229 | } | ||
9230 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9231 | |||
9232 | #else /* #ifndef CONFIG_SMP */ | ||
9233 | |||
9234 | static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); | ||
9235 | |||
9236 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
9237 | { | ||
9238 | /* | ||
9239 | * There must be a full memory barrier on each affected CPU | ||
9240 | * between the time that try_stop_cpus() is called and the | ||
9241 | * time that it returns. | ||
9242 | * | ||
9243 | * In the current initial implementation of cpu_stop, the | ||
9244 | * above condition is already met when the control reaches | ||
9245 | * this point and the following smp_mb() is not strictly | ||
9246 | * necessary. Do smp_mb() anyway for documentation and | ||
9247 | * robustness against future implementation changes. | ||
9248 | */ | ||
9249 | smp_mb(); /* See above comment block. */ | ||
9250 | return 0; | ||
9251 | } | ||
9252 | |||
9253 | /* | ||
9254 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
9255 | * approach to force grace period to end quickly. This consumes | ||
9256 | * significant time on all CPUs, and is thus not recommended for | ||
9257 | * any sort of common-case code. | ||
9258 | * | ||
9259 | * Note that it is illegal to call this function while holding any | ||
9260 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
9261 | * observe this restriction will result in deadlock. | ||
9262 | */ | ||
9263 | void synchronize_sched_expedited(void) | ||
9264 | { | ||
9265 | int snap, trycount = 0; | ||
9266 | |||
9267 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
9268 | snap = atomic_read(&synchronize_sched_expedited_count) + 1; | ||
9269 | get_online_cpus(); | ||
9270 | while (try_stop_cpus(cpu_online_mask, | ||
9271 | synchronize_sched_expedited_cpu_stop, | ||
9272 | NULL) == -EAGAIN) { | ||
9273 | put_online_cpus(); | ||
9274 | if (trycount++ < 10) | ||
9275 | udelay(trycount * num_online_cpus()); | ||
9276 | else { | ||
9277 | synchronize_sched(); | ||
9278 | return; | ||
9279 | } | ||
9280 | if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { | ||
9281 | smp_mb(); /* ensure test happens before caller kfree */ | ||
9282 | return; | ||
9283 | } | ||
9284 | get_online_cpus(); | ||
9285 | } | ||
9286 | atomic_inc(&synchronize_sched_expedited_count); | ||
9287 | smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ | ||
9288 | put_online_cpus(); | ||
9289 | } | ||
9290 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9291 | |||
9292 | #endif /* #else #ifndef CONFIG_SMP */ | ||