diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 3432 |
1 files changed, 1796 insertions, 1636 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index c5d775079027..935f8e8e6160 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -32,7 +32,6 @@ | |||
32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
33 | #include <linux/uaccess.h> | 33 | #include <linux/uaccess.h> |
34 | #include <linux/highmem.h> | 34 | #include <linux/highmem.h> |
35 | #include <linux/smp_lock.h> | ||
36 | #include <asm/mmu_context.h> | 35 | #include <asm/mmu_context.h> |
37 | #include <linux/interrupt.h> | 36 | #include <linux/interrupt.h> |
38 | #include <linux/capability.h> | 37 | #include <linux/capability.h> |
@@ -75,9 +74,11 @@ | |||
75 | 74 | ||
76 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
77 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
77 | #include <asm/mutex.h> | ||
78 | 78 | ||
79 | #include "sched_cpupri.h" | 79 | #include "sched_cpupri.h" |
80 | #include "workqueue_sched.h" | 80 | #include "workqueue_sched.h" |
81 | #include "sched_autogroup.h" | ||
81 | 82 | ||
82 | #include <litmus/sched_trace.h> | 83 | #include <litmus/sched_trace.h> |
83 | #include <litmus/trace.h> | 84 | #include <litmus/trace.h> |
@@ -235,7 +236,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
235 | #endif | 236 | #endif |
236 | 237 | ||
237 | /* | 238 | /* |
238 | * sched_domains_mutex serializes calls to arch_init_sched_domains, | 239 | * sched_domains_mutex serializes calls to init_sched_domains, |
239 | * detach_destroy_domains and partition_sched_domains. | 240 | * detach_destroy_domains and partition_sched_domains. |
240 | */ | 241 | */ |
241 | static DEFINE_MUTEX(sched_domains_mutex); | 242 | static DEFINE_MUTEX(sched_domains_mutex); |
@@ -258,6 +259,8 @@ struct task_group { | |||
258 | /* runqueue "owned" by this group on each cpu */ | 259 | /* runqueue "owned" by this group on each cpu */ |
259 | struct cfs_rq **cfs_rq; | 260 | struct cfs_rq **cfs_rq; |
260 | unsigned long shares; | 261 | unsigned long shares; |
262 | |||
263 | atomic_t load_weight; | ||
261 | #endif | 264 | #endif |
262 | 265 | ||
263 | #ifdef CONFIG_RT_GROUP_SCHED | 266 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -273,25 +276,18 @@ struct task_group { | |||
273 | struct task_group *parent; | 276 | struct task_group *parent; |
274 | struct list_head siblings; | 277 | struct list_head siblings; |
275 | struct list_head children; | 278 | struct list_head children; |
276 | }; | ||
277 | 279 | ||
278 | #define root_task_group init_task_group | 280 | #ifdef CONFIG_SCHED_AUTOGROUP |
281 | struct autogroup *autogroup; | ||
282 | #endif | ||
283 | }; | ||
279 | 284 | ||
280 | /* task_group_lock serializes add/remove of task groups and also changes to | 285 | /* task_group_lock serializes the addition/removal of task groups */ |
281 | * a task group's cpu shares. | ||
282 | */ | ||
283 | static DEFINE_SPINLOCK(task_group_lock); | 286 | static DEFINE_SPINLOCK(task_group_lock); |
284 | 287 | ||
285 | #ifdef CONFIG_FAIR_GROUP_SCHED | 288 | #ifdef CONFIG_FAIR_GROUP_SCHED |
286 | 289 | ||
287 | #ifdef CONFIG_SMP | 290 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD |
288 | static int root_task_group_empty(void) | ||
289 | { | ||
290 | return list_empty(&root_task_group.children); | ||
291 | } | ||
292 | #endif | ||
293 | |||
294 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | ||
295 | 291 | ||
296 | /* | 292 | /* |
297 | * A weight of 0 or 1 can cause arithmetics problems. | 293 | * A weight of 0 or 1 can cause arithmetics problems. |
@@ -301,16 +297,16 @@ static int root_task_group_empty(void) | |||
301 | * (The default weight is 1024 - so there's no practical | 297 | * (The default weight is 1024 - so there's no practical |
302 | * limitation from this.) | 298 | * limitation from this.) |
303 | */ | 299 | */ |
304 | #define MIN_SHARES 2 | 300 | #define MIN_SHARES (1UL << 1) |
305 | #define MAX_SHARES (1UL << 18) | 301 | #define MAX_SHARES (1UL << 18) |
306 | 302 | ||
307 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 303 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; |
308 | #endif | 304 | #endif |
309 | 305 | ||
310 | /* Default task group. | 306 | /* Default task group. |
311 | * Every task in system belong to this group at bootup. | 307 | * Every task in system belong to this group at bootup. |
312 | */ | 308 | */ |
313 | struct task_group init_task_group; | 309 | struct task_group root_task_group; |
314 | 310 | ||
315 | #endif /* CONFIG_CGROUP_SCHED */ | 311 | #endif /* CONFIG_CGROUP_SCHED */ |
316 | 312 | ||
@@ -321,6 +317,9 @@ struct cfs_rq { | |||
321 | 317 | ||
322 | u64 exec_clock; | 318 | u64 exec_clock; |
323 | u64 min_vruntime; | 319 | u64 min_vruntime; |
320 | #ifndef CONFIG_64BIT | ||
321 | u64 min_vruntime_copy; | ||
322 | #endif | ||
324 | 323 | ||
325 | struct rb_root tasks_timeline; | 324 | struct rb_root tasks_timeline; |
326 | struct rb_node *rb_leftmost; | 325 | struct rb_node *rb_leftmost; |
@@ -332,9 +331,11 @@ struct cfs_rq { | |||
332 | * 'curr' points to currently running entity on this cfs_rq. | 331 | * 'curr' points to currently running entity on this cfs_rq. |
333 | * It is set to NULL otherwise (i.e when none are currently running). | 332 | * It is set to NULL otherwise (i.e when none are currently running). |
334 | */ | 333 | */ |
335 | struct sched_entity *curr, *next, *last; | 334 | struct sched_entity *curr, *next, *last, *skip; |
336 | 335 | ||
336 | #ifdef CONFIG_SCHED_DEBUG | ||
337 | unsigned int nr_spread_over; | 337 | unsigned int nr_spread_over; |
338 | #endif | ||
338 | 339 | ||
339 | #ifdef CONFIG_FAIR_GROUP_SCHED | 340 | #ifdef CONFIG_FAIR_GROUP_SCHED |
340 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 341 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
@@ -347,6 +348,7 @@ struct cfs_rq { | |||
347 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 348 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
348 | * list is used during load balance. | 349 | * list is used during load balance. |
349 | */ | 350 | */ |
351 | int on_list; | ||
350 | struct list_head leaf_cfs_rq_list; | 352 | struct list_head leaf_cfs_rq_list; |
351 | struct task_group *tg; /* group that "owns" this runqueue */ | 353 | struct task_group *tg; /* group that "owns" this runqueue */ |
352 | 354 | ||
@@ -365,14 +367,17 @@ struct cfs_rq { | |||
365 | unsigned long h_load; | 367 | unsigned long h_load; |
366 | 368 | ||
367 | /* | 369 | /* |
368 | * this cpu's part of tg->shares | 370 | * Maintaining per-cpu shares distribution for group scheduling |
371 | * | ||
372 | * load_stamp is the last time we updated the load average | ||
373 | * load_last is the last time we updated the load average and saw load | ||
374 | * load_unacc_exec_time is currently unaccounted execution time | ||
369 | */ | 375 | */ |
370 | unsigned long shares; | 376 | u64 load_avg; |
377 | u64 load_period; | ||
378 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
371 | 379 | ||
372 | /* | 380 | unsigned long load_contribution; |
373 | * load.weight at the time we set shares | ||
374 | */ | ||
375 | unsigned long rq_weight; | ||
376 | #endif | 381 | #endif |
377 | #endif | 382 | #endif |
378 | }; | 383 | }; |
@@ -428,6 +433,7 @@ struct litmus_rq { | |||
428 | */ | 433 | */ |
429 | struct root_domain { | 434 | struct root_domain { |
430 | atomic_t refcount; | 435 | atomic_t refcount; |
436 | struct rcu_head rcu; | ||
431 | cpumask_var_t span; | 437 | cpumask_var_t span; |
432 | cpumask_var_t online; | 438 | cpumask_var_t online; |
433 | 439 | ||
@@ -437,9 +443,7 @@ struct root_domain { | |||
437 | */ | 443 | */ |
438 | cpumask_var_t rto_mask; | 444 | cpumask_var_t rto_mask; |
439 | atomic_t rto_count; | 445 | atomic_t rto_count; |
440 | #ifdef CONFIG_SMP | ||
441 | struct cpupri cpupri; | 446 | struct cpupri cpupri; |
442 | #endif | ||
443 | }; | 447 | }; |
444 | 448 | ||
445 | /* | 449 | /* |
@@ -448,7 +452,7 @@ struct root_domain { | |||
448 | */ | 452 | */ |
449 | static struct root_domain def_root_domain; | 453 | static struct root_domain def_root_domain; |
450 | 454 | ||
451 | #endif | 455 | #endif /* CONFIG_SMP */ |
452 | 456 | ||
453 | /* | 457 | /* |
454 | * This is the main, per-CPU runqueue data structure. | 458 | * This is the main, per-CPU runqueue data structure. |
@@ -473,7 +477,7 @@ struct rq { | |||
473 | u64 nohz_stamp; | 477 | u64 nohz_stamp; |
474 | unsigned char nohz_balance_kick; | 478 | unsigned char nohz_balance_kick; |
475 | #endif | 479 | #endif |
476 | unsigned int skip_clock_update; | 480 | int skip_clock_update; |
477 | 481 | ||
478 | /* capture load from *all* tasks on this cpu: */ | 482 | /* capture load from *all* tasks on this cpu: */ |
479 | struct load_weight load; | 483 | struct load_weight load; |
@@ -500,11 +504,12 @@ struct rq { | |||
500 | */ | 504 | */ |
501 | unsigned long nr_uninterruptible; | 505 | unsigned long nr_uninterruptible; |
502 | 506 | ||
503 | struct task_struct *curr, *idle; | 507 | struct task_struct *curr, *idle, *stop; |
504 | unsigned long next_balance; | 508 | unsigned long next_balance; |
505 | struct mm_struct *prev_mm; | 509 | struct mm_struct *prev_mm; |
506 | 510 | ||
507 | u64 clock; | 511 | u64 clock; |
512 | u64 clock_task; | ||
508 | 513 | ||
509 | atomic_t nr_iowait; | 514 | atomic_t nr_iowait; |
510 | 515 | ||
@@ -532,6 +537,10 @@ struct rq { | |||
532 | u64 avg_idle; | 537 | u64 avg_idle; |
533 | #endif | 538 | #endif |
534 | 539 | ||
540 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
541 | u64 prev_irq_time; | ||
542 | #endif | ||
543 | |||
535 | /* calc_load related fields */ | 544 | /* calc_load related fields */ |
536 | unsigned long calc_load_update; | 545 | unsigned long calc_load_update; |
537 | long calc_load_active; | 546 | long calc_load_active; |
@@ -561,32 +570,17 @@ struct rq { | |||
561 | /* try_to_wake_up() stats */ | 570 | /* try_to_wake_up() stats */ |
562 | unsigned int ttwu_count; | 571 | unsigned int ttwu_count; |
563 | unsigned int ttwu_local; | 572 | unsigned int ttwu_local; |
573 | #endif | ||
564 | 574 | ||
565 | /* BKL stats */ | 575 | #ifdef CONFIG_SMP |
566 | unsigned int bkl_count; | 576 | struct task_struct *wake_list; |
567 | #endif | 577 | #endif |
568 | }; | 578 | }; |
569 | 579 | ||
570 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 580 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
571 | 581 | ||
572 | static inline | ||
573 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
574 | { | ||
575 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
576 | 582 | ||
577 | /* | 583 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); |
578 | * A queue event has occurred, and we're going to schedule. In | ||
579 | * this case, we can save a useless back to back clock update. | ||
580 | */ | ||
581 | /* LITMUS^RT: turning off the clock update is buggy in Linux 2.6.36; | ||
582 | * the scheduler can "forget" to renable the runqueue clock in some | ||
583 | * cases. LITMUS^RT amplifies the effects of this problem. Hence, we | ||
584 | * turn it off to avoid stalling clocks. */ | ||
585 | /* | ||
586 | if (test_tsk_need_resched(p)) | ||
587 | rq->skip_clock_update = 1; | ||
588 | */ | ||
589 | } | ||
590 | 584 | ||
591 | static inline int cpu_of(struct rq *rq) | 585 | static inline int cpu_of(struct rq *rq) |
592 | { | 586 | { |
@@ -599,7 +593,7 @@ static inline int cpu_of(struct rq *rq) | |||
599 | 593 | ||
600 | #define rcu_dereference_check_sched_domain(p) \ | 594 | #define rcu_dereference_check_sched_domain(p) \ |
601 | rcu_dereference_check((p), \ | 595 | rcu_dereference_check((p), \ |
602 | rcu_read_lock_sched_held() || \ | 596 | rcu_read_lock_held() || \ |
603 | lockdep_is_held(&sched_domains_mutex)) | 597 | lockdep_is_held(&sched_domains_mutex)) |
604 | 598 | ||
605 | /* | 599 | /* |
@@ -623,18 +617,22 @@ static inline int cpu_of(struct rq *rq) | |||
623 | /* | 617 | /* |
624 | * Return the group to which this tasks belongs. | 618 | * Return the group to which this tasks belongs. |
625 | * | 619 | * |
626 | * We use task_subsys_state_check() and extend the RCU verification | 620 | * We use task_subsys_state_check() and extend the RCU verification with |
627 | * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() | 621 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each |
628 | * holds that lock for each task it moves into the cgroup. Therefore | 622 | * task it moves into the cgroup. Therefore by holding either of those locks, |
629 | * by holding that lock, we pin the task to the current cgroup. | 623 | * we pin the task to the current cgroup. |
630 | */ | 624 | */ |
631 | static inline struct task_group *task_group(struct task_struct *p) | 625 | static inline struct task_group *task_group(struct task_struct *p) |
632 | { | 626 | { |
627 | struct task_group *tg; | ||
633 | struct cgroup_subsys_state *css; | 628 | struct cgroup_subsys_state *css; |
634 | 629 | ||
635 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 630 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
631 | lockdep_is_held(&p->pi_lock) || | ||
636 | lockdep_is_held(&task_rq(p)->lock)); | 632 | lockdep_is_held(&task_rq(p)->lock)); |
637 | return container_of(css, struct task_group, css); | 633 | tg = container_of(css, struct task_group, css); |
634 | |||
635 | return autogroup_task_group(p, tg); | ||
638 | } | 636 | } |
639 | 637 | ||
640 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 638 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
@@ -661,10 +659,18 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
661 | 659 | ||
662 | #endif /* CONFIG_CGROUP_SCHED */ | 660 | #endif /* CONFIG_CGROUP_SCHED */ |
663 | 661 | ||
664 | inline void update_rq_clock(struct rq *rq) | 662 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
663 | |||
664 | static void update_rq_clock(struct rq *rq) | ||
665 | { | 665 | { |
666 | if (!rq->skip_clock_update) | 666 | s64 delta; |
667 | rq->clock = sched_clock_cpu(cpu_of(rq)); | 667 | |
668 | if (rq->skip_clock_update > 0) | ||
669 | return; | ||
670 | |||
671 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | ||
672 | rq->clock += delta; | ||
673 | update_rq_clock_task(rq, delta); | ||
668 | } | 674 | } |
669 | 675 | ||
670 | /* | 676 | /* |
@@ -677,10 +683,9 @@ inline void update_rq_clock(struct rq *rq) | |||
677 | #endif | 683 | #endif |
678 | 684 | ||
679 | /** | 685 | /** |
680 | * runqueue_is_locked | 686 | * runqueue_is_locked - Returns true if the current cpu runqueue is locked |
681 | * @cpu: the processor in question. | 687 | * @cpu: the processor in question. |
682 | * | 688 | * |
683 | * Returns true if the current cpu runqueue is locked. | ||
684 | * This interface allows printk to be called with the runqueue lock | 689 | * This interface allows printk to be called with the runqueue lock |
685 | * held and know whether or not it is OK to wake up the klogd. | 690 | * held and know whether or not it is OK to wake up the klogd. |
686 | */ | 691 | */ |
@@ -741,7 +746,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
741 | size_t cnt, loff_t *ppos) | 746 | size_t cnt, loff_t *ppos) |
742 | { | 747 | { |
743 | char buf[64]; | 748 | char buf[64]; |
744 | char *cmp = buf; | 749 | char *cmp; |
745 | int neg = 0; | 750 | int neg = 0; |
746 | int i; | 751 | int i; |
747 | 752 | ||
@@ -752,16 +757,15 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
752 | return -EFAULT; | 757 | return -EFAULT; |
753 | 758 | ||
754 | buf[cnt] = 0; | 759 | buf[cnt] = 0; |
760 | cmp = strstrip(buf); | ||
755 | 761 | ||
756 | if (strncmp(buf, "NO_", 3) == 0) { | 762 | if (strncmp(cmp, "NO_", 3) == 0) { |
757 | neg = 1; | 763 | neg = 1; |
758 | cmp += 3; | 764 | cmp += 3; |
759 | } | 765 | } |
760 | 766 | ||
761 | for (i = 0; sched_feat_names[i]; i++) { | 767 | for (i = 0; sched_feat_names[i]; i++) { |
762 | int len = strlen(sched_feat_names[i]); | 768 | if (strcmp(cmp, sched_feat_names[i]) == 0) { |
763 | |||
764 | if (strncmp(cmp, sched_feat_names[i], len) == 0) { | ||
765 | if (neg) | 769 | if (neg) |
766 | sysctl_sched_features &= ~(1UL << i); | 770 | sysctl_sched_features &= ~(1UL << i); |
767 | else | 771 | else |
@@ -811,20 +815,6 @@ late_initcall(sched_init_debug); | |||
811 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 815 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
812 | 816 | ||
813 | /* | 817 | /* |
814 | * ratelimit for updating the group shares. | ||
815 | * default: 0.25ms | ||
816 | */ | ||
817 | unsigned int sysctl_sched_shares_ratelimit = 250000; | ||
818 | unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; | ||
819 | |||
820 | /* | ||
821 | * Inject some fuzzyness into changing the per-cpu group shares | ||
822 | * this avoids remote rq-locks at the expense of fairness. | ||
823 | * default: 4 | ||
824 | */ | ||
825 | unsigned int sysctl_sched_shares_thresh = 4; | ||
826 | |||
827 | /* | ||
828 | * period over which we average the RT time consumption, measured | 818 | * period over which we average the RT time consumption, measured |
829 | * in ms. | 819 | * in ms. |
830 | * | 820 | * |
@@ -871,18 +861,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) | |||
871 | return rq->curr == p; | 861 | return rq->curr == p; |
872 | } | 862 | } |
873 | 863 | ||
874 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
875 | static inline int task_running(struct rq *rq, struct task_struct *p) | 864 | static inline int task_running(struct rq *rq, struct task_struct *p) |
876 | { | 865 | { |
866 | #ifdef CONFIG_SMP | ||
867 | return p->on_cpu; | ||
868 | #else | ||
877 | return task_current(rq, p); | 869 | return task_current(rq, p); |
870 | #endif | ||
878 | } | 871 | } |
879 | 872 | ||
873 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
880 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 874 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
881 | { | 875 | { |
876 | #ifdef CONFIG_SMP | ||
877 | /* | ||
878 | * We can optimise this out completely for !SMP, because the | ||
879 | * SMP rebalancing from interrupt is the only thing that cares | ||
880 | * here. | ||
881 | */ | ||
882 | next->on_cpu = 1; | ||
883 | #endif | ||
882 | } | 884 | } |
883 | 885 | ||
884 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 886 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
885 | { | 887 | { |
888 | #ifdef CONFIG_SMP | ||
889 | /* | ||
890 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
891 | * We must ensure this doesn't happen until the switch is completely | ||
892 | * finished. | ||
893 | */ | ||
894 | smp_wmb(); | ||
895 | prev->on_cpu = 0; | ||
896 | #endif | ||
886 | #ifdef CONFIG_DEBUG_SPINLOCK | 897 | #ifdef CONFIG_DEBUG_SPINLOCK |
887 | /* this is a valid case when another task releases the spinlock */ | 898 | /* this is a valid case when another task releases the spinlock */ |
888 | rq->lock.owner = current; | 899 | rq->lock.owner = current; |
@@ -898,15 +909,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
898 | } | 909 | } |
899 | 910 | ||
900 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 911 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
901 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
902 | { | ||
903 | #ifdef CONFIG_SMP | ||
904 | return p->oncpu; | ||
905 | #else | ||
906 | return task_current(rq, p); | ||
907 | #endif | ||
908 | } | ||
909 | |||
910 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 912 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
911 | { | 913 | { |
912 | #ifdef CONFIG_SMP | 914 | #ifdef CONFIG_SMP |
@@ -915,7 +917,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | |||
915 | * SMP rebalancing from interrupt is the only thing that cares | 917 | * SMP rebalancing from interrupt is the only thing that cares |
916 | * here. | 918 | * here. |
917 | */ | 919 | */ |
918 | next->oncpu = 1; | 920 | next->on_cpu = 1; |
919 | #endif | 921 | #endif |
920 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 922 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
921 | raw_spin_unlock_irq(&rq->lock); | 923 | raw_spin_unlock_irq(&rq->lock); |
@@ -928,12 +930,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
928 | { | 930 | { |
929 | #ifdef CONFIG_SMP | 931 | #ifdef CONFIG_SMP |
930 | /* | 932 | /* |
931 | * After ->oncpu is cleared, the task can be moved to a different CPU. | 933 | * After ->on_cpu is cleared, the task can be moved to a different CPU. |
932 | * We must ensure this doesn't happen until the switch is completely | 934 | * We must ensure this doesn't happen until the switch is completely |
933 | * finished. | 935 | * finished. |
934 | */ | 936 | */ |
935 | smp_wmb(); | 937 | smp_wmb(); |
936 | prev->oncpu = 0; | 938 | prev->on_cpu = 0; |
937 | #endif | 939 | #endif |
938 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 940 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
939 | local_irq_enable(); | 941 | local_irq_enable(); |
@@ -942,23 +944,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
942 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 944 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
943 | 945 | ||
944 | /* | 946 | /* |
945 | * Check whether the task is waking, we use this to synchronize ->cpus_allowed | 947 | * __task_rq_lock - lock the rq @p resides on. |
946 | * against ttwu(). | ||
947 | */ | ||
948 | static inline int task_is_waking(struct task_struct *p) | ||
949 | { | ||
950 | return unlikely(p->state == TASK_WAKING); | ||
951 | } | ||
952 | |||
953 | /* | ||
954 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
955 | * Must be called interrupts disabled. | ||
956 | */ | 948 | */ |
957 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 949 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
958 | __acquires(rq->lock) | 950 | __acquires(rq->lock) |
959 | { | 951 | { |
960 | struct rq *rq; | 952 | struct rq *rq; |
961 | 953 | ||
954 | lockdep_assert_held(&p->pi_lock); | ||
955 | |||
962 | for (;;) { | 956 | for (;;) { |
963 | rq = task_rq(p); | 957 | rq = task_rq(p); |
964 | raw_spin_lock(&rq->lock); | 958 | raw_spin_lock(&rq->lock); |
@@ -969,22 +963,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
969 | } | 963 | } |
970 | 964 | ||
971 | /* | 965 | /* |
972 | * task_rq_lock - lock the runqueue a given task resides on and disable | 966 | * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. |
973 | * interrupts. Note the ordering: we can safely lookup the task_rq without | ||
974 | * explicitly disabling preemption. | ||
975 | */ | 967 | */ |
976 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 968 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
969 | __acquires(p->pi_lock) | ||
977 | __acquires(rq->lock) | 970 | __acquires(rq->lock) |
978 | { | 971 | { |
979 | struct rq *rq; | 972 | struct rq *rq; |
980 | 973 | ||
981 | for (;;) { | 974 | for (;;) { |
982 | local_irq_save(*flags); | 975 | raw_spin_lock_irqsave(&p->pi_lock, *flags); |
983 | rq = task_rq(p); | 976 | rq = task_rq(p); |
984 | raw_spin_lock(&rq->lock); | 977 | raw_spin_lock(&rq->lock); |
985 | if (likely(rq == task_rq(p))) | 978 | if (likely(rq == task_rq(p))) |
986 | return rq; | 979 | return rq; |
987 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 980 | raw_spin_unlock(&rq->lock); |
981 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
988 | } | 982 | } |
989 | } | 983 | } |
990 | 984 | ||
@@ -994,10 +988,13 @@ static void __task_rq_unlock(struct rq *rq) | |||
994 | raw_spin_unlock(&rq->lock); | 988 | raw_spin_unlock(&rq->lock); |
995 | } | 989 | } |
996 | 990 | ||
997 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | 991 | static inline void |
992 | task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) | ||
998 | __releases(rq->lock) | 993 | __releases(rq->lock) |
994 | __releases(p->pi_lock) | ||
999 | { | 995 | { |
1000 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 996 | raw_spin_unlock(&rq->lock); |
997 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
1001 | } | 998 | } |
1002 | 999 | ||
1003 | /* | 1000 | /* |
@@ -1227,11 +1224,17 @@ int get_nohz_timer_target(void) | |||
1227 | int i; | 1224 | int i; |
1228 | struct sched_domain *sd; | 1225 | struct sched_domain *sd; |
1229 | 1226 | ||
1227 | rcu_read_lock(); | ||
1230 | for_each_domain(cpu, sd) { | 1228 | for_each_domain(cpu, sd) { |
1231 | for_each_cpu(i, sched_domain_span(sd)) | 1229 | for_each_cpu(i, sched_domain_span(sd)) { |
1232 | if (!idle_cpu(i)) | 1230 | if (!idle_cpu(i)) { |
1233 | return i; | 1231 | cpu = i; |
1232 | goto unlock; | ||
1233 | } | ||
1234 | } | ||
1234 | } | 1235 | } |
1236 | unlock: | ||
1237 | rcu_read_unlock(); | ||
1235 | return cpu; | 1238 | return cpu; |
1236 | } | 1239 | } |
1237 | /* | 1240 | /* |
@@ -1341,15 +1344,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1341 | { | 1344 | { |
1342 | u64 tmp; | 1345 | u64 tmp; |
1343 | 1346 | ||
1347 | /* | ||
1348 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
1349 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
1350 | * 2^SCHED_LOAD_RESOLUTION. | ||
1351 | */ | ||
1352 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
1353 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
1354 | else | ||
1355 | tmp = (u64)delta_exec; | ||
1356 | |||
1344 | if (!lw->inv_weight) { | 1357 | if (!lw->inv_weight) { |
1345 | if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) | 1358 | unsigned long w = scale_load_down(lw->weight); |
1359 | |||
1360 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
1346 | lw->inv_weight = 1; | 1361 | lw->inv_weight = 1; |
1362 | else if (unlikely(!w)) | ||
1363 | lw->inv_weight = WMULT_CONST; | ||
1347 | else | 1364 | else |
1348 | lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) | 1365 | lw->inv_weight = WMULT_CONST / w; |
1349 | / (lw->weight+1); | ||
1350 | } | 1366 | } |
1351 | 1367 | ||
1352 | tmp = (u64)delta_exec * weight; | ||
1353 | /* | 1368 | /* |
1354 | * Check whether we'd overflow the 64-bit multiplication: | 1369 | * Check whether we'd overflow the 64-bit multiplication: |
1355 | */ | 1370 | */ |
@@ -1374,6 +1389,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | |||
1374 | lw->inv_weight = 0; | 1389 | lw->inv_weight = 0; |
1375 | } | 1390 | } |
1376 | 1391 | ||
1392 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
1393 | { | ||
1394 | lw->weight = w; | ||
1395 | lw->inv_weight = 0; | ||
1396 | } | ||
1397 | |||
1377 | /* | 1398 | /* |
1378 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1399 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
1379 | * of tasks with abnormal "nice" values across CPUs the contribution that | 1400 | * of tasks with abnormal "nice" values across CPUs the contribution that |
@@ -1562,101 +1583,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1562 | 1583 | ||
1563 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1584 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1564 | 1585 | ||
1565 | static __read_mostly unsigned long __percpu *update_shares_data; | ||
1566 | |||
1567 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1568 | |||
1569 | /* | ||
1570 | * Calculate and set the cpu's group shares. | ||
1571 | */ | ||
1572 | static void update_group_shares_cpu(struct task_group *tg, int cpu, | ||
1573 | unsigned long sd_shares, | ||
1574 | unsigned long sd_rq_weight, | ||
1575 | unsigned long *usd_rq_weight) | ||
1576 | { | ||
1577 | unsigned long shares, rq_weight; | ||
1578 | int boost = 0; | ||
1579 | |||
1580 | rq_weight = usd_rq_weight[cpu]; | ||
1581 | if (!rq_weight) { | ||
1582 | boost = 1; | ||
1583 | rq_weight = NICE_0_LOAD; | ||
1584 | } | ||
1585 | |||
1586 | /* | ||
1587 | * \Sum_j shares_j * rq_weight_i | ||
1588 | * shares_i = ----------------------------- | ||
1589 | * \Sum_j rq_weight_j | ||
1590 | */ | ||
1591 | shares = (sd_shares * rq_weight) / sd_rq_weight; | ||
1592 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
1593 | |||
1594 | if (abs(shares - tg->se[cpu]->load.weight) > | ||
1595 | sysctl_sched_shares_thresh) { | ||
1596 | struct rq *rq = cpu_rq(cpu); | ||
1597 | unsigned long flags; | ||
1598 | |||
1599 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
1600 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; | ||
1601 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1602 | __set_se_shares(tg->se[cpu], shares); | ||
1603 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
1604 | } | ||
1605 | } | ||
1606 | |||
1607 | /* | ||
1608 | * Re-compute the task group their per cpu shares over the given domain. | ||
1609 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
1610 | * parent group depends on the shares of its child groups. | ||
1611 | */ | ||
1612 | static int tg_shares_up(struct task_group *tg, void *data) | ||
1613 | { | ||
1614 | unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; | ||
1615 | unsigned long *usd_rq_weight; | ||
1616 | struct sched_domain *sd = data; | ||
1617 | unsigned long flags; | ||
1618 | int i; | ||
1619 | |||
1620 | if (!tg->se[0]) | ||
1621 | return 0; | ||
1622 | |||
1623 | local_irq_save(flags); | ||
1624 | usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); | ||
1625 | |||
1626 | for_each_cpu(i, sched_domain_span(sd)) { | ||
1627 | weight = tg->cfs_rq[i]->load.weight; | ||
1628 | usd_rq_weight[i] = weight; | ||
1629 | |||
1630 | rq_weight += weight; | ||
1631 | /* | ||
1632 | * If there are currently no tasks on the cpu pretend there | ||
1633 | * is one of average load so that when a new task gets to | ||
1634 | * run here it will not get delayed by group starvation. | ||
1635 | */ | ||
1636 | if (!weight) | ||
1637 | weight = NICE_0_LOAD; | ||
1638 | |||
1639 | sum_weight += weight; | ||
1640 | shares += tg->cfs_rq[i]->shares; | ||
1641 | } | ||
1642 | |||
1643 | if (!rq_weight) | ||
1644 | rq_weight = sum_weight; | ||
1645 | |||
1646 | if ((!shares && rq_weight) || shares > tg->shares) | ||
1647 | shares = tg->shares; | ||
1648 | |||
1649 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | ||
1650 | shares = tg->shares; | ||
1651 | |||
1652 | for_each_cpu(i, sched_domain_span(sd)) | ||
1653 | update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); | ||
1654 | |||
1655 | local_irq_restore(flags); | ||
1656 | |||
1657 | return 0; | ||
1658 | } | ||
1659 | |||
1660 | /* | 1586 | /* |
1661 | * Compute the cpu's hierarchical load factor for each task group. | 1587 | * Compute the cpu's hierarchical load factor for each task group. |
1662 | * This needs to be done in a top-down fashion because the load of a child | 1588 | * This needs to be done in a top-down fashion because the load of a child |
@@ -1671,7 +1597,7 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1671 | load = cpu_rq(cpu)->load.weight; | 1597 | load = cpu_rq(cpu)->load.weight; |
1672 | } else { | 1598 | } else { |
1673 | load = tg->parent->cfs_rq[cpu]->h_load; | 1599 | load = tg->parent->cfs_rq[cpu]->h_load; |
1674 | load *= tg->cfs_rq[cpu]->shares; | 1600 | load *= tg->se[cpu]->load.weight; |
1675 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | 1601 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; |
1676 | } | 1602 | } |
1677 | 1603 | ||
@@ -1680,34 +1606,11 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1680 | return 0; | 1606 | return 0; |
1681 | } | 1607 | } |
1682 | 1608 | ||
1683 | static void update_shares(struct sched_domain *sd) | ||
1684 | { | ||
1685 | s64 elapsed; | ||
1686 | u64 now; | ||
1687 | |||
1688 | if (root_task_group_empty()) | ||
1689 | return; | ||
1690 | |||
1691 | now = local_clock(); | ||
1692 | elapsed = now - sd->last_update; | ||
1693 | |||
1694 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | ||
1695 | sd->last_update = now; | ||
1696 | walk_tg_tree(tg_nop, tg_shares_up, sd); | ||
1697 | } | ||
1698 | } | ||
1699 | |||
1700 | static void update_h_load(long cpu) | 1609 | static void update_h_load(long cpu) |
1701 | { | 1610 | { |
1702 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1611 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1703 | } | 1612 | } |
1704 | 1613 | ||
1705 | #else | ||
1706 | |||
1707 | static inline void update_shares(struct sched_domain *sd) | ||
1708 | { | ||
1709 | } | ||
1710 | |||
1711 | #endif | 1614 | #endif |
1712 | 1615 | ||
1713 | #ifdef CONFIG_PREEMPT | 1616 | #ifdef CONFIG_PREEMPT |
@@ -1827,15 +1730,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1827 | __release(rq2->lock); | 1730 | __release(rq2->lock); |
1828 | } | 1731 | } |
1829 | 1732 | ||
1830 | #endif | 1733 | #else /* CONFIG_SMP */ |
1831 | 1734 | ||
1832 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1735 | /* |
1833 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | 1736 | * double_rq_lock - safely lock two runqueues |
1737 | * | ||
1738 | * Note this does not disable interrupts like task_rq_lock, | ||
1739 | * you need to do so manually before calling. | ||
1740 | */ | ||
1741 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1742 | __acquires(rq1->lock) | ||
1743 | __acquires(rq2->lock) | ||
1834 | { | 1744 | { |
1835 | #ifdef CONFIG_SMP | 1745 | BUG_ON(!irqs_disabled()); |
1836 | cfs_rq->shares = shares; | 1746 | BUG_ON(rq1 != rq2); |
1837 | #endif | 1747 | raw_spin_lock(&rq1->lock); |
1748 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1838 | } | 1749 | } |
1750 | |||
1751 | /* | ||
1752 | * double_rq_unlock - safely unlock two runqueues | ||
1753 | * | ||
1754 | * Note this does not restore interrupts like task_rq_unlock, | ||
1755 | * you need to do so manually after calling. | ||
1756 | */ | ||
1757 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1758 | __releases(rq1->lock) | ||
1759 | __releases(rq2->lock) | ||
1760 | { | ||
1761 | BUG_ON(rq1 != rq2); | ||
1762 | raw_spin_unlock(&rq1->lock); | ||
1763 | __release(rq2->lock); | ||
1764 | } | ||
1765 | |||
1839 | #endif | 1766 | #endif |
1840 | 1767 | ||
1841 | static void calc_load_account_idle(struct rq *this_rq); | 1768 | static void calc_load_account_idle(struct rq *this_rq); |
@@ -1877,23 +1804,20 @@ static void dec_nr_running(struct rq *rq) | |||
1877 | 1804 | ||
1878 | static void set_load_weight(struct task_struct *p) | 1805 | static void set_load_weight(struct task_struct *p) |
1879 | { | 1806 | { |
1880 | if (task_has_rt_policy(p)) { | 1807 | int prio = p->static_prio - MAX_RT_PRIO; |
1881 | p->se.load.weight = 0; | 1808 | struct load_weight *load = &p->se.load; |
1882 | p->se.load.inv_weight = WMULT_CONST; | ||
1883 | return; | ||
1884 | } | ||
1885 | 1809 | ||
1886 | /* | 1810 | /* |
1887 | * SCHED_IDLE tasks get minimal weight: | 1811 | * SCHED_IDLE tasks get minimal weight: |
1888 | */ | 1812 | */ |
1889 | if (p->policy == SCHED_IDLE) { | 1813 | if (p->policy == SCHED_IDLE) { |
1890 | p->se.load.weight = WEIGHT_IDLEPRIO; | 1814 | load->weight = scale_load(WEIGHT_IDLEPRIO); |
1891 | p->se.load.inv_weight = WMULT_IDLEPRIO; | 1815 | load->inv_weight = WMULT_IDLEPRIO; |
1892 | return; | 1816 | return; |
1893 | } | 1817 | } |
1894 | 1818 | ||
1895 | p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; | 1819 | load->weight = scale_load(prio_to_weight[prio]); |
1896 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 1820 | load->inv_weight = prio_to_wmult[prio]; |
1897 | } | 1821 | } |
1898 | 1822 | ||
1899 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 1823 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -1901,7 +1825,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1901 | update_rq_clock(rq); | 1825 | update_rq_clock(rq); |
1902 | sched_info_queued(p); | 1826 | sched_info_queued(p); |
1903 | p->sched_class->enqueue_task(rq, p, flags); | 1827 | p->sched_class->enqueue_task(rq, p, flags); |
1904 | p->se.on_rq = 1; | ||
1905 | } | 1828 | } |
1906 | 1829 | ||
1907 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 1830 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -1909,7 +1832,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1909 | update_rq_clock(rq); | 1832 | update_rq_clock(rq); |
1910 | sched_info_dequeued(p); | 1833 | sched_info_dequeued(p); |
1911 | p->sched_class->dequeue_task(rq, p, flags); | 1834 | p->sched_class->dequeue_task(rq, p, flags); |
1912 | p->se.on_rq = 0; | ||
1913 | } | 1835 | } |
1914 | 1836 | ||
1915 | /* | 1837 | /* |
@@ -1936,14 +1858,227 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1936 | dec_nr_running(rq); | 1858 | dec_nr_running(rq); |
1937 | } | 1859 | } |
1938 | 1860 | ||
1861 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1862 | |||
1863 | /* | ||
1864 | * There are no locks covering percpu hardirq/softirq time. | ||
1865 | * They are only modified in account_system_vtime, on corresponding CPU | ||
1866 | * with interrupts disabled. So, writes are safe. | ||
1867 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
1868 | * This may result in other CPU reading this CPU's irq time and can | ||
1869 | * race with irq/account_system_vtime on this CPU. We would either get old | ||
1870 | * or new value with a side effect of accounting a slice of irq time to wrong | ||
1871 | * task when irq is in progress while we read rq->clock. That is a worthy | ||
1872 | * compromise in place of having locks on each irq in account_system_time. | ||
1873 | */ | ||
1874 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
1875 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
1876 | |||
1877 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
1878 | static int sched_clock_irqtime; | ||
1879 | |||
1880 | void enable_sched_clock_irqtime(void) | ||
1881 | { | ||
1882 | sched_clock_irqtime = 1; | ||
1883 | } | ||
1884 | |||
1885 | void disable_sched_clock_irqtime(void) | ||
1886 | { | ||
1887 | sched_clock_irqtime = 0; | ||
1888 | } | ||
1889 | |||
1890 | #ifndef CONFIG_64BIT | ||
1891 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
1892 | |||
1893 | static inline void irq_time_write_begin(void) | ||
1894 | { | ||
1895 | __this_cpu_inc(irq_time_seq.sequence); | ||
1896 | smp_wmb(); | ||
1897 | } | ||
1898 | |||
1899 | static inline void irq_time_write_end(void) | ||
1900 | { | ||
1901 | smp_wmb(); | ||
1902 | __this_cpu_inc(irq_time_seq.sequence); | ||
1903 | } | ||
1904 | |||
1905 | static inline u64 irq_time_read(int cpu) | ||
1906 | { | ||
1907 | u64 irq_time; | ||
1908 | unsigned seq; | ||
1909 | |||
1910 | do { | ||
1911 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
1912 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
1913 | per_cpu(cpu_hardirq_time, cpu); | ||
1914 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
1915 | |||
1916 | return irq_time; | ||
1917 | } | ||
1918 | #else /* CONFIG_64BIT */ | ||
1919 | static inline void irq_time_write_begin(void) | ||
1920 | { | ||
1921 | } | ||
1922 | |||
1923 | static inline void irq_time_write_end(void) | ||
1924 | { | ||
1925 | } | ||
1926 | |||
1927 | static inline u64 irq_time_read(int cpu) | ||
1928 | { | ||
1929 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
1930 | } | ||
1931 | #endif /* CONFIG_64BIT */ | ||
1932 | |||
1933 | /* | ||
1934 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
1935 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
1936 | */ | ||
1937 | void account_system_vtime(struct task_struct *curr) | ||
1938 | { | ||
1939 | unsigned long flags; | ||
1940 | s64 delta; | ||
1941 | int cpu; | ||
1942 | |||
1943 | if (!sched_clock_irqtime) | ||
1944 | return; | ||
1945 | |||
1946 | local_irq_save(flags); | ||
1947 | |||
1948 | cpu = smp_processor_id(); | ||
1949 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | ||
1950 | __this_cpu_add(irq_start_time, delta); | ||
1951 | |||
1952 | irq_time_write_begin(); | ||
1953 | /* | ||
1954 | * We do not account for softirq time from ksoftirqd here. | ||
1955 | * We want to continue accounting softirq time to ksoftirqd thread | ||
1956 | * in that case, so as not to confuse scheduler with a special task | ||
1957 | * that do not consume any time, but still wants to run. | ||
1958 | */ | ||
1959 | if (hardirq_count()) | ||
1960 | __this_cpu_add(cpu_hardirq_time, delta); | ||
1961 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | ||
1962 | __this_cpu_add(cpu_softirq_time, delta); | ||
1963 | |||
1964 | irq_time_write_end(); | ||
1965 | local_irq_restore(flags); | ||
1966 | } | ||
1967 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
1968 | |||
1969 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
1970 | { | ||
1971 | s64 irq_delta; | ||
1972 | |||
1973 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | ||
1974 | |||
1975 | /* | ||
1976 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
1977 | * this case when a previous update_rq_clock() happened inside a | ||
1978 | * {soft,}irq region. | ||
1979 | * | ||
1980 | * When this happens, we stop ->clock_task and only update the | ||
1981 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
1982 | * update will consume the rest. This ensures ->clock_task is | ||
1983 | * monotonic. | ||
1984 | * | ||
1985 | * It does however cause some slight miss-attribution of {soft,}irq | ||
1986 | * time, a more accurate solution would be to update the irq_time using | ||
1987 | * the current rq->clock timestamp, except that would require using | ||
1988 | * atomic ops. | ||
1989 | */ | ||
1990 | if (irq_delta > delta) | ||
1991 | irq_delta = delta; | ||
1992 | |||
1993 | rq->prev_irq_time += irq_delta; | ||
1994 | delta -= irq_delta; | ||
1995 | rq->clock_task += delta; | ||
1996 | |||
1997 | if (irq_delta && sched_feat(NONIRQ_POWER)) | ||
1998 | sched_rt_avg_update(rq, irq_delta); | ||
1999 | } | ||
2000 | |||
2001 | static int irqtime_account_hi_update(void) | ||
2002 | { | ||
2003 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
2004 | unsigned long flags; | ||
2005 | u64 latest_ns; | ||
2006 | int ret = 0; | ||
2007 | |||
2008 | local_irq_save(flags); | ||
2009 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
2010 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) | ||
2011 | ret = 1; | ||
2012 | local_irq_restore(flags); | ||
2013 | return ret; | ||
2014 | } | ||
2015 | |||
2016 | static int irqtime_account_si_update(void) | ||
2017 | { | ||
2018 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
2019 | unsigned long flags; | ||
2020 | u64 latest_ns; | ||
2021 | int ret = 0; | ||
2022 | |||
2023 | local_irq_save(flags); | ||
2024 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
2025 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) | ||
2026 | ret = 1; | ||
2027 | local_irq_restore(flags); | ||
2028 | return ret; | ||
2029 | } | ||
2030 | |||
2031 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
2032 | |||
2033 | #define sched_clock_irqtime (0) | ||
2034 | |||
2035 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
2036 | { | ||
2037 | rq->clock_task += delta; | ||
2038 | } | ||
2039 | |||
2040 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
2041 | |||
1939 | #include "sched_idletask.c" | 2042 | #include "sched_idletask.c" |
1940 | #include "sched_fair.c" | 2043 | #include "sched_fair.c" |
1941 | #include "sched_rt.c" | 2044 | #include "sched_rt.c" |
2045 | #include "sched_autogroup.c" | ||
2046 | #include "sched_stoptask.c" | ||
1942 | #include "../litmus/sched_litmus.c" | 2047 | #include "../litmus/sched_litmus.c" |
1943 | #ifdef CONFIG_SCHED_DEBUG | 2048 | #ifdef CONFIG_SCHED_DEBUG |
1944 | # include "sched_debug.c" | 2049 | # include "sched_debug.c" |
1945 | #endif | 2050 | #endif |
1946 | 2051 | ||
2052 | void sched_set_stop_task(int cpu, struct task_struct *stop) | ||
2053 | { | ||
2054 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | ||
2055 | struct task_struct *old_stop = cpu_rq(cpu)->stop; | ||
2056 | |||
2057 | if (stop) { | ||
2058 | /* | ||
2059 | * Make it appear like a SCHED_FIFO task, its something | ||
2060 | * userspace knows about and won't get confused about. | ||
2061 | * | ||
2062 | * Also, it will make PI more or less work without too | ||
2063 | * much confusion -- but then, stop work should not | ||
2064 | * rely on PI working anyway. | ||
2065 | */ | ||
2066 | sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); | ||
2067 | |||
2068 | stop->sched_class = &stop_sched_class; | ||
2069 | } | ||
2070 | |||
2071 | cpu_rq(cpu)->stop = stop; | ||
2072 | |||
2073 | if (old_stop) { | ||
2074 | /* | ||
2075 | * Reset it back to a normal scheduling class so that | ||
2076 | * it can die in pieces. | ||
2077 | */ | ||
2078 | old_stop->sched_class = &rt_sched_class; | ||
2079 | } | ||
2080 | } | ||
2081 | |||
1947 | /* | 2082 | /* |
1948 | * __normal_prio - return the priority that is based on the static prio | 2083 | * __normal_prio - return the priority that is based on the static prio |
1949 | */ | 2084 | */ |
@@ -2001,14 +2136,43 @@ inline int task_curr(const struct task_struct *p) | |||
2001 | 2136 | ||
2002 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 2137 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
2003 | const struct sched_class *prev_class, | 2138 | const struct sched_class *prev_class, |
2004 | int oldprio, int running) | 2139 | int oldprio) |
2005 | { | 2140 | { |
2006 | if (prev_class != p->sched_class) { | 2141 | if (prev_class != p->sched_class) { |
2007 | if (prev_class->switched_from) | 2142 | if (prev_class->switched_from) |
2008 | prev_class->switched_from(rq, p, running); | 2143 | prev_class->switched_from(rq, p); |
2009 | p->sched_class->switched_to(rq, p, running); | 2144 | p->sched_class->switched_to(rq, p); |
2010 | } else | 2145 | } else if (oldprio != p->prio) |
2011 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2146 | p->sched_class->prio_changed(rq, p, oldprio); |
2147 | } | ||
2148 | |||
2149 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
2150 | { | ||
2151 | const struct sched_class *class; | ||
2152 | |||
2153 | if (p->sched_class == rq->curr->sched_class) { | ||
2154 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
2155 | } else { | ||
2156 | for_each_class(class) { | ||
2157 | if (class == rq->curr->sched_class) | ||
2158 | break; | ||
2159 | if (class == p->sched_class) { | ||
2160 | resched_task(rq->curr); | ||
2161 | break; | ||
2162 | } | ||
2163 | } | ||
2164 | } | ||
2165 | |||
2166 | /* | ||
2167 | * A queue event has occurred, and we're going to schedule. In | ||
2168 | * this case, we can save a useless back to back clock update. | ||
2169 | */ | ||
2170 | /* LITMUS^RT: | ||
2171 | * The "disable-clock-update" approach was buggy in Linux 2.6.36. | ||
2172 | * The issue has been solved in 2.6.37. | ||
2173 | */ | ||
2174 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) | ||
2175 | rq->skip_clock_update = 1; | ||
2012 | } | 2176 | } |
2013 | 2177 | ||
2014 | #ifdef CONFIG_SMP | 2178 | #ifdef CONFIG_SMP |
@@ -2023,6 +2187,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
2023 | if (p->sched_class != &fair_sched_class) | 2187 | if (p->sched_class != &fair_sched_class) |
2024 | return 0; | 2188 | return 0; |
2025 | 2189 | ||
2190 | if (unlikely(p->policy == SCHED_IDLE)) | ||
2191 | return 0; | ||
2192 | |||
2026 | /* | 2193 | /* |
2027 | * Buddy candidates are cache hot: | 2194 | * Buddy candidates are cache hot: |
2028 | */ | 2195 | */ |
@@ -2050,6 +2217,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
2050 | */ | 2217 | */ |
2051 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 2218 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
2052 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 2219 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); |
2220 | |||
2221 | #ifdef CONFIG_LOCKDEP | ||
2222 | /* | ||
2223 | * The caller should hold either p->pi_lock or rq->lock, when changing | ||
2224 | * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. | ||
2225 | * | ||
2226 | * sched_move_task() holds both and thus holding either pins the cgroup, | ||
2227 | * see set_task_rq(). | ||
2228 | * | ||
2229 | * Furthermore, all task_rq users should acquire both locks, see | ||
2230 | * task_rq_lock(). | ||
2231 | */ | ||
2232 | WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || | ||
2233 | lockdep_is_held(&task_rq(p)->lock))); | ||
2234 | #endif | ||
2053 | #endif | 2235 | #endif |
2054 | 2236 | ||
2055 | trace_sched_migrate_task(p, new_cpu); | 2237 | trace_sched_migrate_task(p, new_cpu); |
@@ -2070,21 +2252,6 @@ struct migration_arg { | |||
2070 | static int migration_cpu_stop(void *data); | 2252 | static int migration_cpu_stop(void *data); |
2071 | 2253 | ||
2072 | /* | 2254 | /* |
2073 | * The task's runqueue lock must be held. | ||
2074 | * Returns true if you have to wait for migration thread. | ||
2075 | */ | ||
2076 | static bool migrate_task(struct task_struct *p, int dest_cpu) | ||
2077 | { | ||
2078 | struct rq *rq = task_rq(p); | ||
2079 | |||
2080 | /* | ||
2081 | * If the task is not on a runqueue (and not running), then | ||
2082 | * the next wake-up will properly place the task. | ||
2083 | */ | ||
2084 | return p->se.on_rq || task_running(rq, p); | ||
2085 | } | ||
2086 | |||
2087 | /* | ||
2088 | * wait_task_inactive - wait for a thread to unschedule. | 2255 | * wait_task_inactive - wait for a thread to unschedule. |
2089 | * | 2256 | * |
2090 | * If @match_state is nonzero, it's the @p->state value just checked and | 2257 | * If @match_state is nonzero, it's the @p->state value just checked and |
@@ -2141,11 +2308,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2141 | rq = task_rq_lock(p, &flags); | 2308 | rq = task_rq_lock(p, &flags); |
2142 | trace_sched_wait_task(p); | 2309 | trace_sched_wait_task(p); |
2143 | running = task_running(rq, p); | 2310 | running = task_running(rq, p); |
2144 | on_rq = p->se.on_rq; | 2311 | on_rq = p->on_rq; |
2145 | ncsw = 0; | 2312 | ncsw = 0; |
2146 | if (!match_state || p->state == match_state) | 2313 | if (!match_state || p->state == match_state) |
2147 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | 2314 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
2148 | task_rq_unlock(rq, &flags); | 2315 | task_rq_unlock(rq, p, &flags); |
2149 | 2316 | ||
2150 | /* | 2317 | /* |
2151 | * If it changed from the expected state, bail out now. | 2318 | * If it changed from the expected state, bail out now. |
@@ -2174,7 +2341,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2174 | * yield - it could be a while. | 2341 | * yield - it could be a while. |
2175 | */ | 2342 | */ |
2176 | if (unlikely(on_rq)) { | 2343 | if (unlikely(on_rq)) { |
2177 | schedule_timeout_uninterruptible(1); | 2344 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); |
2345 | |||
2346 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
2347 | schedule_hrtimeout(&to, HRTIMER_MODE_REL); | ||
2178 | continue; | 2348 | continue; |
2179 | } | 2349 | } |
2180 | 2350 | ||
@@ -2196,7 +2366,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2196 | * Cause a process which is running on another CPU to enter | 2366 | * Cause a process which is running on another CPU to enter |
2197 | * kernel-mode, without any delay. (to get signals handled.) | 2367 | * kernel-mode, without any delay. (to get signals handled.) |
2198 | * | 2368 | * |
2199 | * NOTE: this function doesnt have to take the runqueue lock, | 2369 | * NOTE: this function doesn't have to take the runqueue lock, |
2200 | * because all it wants to ensure is that the remote task enters | 2370 | * because all it wants to ensure is that the remote task enters |
2201 | * the kernel. If the IPI races and the task has been migrated | 2371 | * the kernel. If the IPI races and the task has been migrated |
2202 | * to another CPU then no harm is done and the purpose has been | 2372 | * to another CPU then no harm is done and the purpose has been |
@@ -2215,30 +2385,9 @@ void kick_process(struct task_struct *p) | |||
2215 | EXPORT_SYMBOL_GPL(kick_process); | 2385 | EXPORT_SYMBOL_GPL(kick_process); |
2216 | #endif /* CONFIG_SMP */ | 2386 | #endif /* CONFIG_SMP */ |
2217 | 2387 | ||
2218 | /** | ||
2219 | * task_oncpu_function_call - call a function on the cpu on which a task runs | ||
2220 | * @p: the task to evaluate | ||
2221 | * @func: the function to be called | ||
2222 | * @info: the function call argument | ||
2223 | * | ||
2224 | * Calls the function @func when the task is currently running. This might | ||
2225 | * be on the current CPU, which just calls the function directly | ||
2226 | */ | ||
2227 | void task_oncpu_function_call(struct task_struct *p, | ||
2228 | void (*func) (void *info), void *info) | ||
2229 | { | ||
2230 | int cpu; | ||
2231 | |||
2232 | preempt_disable(); | ||
2233 | cpu = task_cpu(p); | ||
2234 | if (task_curr(p)) | ||
2235 | smp_call_function_single(cpu, func, info, 1); | ||
2236 | preempt_enable(); | ||
2237 | } | ||
2238 | |||
2239 | #ifdef CONFIG_SMP | 2388 | #ifdef CONFIG_SMP |
2240 | /* | 2389 | /* |
2241 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. | 2390 | * ->cpus_allowed is protected by both rq->lock and p->pi_lock |
2242 | */ | 2391 | */ |
2243 | static int select_fallback_rq(int cpu, struct task_struct *p) | 2392 | static int select_fallback_rq(int cpu, struct task_struct *p) |
2244 | { | 2393 | { |
@@ -2256,30 +2405,27 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2256 | return dest_cpu; | 2405 | return dest_cpu; |
2257 | 2406 | ||
2258 | /* No more Mr. Nice Guy. */ | 2407 | /* No more Mr. Nice Guy. */ |
2259 | if (unlikely(dest_cpu >= nr_cpu_ids)) { | 2408 | dest_cpu = cpuset_cpus_allowed_fallback(p); |
2260 | dest_cpu = cpuset_cpus_allowed_fallback(p); | 2409 | /* |
2261 | /* | 2410 | * Don't tell them about moving exiting tasks or |
2262 | * Don't tell them about moving exiting tasks or | 2411 | * kernel threads (both mm NULL), since they never |
2263 | * kernel threads (both mm NULL), since they never | 2412 | * leave kernel. |
2264 | * leave kernel. | 2413 | */ |
2265 | */ | 2414 | if (p->mm && printk_ratelimit()) { |
2266 | if (p->mm && printk_ratelimit()) { | 2415 | printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", |
2267 | printk(KERN_INFO "process %d (%s) no " | 2416 | task_pid_nr(p), p->comm, cpu); |
2268 | "longer affine to cpu%d\n", | ||
2269 | task_pid_nr(p), p->comm, cpu); | ||
2270 | } | ||
2271 | } | 2417 | } |
2272 | 2418 | ||
2273 | return dest_cpu; | 2419 | return dest_cpu; |
2274 | } | 2420 | } |
2275 | 2421 | ||
2276 | /* | 2422 | /* |
2277 | * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. | 2423 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
2278 | */ | 2424 | */ |
2279 | static inline | 2425 | static inline |
2280 | int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) | 2426 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) |
2281 | { | 2427 | { |
2282 | int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); | 2428 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); |
2283 | 2429 | ||
2284 | /* | 2430 | /* |
2285 | * In order not to call set_task_cpu() on a blocking task we need | 2431 | * In order not to call set_task_cpu() on a blocking task we need |
@@ -2305,27 +2451,63 @@ static void update_avg(u64 *avg, u64 sample) | |||
2305 | } | 2451 | } |
2306 | #endif | 2452 | #endif |
2307 | 2453 | ||
2308 | static inline void ttwu_activate(struct task_struct *p, struct rq *rq, | 2454 | static void |
2309 | bool is_sync, bool is_migrate, bool is_local, | 2455 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) |
2310 | unsigned long en_flags) | ||
2311 | { | 2456 | { |
2312 | schedstat_inc(p, se.statistics.nr_wakeups); | 2457 | #ifdef CONFIG_SCHEDSTATS |
2313 | if (is_sync) | 2458 | struct rq *rq = this_rq(); |
2314 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | 2459 | |
2315 | if (is_migrate) | 2460 | #ifdef CONFIG_SMP |
2316 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | 2461 | int this_cpu = smp_processor_id(); |
2317 | if (is_local) | 2462 | |
2463 | if (cpu == this_cpu) { | ||
2464 | schedstat_inc(rq, ttwu_local); | ||
2318 | schedstat_inc(p, se.statistics.nr_wakeups_local); | 2465 | schedstat_inc(p, se.statistics.nr_wakeups_local); |
2319 | else | 2466 | } else { |
2467 | struct sched_domain *sd; | ||
2468 | |||
2320 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | 2469 | schedstat_inc(p, se.statistics.nr_wakeups_remote); |
2470 | rcu_read_lock(); | ||
2471 | for_each_domain(this_cpu, sd) { | ||
2472 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2473 | schedstat_inc(sd, ttwu_wake_remote); | ||
2474 | break; | ||
2475 | } | ||
2476 | } | ||
2477 | rcu_read_unlock(); | ||
2478 | } | ||
2479 | |||
2480 | if (wake_flags & WF_MIGRATED) | ||
2481 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | ||
2482 | |||
2483 | #endif /* CONFIG_SMP */ | ||
2484 | |||
2485 | schedstat_inc(rq, ttwu_count); | ||
2486 | schedstat_inc(p, se.statistics.nr_wakeups); | ||
2487 | |||
2488 | if (wake_flags & WF_SYNC) | ||
2489 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | ||
2321 | 2490 | ||
2491 | #endif /* CONFIG_SCHEDSTATS */ | ||
2492 | } | ||
2493 | |||
2494 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | ||
2495 | { | ||
2322 | activate_task(rq, p, en_flags); | 2496 | activate_task(rq, p, en_flags); |
2497 | p->on_rq = 1; | ||
2498 | |||
2499 | /* if a worker is waking up, notify workqueue */ | ||
2500 | if (p->flags & PF_WQ_WORKER) | ||
2501 | wq_worker_waking_up(p, cpu_of(rq)); | ||
2323 | } | 2502 | } |
2324 | 2503 | ||
2325 | static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | 2504 | /* |
2326 | int wake_flags, bool success) | 2505 | * Mark the task runnable and perform wakeup-preemption. |
2506 | */ | ||
2507 | static void | ||
2508 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2327 | { | 2509 | { |
2328 | trace_sched_wakeup(p, success); | 2510 | trace_sched_wakeup(p, true); |
2329 | check_preempt_curr(rq, p, wake_flags); | 2511 | check_preempt_curr(rq, p, wake_flags); |
2330 | 2512 | ||
2331 | p->state = TASK_RUNNING; | 2513 | p->state = TASK_RUNNING; |
@@ -2344,9 +2526,151 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2344 | rq->idle_stamp = 0; | 2526 | rq->idle_stamp = 0; |
2345 | } | 2527 | } |
2346 | #endif | 2528 | #endif |
2347 | /* if a worker is waking up, notify workqueue */ | 2529 | } |
2348 | if ((p->flags & PF_WQ_WORKER) && success) | 2530 | |
2349 | wq_worker_waking_up(p, cpu_of(rq)); | 2531 | static void |
2532 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2533 | { | ||
2534 | #ifdef CONFIG_SMP | ||
2535 | if (p->sched_contributes_to_load) | ||
2536 | rq->nr_uninterruptible--; | ||
2537 | #endif | ||
2538 | |||
2539 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); | ||
2540 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2541 | } | ||
2542 | |||
2543 | /* | ||
2544 | * Called in case the task @p isn't fully descheduled from its runqueue, | ||
2545 | * in this case we must do a remote wakeup. Its a 'light' wakeup though, | ||
2546 | * since all we need to do is flip p->state to TASK_RUNNING, since | ||
2547 | * the task is still ->on_rq. | ||
2548 | */ | ||
2549 | static int ttwu_remote(struct task_struct *p, int wake_flags) | ||
2550 | { | ||
2551 | struct rq *rq; | ||
2552 | int ret = 0; | ||
2553 | |||
2554 | rq = __task_rq_lock(p); | ||
2555 | if (p->on_rq) { | ||
2556 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2557 | ret = 1; | ||
2558 | } | ||
2559 | __task_rq_unlock(rq); | ||
2560 | |||
2561 | return ret; | ||
2562 | } | ||
2563 | |||
2564 | #ifdef CONFIG_SMP | ||
2565 | static void sched_ttwu_do_pending(struct task_struct *list) | ||
2566 | { | ||
2567 | struct rq *rq = this_rq(); | ||
2568 | |||
2569 | raw_spin_lock(&rq->lock); | ||
2570 | |||
2571 | while (list) { | ||
2572 | struct task_struct *p = list; | ||
2573 | list = list->wake_entry; | ||
2574 | ttwu_do_activate(rq, p, 0); | ||
2575 | } | ||
2576 | |||
2577 | raw_spin_unlock(&rq->lock); | ||
2578 | } | ||
2579 | |||
2580 | #ifdef CONFIG_HOTPLUG_CPU | ||
2581 | |||
2582 | static void sched_ttwu_pending(void) | ||
2583 | { | ||
2584 | struct rq *rq = this_rq(); | ||
2585 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2586 | |||
2587 | if (!list) | ||
2588 | return; | ||
2589 | |||
2590 | sched_ttwu_do_pending(list); | ||
2591 | } | ||
2592 | |||
2593 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
2594 | |||
2595 | void scheduler_ipi(void) | ||
2596 | { | ||
2597 | struct rq *rq = this_rq(); | ||
2598 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2599 | |||
2600 | if (!list) | ||
2601 | return; | ||
2602 | |||
2603 | /* | ||
2604 | * Not all reschedule IPI handlers call irq_enter/irq_exit, since | ||
2605 | * traditionally all their work was done from the interrupt return | ||
2606 | * path. Now that we actually do some work, we need to make sure | ||
2607 | * we do call them. | ||
2608 | * | ||
2609 | * Some archs already do call them, luckily irq_enter/exit nest | ||
2610 | * properly. | ||
2611 | * | ||
2612 | * Arguably we should visit all archs and update all handlers, | ||
2613 | * however a fair share of IPIs are still resched only so this would | ||
2614 | * somewhat pessimize the simple resched case. | ||
2615 | */ | ||
2616 | irq_enter(); | ||
2617 | sched_ttwu_do_pending(list); | ||
2618 | irq_exit(); | ||
2619 | } | ||
2620 | |||
2621 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | ||
2622 | { | ||
2623 | struct rq *rq = cpu_rq(cpu); | ||
2624 | struct task_struct *next = rq->wake_list; | ||
2625 | |||
2626 | for (;;) { | ||
2627 | struct task_struct *old = next; | ||
2628 | |||
2629 | p->wake_entry = next; | ||
2630 | next = cmpxchg(&rq->wake_list, old, p); | ||
2631 | if (next == old) | ||
2632 | break; | ||
2633 | } | ||
2634 | |||
2635 | if (!next) | ||
2636 | smp_send_reschedule(cpu); | ||
2637 | } | ||
2638 | |||
2639 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
2640 | static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | ||
2641 | { | ||
2642 | struct rq *rq; | ||
2643 | int ret = 0; | ||
2644 | |||
2645 | rq = __task_rq_lock(p); | ||
2646 | if (p->on_cpu) { | ||
2647 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | ||
2648 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2649 | ret = 1; | ||
2650 | } | ||
2651 | __task_rq_unlock(rq); | ||
2652 | |||
2653 | return ret; | ||
2654 | |||
2655 | } | ||
2656 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
2657 | #endif /* CONFIG_SMP */ | ||
2658 | |||
2659 | static void ttwu_queue(struct task_struct *p, int cpu) | ||
2660 | { | ||
2661 | struct rq *rq = cpu_rq(cpu); | ||
2662 | |||
2663 | #if defined(CONFIG_SMP) | ||
2664 | if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { | ||
2665 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | ||
2666 | ttwu_queue_remote(p, cpu); | ||
2667 | return; | ||
2668 | } | ||
2669 | #endif | ||
2670 | |||
2671 | raw_spin_lock(&rq->lock); | ||
2672 | ttwu_do_activate(rq, p, 0); | ||
2673 | raw_spin_unlock(&rq->lock); | ||
2350 | } | 2674 | } |
2351 | 2675 | ||
2352 | /** | 2676 | /** |
@@ -2364,97 +2688,79 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2364 | * Returns %true if @p was woken up, %false if it was already running | 2688 | * Returns %true if @p was woken up, %false if it was already running |
2365 | * or @state didn't match @p's state. | 2689 | * or @state didn't match @p's state. |
2366 | */ | 2690 | */ |
2367 | static int try_to_wake_up(struct task_struct *p, unsigned int state, | 2691 | static int |
2368 | int wake_flags) | 2692 | try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) |
2369 | { | 2693 | { |
2370 | int cpu, orig_cpu, this_cpu, success = 0; | ||
2371 | unsigned long flags; | 2694 | unsigned long flags; |
2372 | unsigned long en_flags = ENQUEUE_WAKEUP; | 2695 | int cpu, success = 0; |
2373 | struct rq *rq; | ||
2374 | 2696 | ||
2375 | if (is_realtime(p)) | 2697 | if (is_realtime(p)) |
2376 | TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); | 2698 | TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); |
2377 | 2699 | ||
2378 | this_cpu = get_cpu(); | ||
2379 | |||
2380 | smp_wmb(); | 2700 | smp_wmb(); |
2381 | rq = task_rq_lock(p, &flags); | 2701 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2382 | if (!(p->state & state)) | 2702 | if (!(p->state & state)) |
2383 | goto out; | 2703 | goto out; |
2384 | 2704 | ||
2385 | if (p->se.on_rq) | 2705 | success = 1; /* we're going to change ->state */ |
2386 | goto out_running; | ||
2387 | |||
2388 | cpu = task_cpu(p); | 2706 | cpu = task_cpu(p); |
2389 | orig_cpu = cpu; | ||
2390 | 2707 | ||
2391 | #ifdef CONFIG_SMP | 2708 | if (p->on_rq && ttwu_remote(p, wake_flags)) |
2392 | if (unlikely(task_running(rq, p)) || is_realtime(p)) | 2709 | goto stat; |
2393 | goto out_activate; | ||
2394 | 2710 | ||
2711 | #ifdef CONFIG_SMP | ||
2395 | /* | 2712 | /* |
2396 | * In order to handle concurrent wakeups and release the rq->lock | 2713 | * If the owning (remote) cpu is still in the middle of schedule() with |
2397 | * we put the task in TASK_WAKING state. | 2714 | * this task as prev, wait until its done referencing the task. |
2398 | * | ||
2399 | * First fix up the nr_uninterruptible count: | ||
2400 | */ | 2715 | */ |
2401 | if (task_contributes_to_load(p)) { | 2716 | while (p->on_cpu) { |
2402 | if (likely(cpu_online(orig_cpu))) | 2717 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
2403 | rq->nr_uninterruptible--; | 2718 | /* |
2404 | else | 2719 | * In case the architecture enables interrupts in |
2405 | this_rq()->nr_uninterruptible--; | 2720 | * context_switch(), we cannot busy wait, since that |
2406 | } | 2721 | * would lead to deadlocks when an interrupt hits and |
2407 | p->state = TASK_WAKING; | 2722 | * tries to wake up @prev. So bail and do a complete |
2408 | 2723 | * remote wakeup. | |
2409 | if (p->sched_class->task_waking) { | 2724 | */ |
2410 | p->sched_class->task_waking(rq, p); | 2725 | if (ttwu_activate_remote(p, wake_flags)) |
2411 | en_flags |= ENQUEUE_WAKING; | 2726 | goto stat; |
2727 | #else | ||
2728 | cpu_relax(); | ||
2729 | #endif | ||
2412 | } | 2730 | } |
2731 | /* | ||
2732 | * Pairs with the smp_wmb() in finish_lock_switch(). | ||
2733 | */ | ||
2734 | smp_rmb(); | ||
2413 | 2735 | ||
2414 | cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); | 2736 | /* LITMUS^RT: once the task can be safely referenced by this |
2415 | if (cpu != orig_cpu) | 2737 | * CPU, don't mess up with Linux load balancing stuff. |
2416 | set_task_cpu(p, cpu); | 2738 | */ |
2417 | __task_rq_unlock(rq); | 2739 | if (is_realtime(p)) |
2740 | goto litmus_out_activate; | ||
2418 | 2741 | ||
2419 | rq = cpu_rq(cpu); | 2742 | p->sched_contributes_to_load = !!task_contributes_to_load(p); |
2420 | raw_spin_lock(&rq->lock); | 2743 | p->state = TASK_WAKING; |
2421 | 2744 | ||
2422 | /* | 2745 | if (p->sched_class->task_waking) |
2423 | * We migrated the task without holding either rq->lock, however | 2746 | p->sched_class->task_waking(p); |
2424 | * since the task is not on the task list itself, nobody else | ||
2425 | * will try and migrate the task, hence the rq should match the | ||
2426 | * cpu we just moved it to. | ||
2427 | */ | ||
2428 | WARN_ON(task_cpu(p) != cpu); | ||
2429 | WARN_ON(p->state != TASK_WAKING); | ||
2430 | 2747 | ||
2431 | #ifdef CONFIG_SCHEDSTATS | 2748 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
2432 | schedstat_inc(rq, ttwu_count); | 2749 | if (task_cpu(p) != cpu) { |
2433 | if (cpu == this_cpu) | 2750 | wake_flags |= WF_MIGRATED; |
2434 | schedstat_inc(rq, ttwu_local); | 2751 | set_task_cpu(p, cpu); |
2435 | else { | ||
2436 | struct sched_domain *sd; | ||
2437 | for_each_domain(this_cpu, sd) { | ||
2438 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2439 | schedstat_inc(sd, ttwu_wake_remote); | ||
2440 | break; | ||
2441 | } | ||
2442 | } | ||
2443 | } | 2752 | } |
2444 | #endif /* CONFIG_SCHEDSTATS */ | ||
2445 | 2753 | ||
2446 | out_activate: | 2754 | litmus_out_activate: |
2447 | #endif /* CONFIG_SMP */ | 2755 | #endif /* CONFIG_SMP */ |
2448 | ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, | 2756 | |
2449 | cpu == this_cpu, en_flags); | 2757 | ttwu_queue(p, cpu); |
2450 | success = 1; | 2758 | stat: |
2451 | out_running: | 2759 | ttwu_stat(p, cpu, wake_flags); |
2452 | ttwu_post_activation(p, rq, wake_flags, success); | ||
2453 | out: | 2760 | out: |
2454 | if (is_realtime(p)) | 2761 | if (is_realtime(p)) |
2455 | TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state); | 2762 | TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state); |
2456 | task_rq_unlock(rq, &flags); | 2763 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2457 | put_cpu(); | ||
2458 | 2764 | ||
2459 | return success; | 2765 | return success; |
2460 | } | 2766 | } |
@@ -2463,31 +2769,34 @@ out: | |||
2463 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2769 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
2464 | * @p: the thread to be awakened | 2770 | * @p: the thread to be awakened |
2465 | * | 2771 | * |
2466 | * Put @p on the run-queue if it's not alredy there. The caller must | 2772 | * Put @p on the run-queue if it's not already there. The caller must |
2467 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2773 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2468 | * the current task. this_rq() stays locked over invocation. | 2774 | * the current task. |
2469 | */ | 2775 | */ |
2470 | static void try_to_wake_up_local(struct task_struct *p) | 2776 | static void try_to_wake_up_local(struct task_struct *p) |
2471 | { | 2777 | { |
2472 | struct rq *rq = task_rq(p); | 2778 | struct rq *rq = task_rq(p); |
2473 | bool success = false; | ||
2474 | 2779 | ||
2475 | BUG_ON(rq != this_rq()); | 2780 | BUG_ON(rq != this_rq()); |
2476 | BUG_ON(p == current); | 2781 | BUG_ON(p == current); |
2477 | lockdep_assert_held(&rq->lock); | 2782 | lockdep_assert_held(&rq->lock); |
2478 | 2783 | ||
2784 | if (!raw_spin_trylock(&p->pi_lock)) { | ||
2785 | raw_spin_unlock(&rq->lock); | ||
2786 | raw_spin_lock(&p->pi_lock); | ||
2787 | raw_spin_lock(&rq->lock); | ||
2788 | } | ||
2789 | |||
2479 | if (!(p->state & TASK_NORMAL)) | 2790 | if (!(p->state & TASK_NORMAL)) |
2480 | return; | 2791 | goto out; |
2481 | 2792 | ||
2482 | if (!p->se.on_rq) { | 2793 | if (!p->on_rq) |
2483 | if (likely(!task_running(rq, p))) { | 2794 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2484 | schedstat_inc(rq, ttwu_count); | 2795 | |
2485 | schedstat_inc(rq, ttwu_local); | 2796 | ttwu_do_wakeup(rq, p, 0); |
2486 | } | 2797 | ttwu_stat(p, smp_processor_id(), 0); |
2487 | ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); | 2798 | out: |
2488 | success = true; | 2799 | raw_spin_unlock(&p->pi_lock); |
2489 | } | ||
2490 | ttwu_post_activation(p, rq, 0, success); | ||
2491 | } | 2800 | } |
2492 | 2801 | ||
2493 | /** | 2802 | /** |
@@ -2520,18 +2829,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
2520 | */ | 2829 | */ |
2521 | static void __sched_fork(struct task_struct *p) | 2830 | static void __sched_fork(struct task_struct *p) |
2522 | { | 2831 | { |
2832 | p->on_rq = 0; | ||
2833 | |||
2834 | p->se.on_rq = 0; | ||
2523 | p->se.exec_start = 0; | 2835 | p->se.exec_start = 0; |
2524 | p->se.sum_exec_runtime = 0; | 2836 | p->se.sum_exec_runtime = 0; |
2525 | p->se.prev_sum_exec_runtime = 0; | 2837 | p->se.prev_sum_exec_runtime = 0; |
2526 | p->se.nr_migrations = 0; | 2838 | p->se.nr_migrations = 0; |
2839 | p->se.vruntime = 0; | ||
2840 | INIT_LIST_HEAD(&p->se.group_node); | ||
2527 | 2841 | ||
2528 | #ifdef CONFIG_SCHEDSTATS | 2842 | #ifdef CONFIG_SCHEDSTATS |
2529 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2843 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
2530 | #endif | 2844 | #endif |
2531 | 2845 | ||
2532 | INIT_LIST_HEAD(&p->rt.run_list); | 2846 | INIT_LIST_HEAD(&p->rt.run_list); |
2533 | p->se.on_rq = 0; | ||
2534 | INIT_LIST_HEAD(&p->se.group_node); | ||
2535 | 2847 | ||
2536 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2848 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2537 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2849 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
@@ -2541,8 +2853,9 @@ static void __sched_fork(struct task_struct *p) | |||
2541 | /* | 2853 | /* |
2542 | * fork()/clone()-time setup: | 2854 | * fork()/clone()-time setup: |
2543 | */ | 2855 | */ |
2544 | void sched_fork(struct task_struct *p, int clone_flags) | 2856 | void sched_fork(struct task_struct *p) |
2545 | { | 2857 | { |
2858 | unsigned long flags; | ||
2546 | int cpu = get_cpu(); | 2859 | int cpu = get_cpu(); |
2547 | 2860 | ||
2548 | __sched_fork(p); | 2861 | __sched_fork(p); |
@@ -2594,22 +2907,24 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2594 | * | 2907 | * |
2595 | * Silence PROVE_RCU. | 2908 | * Silence PROVE_RCU. |
2596 | */ | 2909 | */ |
2597 | rcu_read_lock(); | 2910 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2598 | set_task_cpu(p, cpu); | 2911 | set_task_cpu(p, cpu); |
2599 | rcu_read_unlock(); | 2912 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2600 | 2913 | ||
2601 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2914 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
2602 | if (likely(sched_info_on())) | 2915 | if (likely(sched_info_on())) |
2603 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2916 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
2604 | #endif | 2917 | #endif |
2605 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 2918 | #if defined(CONFIG_SMP) |
2606 | p->oncpu = 0; | 2919 | p->on_cpu = 0; |
2607 | #endif | 2920 | #endif |
2608 | #ifdef CONFIG_PREEMPT | 2921 | #ifdef CONFIG_PREEMPT |
2609 | /* Want to start with kernel preemption disabled. */ | 2922 | /* Want to start with kernel preemption disabled. */ |
2610 | task_thread_info(p)->preempt_count = 1; | 2923 | task_thread_info(p)->preempt_count = 1; |
2611 | #endif | 2924 | #endif |
2925 | #ifdef CONFIG_SMP | ||
2612 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 2926 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
2927 | #endif | ||
2613 | 2928 | ||
2614 | put_cpu(); | 2929 | put_cpu(); |
2615 | } | 2930 | } |
@@ -2621,41 +2936,31 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2621 | * that must be done for every newly created context, then puts the task | 2936 | * that must be done for every newly created context, then puts the task |
2622 | * on the runqueue and wakes it. | 2937 | * on the runqueue and wakes it. |
2623 | */ | 2938 | */ |
2624 | void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 2939 | void wake_up_new_task(struct task_struct *p) |
2625 | { | 2940 | { |
2626 | unsigned long flags; | 2941 | unsigned long flags; |
2627 | struct rq *rq; | 2942 | struct rq *rq; |
2628 | int cpu __maybe_unused = get_cpu(); | ||
2629 | 2943 | ||
2944 | raw_spin_lock_irqsave(&p->pi_lock, flags); | ||
2630 | #ifdef CONFIG_SMP | 2945 | #ifdef CONFIG_SMP |
2631 | rq = task_rq_lock(p, &flags); | ||
2632 | p->state = TASK_WAKING; | ||
2633 | |||
2634 | /* | 2946 | /* |
2635 | * Fork balancing, do it here and not earlier because: | 2947 | * Fork balancing, do it here and not earlier because: |
2636 | * - cpus_allowed can change in the fork path | 2948 | * - cpus_allowed can change in the fork path |
2637 | * - any previously selected cpu might disappear through hotplug | 2949 | * - any previously selected cpu might disappear through hotplug |
2638 | * | ||
2639 | * We set TASK_WAKING so that select_task_rq() can drop rq->lock | ||
2640 | * without people poking at ->cpus_allowed. | ||
2641 | */ | 2950 | */ |
2642 | cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); | 2951 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); |
2643 | set_task_cpu(p, cpu); | ||
2644 | |||
2645 | p->state = TASK_RUNNING; | ||
2646 | task_rq_unlock(rq, &flags); | ||
2647 | #endif | 2952 | #endif |
2648 | 2953 | ||
2649 | rq = task_rq_lock(p, &flags); | 2954 | rq = __task_rq_lock(p); |
2650 | activate_task(rq, p, 0); | 2955 | activate_task(rq, p, 0); |
2651 | trace_sched_wakeup_new(p, 1); | 2956 | p->on_rq = 1; |
2957 | trace_sched_wakeup_new(p, true); | ||
2652 | check_preempt_curr(rq, p, WF_FORK); | 2958 | check_preempt_curr(rq, p, WF_FORK); |
2653 | #ifdef CONFIG_SMP | 2959 | #ifdef CONFIG_SMP |
2654 | if (p->sched_class->task_woken) | 2960 | if (p->sched_class->task_woken) |
2655 | p->sched_class->task_woken(rq, p); | 2961 | p->sched_class->task_woken(rq, p); |
2656 | #endif | 2962 | #endif |
2657 | task_rq_unlock(rq, &flags); | 2963 | task_rq_unlock(rq, p, &flags); |
2658 | put_cpu(); | ||
2659 | } | 2964 | } |
2660 | 2965 | ||
2661 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2966 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -2733,9 +3038,12 @@ static inline void | |||
2733 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 3038 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
2734 | struct task_struct *next) | 3039 | struct task_struct *next) |
2735 | { | 3040 | { |
3041 | sched_info_switch(prev, next); | ||
3042 | perf_event_task_sched_out(prev, next); | ||
2736 | fire_sched_out_preempt_notifiers(prev, next); | 3043 | fire_sched_out_preempt_notifiers(prev, next); |
2737 | prepare_lock_switch(rq, next); | 3044 | prepare_lock_switch(rq, next); |
2738 | prepare_arch_switch(next); | 3045 | prepare_arch_switch(next); |
3046 | trace_sched_switch(prev, next); | ||
2739 | } | 3047 | } |
2740 | 3048 | ||
2741 | /** | 3049 | /** |
@@ -2879,7 +3187,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2879 | struct mm_struct *mm, *oldmm; | 3187 | struct mm_struct *mm, *oldmm; |
2880 | 3188 | ||
2881 | prepare_task_switch(rq, prev, next); | 3189 | prepare_task_switch(rq, prev, next); |
2882 | trace_sched_switch(prev, next); | 3190 | |
2883 | mm = next->mm; | 3191 | mm = next->mm; |
2884 | oldmm = prev->active_mm; | 3192 | oldmm = prev->active_mm; |
2885 | /* | 3193 | /* |
@@ -2889,14 +3197,14 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2889 | */ | 3197 | */ |
2890 | arch_start_context_switch(prev); | 3198 | arch_start_context_switch(prev); |
2891 | 3199 | ||
2892 | if (likely(!mm)) { | 3200 | if (!mm) { |
2893 | next->active_mm = oldmm; | 3201 | next->active_mm = oldmm; |
2894 | atomic_inc(&oldmm->mm_count); | 3202 | atomic_inc(&oldmm->mm_count); |
2895 | enter_lazy_tlb(oldmm, next); | 3203 | enter_lazy_tlb(oldmm, next); |
2896 | } else | 3204 | } else |
2897 | switch_mm(oldmm, mm, next); | 3205 | switch_mm(oldmm, mm, next); |
2898 | 3206 | ||
2899 | if (likely(!prev->mm)) { | 3207 | if (!prev->mm) { |
2900 | prev->active_mm = NULL; | 3208 | prev->active_mm = NULL; |
2901 | rq->prev_mm = oldmm; | 3209 | rq->prev_mm = oldmm; |
2902 | } | 3210 | } |
@@ -3011,6 +3319,15 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
3011 | return delta; | 3319 | return delta; |
3012 | } | 3320 | } |
3013 | 3321 | ||
3322 | static unsigned long | ||
3323 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3324 | { | ||
3325 | load *= exp; | ||
3326 | load += active * (FIXED_1 - exp); | ||
3327 | load += 1UL << (FSHIFT - 1); | ||
3328 | return load >> FSHIFT; | ||
3329 | } | ||
3330 | |||
3014 | #ifdef CONFIG_NO_HZ | 3331 | #ifdef CONFIG_NO_HZ |
3015 | /* | 3332 | /* |
3016 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 3333 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. |
@@ -3040,6 +3357,128 @@ static long calc_load_fold_idle(void) | |||
3040 | 3357 | ||
3041 | return delta; | 3358 | return delta; |
3042 | } | 3359 | } |
3360 | |||
3361 | /** | ||
3362 | * fixed_power_int - compute: x^n, in O(log n) time | ||
3363 | * | ||
3364 | * @x: base of the power | ||
3365 | * @frac_bits: fractional bits of @x | ||
3366 | * @n: power to raise @x to. | ||
3367 | * | ||
3368 | * By exploiting the relation between the definition of the natural power | ||
3369 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
3370 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
3371 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
3372 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
3373 | * of course trivially computable in O(log_2 n), the length of our binary | ||
3374 | * vector. | ||
3375 | */ | ||
3376 | static unsigned long | ||
3377 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
3378 | { | ||
3379 | unsigned long result = 1UL << frac_bits; | ||
3380 | |||
3381 | if (n) for (;;) { | ||
3382 | if (n & 1) { | ||
3383 | result *= x; | ||
3384 | result += 1UL << (frac_bits - 1); | ||
3385 | result >>= frac_bits; | ||
3386 | } | ||
3387 | n >>= 1; | ||
3388 | if (!n) | ||
3389 | break; | ||
3390 | x *= x; | ||
3391 | x += 1UL << (frac_bits - 1); | ||
3392 | x >>= frac_bits; | ||
3393 | } | ||
3394 | |||
3395 | return result; | ||
3396 | } | ||
3397 | |||
3398 | /* | ||
3399 | * a1 = a0 * e + a * (1 - e) | ||
3400 | * | ||
3401 | * a2 = a1 * e + a * (1 - e) | ||
3402 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
3403 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
3404 | * | ||
3405 | * a3 = a2 * e + a * (1 - e) | ||
3406 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
3407 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
3408 | * | ||
3409 | * ... | ||
3410 | * | ||
3411 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
3412 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
3413 | * = a0 * e^n + a * (1 - e^n) | ||
3414 | * | ||
3415 | * [1] application of the geometric series: | ||
3416 | * | ||
3417 | * n 1 - x^(n+1) | ||
3418 | * S_n := \Sum x^i = ------------- | ||
3419 | * i=0 1 - x | ||
3420 | */ | ||
3421 | static unsigned long | ||
3422 | calc_load_n(unsigned long load, unsigned long exp, | ||
3423 | unsigned long active, unsigned int n) | ||
3424 | { | ||
3425 | |||
3426 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
3427 | } | ||
3428 | |||
3429 | /* | ||
3430 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
3431 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
3432 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
3433 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
3434 | * | ||
3435 | * Once we've updated the global active value, we need to apply the exponential | ||
3436 | * weights adjusted to the number of cycles missed. | ||
3437 | */ | ||
3438 | static void calc_global_nohz(unsigned long ticks) | ||
3439 | { | ||
3440 | long delta, active, n; | ||
3441 | |||
3442 | if (time_before(jiffies, calc_load_update)) | ||
3443 | return; | ||
3444 | |||
3445 | /* | ||
3446 | * If we crossed a calc_load_update boundary, make sure to fold | ||
3447 | * any pending idle changes, the respective CPUs might have | ||
3448 | * missed the tick driven calc_load_account_active() update | ||
3449 | * due to NO_HZ. | ||
3450 | */ | ||
3451 | delta = calc_load_fold_idle(); | ||
3452 | if (delta) | ||
3453 | atomic_long_add(delta, &calc_load_tasks); | ||
3454 | |||
3455 | /* | ||
3456 | * If we were idle for multiple load cycles, apply them. | ||
3457 | */ | ||
3458 | if (ticks >= LOAD_FREQ) { | ||
3459 | n = ticks / LOAD_FREQ; | ||
3460 | |||
3461 | active = atomic_long_read(&calc_load_tasks); | ||
3462 | active = active > 0 ? active * FIXED_1 : 0; | ||
3463 | |||
3464 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
3465 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
3466 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
3467 | |||
3468 | calc_load_update += n * LOAD_FREQ; | ||
3469 | } | ||
3470 | |||
3471 | /* | ||
3472 | * Its possible the remainder of the above division also crosses | ||
3473 | * a LOAD_FREQ period, the regular check in calc_global_load() | ||
3474 | * which comes after this will take care of that. | ||
3475 | * | ||
3476 | * Consider us being 11 ticks before a cycle completion, and us | ||
3477 | * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will | ||
3478 | * age us 4 cycles, and the test in calc_global_load() will | ||
3479 | * pick up the final one. | ||
3480 | */ | ||
3481 | } | ||
3043 | #else | 3482 | #else |
3044 | static void calc_load_account_idle(struct rq *this_rq) | 3483 | static void calc_load_account_idle(struct rq *this_rq) |
3045 | { | 3484 | { |
@@ -3049,6 +3488,10 @@ static inline long calc_load_fold_idle(void) | |||
3049 | { | 3488 | { |
3050 | return 0; | 3489 | return 0; |
3051 | } | 3490 | } |
3491 | |||
3492 | static void calc_global_nohz(unsigned long ticks) | ||
3493 | { | ||
3494 | } | ||
3052 | #endif | 3495 | #endif |
3053 | 3496 | ||
3054 | /** | 3497 | /** |
@@ -3066,24 +3509,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
3066 | loads[2] = (avenrun[2] + offset) << shift; | 3509 | loads[2] = (avenrun[2] + offset) << shift; |
3067 | } | 3510 | } |
3068 | 3511 | ||
3069 | static unsigned long | ||
3070 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3071 | { | ||
3072 | load *= exp; | ||
3073 | load += active * (FIXED_1 - exp); | ||
3074 | return load >> FSHIFT; | ||
3075 | } | ||
3076 | |||
3077 | /* | 3512 | /* |
3078 | * calc_load - update the avenrun load estimates 10 ticks after the | 3513 | * calc_load - update the avenrun load estimates 10 ticks after the |
3079 | * CPUs have updated calc_load_tasks. | 3514 | * CPUs have updated calc_load_tasks. |
3080 | */ | 3515 | */ |
3081 | void calc_global_load(void) | 3516 | void calc_global_load(unsigned long ticks) |
3082 | { | 3517 | { |
3083 | unsigned long upd = calc_load_update + 10; | ||
3084 | long active; | 3518 | long active; |
3085 | 3519 | ||
3086 | if (time_before(jiffies, upd)) | 3520 | calc_global_nohz(ticks); |
3521 | |||
3522 | if (time_before(jiffies, calc_load_update + 10)) | ||
3087 | return; | 3523 | return; |
3088 | 3524 | ||
3089 | active = atomic_long_read(&calc_load_tasks); | 3525 | active = atomic_long_read(&calc_load_tasks); |
@@ -3244,27 +3680,22 @@ void sched_exec(void) | |||
3244 | { | 3680 | { |
3245 | struct task_struct *p = current; | 3681 | struct task_struct *p = current; |
3246 | unsigned long flags; | 3682 | unsigned long flags; |
3247 | struct rq *rq; | ||
3248 | int dest_cpu; | 3683 | int dest_cpu; |
3249 | 3684 | ||
3250 | rq = task_rq_lock(p, &flags); | 3685 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
3251 | dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); | 3686 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); |
3252 | if (dest_cpu == smp_processor_id()) | 3687 | if (dest_cpu == smp_processor_id()) |
3253 | goto unlock; | 3688 | goto unlock; |
3254 | 3689 | ||
3255 | /* | 3690 | if (likely(cpu_active(dest_cpu))) { |
3256 | * select_task_rq() can race against ->cpus_allowed | ||
3257 | */ | ||
3258 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | ||
3259 | likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { | ||
3260 | struct migration_arg arg = { p, dest_cpu }; | 3691 | struct migration_arg arg = { p, dest_cpu }; |
3261 | 3692 | ||
3262 | task_rq_unlock(rq, &flags); | 3693 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3263 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 3694 | stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); |
3264 | return; | 3695 | return; |
3265 | } | 3696 | } |
3266 | unlock: | 3697 | unlock: |
3267 | task_rq_unlock(rq, &flags); | 3698 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3268 | } | 3699 | } |
3269 | 3700 | ||
3270 | #endif | 3701 | #endif |
@@ -3285,7 +3716,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
3285 | 3716 | ||
3286 | if (task_current(rq, p)) { | 3717 | if (task_current(rq, p)) { |
3287 | update_rq_clock(rq); | 3718 | update_rq_clock(rq); |
3288 | ns = rq->clock - p->se.exec_start; | 3719 | ns = rq->clock_task - p->se.exec_start; |
3289 | if ((s64)ns < 0) | 3720 | if ((s64)ns < 0) |
3290 | ns = 0; | 3721 | ns = 0; |
3291 | } | 3722 | } |
@@ -3301,7 +3732,7 @@ unsigned long long task_delta_exec(struct task_struct *p) | |||
3301 | 3732 | ||
3302 | rq = task_rq_lock(p, &flags); | 3733 | rq = task_rq_lock(p, &flags); |
3303 | ns = do_task_delta_exec(p, rq); | 3734 | ns = do_task_delta_exec(p, rq); |
3304 | task_rq_unlock(rq, &flags); | 3735 | task_rq_unlock(rq, p, &flags); |
3305 | 3736 | ||
3306 | return ns; | 3737 | return ns; |
3307 | } | 3738 | } |
@@ -3319,7 +3750,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3319 | 3750 | ||
3320 | rq = task_rq_lock(p, &flags); | 3751 | rq = task_rq_lock(p, &flags); |
3321 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); | 3752 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); |
3322 | task_rq_unlock(rq, &flags); | 3753 | task_rq_unlock(rq, p, &flags); |
3323 | 3754 | ||
3324 | return ns; | 3755 | return ns; |
3325 | } | 3756 | } |
@@ -3343,7 +3774,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) | |||
3343 | rq = task_rq_lock(p, &flags); | 3774 | rq = task_rq_lock(p, &flags); |
3344 | thread_group_cputime(p, &totals); | 3775 | thread_group_cputime(p, &totals); |
3345 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); | 3776 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); |
3346 | task_rq_unlock(rq, &flags); | 3777 | task_rq_unlock(rq, p, &flags); |
3347 | 3778 | ||
3348 | return ns; | 3779 | return ns; |
3349 | } | 3780 | } |
@@ -3408,6 +3839,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
3408 | } | 3839 | } |
3409 | 3840 | ||
3410 | /* | 3841 | /* |
3842 | * Account system cpu time to a process and desired cpustat field | ||
3843 | * @p: the process that the cpu time gets accounted to | ||
3844 | * @cputime: the cpu time spent in kernel space since the last update | ||
3845 | * @cputime_scaled: cputime scaled by cpu frequency | ||
3846 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
3847 | */ | ||
3848 | static inline | ||
3849 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
3850 | cputime_t cputime_scaled, cputime64_t *target_cputime64) | ||
3851 | { | ||
3852 | cputime64_t tmp = cputime_to_cputime64(cputime); | ||
3853 | |||
3854 | /* Add system time to process. */ | ||
3855 | p->stime = cputime_add(p->stime, cputime); | ||
3856 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3857 | account_group_system_time(p, cputime); | ||
3858 | |||
3859 | /* Add system time to cpustat. */ | ||
3860 | *target_cputime64 = cputime64_add(*target_cputime64, tmp); | ||
3861 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3862 | |||
3863 | /* Account for system time used */ | ||
3864 | acct_update_integrals(p); | ||
3865 | } | ||
3866 | |||
3867 | /* | ||
3411 | * Account system cpu time to a process. | 3868 | * Account system cpu time to a process. |
3412 | * @p: the process that the cpu time gets accounted to | 3869 | * @p: the process that the cpu time gets accounted to |
3413 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3870 | * @hardirq_offset: the offset to subtract from hardirq_count() |
@@ -3418,36 +3875,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3418 | cputime_t cputime, cputime_t cputime_scaled) | 3875 | cputime_t cputime, cputime_t cputime_scaled) |
3419 | { | 3876 | { |
3420 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3877 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3421 | cputime64_t tmp; | 3878 | cputime64_t *target_cputime64; |
3422 | 3879 | ||
3423 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 3880 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
3424 | account_guest_time(p, cputime, cputime_scaled); | 3881 | account_guest_time(p, cputime, cputime_scaled); |
3425 | return; | 3882 | return; |
3426 | } | 3883 | } |
3427 | 3884 | ||
3428 | /* Add system time to process. */ | ||
3429 | p->stime = cputime_add(p->stime, cputime); | ||
3430 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3431 | account_group_system_time(p, cputime); | ||
3432 | |||
3433 | /* Add system time to cpustat. */ | ||
3434 | tmp = cputime_to_cputime64(cputime); | ||
3435 | if (hardirq_count() - hardirq_offset) | 3885 | if (hardirq_count() - hardirq_offset) |
3436 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3886 | target_cputime64 = &cpustat->irq; |
3437 | else if (softirq_count()) | 3887 | else if (in_serving_softirq()) |
3438 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3888 | target_cputime64 = &cpustat->softirq; |
3439 | else | 3889 | else |
3440 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3890 | target_cputime64 = &cpustat->system; |
3441 | |||
3442 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3443 | 3891 | ||
3444 | /* Account for system time used */ | 3892 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); |
3445 | acct_update_integrals(p); | ||
3446 | } | 3893 | } |
3447 | 3894 | ||
3448 | /* | 3895 | /* |
3449 | * Account for involuntary wait time. | 3896 | * Account for involuntary wait time. |
3450 | * @steal: the cpu time spent in involuntary wait | 3897 | * @cputime: the cpu time spent in involuntary wait |
3451 | */ | 3898 | */ |
3452 | void account_steal_time(cputime_t cputime) | 3899 | void account_steal_time(cputime_t cputime) |
3453 | { | 3900 | { |
@@ -3475,6 +3922,73 @@ void account_idle_time(cputime_t cputime) | |||
3475 | 3922 | ||
3476 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 3923 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
3477 | 3924 | ||
3925 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
3926 | /* | ||
3927 | * Account a tick to a process and cpustat | ||
3928 | * @p: the process that the cpu time gets accounted to | ||
3929 | * @user_tick: is the tick from userspace | ||
3930 | * @rq: the pointer to rq | ||
3931 | * | ||
3932 | * Tick demultiplexing follows the order | ||
3933 | * - pending hardirq update | ||
3934 | * - pending softirq update | ||
3935 | * - user_time | ||
3936 | * - idle_time | ||
3937 | * - system time | ||
3938 | * - check for guest_time | ||
3939 | * - else account as system_time | ||
3940 | * | ||
3941 | * Check for hardirq is done both for system and user time as there is | ||
3942 | * no timer going off while we are on hardirq and hence we may never get an | ||
3943 | * opportunity to update it solely in system time. | ||
3944 | * p->stime and friends are only updated on system time and not on irq | ||
3945 | * softirq as those do not count in task exec_runtime any more. | ||
3946 | */ | ||
3947 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3948 | struct rq *rq) | ||
3949 | { | ||
3950 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
3951 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | ||
3952 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
3953 | |||
3954 | if (irqtime_account_hi_update()) { | ||
3955 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | ||
3956 | } else if (irqtime_account_si_update()) { | ||
3957 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | ||
3958 | } else if (this_cpu_ksoftirqd() == p) { | ||
3959 | /* | ||
3960 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
3961 | * So, we have to handle it separately here. | ||
3962 | * Also, p->stime needs to be updated for ksoftirqd. | ||
3963 | */ | ||
3964 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3965 | &cpustat->softirq); | ||
3966 | } else if (user_tick) { | ||
3967 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3968 | } else if (p == rq->idle) { | ||
3969 | account_idle_time(cputime_one_jiffy); | ||
3970 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
3971 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3972 | } else { | ||
3973 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3974 | &cpustat->system); | ||
3975 | } | ||
3976 | } | ||
3977 | |||
3978 | static void irqtime_account_idle_ticks(int ticks) | ||
3979 | { | ||
3980 | int i; | ||
3981 | struct rq *rq = this_rq(); | ||
3982 | |||
3983 | for (i = 0; i < ticks; i++) | ||
3984 | irqtime_account_process_tick(current, 0, rq); | ||
3985 | } | ||
3986 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3987 | static void irqtime_account_idle_ticks(int ticks) {} | ||
3988 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3989 | struct rq *rq) {} | ||
3990 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3991 | |||
3478 | /* | 3992 | /* |
3479 | * Account a single tick of cpu time. | 3993 | * Account a single tick of cpu time. |
3480 | * @p: the process that the cpu time gets accounted to | 3994 | * @p: the process that the cpu time gets accounted to |
@@ -3485,6 +3999,11 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
3485 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 3999 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
3486 | struct rq *rq = this_rq(); | 4000 | struct rq *rq = this_rq(); |
3487 | 4001 | ||
4002 | if (sched_clock_irqtime) { | ||
4003 | irqtime_account_process_tick(p, user_tick, rq); | ||
4004 | return; | ||
4005 | } | ||
4006 | |||
3488 | if (user_tick) | 4007 | if (user_tick) |
3489 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 4008 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
3490 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 4009 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
@@ -3510,6 +4029,12 @@ void account_steal_ticks(unsigned long ticks) | |||
3510 | */ | 4029 | */ |
3511 | void account_idle_ticks(unsigned long ticks) | 4030 | void account_idle_ticks(unsigned long ticks) |
3512 | { | 4031 | { |
4032 | |||
4033 | if (sched_clock_irqtime) { | ||
4034 | irqtime_account_idle_ticks(ticks); | ||
4035 | return; | ||
4036 | } | ||
4037 | |||
3513 | account_idle_time(jiffies_to_cputime(ticks)); | 4038 | account_idle_time(jiffies_to_cputime(ticks)); |
3514 | } | 4039 | } |
3515 | 4040 | ||
@@ -3603,9 +4128,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3603 | /* | 4128 | /* |
3604 | * This function gets called by the timer code, with HZ frequency. | 4129 | * This function gets called by the timer code, with HZ frequency. |
3605 | * We call it with interrupts disabled. | 4130 | * We call it with interrupts disabled. |
3606 | * | ||
3607 | * It also gets called by the fork code, when changing the parent's | ||
3608 | * timeslices. | ||
3609 | */ | 4131 | */ |
3610 | void scheduler_tick(void) | 4132 | void scheduler_tick(void) |
3611 | { | 4133 | { |
@@ -3627,7 +4149,7 @@ void scheduler_tick(void) | |||
3627 | 4149 | ||
3628 | raw_spin_unlock(&rq->lock); | 4150 | raw_spin_unlock(&rq->lock); |
3629 | 4151 | ||
3630 | perf_event_task_tick(curr); | 4152 | perf_event_task_tick(); |
3631 | 4153 | ||
3632 | #ifdef CONFIG_SMP | 4154 | #ifdef CONFIG_SMP |
3633 | rq->idle_at_tick = idle_cpu(cpu); | 4155 | rq->idle_at_tick = idle_cpu(cpu); |
@@ -3733,19 +4255,12 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3733 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 4255 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3734 | 4256 | ||
3735 | schedstat_inc(this_rq(), sched_count); | 4257 | schedstat_inc(this_rq(), sched_count); |
3736 | #ifdef CONFIG_SCHEDSTATS | ||
3737 | if (unlikely(prev->lock_depth >= 0)) { | ||
3738 | schedstat_inc(this_rq(), bkl_count); | ||
3739 | schedstat_inc(prev, sched_info.bkl_count); | ||
3740 | } | ||
3741 | #endif | ||
3742 | } | 4258 | } |
3743 | 4259 | ||
3744 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | 4260 | static void put_prev_task(struct rq *rq, struct task_struct *prev) |
3745 | { | 4261 | { |
3746 | if (prev->se.on_rq) | 4262 | if (prev->on_rq || rq->skip_clock_update < 0) |
3747 | update_rq_clock(rq); | 4263 | update_rq_clock(rq); |
3748 | rq->skip_clock_update = 0; | ||
3749 | prev->sched_class->put_prev_task(rq, prev); | 4264 | prev->sched_class->put_prev_task(rq, prev); |
3750 | } | 4265 | } |
3751 | 4266 | ||
@@ -3776,17 +4291,13 @@ pick_next_task(struct rq *rq) | |||
3776 | } | 4291 | } |
3777 | */ | 4292 | */ |
3778 | 4293 | ||
3779 | class = sched_class_highest; | 4294 | for_each_class(class) { |
3780 | for ( ; ; ) { | ||
3781 | p = class->pick_next_task(rq); | 4295 | p = class->pick_next_task(rq); |
3782 | if (p) | 4296 | if (p) |
3783 | return p; | 4297 | return p; |
3784 | /* | ||
3785 | * Will never be NULL as the idle class always | ||
3786 | * returns a non-NULL p: | ||
3787 | */ | ||
3788 | class = class->next; | ||
3789 | } | 4298 | } |
4299 | |||
4300 | BUG(); /* the idle class will always have a runnable task */ | ||
3790 | } | 4301 | } |
3791 | 4302 | ||
3792 | /* | 4303 | /* |
@@ -3807,8 +4318,10 @@ need_resched: | |||
3807 | rcu_note_context_switch(cpu); | 4318 | rcu_note_context_switch(cpu); |
3808 | prev = rq->curr; | 4319 | prev = rq->curr; |
3809 | 4320 | ||
3810 | release_kernel_lock(prev); | 4321 | /* LITMUS^RT: quickly re-evaluate the scheduling decision |
3811 | need_resched_nonpreemptible: | 4322 | * if the previous one is no longer valid after CTX. |
4323 | */ | ||
4324 | litmus_need_resched_nonpreemptible: | ||
3812 | TS_SCHED_START; | 4325 | TS_SCHED_START; |
3813 | sched_trace_task_switch_away(prev); | 4326 | sched_trace_task_switch_away(prev); |
3814 | 4327 | ||
@@ -3818,18 +4331,19 @@ need_resched_nonpreemptible: | |||
3818 | hrtick_clear(rq); | 4331 | hrtick_clear(rq); |
3819 | 4332 | ||
3820 | raw_spin_lock_irq(&rq->lock); | 4333 | raw_spin_lock_irq(&rq->lock); |
3821 | clear_tsk_need_resched(prev); | ||
3822 | 4334 | ||
3823 | switch_count = &prev->nivcsw; | 4335 | switch_count = &prev->nivcsw; |
3824 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 4336 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3825 | if (unlikely(signal_pending_state(prev->state, prev))) { | 4337 | if (unlikely(signal_pending_state(prev->state, prev))) { |
3826 | prev->state = TASK_RUNNING; | 4338 | prev->state = TASK_RUNNING; |
3827 | } else { | 4339 | } else { |
4340 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | ||
4341 | prev->on_rq = 0; | ||
4342 | |||
3828 | /* | 4343 | /* |
3829 | * If a worker is going to sleep, notify and | 4344 | * If a worker went to sleep, notify and ask workqueue |
3830 | * ask workqueue whether it wants to wake up a | 4345 | * whether it wants to wake up a task to maintain |
3831 | * task to maintain concurrency. If so, wake | 4346 | * concurrency. |
3832 | * up the task. | ||
3833 | */ | 4347 | */ |
3834 | if (prev->flags & PF_WQ_WORKER) { | 4348 | if (prev->flags & PF_WQ_WORKER) { |
3835 | struct task_struct *to_wakeup; | 4349 | struct task_struct *to_wakeup; |
@@ -3838,7 +4352,16 @@ need_resched_nonpreemptible: | |||
3838 | if (to_wakeup) | 4352 | if (to_wakeup) |
3839 | try_to_wake_up_local(to_wakeup); | 4353 | try_to_wake_up_local(to_wakeup); |
3840 | } | 4354 | } |
3841 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 4355 | |
4356 | /* | ||
4357 | * If we are going to sleep and we have plugged IO | ||
4358 | * queued, make sure to submit it to avoid deadlocks. | ||
4359 | */ | ||
4360 | if (blk_needs_flush_plug(prev)) { | ||
4361 | raw_spin_unlock(&rq->lock); | ||
4362 | blk_schedule_flush_plug(prev); | ||
4363 | raw_spin_lock(&rq->lock); | ||
4364 | } | ||
3842 | } | 4365 | } |
3843 | switch_count = &prev->nvcsw; | 4366 | switch_count = &prev->nvcsw; |
3844 | } | 4367 | } |
@@ -3850,11 +4373,10 @@ need_resched_nonpreemptible: | |||
3850 | 4373 | ||
3851 | put_prev_task(rq, prev); | 4374 | put_prev_task(rq, prev); |
3852 | next = pick_next_task(rq); | 4375 | next = pick_next_task(rq); |
4376 | clear_tsk_need_resched(prev); | ||
4377 | rq->skip_clock_update = 0; | ||
3853 | 4378 | ||
3854 | if (likely(prev != next)) { | 4379 | if (likely(prev != next)) { |
3855 | sched_info_switch(prev, next); | ||
3856 | perf_event_task_sched_out(prev, next); | ||
3857 | |||
3858 | rq->nr_switches++; | 4380 | rq->nr_switches++; |
3859 | rq->curr = next; | 4381 | rq->curr = next; |
3860 | ++*switch_count; | 4382 | ++*switch_count; |
@@ -3880,8 +4402,8 @@ need_resched_nonpreemptible: | |||
3880 | 4402 | ||
3881 | post_schedule(rq); | 4403 | post_schedule(rq); |
3882 | 4404 | ||
3883 | if (sched_state_validate_switch() || unlikely(reacquire_kernel_lock(prev))) | 4405 | if (sched_state_validate_switch()) |
3884 | goto need_resched_nonpreemptible; | 4406 | goto litmus_need_resched_nonpreemptible; |
3885 | 4407 | ||
3886 | preempt_enable_no_resched(); | 4408 | preempt_enable_no_resched(); |
3887 | if (need_resched()) | 4409 | if (need_resched()) |
@@ -3892,70 +4414,53 @@ need_resched_nonpreemptible: | |||
3892 | EXPORT_SYMBOL(schedule); | 4414 | EXPORT_SYMBOL(schedule); |
3893 | 4415 | ||
3894 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 4416 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
4417 | |||
4418 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | ||
4419 | { | ||
4420 | bool ret = false; | ||
4421 | |||
4422 | rcu_read_lock(); | ||
4423 | if (lock->owner != owner) | ||
4424 | goto fail; | ||
4425 | |||
4426 | /* | ||
4427 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
4428 | * lock->owner still matches owner, if that fails, owner might | ||
4429 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
4430 | * ensures the memory stays valid. | ||
4431 | */ | ||
4432 | barrier(); | ||
4433 | |||
4434 | ret = owner->on_cpu; | ||
4435 | fail: | ||
4436 | rcu_read_unlock(); | ||
4437 | |||
4438 | return ret; | ||
4439 | } | ||
4440 | |||
3895 | /* | 4441 | /* |
3896 | * Look out! "owner" is an entirely speculative pointer | 4442 | * Look out! "owner" is an entirely speculative pointer |
3897 | * access and not reliable. | 4443 | * access and not reliable. |
3898 | */ | 4444 | */ |
3899 | int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | 4445 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) |
3900 | { | 4446 | { |
3901 | unsigned int cpu; | ||
3902 | struct rq *rq; | ||
3903 | |||
3904 | if (!sched_feat(OWNER_SPIN)) | 4447 | if (!sched_feat(OWNER_SPIN)) |
3905 | return 0; | 4448 | return 0; |
3906 | 4449 | ||
3907 | #ifdef CONFIG_DEBUG_PAGEALLOC | 4450 | while (owner_running(lock, owner)) { |
3908 | /* | 4451 | if (need_resched()) |
3909 | * Need to access the cpu field knowing that | 4452 | return 0; |
3910 | * DEBUG_PAGEALLOC could have unmapped it if | ||
3911 | * the mutex owner just released it and exited. | ||
3912 | */ | ||
3913 | if (probe_kernel_address(&owner->cpu, cpu)) | ||
3914 | return 0; | ||
3915 | #else | ||
3916 | cpu = owner->cpu; | ||
3917 | #endif | ||
3918 | 4453 | ||
3919 | /* | 4454 | arch_mutex_cpu_relax(); |
3920 | * Even if the access succeeded (likely case), | 4455 | } |
3921 | * the cpu field may no longer be valid. | ||
3922 | */ | ||
3923 | if (cpu >= nr_cpumask_bits) | ||
3924 | return 0; | ||
3925 | 4456 | ||
3926 | /* | 4457 | /* |
3927 | * We need to validate that we can do a | 4458 | * If the owner changed to another task there is likely |
3928 | * get_cpu() and that we have the percpu area. | 4459 | * heavy contention, stop spinning. |
3929 | */ | 4460 | */ |
3930 | if (!cpu_online(cpu)) | 4461 | if (lock->owner) |
3931 | return 0; | 4462 | return 0; |
3932 | 4463 | ||
3933 | rq = cpu_rq(cpu); | ||
3934 | |||
3935 | for (;;) { | ||
3936 | /* | ||
3937 | * Owner changed, break to re-assess state. | ||
3938 | */ | ||
3939 | if (lock->owner != owner) { | ||
3940 | /* | ||
3941 | * If the lock has switched to a different owner, | ||
3942 | * we likely have heavy contention. Return 0 to quit | ||
3943 | * optimistic spinning and not contend further: | ||
3944 | */ | ||
3945 | if (lock->owner) | ||
3946 | return 0; | ||
3947 | break; | ||
3948 | } | ||
3949 | |||
3950 | /* | ||
3951 | * Is that owner really running on that cpu? | ||
3952 | */ | ||
3953 | if (task_thread_info(rq->curr) != owner || need_resched()) | ||
3954 | return 0; | ||
3955 | |||
3956 | cpu_relax(); | ||
3957 | } | ||
3958 | |||
3959 | return 1; | 4464 | return 1; |
3960 | } | 4465 | } |
3961 | #endif | 4466 | #endif |
@@ -4085,6 +4590,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | |||
4085 | { | 4590 | { |
4086 | __wake_up_common(q, mode, 1, 0, key); | 4591 | __wake_up_common(q, mode, 1, 0, key); |
4087 | } | 4592 | } |
4593 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | ||
4088 | 4594 | ||
4089 | /** | 4595 | /** |
4090 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | 4596 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. |
@@ -4276,7 +4782,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); | |||
4276 | * This waits for either a completion of a specific task to be signaled or for a | 4782 | * This waits for either a completion of a specific task to be signaled or for a |
4277 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 4783 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
4278 | */ | 4784 | */ |
4279 | unsigned long __sched | 4785 | long __sched |
4280 | wait_for_completion_interruptible_timeout(struct completion *x, | 4786 | wait_for_completion_interruptible_timeout(struct completion *x, |
4281 | unsigned long timeout) | 4787 | unsigned long timeout) |
4282 | { | 4788 | { |
@@ -4309,7 +4815,7 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
4309 | * signaled or for a specified timeout to expire. It can be | 4815 | * signaled or for a specified timeout to expire. It can be |
4310 | * interrupted by a kill signal. The timeout is in jiffies. | 4816 | * interrupted by a kill signal. The timeout is in jiffies. |
4311 | */ | 4817 | */ |
4312 | unsigned long __sched | 4818 | long __sched |
4313 | wait_for_completion_killable_timeout(struct completion *x, | 4819 | wait_for_completion_killable_timeout(struct completion *x, |
4314 | unsigned long timeout) | 4820 | unsigned long timeout) |
4315 | { | 4821 | { |
@@ -4425,18 +4931,18 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
4425 | */ | 4931 | */ |
4426 | void rt_mutex_setprio(struct task_struct *p, int prio) | 4932 | void rt_mutex_setprio(struct task_struct *p, int prio) |
4427 | { | 4933 | { |
4428 | unsigned long flags; | ||
4429 | int oldprio, on_rq, running; | 4934 | int oldprio, on_rq, running; |
4430 | struct rq *rq; | 4935 | struct rq *rq; |
4431 | const struct sched_class *prev_class; | 4936 | const struct sched_class *prev_class; |
4432 | 4937 | ||
4433 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4938 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4434 | 4939 | ||
4435 | rq = task_rq_lock(p, &flags); | 4940 | rq = __task_rq_lock(p); |
4436 | 4941 | ||
4942 | trace_sched_pi_setprio(p, prio); | ||
4437 | oldprio = p->prio; | 4943 | oldprio = p->prio; |
4438 | prev_class = p->sched_class; | 4944 | prev_class = p->sched_class; |
4439 | on_rq = p->se.on_rq; | 4945 | on_rq = p->on_rq; |
4440 | running = task_current(rq, p); | 4946 | running = task_current(rq, p); |
4441 | if (on_rq) | 4947 | if (on_rq) |
4442 | dequeue_task(rq, p, 0); | 4948 | dequeue_task(rq, p, 0); |
@@ -4452,12 +4958,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4452 | 4958 | ||
4453 | if (running) | 4959 | if (running) |
4454 | p->sched_class->set_curr_task(rq); | 4960 | p->sched_class->set_curr_task(rq); |
4455 | if (on_rq) { | 4961 | if (on_rq) |
4456 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 4962 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
4457 | 4963 | ||
4458 | check_class_changed(rq, p, prev_class, oldprio, running); | 4964 | check_class_changed(rq, p, prev_class, oldprio); |
4459 | } | 4965 | __task_rq_unlock(rq); |
4460 | task_rq_unlock(rq, &flags); | ||
4461 | } | 4966 | } |
4462 | 4967 | ||
4463 | #endif | 4968 | #endif |
@@ -4485,7 +4990,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4485 | p->static_prio = NICE_TO_PRIO(nice); | 4990 | p->static_prio = NICE_TO_PRIO(nice); |
4486 | goto out_unlock; | 4991 | goto out_unlock; |
4487 | } | 4992 | } |
4488 | on_rq = p->se.on_rq; | 4993 | on_rq = p->on_rq; |
4489 | if (on_rq) | 4994 | if (on_rq) |
4490 | dequeue_task(rq, p, 0); | 4995 | dequeue_task(rq, p, 0); |
4491 | 4996 | ||
@@ -4505,7 +5010,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4505 | resched_task(rq->curr); | 5010 | resched_task(rq->curr); |
4506 | } | 5011 | } |
4507 | out_unlock: | 5012 | out_unlock: |
4508 | task_rq_unlock(rq, &flags); | 5013 | task_rq_unlock(rq, p, &flags); |
4509 | } | 5014 | } |
4510 | EXPORT_SYMBOL(set_user_nice); | 5015 | EXPORT_SYMBOL(set_user_nice); |
4511 | 5016 | ||
@@ -4619,8 +5124,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
4619 | static void | 5124 | static void |
4620 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 5125 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) |
4621 | { | 5126 | { |
4622 | BUG_ON(p->se.on_rq); | ||
4623 | |||
4624 | p->policy = policy; | 5127 | p->policy = policy; |
4625 | p->rt_priority = prio; | 5128 | p->rt_priority = prio; |
4626 | p->normal_prio = normal_prio(p); | 5129 | p->normal_prio = normal_prio(p); |
@@ -4645,14 +5148,17 @@ static bool check_same_owner(struct task_struct *p) | |||
4645 | 5148 | ||
4646 | rcu_read_lock(); | 5149 | rcu_read_lock(); |
4647 | pcred = __task_cred(p); | 5150 | pcred = __task_cred(p); |
4648 | match = (cred->euid == pcred->euid || | 5151 | if (cred->user->user_ns == pcred->user->user_ns) |
4649 | cred->euid == pcred->uid); | 5152 | match = (cred->euid == pcred->euid || |
5153 | cred->euid == pcred->uid); | ||
5154 | else | ||
5155 | match = false; | ||
4650 | rcu_read_unlock(); | 5156 | rcu_read_unlock(); |
4651 | return match; | 5157 | return match; |
4652 | } | 5158 | } |
4653 | 5159 | ||
4654 | static int __sched_setscheduler(struct task_struct *p, int policy, | 5160 | static int __sched_setscheduler(struct task_struct *p, int policy, |
4655 | struct sched_param *param, bool user) | 5161 | const struct sched_param *param, bool user) |
4656 | { | 5162 | { |
4657 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 5163 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4658 | unsigned long flags; | 5164 | unsigned long flags; |
@@ -4708,12 +5214,15 @@ recheck: | |||
4708 | param->sched_priority > rlim_rtprio) | 5214 | param->sched_priority > rlim_rtprio) |
4709 | return -EPERM; | 5215 | return -EPERM; |
4710 | } | 5216 | } |
5217 | |||
4711 | /* | 5218 | /* |
4712 | * Like positive nice levels, dont allow tasks to | 5219 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
4713 | * move out of SCHED_IDLE either: | 5220 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
4714 | */ | 5221 | */ |
4715 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) | 5222 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { |
4716 | return -EPERM; | 5223 | if (!can_nice(p, TASK_NICE(p))) |
5224 | return -EPERM; | ||
5225 | } | ||
4717 | 5226 | ||
4718 | /* can't change other user's priorities */ | 5227 | /* can't change other user's priorities */ |
4719 | if (!check_same_owner(p)) | 5228 | if (!check_same_owner(p)) |
@@ -4725,7 +5234,7 @@ recheck: | |||
4725 | } | 5234 | } |
4726 | 5235 | ||
4727 | if (user) { | 5236 | if (user) { |
4728 | retval = security_task_setscheduler(p, policy, param); | 5237 | retval = security_task_setscheduler(p); |
4729 | if (retval) | 5238 | if (retval) |
4730 | return retval; | 5239 | return retval; |
4731 | } | 5240 | } |
@@ -4739,13 +5248,30 @@ recheck: | |||
4739 | /* | 5248 | /* |
4740 | * make sure no PI-waiters arrive (or leave) while we are | 5249 | * make sure no PI-waiters arrive (or leave) while we are |
4741 | * changing the priority of the task: | 5250 | * changing the priority of the task: |
5251 | * | ||
5252 | * To be able to change p->policy safely, the appropriate | ||
5253 | * runqueue lock must be held. | ||
4742 | */ | 5254 | */ |
4743 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 5255 | rq = task_rq_lock(p, &flags); |
5256 | |||
4744 | /* | 5257 | /* |
4745 | * To be able to change p->policy safely, the apropriate | 5258 | * Changing the policy of the stop threads its a very bad idea |
4746 | * runqueue lock must be held. | ||
4747 | */ | 5259 | */ |
4748 | rq = __task_rq_lock(p); | 5260 | if (p == rq->stop) { |
5261 | task_rq_unlock(rq, p, &flags); | ||
5262 | return -EINVAL; | ||
5263 | } | ||
5264 | |||
5265 | /* | ||
5266 | * If not changing anything there's no need to proceed further: | ||
5267 | */ | ||
5268 | if (unlikely(policy == p->policy && (!rt_policy(policy) || | ||
5269 | param->sched_priority == p->rt_priority))) { | ||
5270 | |||
5271 | __task_rq_unlock(rq); | ||
5272 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5273 | return 0; | ||
5274 | } | ||
4749 | 5275 | ||
4750 | #ifdef CONFIG_RT_GROUP_SCHED | 5276 | #ifdef CONFIG_RT_GROUP_SCHED |
4751 | if (user) { | 5277 | if (user) { |
@@ -4754,9 +5280,9 @@ recheck: | |||
4754 | * assigned. | 5280 | * assigned. |
4755 | */ | 5281 | */ |
4756 | if (rt_bandwidth_enabled() && rt_policy(policy) && | 5282 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
4757 | task_group(p)->rt_bandwidth.rt_runtime == 0) { | 5283 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
4758 | __task_rq_unlock(rq); | 5284 | !task_group_is_autogroup(task_group(p))) { |
4759 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 5285 | task_rq_unlock(rq, p, &flags); |
4760 | return -EPERM; | 5286 | return -EPERM; |
4761 | } | 5287 | } |
4762 | } | 5288 | } |
@@ -4765,11 +5291,10 @@ recheck: | |||
4765 | /* recheck policy now with rq lock held */ | 5291 | /* recheck policy now with rq lock held */ |
4766 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 5292 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
4767 | policy = oldpolicy = -1; | 5293 | policy = oldpolicy = -1; |
4768 | __task_rq_unlock(rq); | 5294 | task_rq_unlock(rq, p, &flags); |
4769 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4770 | goto recheck; | 5295 | goto recheck; |
4771 | } | 5296 | } |
4772 | on_rq = p->se.on_rq; | 5297 | on_rq = p->on_rq; |
4773 | running = task_current(rq, p); | 5298 | running = task_current(rq, p); |
4774 | if (on_rq) | 5299 | if (on_rq) |
4775 | deactivate_task(rq, p, 0); | 5300 | deactivate_task(rq, p, 0); |
@@ -4793,13 +5318,11 @@ recheck: | |||
4793 | 5318 | ||
4794 | if (running) | 5319 | if (running) |
4795 | p->sched_class->set_curr_task(rq); | 5320 | p->sched_class->set_curr_task(rq); |
4796 | if (on_rq) { | 5321 | if (on_rq) |
4797 | activate_task(rq, p, 0); | 5322 | activate_task(rq, p, 0); |
4798 | 5323 | ||
4799 | check_class_changed(rq, p, prev_class, oldprio, running); | 5324 | check_class_changed(rq, p, prev_class, oldprio); |
4800 | } | 5325 | task_rq_unlock(rq, p, &flags); |
4801 | __task_rq_unlock(rq); | ||
4802 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4803 | 5326 | ||
4804 | rt_mutex_adjust_pi(p); | 5327 | rt_mutex_adjust_pi(p); |
4805 | 5328 | ||
@@ -4815,7 +5338,7 @@ recheck: | |||
4815 | * NOTE that the task may be already dead. | 5338 | * NOTE that the task may be already dead. |
4816 | */ | 5339 | */ |
4817 | int sched_setscheduler(struct task_struct *p, int policy, | 5340 | int sched_setscheduler(struct task_struct *p, int policy, |
4818 | struct sched_param *param) | 5341 | const struct sched_param *param) |
4819 | { | 5342 | { |
4820 | return __sched_setscheduler(p, policy, param, true); | 5343 | return __sched_setscheduler(p, policy, param, true); |
4821 | } | 5344 | } |
@@ -4833,7 +5356,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
4833 | * but our caller might not have that capability. | 5356 | * but our caller might not have that capability. |
4834 | */ | 5357 | */ |
4835 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 5358 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
4836 | struct sched_param *param) | 5359 | const struct sched_param *param) |
4837 | { | 5360 | { |
4838 | return __sched_setscheduler(p, policy, param, false); | 5361 | return __sched_setscheduler(p, policy, param, false); |
4839 | } | 5362 | } |
@@ -4980,16 +5503,16 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
4980 | goto out_free_cpus_allowed; | 5503 | goto out_free_cpus_allowed; |
4981 | } | 5504 | } |
4982 | retval = -EPERM; | 5505 | retval = -EPERM; |
4983 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) | 5506 | if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) |
4984 | goto out_unlock; | 5507 | goto out_unlock; |
4985 | 5508 | ||
4986 | retval = security_task_setscheduler(p, 0, NULL); | 5509 | retval = security_task_setscheduler(p); |
4987 | if (retval) | 5510 | if (retval) |
4988 | goto out_unlock; | 5511 | goto out_unlock; |
4989 | 5512 | ||
4990 | cpuset_cpus_allowed(p, cpus_allowed); | 5513 | cpuset_cpus_allowed(p, cpus_allowed); |
4991 | cpumask_and(new_mask, in_mask, cpus_allowed); | 5514 | cpumask_and(new_mask, in_mask, cpus_allowed); |
4992 | again: | 5515 | again: |
4993 | retval = set_cpus_allowed_ptr(p, new_mask); | 5516 | retval = set_cpus_allowed_ptr(p, new_mask); |
4994 | 5517 | ||
4995 | if (!retval) { | 5518 | if (!retval) { |
@@ -5051,7 +5574,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5051 | { | 5574 | { |
5052 | struct task_struct *p; | 5575 | struct task_struct *p; |
5053 | unsigned long flags; | 5576 | unsigned long flags; |
5054 | struct rq *rq; | ||
5055 | int retval; | 5577 | int retval; |
5056 | 5578 | ||
5057 | get_online_cpus(); | 5579 | get_online_cpus(); |
@@ -5066,9 +5588,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5066 | if (retval) | 5588 | if (retval) |
5067 | goto out_unlock; | 5589 | goto out_unlock; |
5068 | 5590 | ||
5069 | rq = task_rq_lock(p, &flags); | 5591 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
5070 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 5592 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); |
5071 | task_rq_unlock(rq, &flags); | 5593 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
5072 | 5594 | ||
5073 | out_unlock: | 5595 | out_unlock: |
5074 | rcu_read_unlock(); | 5596 | rcu_read_unlock(); |
@@ -5215,6 +5737,67 @@ void __sched yield(void) | |||
5215 | } | 5737 | } |
5216 | EXPORT_SYMBOL(yield); | 5738 | EXPORT_SYMBOL(yield); |
5217 | 5739 | ||
5740 | /** | ||
5741 | * yield_to - yield the current processor to another thread in | ||
5742 | * your thread group, or accelerate that thread toward the | ||
5743 | * processor it's on. | ||
5744 | * @p: target task | ||
5745 | * @preempt: whether task preemption is allowed or not | ||
5746 | * | ||
5747 | * It's the caller's job to ensure that the target task struct | ||
5748 | * can't go away on us before we can do any checks. | ||
5749 | * | ||
5750 | * Returns true if we indeed boosted the target task. | ||
5751 | */ | ||
5752 | bool __sched yield_to(struct task_struct *p, bool preempt) | ||
5753 | { | ||
5754 | struct task_struct *curr = current; | ||
5755 | struct rq *rq, *p_rq; | ||
5756 | unsigned long flags; | ||
5757 | bool yielded = 0; | ||
5758 | |||
5759 | local_irq_save(flags); | ||
5760 | rq = this_rq(); | ||
5761 | |||
5762 | again: | ||
5763 | p_rq = task_rq(p); | ||
5764 | double_rq_lock(rq, p_rq); | ||
5765 | while (task_rq(p) != p_rq) { | ||
5766 | double_rq_unlock(rq, p_rq); | ||
5767 | goto again; | ||
5768 | } | ||
5769 | |||
5770 | if (!curr->sched_class->yield_to_task) | ||
5771 | goto out; | ||
5772 | |||
5773 | if (curr->sched_class != p->sched_class) | ||
5774 | goto out; | ||
5775 | |||
5776 | if (task_running(p_rq, p) || p->state) | ||
5777 | goto out; | ||
5778 | |||
5779 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); | ||
5780 | if (yielded) { | ||
5781 | schedstat_inc(rq, yld_count); | ||
5782 | /* | ||
5783 | * Make p's CPU reschedule; pick_next_entity takes care of | ||
5784 | * fairness. | ||
5785 | */ | ||
5786 | if (preempt && rq != p_rq) | ||
5787 | resched_task(p_rq->curr); | ||
5788 | } | ||
5789 | |||
5790 | out: | ||
5791 | double_rq_unlock(rq, p_rq); | ||
5792 | local_irq_restore(flags); | ||
5793 | |||
5794 | if (yielded) | ||
5795 | schedule(); | ||
5796 | |||
5797 | return yielded; | ||
5798 | } | ||
5799 | EXPORT_SYMBOL_GPL(yield_to); | ||
5800 | |||
5218 | /* | 5801 | /* |
5219 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 5802 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
5220 | * that process accounting knows that this is a task in IO wait state. | 5803 | * that process accounting knows that this is a task in IO wait state. |
@@ -5225,6 +5808,7 @@ void __sched io_schedule(void) | |||
5225 | 5808 | ||
5226 | delayacct_blkio_start(); | 5809 | delayacct_blkio_start(); |
5227 | atomic_inc(&rq->nr_iowait); | 5810 | atomic_inc(&rq->nr_iowait); |
5811 | blk_flush_plug(current); | ||
5228 | current->in_iowait = 1; | 5812 | current->in_iowait = 1; |
5229 | schedule(); | 5813 | schedule(); |
5230 | current->in_iowait = 0; | 5814 | current->in_iowait = 0; |
@@ -5240,6 +5824,7 @@ long __sched io_schedule_timeout(long timeout) | |||
5240 | 5824 | ||
5241 | delayacct_blkio_start(); | 5825 | delayacct_blkio_start(); |
5242 | atomic_inc(&rq->nr_iowait); | 5826 | atomic_inc(&rq->nr_iowait); |
5827 | blk_flush_plug(current); | ||
5243 | current->in_iowait = 1; | 5828 | current->in_iowait = 1; |
5244 | ret = schedule_timeout(timeout); | 5829 | ret = schedule_timeout(timeout); |
5245 | current->in_iowait = 0; | 5830 | current->in_iowait = 0; |
@@ -5330,7 +5915,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
5330 | 5915 | ||
5331 | rq = task_rq_lock(p, &flags); | 5916 | rq = task_rq_lock(p, &flags); |
5332 | time_slice = p->sched_class->get_rr_interval(rq, p); | 5917 | time_slice = p->sched_class->get_rr_interval(rq, p); |
5333 | task_rq_unlock(rq, &flags); | 5918 | task_rq_unlock(rq, p, &flags); |
5334 | 5919 | ||
5335 | rcu_read_unlock(); | 5920 | rcu_read_unlock(); |
5336 | jiffies_to_timespec(time_slice, &t); | 5921 | jiffies_to_timespec(time_slice, &t); |
@@ -5350,7 +5935,7 @@ void sched_show_task(struct task_struct *p) | |||
5350 | unsigned state; | 5935 | unsigned state; |
5351 | 5936 | ||
5352 | state = p->state ? __ffs(p->state) + 1 : 0; | 5937 | state = p->state ? __ffs(p->state) + 1 : 0; |
5353 | printk(KERN_INFO "%-13.13s %c", p->comm, | 5938 | printk(KERN_INFO "%-15.15s %c", p->comm, |
5354 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 5939 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
5355 | #if BITS_PER_LONG == 32 | 5940 | #if BITS_PER_LONG == 32 |
5356 | if (state == TASK_RUNNING) | 5941 | if (state == TASK_RUNNING) |
@@ -5388,7 +5973,7 @@ void show_state_filter(unsigned long state_filter) | |||
5388 | do_each_thread(g, p) { | 5973 | do_each_thread(g, p) { |
5389 | /* | 5974 | /* |
5390 | * reset the NMI-timeout, listing all files on a slow | 5975 | * reset the NMI-timeout, listing all files on a slow |
5391 | * console might take alot of time: | 5976 | * console might take a lot of time: |
5392 | */ | 5977 | */ |
5393 | touch_nmi_watchdog(); | 5978 | touch_nmi_watchdog(); |
5394 | if (!state_filter || (p->state & state_filter)) | 5979 | if (!state_filter || (p->state & state_filter)) |
@@ -5432,26 +6017,35 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5432 | idle->state = TASK_RUNNING; | 6017 | idle->state = TASK_RUNNING; |
5433 | idle->se.exec_start = sched_clock(); | 6018 | idle->se.exec_start = sched_clock(); |
5434 | 6019 | ||
5435 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); | 6020 | do_set_cpus_allowed(idle, cpumask_of(cpu)); |
6021 | /* | ||
6022 | * We're having a chicken and egg problem, even though we are | ||
6023 | * holding rq->lock, the cpu isn't yet set to this cpu so the | ||
6024 | * lockdep check in task_group() will fail. | ||
6025 | * | ||
6026 | * Similar case to sched_fork(). / Alternatively we could | ||
6027 | * use task_rq_lock() here and obtain the other rq->lock. | ||
6028 | * | ||
6029 | * Silence PROVE_RCU | ||
6030 | */ | ||
6031 | rcu_read_lock(); | ||
5436 | __set_task_cpu(idle, cpu); | 6032 | __set_task_cpu(idle, cpu); |
6033 | rcu_read_unlock(); | ||
5437 | 6034 | ||
5438 | rq->curr = rq->idle = idle; | 6035 | rq->curr = rq->idle = idle; |
5439 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 6036 | #if defined(CONFIG_SMP) |
5440 | idle->oncpu = 1; | 6037 | idle->on_cpu = 1; |
5441 | #endif | 6038 | #endif |
5442 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6039 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
5443 | 6040 | ||
5444 | /* Set the preempt count _outside_ the spinlocks! */ | 6041 | /* Set the preempt count _outside_ the spinlocks! */ |
5445 | #if defined(CONFIG_PREEMPT) | ||
5446 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
5447 | #else | ||
5448 | task_thread_info(idle)->preempt_count = 0; | 6042 | task_thread_info(idle)->preempt_count = 0; |
5449 | #endif | 6043 | |
5450 | /* | 6044 | /* |
5451 | * The idle tasks have their own, simple scheduling class: | 6045 | * The idle tasks have their own, simple scheduling class: |
5452 | */ | 6046 | */ |
5453 | idle->sched_class = &idle_sched_class; | 6047 | idle->sched_class = &idle_sched_class; |
5454 | ftrace_graph_init_task(idle); | 6048 | ftrace_graph_init_idle_task(idle, cpu); |
5455 | } | 6049 | } |
5456 | 6050 | ||
5457 | /* | 6051 | /* |
@@ -5502,7 +6096,6 @@ static void update_sysctl(void) | |||
5502 | SET_SYSCTL(sched_min_granularity); | 6096 | SET_SYSCTL(sched_min_granularity); |
5503 | SET_SYSCTL(sched_latency); | 6097 | SET_SYSCTL(sched_latency); |
5504 | SET_SYSCTL(sched_wakeup_granularity); | 6098 | SET_SYSCTL(sched_wakeup_granularity); |
5505 | SET_SYSCTL(sched_shares_ratelimit); | ||
5506 | #undef SET_SYSCTL | 6099 | #undef SET_SYSCTL |
5507 | } | 6100 | } |
5508 | 6101 | ||
@@ -5512,6 +6105,16 @@ static inline void sched_init_granularity(void) | |||
5512 | } | 6105 | } |
5513 | 6106 | ||
5514 | #ifdef CONFIG_SMP | 6107 | #ifdef CONFIG_SMP |
6108 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | ||
6109 | { | ||
6110 | if (p->sched_class && p->sched_class->set_cpus_allowed) | ||
6111 | p->sched_class->set_cpus_allowed(p, new_mask); | ||
6112 | else { | ||
6113 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
6114 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | ||
6115 | } | ||
6116 | } | ||
6117 | |||
5515 | /* | 6118 | /* |
5516 | * This is how migration works: | 6119 | * This is how migration works: |
5517 | * | 6120 | * |
@@ -5542,52 +6145,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
5542 | unsigned int dest_cpu; | 6145 | unsigned int dest_cpu; |
5543 | int ret = 0; | 6146 | int ret = 0; |
5544 | 6147 | ||
5545 | /* | ||
5546 | * Serialize against TASK_WAKING so that ttwu() and wunt() can | ||
5547 | * drop the rq->lock and still rely on ->cpus_allowed. | ||
5548 | */ | ||
5549 | again: | ||
5550 | while (task_is_waking(p)) | ||
5551 | cpu_relax(); | ||
5552 | rq = task_rq_lock(p, &flags); | 6148 | rq = task_rq_lock(p, &flags); |
5553 | if (task_is_waking(p)) { | 6149 | |
5554 | task_rq_unlock(rq, &flags); | 6150 | if (cpumask_equal(&p->cpus_allowed, new_mask)) |
5555 | goto again; | 6151 | goto out; |
5556 | } | ||
5557 | 6152 | ||
5558 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | 6153 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
5559 | ret = -EINVAL; | 6154 | ret = -EINVAL; |
5560 | goto out; | 6155 | goto out; |
5561 | } | 6156 | } |
5562 | 6157 | ||
5563 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && | 6158 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { |
5564 | !cpumask_equal(&p->cpus_allowed, new_mask))) { | ||
5565 | ret = -EINVAL; | 6159 | ret = -EINVAL; |
5566 | goto out; | 6160 | goto out; |
5567 | } | 6161 | } |
5568 | 6162 | ||
5569 | if (p->sched_class->set_cpus_allowed) | 6163 | do_set_cpus_allowed(p, new_mask); |
5570 | p->sched_class->set_cpus_allowed(p, new_mask); | ||
5571 | else { | ||
5572 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
5573 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | ||
5574 | } | ||
5575 | 6164 | ||
5576 | /* Can the task run on the task's current CPU? If so, we're done */ | 6165 | /* Can the task run on the task's current CPU? If so, we're done */ |
5577 | if (cpumask_test_cpu(task_cpu(p), new_mask)) | 6166 | if (cpumask_test_cpu(task_cpu(p), new_mask)) |
5578 | goto out; | 6167 | goto out; |
5579 | 6168 | ||
5580 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 6169 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
5581 | if (migrate_task(p, dest_cpu)) { | 6170 | if (p->on_rq) { |
5582 | struct migration_arg arg = { p, dest_cpu }; | 6171 | struct migration_arg arg = { p, dest_cpu }; |
5583 | /* Need help from migration thread: drop lock and wait. */ | 6172 | /* Need help from migration thread: drop lock and wait. */ |
5584 | task_rq_unlock(rq, &flags); | 6173 | task_rq_unlock(rq, p, &flags); |
5585 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 6174 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
5586 | tlb_migrate_finish(p->mm); | 6175 | tlb_migrate_finish(p->mm); |
5587 | return 0; | 6176 | return 0; |
5588 | } | 6177 | } |
5589 | out: | 6178 | out: |
5590 | task_rq_unlock(rq, &flags); | 6179 | task_rq_unlock(rq, p, &flags); |
5591 | 6180 | ||
5592 | return ret; | 6181 | return ret; |
5593 | } | 6182 | } |
@@ -5615,6 +6204,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5615 | rq_src = cpu_rq(src_cpu); | 6204 | rq_src = cpu_rq(src_cpu); |
5616 | rq_dest = cpu_rq(dest_cpu); | 6205 | rq_dest = cpu_rq(dest_cpu); |
5617 | 6206 | ||
6207 | raw_spin_lock(&p->pi_lock); | ||
5618 | double_rq_lock(rq_src, rq_dest); | 6208 | double_rq_lock(rq_src, rq_dest); |
5619 | /* Already moved. */ | 6209 | /* Already moved. */ |
5620 | if (task_cpu(p) != src_cpu) | 6210 | if (task_cpu(p) != src_cpu) |
@@ -5627,7 +6217,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5627 | * If we're not on a rq, the next wake-up will ensure we're | 6217 | * If we're not on a rq, the next wake-up will ensure we're |
5628 | * placed properly. | 6218 | * placed properly. |
5629 | */ | 6219 | */ |
5630 | if (p->se.on_rq) { | 6220 | if (p->on_rq) { |
5631 | deactivate_task(rq_src, p, 0); | 6221 | deactivate_task(rq_src, p, 0); |
5632 | set_task_cpu(p, dest_cpu); | 6222 | set_task_cpu(p, dest_cpu); |
5633 | activate_task(rq_dest, p, 0); | 6223 | activate_task(rq_dest, p, 0); |
@@ -5637,6 +6227,7 @@ done: | |||
5637 | ret = 1; | 6227 | ret = 1; |
5638 | fail: | 6228 | fail: |
5639 | double_rq_unlock(rq_src, rq_dest); | 6229 | double_rq_unlock(rq_src, rq_dest); |
6230 | raw_spin_unlock(&p->pi_lock); | ||
5640 | return ret; | 6231 | return ret; |
5641 | } | 6232 | } |
5642 | 6233 | ||
@@ -5660,29 +6251,20 @@ static int migration_cpu_stop(void *data) | |||
5660 | } | 6251 | } |
5661 | 6252 | ||
5662 | #ifdef CONFIG_HOTPLUG_CPU | 6253 | #ifdef CONFIG_HOTPLUG_CPU |
6254 | |||
5663 | /* | 6255 | /* |
5664 | * Figure out where task on dead CPU should go, use force if necessary. | 6256 | * Ensures that the idle task is using init_mm right before its cpu goes |
6257 | * offline. | ||
5665 | */ | 6258 | */ |
5666 | void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 6259 | void idle_task_exit(void) |
5667 | { | 6260 | { |
5668 | struct rq *rq = cpu_rq(dead_cpu); | 6261 | struct mm_struct *mm = current->active_mm; |
5669 | int needs_cpu, uninitialized_var(dest_cpu); | ||
5670 | unsigned long flags; | ||
5671 | 6262 | ||
5672 | local_irq_save(flags); | 6263 | BUG_ON(cpu_online(smp_processor_id())); |
5673 | 6264 | ||
5674 | raw_spin_lock(&rq->lock); | 6265 | if (mm != &init_mm) |
5675 | needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); | 6266 | switch_mm(mm, &init_mm, current); |
5676 | if (needs_cpu) | 6267 | mmdrop(mm); |
5677 | dest_cpu = select_fallback_rq(dead_cpu, p); | ||
5678 | raw_spin_unlock(&rq->lock); | ||
5679 | /* | ||
5680 | * It can only fail if we race with set_cpus_allowed(), | ||
5681 | * in the racer should migrate the task anyway. | ||
5682 | */ | ||
5683 | if (needs_cpu) | ||
5684 | __migrate_task(p, dead_cpu, dest_cpu); | ||
5685 | local_irq_restore(flags); | ||
5686 | } | 6268 | } |
5687 | 6269 | ||
5688 | /* | 6270 | /* |
@@ -5695,128 +6277,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5695 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 6277 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
5696 | { | 6278 | { |
5697 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); | 6279 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |
5698 | unsigned long flags; | ||
5699 | 6280 | ||
5700 | local_irq_save(flags); | ||
5701 | double_rq_lock(rq_src, rq_dest); | ||
5702 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 6281 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
5703 | rq_src->nr_uninterruptible = 0; | 6282 | rq_src->nr_uninterruptible = 0; |
5704 | double_rq_unlock(rq_src, rq_dest); | ||
5705 | local_irq_restore(flags); | ||
5706 | } | ||
5707 | |||
5708 | /* Run through task list and migrate tasks from the dead cpu. */ | ||
5709 | static void migrate_live_tasks(int src_cpu) | ||
5710 | { | ||
5711 | struct task_struct *p, *t; | ||
5712 | |||
5713 | read_lock(&tasklist_lock); | ||
5714 | |||
5715 | do_each_thread(t, p) { | ||
5716 | if (p == current) | ||
5717 | continue; | ||
5718 | |||
5719 | if (task_cpu(p) == src_cpu) | ||
5720 | move_task_off_dead_cpu(src_cpu, p); | ||
5721 | } while_each_thread(t, p); | ||
5722 | |||
5723 | read_unlock(&tasklist_lock); | ||
5724 | } | 6283 | } |
5725 | 6284 | ||
5726 | /* | 6285 | /* |
5727 | * Schedules idle task to be the next runnable task on current CPU. | 6286 | * remove the tasks which were accounted by rq from calc_load_tasks. |
5728 | * It does so by boosting its priority to highest possible. | ||
5729 | * Used by CPU offline code. | ||
5730 | */ | 6287 | */ |
5731 | void sched_idle_next(void) | 6288 | static void calc_global_load_remove(struct rq *rq) |
5732 | { | 6289 | { |
5733 | int this_cpu = smp_processor_id(); | 6290 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); |
5734 | struct rq *rq = cpu_rq(this_cpu); | 6291 | rq->calc_load_active = 0; |
5735 | struct task_struct *p = rq->idle; | ||
5736 | unsigned long flags; | ||
5737 | |||
5738 | /* cpu has to be offline */ | ||
5739 | BUG_ON(cpu_online(this_cpu)); | ||
5740 | |||
5741 | /* | ||
5742 | * Strictly not necessary since rest of the CPUs are stopped by now | ||
5743 | * and interrupts disabled on the current cpu. | ||
5744 | */ | ||
5745 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5746 | |||
5747 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | ||
5748 | |||
5749 | activate_task(rq, p, 0); | ||
5750 | |||
5751 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5752 | } | 6292 | } |
5753 | 6293 | ||
5754 | /* | 6294 | /* |
5755 | * Ensures that the idle task is using init_mm right before its cpu goes | 6295 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
5756 | * offline. | 6296 | * try_to_wake_up()->select_task_rq(). |
6297 | * | ||
6298 | * Called with rq->lock held even though we'er in stop_machine() and | ||
6299 | * there's no concurrency possible, we hold the required locks anyway | ||
6300 | * because of lock validation efforts. | ||
5757 | */ | 6301 | */ |
5758 | void idle_task_exit(void) | 6302 | static void migrate_tasks(unsigned int dead_cpu) |
5759 | { | ||
5760 | struct mm_struct *mm = current->active_mm; | ||
5761 | |||
5762 | BUG_ON(cpu_online(smp_processor_id())); | ||
5763 | |||
5764 | if (mm != &init_mm) | ||
5765 | switch_mm(mm, &init_mm, current); | ||
5766 | mmdrop(mm); | ||
5767 | } | ||
5768 | |||
5769 | /* called under rq->lock with disabled interrupts */ | ||
5770 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | ||
5771 | { | 6303 | { |
5772 | struct rq *rq = cpu_rq(dead_cpu); | 6304 | struct rq *rq = cpu_rq(dead_cpu); |
5773 | 6305 | struct task_struct *next, *stop = rq->stop; | |
5774 | /* Must be exiting, otherwise would be on tasklist. */ | 6306 | int dest_cpu; |
5775 | BUG_ON(!p->exit_state); | ||
5776 | |||
5777 | /* Cannot have done final schedule yet: would have vanished. */ | ||
5778 | BUG_ON(p->state == TASK_DEAD); | ||
5779 | |||
5780 | get_task_struct(p); | ||
5781 | 6307 | ||
5782 | /* | 6308 | /* |
5783 | * Drop lock around migration; if someone else moves it, | 6309 | * Fudge the rq selection such that the below task selection loop |
5784 | * that's OK. No task can be added to this CPU, so iteration is | 6310 | * doesn't get stuck on the currently eligible stop task. |
5785 | * fine. | 6311 | * |
6312 | * We're currently inside stop_machine() and the rq is either stuck | ||
6313 | * in the stop_machine_cpu_stop() loop, or we're executing this code, | ||
6314 | * either way we should never end up calling schedule() until we're | ||
6315 | * done here. | ||
5786 | */ | 6316 | */ |
5787 | raw_spin_unlock_irq(&rq->lock); | 6317 | rq->stop = NULL; |
5788 | move_task_off_dead_cpu(dead_cpu, p); | ||
5789 | raw_spin_lock_irq(&rq->lock); | ||
5790 | |||
5791 | put_task_struct(p); | ||
5792 | } | ||
5793 | |||
5794 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | ||
5795 | static void migrate_dead_tasks(unsigned int dead_cpu) | ||
5796 | { | ||
5797 | struct rq *rq = cpu_rq(dead_cpu); | ||
5798 | struct task_struct *next; | ||
5799 | 6318 | ||
5800 | for ( ; ; ) { | 6319 | for ( ; ; ) { |
5801 | if (!rq->nr_running) | 6320 | /* |
6321 | * There's this thread running, bail when that's the only | ||
6322 | * remaining thread. | ||
6323 | */ | ||
6324 | if (rq->nr_running == 1) | ||
5802 | break; | 6325 | break; |
6326 | |||
5803 | next = pick_next_task(rq); | 6327 | next = pick_next_task(rq); |
5804 | if (!next) | 6328 | BUG_ON(!next); |
5805 | break; | ||
5806 | next->sched_class->put_prev_task(rq, next); | 6329 | next->sched_class->put_prev_task(rq, next); |
5807 | migrate_dead(dead_cpu, next); | ||
5808 | 6330 | ||
6331 | /* Find suitable destination for @next, with force if needed. */ | ||
6332 | dest_cpu = select_fallback_rq(dead_cpu, next); | ||
6333 | raw_spin_unlock(&rq->lock); | ||
6334 | |||
6335 | __migrate_task(next, dead_cpu, dest_cpu); | ||
6336 | |||
6337 | raw_spin_lock(&rq->lock); | ||
5809 | } | 6338 | } |
5810 | } | ||
5811 | 6339 | ||
5812 | /* | 6340 | rq->stop = stop; |
5813 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
5814 | */ | ||
5815 | static void calc_global_load_remove(struct rq *rq) | ||
5816 | { | ||
5817 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
5818 | rq->calc_load_active = 0; | ||
5819 | } | 6341 | } |
6342 | |||
5820 | #endif /* CONFIG_HOTPLUG_CPU */ | 6343 | #endif /* CONFIG_HOTPLUG_CPU */ |
5821 | 6344 | ||
5822 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 6345 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
@@ -6026,15 +6549,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6026 | unsigned long flags; | 6549 | unsigned long flags; |
6027 | struct rq *rq = cpu_rq(cpu); | 6550 | struct rq *rq = cpu_rq(cpu); |
6028 | 6551 | ||
6029 | switch (action) { | 6552 | switch (action & ~CPU_TASKS_FROZEN) { |
6030 | 6553 | ||
6031 | case CPU_UP_PREPARE: | 6554 | case CPU_UP_PREPARE: |
6032 | case CPU_UP_PREPARE_FROZEN: | ||
6033 | rq->calc_load_update = calc_load_update; | 6555 | rq->calc_load_update = calc_load_update; |
6034 | break; | 6556 | break; |
6035 | 6557 | ||
6036 | case CPU_ONLINE: | 6558 | case CPU_ONLINE: |
6037 | case CPU_ONLINE_FROZEN: | ||
6038 | /* Update our root-domain */ | 6559 | /* Update our root-domain */ |
6039 | raw_spin_lock_irqsave(&rq->lock, flags); | 6560 | raw_spin_lock_irqsave(&rq->lock, flags); |
6040 | if (rq->rd) { | 6561 | if (rq->rd) { |
@@ -6046,33 +6567,26 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6046 | break; | 6567 | break; |
6047 | 6568 | ||
6048 | #ifdef CONFIG_HOTPLUG_CPU | 6569 | #ifdef CONFIG_HOTPLUG_CPU |
6049 | case CPU_DEAD: | ||
6050 | case CPU_DEAD_FROZEN: | ||
6051 | migrate_live_tasks(cpu); | ||
6052 | /* Idle task back to normal (off runqueue, low prio) */ | ||
6053 | raw_spin_lock_irq(&rq->lock); | ||
6054 | deactivate_task(rq, rq->idle, 0); | ||
6055 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | ||
6056 | rq->idle->sched_class = &idle_sched_class; | ||
6057 | migrate_dead_tasks(cpu); | ||
6058 | raw_spin_unlock_irq(&rq->lock); | ||
6059 | migrate_nr_uninterruptible(rq); | ||
6060 | BUG_ON(rq->nr_running != 0); | ||
6061 | calc_global_load_remove(rq); | ||
6062 | break; | ||
6063 | |||
6064 | case CPU_DYING: | 6570 | case CPU_DYING: |
6065 | case CPU_DYING_FROZEN: | 6571 | sched_ttwu_pending(); |
6066 | /* Update our root-domain */ | 6572 | /* Update our root-domain */ |
6067 | raw_spin_lock_irqsave(&rq->lock, flags); | 6573 | raw_spin_lock_irqsave(&rq->lock, flags); |
6068 | if (rq->rd) { | 6574 | if (rq->rd) { |
6069 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 6575 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
6070 | set_rq_offline(rq); | 6576 | set_rq_offline(rq); |
6071 | } | 6577 | } |
6578 | migrate_tasks(cpu); | ||
6579 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | ||
6072 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6580 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6581 | |||
6582 | migrate_nr_uninterruptible(rq); | ||
6583 | calc_global_load_remove(rq); | ||
6073 | break; | 6584 | break; |
6074 | #endif | 6585 | #endif |
6075 | } | 6586 | } |
6587 | |||
6588 | update_max_interval(); | ||
6589 | |||
6076 | return NOTIFY_OK; | 6590 | return NOTIFY_OK; |
6077 | } | 6591 | } |
6078 | 6592 | ||
@@ -6133,6 +6647,8 @@ early_initcall(migration_init); | |||
6133 | 6647 | ||
6134 | #ifdef CONFIG_SMP | 6648 | #ifdef CONFIG_SMP |
6135 | 6649 | ||
6650 | static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | ||
6651 | |||
6136 | #ifdef CONFIG_SCHED_DEBUG | 6652 | #ifdef CONFIG_SCHED_DEBUG |
6137 | 6653 | ||
6138 | static __read_mostly int sched_domain_debug_enabled; | 6654 | static __read_mostly int sched_domain_debug_enabled; |
@@ -6183,7 +6699,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6183 | break; | 6699 | break; |
6184 | } | 6700 | } |
6185 | 6701 | ||
6186 | if (!group->cpu_power) { | 6702 | if (!group->sgp->power) { |
6187 | printk(KERN_CONT "\n"); | 6703 | printk(KERN_CONT "\n"); |
6188 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 6704 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
6189 | "set\n"); | 6705 | "set\n"); |
@@ -6207,9 +6723,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6207 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 6723 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
6208 | 6724 | ||
6209 | printk(KERN_CONT " %s", str); | 6725 | printk(KERN_CONT " %s", str); |
6210 | if (group->cpu_power != SCHED_LOAD_SCALE) { | 6726 | if (group->sgp->power != SCHED_POWER_SCALE) { |
6211 | printk(KERN_CONT " (cpu_power = %d)", | 6727 | printk(KERN_CONT " (cpu_power = %d)", |
6212 | group->cpu_power); | 6728 | group->sgp->power); |
6213 | } | 6729 | } |
6214 | 6730 | ||
6215 | group = group->next; | 6731 | group = group->next; |
@@ -6228,7 +6744,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6228 | 6744 | ||
6229 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6745 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
6230 | { | 6746 | { |
6231 | cpumask_var_t groupmask; | ||
6232 | int level = 0; | 6747 | int level = 0; |
6233 | 6748 | ||
6234 | if (!sched_domain_debug_enabled) | 6749 | if (!sched_domain_debug_enabled) |
@@ -6241,20 +6756,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
6241 | 6756 | ||
6242 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6757 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
6243 | 6758 | ||
6244 | if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { | ||
6245 | printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); | ||
6246 | return; | ||
6247 | } | ||
6248 | |||
6249 | for (;;) { | 6759 | for (;;) { |
6250 | if (sched_domain_debug_one(sd, cpu, level, groupmask)) | 6760 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) |
6251 | break; | 6761 | break; |
6252 | level++; | 6762 | level++; |
6253 | sd = sd->parent; | 6763 | sd = sd->parent; |
6254 | if (!sd) | 6764 | if (!sd) |
6255 | break; | 6765 | break; |
6256 | } | 6766 | } |
6257 | free_cpumask_var(groupmask); | ||
6258 | } | 6767 | } |
6259 | #else /* !CONFIG_SCHED_DEBUG */ | 6768 | #else /* !CONFIG_SCHED_DEBUG */ |
6260 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6769 | # define sched_domain_debug(sd, cpu) do { } while (0) |
@@ -6311,12 +6820,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
6311 | return 1; | 6820 | return 1; |
6312 | } | 6821 | } |
6313 | 6822 | ||
6314 | static void free_rootdomain(struct root_domain *rd) | 6823 | static void free_rootdomain(struct rcu_head *rcu) |
6315 | { | 6824 | { |
6316 | synchronize_sched(); | 6825 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
6317 | 6826 | ||
6318 | cpupri_cleanup(&rd->cpupri); | 6827 | cpupri_cleanup(&rd->cpupri); |
6319 | |||
6320 | free_cpumask_var(rd->rto_mask); | 6828 | free_cpumask_var(rd->rto_mask); |
6321 | free_cpumask_var(rd->online); | 6829 | free_cpumask_var(rd->online); |
6322 | free_cpumask_var(rd->span); | 6830 | free_cpumask_var(rd->span); |
@@ -6357,7 +6865,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6357 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6865 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6358 | 6866 | ||
6359 | if (old_rd) | 6867 | if (old_rd) |
6360 | free_rootdomain(old_rd); | 6868 | call_rcu_sched(&old_rd->rcu, free_rootdomain); |
6361 | } | 6869 | } |
6362 | 6870 | ||
6363 | static int init_rootdomain(struct root_domain *rd) | 6871 | static int init_rootdomain(struct root_domain *rd) |
@@ -6408,6 +6916,53 @@ static struct root_domain *alloc_rootdomain(void) | |||
6408 | return rd; | 6916 | return rd; |
6409 | } | 6917 | } |
6410 | 6918 | ||
6919 | static void free_sched_groups(struct sched_group *sg, int free_sgp) | ||
6920 | { | ||
6921 | struct sched_group *tmp, *first; | ||
6922 | |||
6923 | if (!sg) | ||
6924 | return; | ||
6925 | |||
6926 | first = sg; | ||
6927 | do { | ||
6928 | tmp = sg->next; | ||
6929 | |||
6930 | if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) | ||
6931 | kfree(sg->sgp); | ||
6932 | |||
6933 | kfree(sg); | ||
6934 | sg = tmp; | ||
6935 | } while (sg != first); | ||
6936 | } | ||
6937 | |||
6938 | static void free_sched_domain(struct rcu_head *rcu) | ||
6939 | { | ||
6940 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
6941 | |||
6942 | /* | ||
6943 | * If its an overlapping domain it has private groups, iterate and | ||
6944 | * nuke them all. | ||
6945 | */ | ||
6946 | if (sd->flags & SD_OVERLAP) { | ||
6947 | free_sched_groups(sd->groups, 1); | ||
6948 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | ||
6949 | kfree(sd->groups->sgp); | ||
6950 | kfree(sd->groups); | ||
6951 | } | ||
6952 | kfree(sd); | ||
6953 | } | ||
6954 | |||
6955 | static void destroy_sched_domain(struct sched_domain *sd, int cpu) | ||
6956 | { | ||
6957 | call_rcu(&sd->rcu, free_sched_domain); | ||
6958 | } | ||
6959 | |||
6960 | static void destroy_sched_domains(struct sched_domain *sd, int cpu) | ||
6961 | { | ||
6962 | for (; sd; sd = sd->parent) | ||
6963 | destroy_sched_domain(sd, cpu); | ||
6964 | } | ||
6965 | |||
6411 | /* | 6966 | /* |
6412 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6967 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
6413 | * hold the hotplug lock. | 6968 | * hold the hotplug lock. |
@@ -6418,9 +6973,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6418 | struct rq *rq = cpu_rq(cpu); | 6973 | struct rq *rq = cpu_rq(cpu); |
6419 | struct sched_domain *tmp; | 6974 | struct sched_domain *tmp; |
6420 | 6975 | ||
6421 | for (tmp = sd; tmp; tmp = tmp->parent) | ||
6422 | tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); | ||
6423 | |||
6424 | /* Remove the sched domains which do not contribute to scheduling. */ | 6976 | /* Remove the sched domains which do not contribute to scheduling. */ |
6425 | for (tmp = sd; tmp; ) { | 6977 | for (tmp = sd; tmp; ) { |
6426 | struct sched_domain *parent = tmp->parent; | 6978 | struct sched_domain *parent = tmp->parent; |
@@ -6431,12 +6983,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6431 | tmp->parent = parent->parent; | 6983 | tmp->parent = parent->parent; |
6432 | if (parent->parent) | 6984 | if (parent->parent) |
6433 | parent->parent->child = tmp; | 6985 | parent->parent->child = tmp; |
6986 | destroy_sched_domain(parent, cpu); | ||
6434 | } else | 6987 | } else |
6435 | tmp = tmp->parent; | 6988 | tmp = tmp->parent; |
6436 | } | 6989 | } |
6437 | 6990 | ||
6438 | if (sd && sd_degenerate(sd)) { | 6991 | if (sd && sd_degenerate(sd)) { |
6992 | tmp = sd; | ||
6439 | sd = sd->parent; | 6993 | sd = sd->parent; |
6994 | destroy_sched_domain(tmp, cpu); | ||
6440 | if (sd) | 6995 | if (sd) |
6441 | sd->child = NULL; | 6996 | sd->child = NULL; |
6442 | } | 6997 | } |
@@ -6444,7 +6999,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6444 | sched_domain_debug(sd, cpu); | 6999 | sched_domain_debug(sd, cpu); |
6445 | 7000 | ||
6446 | rq_attach_root(rq, rd); | 7001 | rq_attach_root(rq, rd); |
7002 | tmp = rq->sd; | ||
6447 | rcu_assign_pointer(rq->sd, sd); | 7003 | rcu_assign_pointer(rq->sd, sd); |
7004 | destroy_sched_domains(tmp, cpu); | ||
6448 | } | 7005 | } |
6449 | 7006 | ||
6450 | /* cpus with isolated domains */ | 7007 | /* cpus with isolated domains */ |
@@ -6460,56 +7017,6 @@ static int __init isolated_cpu_setup(char *str) | |||
6460 | 7017 | ||
6461 | __setup("isolcpus=", isolated_cpu_setup); | 7018 | __setup("isolcpus=", isolated_cpu_setup); |
6462 | 7019 | ||
6463 | /* | ||
6464 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | ||
6465 | * to a function which identifies what group(along with sched group) a CPU | ||
6466 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids | ||
6467 | * (due to the fact that we keep track of groups covered with a struct cpumask). | ||
6468 | * | ||
6469 | * init_sched_build_groups will build a circular linked list of the groups | ||
6470 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6471 | * and ->cpu_power to 0. | ||
6472 | */ | ||
6473 | static void | ||
6474 | init_sched_build_groups(const struct cpumask *span, | ||
6475 | const struct cpumask *cpu_map, | ||
6476 | int (*group_fn)(int cpu, const struct cpumask *cpu_map, | ||
6477 | struct sched_group **sg, | ||
6478 | struct cpumask *tmpmask), | ||
6479 | struct cpumask *covered, struct cpumask *tmpmask) | ||
6480 | { | ||
6481 | struct sched_group *first = NULL, *last = NULL; | ||
6482 | int i; | ||
6483 | |||
6484 | cpumask_clear(covered); | ||
6485 | |||
6486 | for_each_cpu(i, span) { | ||
6487 | struct sched_group *sg; | ||
6488 | int group = group_fn(i, cpu_map, &sg, tmpmask); | ||
6489 | int j; | ||
6490 | |||
6491 | if (cpumask_test_cpu(i, covered)) | ||
6492 | continue; | ||
6493 | |||
6494 | cpumask_clear(sched_group_cpus(sg)); | ||
6495 | sg->cpu_power = 0; | ||
6496 | |||
6497 | for_each_cpu(j, span) { | ||
6498 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | ||
6499 | continue; | ||
6500 | |||
6501 | cpumask_set_cpu(j, covered); | ||
6502 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
6503 | } | ||
6504 | if (!first) | ||
6505 | first = sg; | ||
6506 | if (last) | ||
6507 | last->next = sg; | ||
6508 | last = sg; | ||
6509 | } | ||
6510 | last->next = first; | ||
6511 | } | ||
6512 | |||
6513 | #define SD_NODES_PER_DOMAIN 16 | 7020 | #define SD_NODES_PER_DOMAIN 16 |
6514 | 7021 | ||
6515 | #ifdef CONFIG_NUMA | 7022 | #ifdef CONFIG_NUMA |
@@ -6526,7 +7033,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
6526 | */ | 7033 | */ |
6527 | static int find_next_best_node(int node, nodemask_t *used_nodes) | 7034 | static int find_next_best_node(int node, nodemask_t *used_nodes) |
6528 | { | 7035 | { |
6529 | int i, n, val, min_val, best_node = 0; | 7036 | int i, n, val, min_val, best_node = -1; |
6530 | 7037 | ||
6531 | min_val = INT_MAX; | 7038 | min_val = INT_MAX; |
6532 | 7039 | ||
@@ -6550,7 +7057,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) | |||
6550 | } | 7057 | } |
6551 | } | 7058 | } |
6552 | 7059 | ||
6553 | node_set(best_node, *used_nodes); | 7060 | if (best_node != -1) |
7061 | node_set(best_node, *used_nodes); | ||
6554 | return best_node; | 7062 | return best_node; |
6555 | } | 7063 | } |
6556 | 7064 | ||
@@ -6576,293 +7084,197 @@ static void sched_domain_node_span(int node, struct cpumask *span) | |||
6576 | 7084 | ||
6577 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 7085 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
6578 | int next_node = find_next_best_node(node, &used_nodes); | 7086 | int next_node = find_next_best_node(node, &used_nodes); |
6579 | 7087 | if (next_node < 0) | |
7088 | break; | ||
6580 | cpumask_or(span, span, cpumask_of_node(next_node)); | 7089 | cpumask_or(span, span, cpumask_of_node(next_node)); |
6581 | } | 7090 | } |
6582 | } | 7091 | } |
7092 | |||
7093 | static const struct cpumask *cpu_node_mask(int cpu) | ||
7094 | { | ||
7095 | lockdep_assert_held(&sched_domains_mutex); | ||
7096 | |||
7097 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
7098 | |||
7099 | return sched_domains_tmpmask; | ||
7100 | } | ||
7101 | |||
7102 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
7103 | { | ||
7104 | return cpu_possible_mask; | ||
7105 | } | ||
6583 | #endif /* CONFIG_NUMA */ | 7106 | #endif /* CONFIG_NUMA */ |
6584 | 7107 | ||
6585 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 7108 | static const struct cpumask *cpu_cpu_mask(int cpu) |
7109 | { | ||
7110 | return cpumask_of_node(cpu_to_node(cpu)); | ||
7111 | } | ||
6586 | 7112 | ||
6587 | /* | 7113 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6588 | * The cpus mask in sched_group and sched_domain hangs off the end. | ||
6589 | * | ||
6590 | * ( See the the comments in include/linux/sched.h:struct sched_group | ||
6591 | * and struct sched_domain. ) | ||
6592 | */ | ||
6593 | struct static_sched_group { | ||
6594 | struct sched_group sg; | ||
6595 | DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); | ||
6596 | }; | ||
6597 | 7114 | ||
6598 | struct static_sched_domain { | 7115 | struct sd_data { |
6599 | struct sched_domain sd; | 7116 | struct sched_domain **__percpu sd; |
6600 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 7117 | struct sched_group **__percpu sg; |
7118 | struct sched_group_power **__percpu sgp; | ||
6601 | }; | 7119 | }; |
6602 | 7120 | ||
6603 | struct s_data { | 7121 | struct s_data { |
6604 | #ifdef CONFIG_NUMA | 7122 | struct sched_domain ** __percpu sd; |
6605 | int sd_allnodes; | ||
6606 | cpumask_var_t domainspan; | ||
6607 | cpumask_var_t covered; | ||
6608 | cpumask_var_t notcovered; | ||
6609 | #endif | ||
6610 | cpumask_var_t nodemask; | ||
6611 | cpumask_var_t this_sibling_map; | ||
6612 | cpumask_var_t this_core_map; | ||
6613 | cpumask_var_t send_covered; | ||
6614 | cpumask_var_t tmpmask; | ||
6615 | struct sched_group **sched_group_nodes; | ||
6616 | struct root_domain *rd; | 7123 | struct root_domain *rd; |
6617 | }; | 7124 | }; |
6618 | 7125 | ||
6619 | enum s_alloc { | 7126 | enum s_alloc { |
6620 | sa_sched_groups = 0, | ||
6621 | sa_rootdomain, | 7127 | sa_rootdomain, |
6622 | sa_tmpmask, | 7128 | sa_sd, |
6623 | sa_send_covered, | 7129 | sa_sd_storage, |
6624 | sa_this_core_map, | ||
6625 | sa_this_sibling_map, | ||
6626 | sa_nodemask, | ||
6627 | sa_sched_group_nodes, | ||
6628 | #ifdef CONFIG_NUMA | ||
6629 | sa_notcovered, | ||
6630 | sa_covered, | ||
6631 | sa_domainspan, | ||
6632 | #endif | ||
6633 | sa_none, | 7130 | sa_none, |
6634 | }; | 7131 | }; |
6635 | 7132 | ||
6636 | /* | 7133 | struct sched_domain_topology_level; |
6637 | * SMT sched-domains: | ||
6638 | */ | ||
6639 | #ifdef CONFIG_SCHED_SMT | ||
6640 | static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); | ||
6641 | static DEFINE_PER_CPU(struct static_sched_group, sched_groups); | ||
6642 | 7134 | ||
6643 | static int | 7135 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); |
6644 | cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | 7136 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); |
6645 | struct sched_group **sg, struct cpumask *unused) | ||
6646 | { | ||
6647 | if (sg) | ||
6648 | *sg = &per_cpu(sched_groups, cpu).sg; | ||
6649 | return cpu; | ||
6650 | } | ||
6651 | #endif /* CONFIG_SCHED_SMT */ | ||
6652 | 7137 | ||
6653 | /* | 7138 | #define SDTL_OVERLAP 0x01 |
6654 | * multi-core sched-domains: | ||
6655 | */ | ||
6656 | #ifdef CONFIG_SCHED_MC | ||
6657 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); | ||
6658 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); | ||
6659 | #endif /* CONFIG_SCHED_MC */ | ||
6660 | 7139 | ||
6661 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 7140 | struct sched_domain_topology_level { |
6662 | static int | 7141 | sched_domain_init_f init; |
6663 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 7142 | sched_domain_mask_f mask; |
6664 | struct sched_group **sg, struct cpumask *mask) | 7143 | int flags; |
6665 | { | 7144 | struct sd_data data; |
6666 | int group; | 7145 | }; |
6667 | 7146 | ||
6668 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6669 | group = cpumask_first(mask); | ||
6670 | if (sg) | ||
6671 | *sg = &per_cpu(sched_group_core, group).sg; | ||
6672 | return group; | ||
6673 | } | ||
6674 | #elif defined(CONFIG_SCHED_MC) | ||
6675 | static int | 7147 | static int |
6676 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 7148 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) |
6677 | struct sched_group **sg, struct cpumask *unused) | ||
6678 | { | 7149 | { |
6679 | if (sg) | 7150 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; |
6680 | *sg = &per_cpu(sched_group_core, cpu).sg; | 7151 | const struct cpumask *span = sched_domain_span(sd); |
6681 | return cpu; | 7152 | struct cpumask *covered = sched_domains_tmpmask; |
6682 | } | 7153 | struct sd_data *sdd = sd->private; |
6683 | #endif | 7154 | struct sched_domain *child; |
7155 | int i; | ||
6684 | 7156 | ||
6685 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); | 7157 | cpumask_clear(covered); |
6686 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); | ||
6687 | 7158 | ||
6688 | static int | 7159 | for_each_cpu(i, span) { |
6689 | cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | 7160 | struct cpumask *sg_span; |
6690 | struct sched_group **sg, struct cpumask *mask) | ||
6691 | { | ||
6692 | int group; | ||
6693 | #ifdef CONFIG_SCHED_MC | ||
6694 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6695 | group = cpumask_first(mask); | ||
6696 | #elif defined(CONFIG_SCHED_SMT) | ||
6697 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6698 | group = cpumask_first(mask); | ||
6699 | #else | ||
6700 | group = cpu; | ||
6701 | #endif | ||
6702 | if (sg) | ||
6703 | *sg = &per_cpu(sched_group_phys, group).sg; | ||
6704 | return group; | ||
6705 | } | ||
6706 | 7161 | ||
6707 | #ifdef CONFIG_NUMA | 7162 | if (cpumask_test_cpu(i, covered)) |
6708 | /* | 7163 | continue; |
6709 | * The init_sched_build_groups can't handle what we want to do with node | ||
6710 | * groups, so roll our own. Now each node has its own list of groups which | ||
6711 | * gets dynamically allocated. | ||
6712 | */ | ||
6713 | static DEFINE_PER_CPU(struct static_sched_domain, node_domains); | ||
6714 | static struct sched_group ***sched_group_nodes_bycpu; | ||
6715 | 7164 | ||
6716 | static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); | 7165 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
6717 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); | 7166 | GFP_KERNEL, cpu_to_node(i)); |
6718 | 7167 | ||
6719 | static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, | 7168 | if (!sg) |
6720 | struct sched_group **sg, | 7169 | goto fail; |
6721 | struct cpumask *nodemask) | ||
6722 | { | ||
6723 | int group; | ||
6724 | 7170 | ||
6725 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); | 7171 | sg_span = sched_group_cpus(sg); |
6726 | group = cpumask_first(nodemask); | ||
6727 | 7172 | ||
6728 | if (sg) | 7173 | child = *per_cpu_ptr(sdd->sd, i); |
6729 | *sg = &per_cpu(sched_group_allnodes, group).sg; | 7174 | if (child->child) { |
6730 | return group; | 7175 | child = child->child; |
6731 | } | 7176 | cpumask_copy(sg_span, sched_domain_span(child)); |
7177 | } else | ||
7178 | cpumask_set_cpu(i, sg_span); | ||
6732 | 7179 | ||
6733 | static void init_numa_sched_groups_power(struct sched_group *group_head) | 7180 | cpumask_or(covered, covered, sg_span); |
6734 | { | ||
6735 | struct sched_group *sg = group_head; | ||
6736 | int j; | ||
6737 | 7181 | ||
6738 | if (!sg) | 7182 | sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); |
6739 | return; | 7183 | atomic_inc(&sg->sgp->ref); |
6740 | do { | ||
6741 | for_each_cpu(j, sched_group_cpus(sg)) { | ||
6742 | struct sched_domain *sd; | ||
6743 | 7184 | ||
6744 | sd = &per_cpu(phys_domains, j).sd; | 7185 | if (cpumask_test_cpu(cpu, sg_span)) |
6745 | if (j != group_first_cpu(sd->groups)) { | 7186 | groups = sg; |
6746 | /* | ||
6747 | * Only add "power" once for each | ||
6748 | * physical package. | ||
6749 | */ | ||
6750 | continue; | ||
6751 | } | ||
6752 | 7187 | ||
6753 | sg->cpu_power += sd->groups->cpu_power; | 7188 | if (!first) |
6754 | } | 7189 | first = sg; |
6755 | sg = sg->next; | 7190 | if (last) |
6756 | } while (sg != group_head); | 7191 | last->next = sg; |
7192 | last = sg; | ||
7193 | last->next = first; | ||
7194 | } | ||
7195 | sd->groups = groups; | ||
7196 | |||
7197 | return 0; | ||
7198 | |||
7199 | fail: | ||
7200 | free_sched_groups(first, 0); | ||
7201 | |||
7202 | return -ENOMEM; | ||
6757 | } | 7203 | } |
6758 | 7204 | ||
6759 | static int build_numa_sched_groups(struct s_data *d, | 7205 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) |
6760 | const struct cpumask *cpu_map, int num) | ||
6761 | { | 7206 | { |
6762 | struct sched_domain *sd; | 7207 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
6763 | struct sched_group *sg, *prev; | 7208 | struct sched_domain *child = sd->child; |
6764 | int n, j; | ||
6765 | 7209 | ||
6766 | cpumask_clear(d->covered); | 7210 | if (child) |
6767 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | 7211 | cpu = cpumask_first(sched_domain_span(child)); |
6768 | if (cpumask_empty(d->nodemask)) { | 7212 | |
6769 | d->sched_group_nodes[num] = NULL; | 7213 | if (sg) { |
6770 | goto out; | 7214 | *sg = *per_cpu_ptr(sdd->sg, cpu); |
7215 | (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); | ||
7216 | atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ | ||
6771 | } | 7217 | } |
6772 | 7218 | ||
6773 | sched_domain_node_span(num, d->domainspan); | 7219 | return cpu; |
6774 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | 7220 | } |
6775 | 7221 | ||
6776 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | 7222 | /* |
6777 | GFP_KERNEL, num); | 7223 | * build_sched_groups will build a circular linked list of the groups |
6778 | if (!sg) { | 7224 | * covered by the given span, and will set each group's ->cpumask correctly, |
6779 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | 7225 | * and ->cpu_power to 0. |
6780 | num); | 7226 | * |
6781 | return -ENOMEM; | 7227 | * Assumes the sched_domain tree is fully constructed |
6782 | } | 7228 | */ |
6783 | d->sched_group_nodes[num] = sg; | 7229 | static int |
7230 | build_sched_groups(struct sched_domain *sd, int cpu) | ||
7231 | { | ||
7232 | struct sched_group *first = NULL, *last = NULL; | ||
7233 | struct sd_data *sdd = sd->private; | ||
7234 | const struct cpumask *span = sched_domain_span(sd); | ||
7235 | struct cpumask *covered; | ||
7236 | int i; | ||
6784 | 7237 | ||
6785 | for_each_cpu(j, d->nodemask) { | 7238 | get_group(cpu, sdd, &sd->groups); |
6786 | sd = &per_cpu(node_domains, j).sd; | 7239 | atomic_inc(&sd->groups->ref); |
6787 | sd->groups = sg; | ||
6788 | } | ||
6789 | 7240 | ||
6790 | sg->cpu_power = 0; | 7241 | if (cpu != cpumask_first(sched_domain_span(sd))) |
6791 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | 7242 | return 0; |
6792 | sg->next = sg; | ||
6793 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
6794 | 7243 | ||
6795 | prev = sg; | 7244 | lockdep_assert_held(&sched_domains_mutex); |
6796 | for (j = 0; j < nr_node_ids; j++) { | 7245 | covered = sched_domains_tmpmask; |
6797 | n = (num + j) % nr_node_ids; | ||
6798 | cpumask_complement(d->notcovered, d->covered); | ||
6799 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
6800 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
6801 | if (cpumask_empty(d->tmpmask)) | ||
6802 | break; | ||
6803 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
6804 | if (cpumask_empty(d->tmpmask)) | ||
6805 | continue; | ||
6806 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
6807 | GFP_KERNEL, num); | ||
6808 | if (!sg) { | ||
6809 | printk(KERN_WARNING | ||
6810 | "Can not alloc domain group for node %d\n", j); | ||
6811 | return -ENOMEM; | ||
6812 | } | ||
6813 | sg->cpu_power = 0; | ||
6814 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
6815 | sg->next = prev->next; | ||
6816 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
6817 | prev->next = sg; | ||
6818 | prev = sg; | ||
6819 | } | ||
6820 | out: | ||
6821 | return 0; | ||
6822 | } | ||
6823 | #endif /* CONFIG_NUMA */ | ||
6824 | 7246 | ||
6825 | #ifdef CONFIG_NUMA | 7247 | cpumask_clear(covered); |
6826 | /* Free memory allocated for various sched_group structures */ | ||
6827 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
6828 | struct cpumask *nodemask) | ||
6829 | { | ||
6830 | int cpu, i; | ||
6831 | 7248 | ||
6832 | for_each_cpu(cpu, cpu_map) { | 7249 | for_each_cpu(i, span) { |
6833 | struct sched_group **sched_group_nodes | 7250 | struct sched_group *sg; |
6834 | = sched_group_nodes_bycpu[cpu]; | 7251 | int group = get_group(i, sdd, &sg); |
7252 | int j; | ||
6835 | 7253 | ||
6836 | if (!sched_group_nodes) | 7254 | if (cpumask_test_cpu(i, covered)) |
6837 | continue; | 7255 | continue; |
6838 | 7256 | ||
6839 | for (i = 0; i < nr_node_ids; i++) { | 7257 | cpumask_clear(sched_group_cpus(sg)); |
6840 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 7258 | sg->sgp->power = 0; |
6841 | 7259 | ||
6842 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 7260 | for_each_cpu(j, span) { |
6843 | if (cpumask_empty(nodemask)) | 7261 | if (get_group(j, sdd, NULL) != group) |
6844 | continue; | 7262 | continue; |
6845 | 7263 | ||
6846 | if (sg == NULL) | 7264 | cpumask_set_cpu(j, covered); |
6847 | continue; | 7265 | cpumask_set_cpu(j, sched_group_cpus(sg)); |
6848 | sg = sg->next; | ||
6849 | next_sg: | ||
6850 | oldsg = sg; | ||
6851 | sg = sg->next; | ||
6852 | kfree(oldsg); | ||
6853 | if (oldsg != sched_group_nodes[i]) | ||
6854 | goto next_sg; | ||
6855 | } | 7266 | } |
6856 | kfree(sched_group_nodes); | 7267 | |
6857 | sched_group_nodes_bycpu[cpu] = NULL; | 7268 | if (!first) |
7269 | first = sg; | ||
7270 | if (last) | ||
7271 | last->next = sg; | ||
7272 | last = sg; | ||
6858 | } | 7273 | } |
7274 | last->next = first; | ||
7275 | |||
7276 | return 0; | ||
6859 | } | 7277 | } |
6860 | #else /* !CONFIG_NUMA */ | ||
6861 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
6862 | struct cpumask *nodemask) | ||
6863 | { | ||
6864 | } | ||
6865 | #endif /* CONFIG_NUMA */ | ||
6866 | 7278 | ||
6867 | /* | 7279 | /* |
6868 | * Initialize sched groups cpu_power. | 7280 | * Initialize sched groups cpu_power. |
@@ -6876,46 +7288,19 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
6876 | */ | 7288 | */ |
6877 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 7289 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
6878 | { | 7290 | { |
6879 | struct sched_domain *child; | 7291 | struct sched_group *sg = sd->groups; |
6880 | struct sched_group *group; | ||
6881 | long power; | ||
6882 | int weight; | ||
6883 | |||
6884 | WARN_ON(!sd || !sd->groups); | ||
6885 | |||
6886 | if (cpu != group_first_cpu(sd->groups)) | ||
6887 | return; | ||
6888 | 7292 | ||
6889 | child = sd->child; | 7293 | WARN_ON(!sd || !sg); |
6890 | 7294 | ||
6891 | sd->groups->cpu_power = 0; | 7295 | do { |
7296 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | ||
7297 | sg = sg->next; | ||
7298 | } while (sg != sd->groups); | ||
6892 | 7299 | ||
6893 | if (!child) { | 7300 | if (cpu != group_first_cpu(sg)) |
6894 | power = SCHED_LOAD_SCALE; | ||
6895 | weight = cpumask_weight(sched_domain_span(sd)); | ||
6896 | /* | ||
6897 | * SMT siblings share the power of a single core. | ||
6898 | * Usually multiple threads get a better yield out of | ||
6899 | * that one core than a single thread would have, | ||
6900 | * reflect that in sd->smt_gain. | ||
6901 | */ | ||
6902 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
6903 | power *= sd->smt_gain; | ||
6904 | power /= weight; | ||
6905 | power >>= SCHED_LOAD_SHIFT; | ||
6906 | } | ||
6907 | sd->groups->cpu_power += power; | ||
6908 | return; | 7301 | return; |
6909 | } | ||
6910 | 7302 | ||
6911 | /* | 7303 | update_group_power(sd, cpu); |
6912 | * Add cpu_power of each child group to this groups cpu_power. | ||
6913 | */ | ||
6914 | group = child->groups; | ||
6915 | do { | ||
6916 | sd->groups->cpu_power += group->cpu_power; | ||
6917 | group = group->next; | ||
6918 | } while (group != child->groups); | ||
6919 | } | 7304 | } |
6920 | 7305 | ||
6921 | /* | 7306 | /* |
@@ -6929,15 +7314,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6929 | # define SD_INIT_NAME(sd, type) do { } while (0) | 7314 | # define SD_INIT_NAME(sd, type) do { } while (0) |
6930 | #endif | 7315 | #endif |
6931 | 7316 | ||
6932 | #define SD_INIT(sd, type) sd_init_##type(sd) | 7317 | #define SD_INIT_FUNC(type) \ |
6933 | 7318 | static noinline struct sched_domain * \ | |
6934 | #define SD_INIT_FUNC(type) \ | 7319 | sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ |
6935 | static noinline void sd_init_##type(struct sched_domain *sd) \ | 7320 | { \ |
6936 | { \ | 7321 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ |
6937 | memset(sd, 0, sizeof(*sd)); \ | 7322 | *sd = SD_##type##_INIT; \ |
6938 | *sd = SD_##type##_INIT; \ | 7323 | SD_INIT_NAME(sd, type); \ |
6939 | sd->level = SD_LV_##type; \ | 7324 | sd->private = &tl->data; \ |
6940 | SD_INIT_NAME(sd, type); \ | 7325 | return sd; \ |
6941 | } | 7326 | } |
6942 | 7327 | ||
6943 | SD_INIT_FUNC(CPU) | 7328 | SD_INIT_FUNC(CPU) |
@@ -6951,15 +7336,19 @@ SD_INIT_FUNC(CPU) | |||
6951 | #ifdef CONFIG_SCHED_MC | 7336 | #ifdef CONFIG_SCHED_MC |
6952 | SD_INIT_FUNC(MC) | 7337 | SD_INIT_FUNC(MC) |
6953 | #endif | 7338 | #endif |
7339 | #ifdef CONFIG_SCHED_BOOK | ||
7340 | SD_INIT_FUNC(BOOK) | ||
7341 | #endif | ||
6954 | 7342 | ||
6955 | static int default_relax_domain_level = -1; | 7343 | static int default_relax_domain_level = -1; |
7344 | int sched_domain_level_max; | ||
6956 | 7345 | ||
6957 | static int __init setup_relax_domain_level(char *str) | 7346 | static int __init setup_relax_domain_level(char *str) |
6958 | { | 7347 | { |
6959 | unsigned long val; | 7348 | unsigned long val; |
6960 | 7349 | ||
6961 | val = simple_strtoul(str, NULL, 0); | 7350 | val = simple_strtoul(str, NULL, 0); |
6962 | if (val < SD_LV_MAX) | 7351 | if (val < sched_domain_level_max) |
6963 | default_relax_domain_level = val; | 7352 | default_relax_domain_level = val; |
6964 | 7353 | ||
6965 | return 1; | 7354 | return 1; |
@@ -6987,35 +7376,20 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
6987 | } | 7376 | } |
6988 | } | 7377 | } |
6989 | 7378 | ||
7379 | static void __sdt_free(const struct cpumask *cpu_map); | ||
7380 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
7381 | |||
6990 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | 7382 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
6991 | const struct cpumask *cpu_map) | 7383 | const struct cpumask *cpu_map) |
6992 | { | 7384 | { |
6993 | switch (what) { | 7385 | switch (what) { |
6994 | case sa_sched_groups: | ||
6995 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ | ||
6996 | d->sched_group_nodes = NULL; | ||
6997 | case sa_rootdomain: | 7386 | case sa_rootdomain: |
6998 | free_rootdomain(d->rd); /* fall through */ | 7387 | if (!atomic_read(&d->rd->refcount)) |
6999 | case sa_tmpmask: | 7388 | free_rootdomain(&d->rd->rcu); /* fall through */ |
7000 | free_cpumask_var(d->tmpmask); /* fall through */ | 7389 | case sa_sd: |
7001 | case sa_send_covered: | 7390 | free_percpu(d->sd); /* fall through */ |
7002 | free_cpumask_var(d->send_covered); /* fall through */ | 7391 | case sa_sd_storage: |
7003 | case sa_this_core_map: | 7392 | __sdt_free(cpu_map); /* fall through */ |
7004 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
7005 | case sa_this_sibling_map: | ||
7006 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
7007 | case sa_nodemask: | ||
7008 | free_cpumask_var(d->nodemask); /* fall through */ | ||
7009 | case sa_sched_group_nodes: | ||
7010 | #ifdef CONFIG_NUMA | ||
7011 | kfree(d->sched_group_nodes); /* fall through */ | ||
7012 | case sa_notcovered: | ||
7013 | free_cpumask_var(d->notcovered); /* fall through */ | ||
7014 | case sa_covered: | ||
7015 | free_cpumask_var(d->covered); /* fall through */ | ||
7016 | case sa_domainspan: | ||
7017 | free_cpumask_var(d->domainspan); /* fall through */ | ||
7018 | #endif | ||
7019 | case sa_none: | 7393 | case sa_none: |
7020 | break; | 7394 | break; |
7021 | } | 7395 | } |
@@ -7024,270 +7398,233 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
7024 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | 7398 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
7025 | const struct cpumask *cpu_map) | 7399 | const struct cpumask *cpu_map) |
7026 | { | 7400 | { |
7027 | #ifdef CONFIG_NUMA | 7401 | memset(d, 0, sizeof(*d)); |
7028 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) | 7402 | |
7029 | return sa_none; | 7403 | if (__sdt_alloc(cpu_map)) |
7030 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) | 7404 | return sa_sd_storage; |
7031 | return sa_domainspan; | 7405 | d->sd = alloc_percpu(struct sched_domain *); |
7032 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) | 7406 | if (!d->sd) |
7033 | return sa_covered; | 7407 | return sa_sd_storage; |
7034 | /* Allocate the per-node list of sched groups */ | ||
7035 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
7036 | sizeof(struct sched_group *), GFP_KERNEL); | ||
7037 | if (!d->sched_group_nodes) { | ||
7038 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | ||
7039 | return sa_notcovered; | ||
7040 | } | ||
7041 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; | ||
7042 | #endif | ||
7043 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) | ||
7044 | return sa_sched_group_nodes; | ||
7045 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
7046 | return sa_nodemask; | ||
7047 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
7048 | return sa_this_sibling_map; | ||
7049 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
7050 | return sa_this_core_map; | ||
7051 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
7052 | return sa_send_covered; | ||
7053 | d->rd = alloc_rootdomain(); | 7408 | d->rd = alloc_rootdomain(); |
7054 | if (!d->rd) { | 7409 | if (!d->rd) |
7055 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 7410 | return sa_sd; |
7056 | return sa_tmpmask; | ||
7057 | } | ||
7058 | return sa_rootdomain; | 7411 | return sa_rootdomain; |
7059 | } | 7412 | } |
7060 | 7413 | ||
7061 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | 7414 | /* |
7062 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | 7415 | * NULL the sd_data elements we've used to build the sched_domain and |
7416 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
7417 | * will not free the data we're using. | ||
7418 | */ | ||
7419 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
7063 | { | 7420 | { |
7064 | struct sched_domain *sd = NULL; | 7421 | struct sd_data *sdd = sd->private; |
7065 | #ifdef CONFIG_NUMA | ||
7066 | struct sched_domain *parent; | ||
7067 | |||
7068 | d->sd_allnodes = 0; | ||
7069 | if (cpumask_weight(cpu_map) > | ||
7070 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
7071 | sd = &per_cpu(allnodes_domains, i).sd; | ||
7072 | SD_INIT(sd, ALLNODES); | ||
7073 | set_domain_attribute(sd, attr); | ||
7074 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
7075 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7076 | d->sd_allnodes = 1; | ||
7077 | } | ||
7078 | parent = sd; | ||
7079 | |||
7080 | sd = &per_cpu(node_domains, i).sd; | ||
7081 | SD_INIT(sd, NODE); | ||
7082 | set_domain_attribute(sd, attr); | ||
7083 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
7084 | sd->parent = parent; | ||
7085 | if (parent) | ||
7086 | parent->child = sd; | ||
7087 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
7088 | #endif | ||
7089 | return sd; | ||
7090 | } | ||
7091 | 7422 | ||
7092 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, | 7423 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
7093 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7424 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
7094 | struct sched_domain *parent, int i) | ||
7095 | { | ||
7096 | struct sched_domain *sd; | ||
7097 | sd = &per_cpu(phys_domains, i).sd; | ||
7098 | SD_INIT(sd, CPU); | ||
7099 | set_domain_attribute(sd, attr); | ||
7100 | cpumask_copy(sched_domain_span(sd), d->nodemask); | ||
7101 | sd->parent = parent; | ||
7102 | if (parent) | ||
7103 | parent->child = sd; | ||
7104 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7105 | return sd; | ||
7106 | } | ||
7107 | 7425 | ||
7108 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | 7426 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) |
7109 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7427 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
7110 | struct sched_domain *parent, int i) | 7428 | |
7111 | { | 7429 | if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) |
7112 | struct sched_domain *sd = parent; | 7430 | *per_cpu_ptr(sdd->sgp, cpu) = NULL; |
7113 | #ifdef CONFIG_SCHED_MC | ||
7114 | sd = &per_cpu(core_domains, i).sd; | ||
7115 | SD_INIT(sd, MC); | ||
7116 | set_domain_attribute(sd, attr); | ||
7117 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); | ||
7118 | sd->parent = parent; | ||
7119 | parent->child = sd; | ||
7120 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7121 | #endif | ||
7122 | return sd; | ||
7123 | } | 7431 | } |
7124 | 7432 | ||
7125 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
7126 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
7127 | struct sched_domain *parent, int i) | ||
7128 | { | ||
7129 | struct sched_domain *sd = parent; | ||
7130 | #ifdef CONFIG_SCHED_SMT | 7433 | #ifdef CONFIG_SCHED_SMT |
7131 | sd = &per_cpu(cpu_domains, i).sd; | 7434 | static const struct cpumask *cpu_smt_mask(int cpu) |
7132 | SD_INIT(sd, SIBLING); | 7435 | { |
7133 | set_domain_attribute(sd, attr); | 7436 | return topology_thread_cpumask(cpu); |
7134 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); | ||
7135 | sd->parent = parent; | ||
7136 | parent->child = sd; | ||
7137 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7138 | #endif | ||
7139 | return sd; | ||
7140 | } | 7437 | } |
7438 | #endif | ||
7141 | 7439 | ||
7142 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | 7440 | /* |
7143 | const struct cpumask *cpu_map, int cpu) | 7441 | * Topology list, bottom-up. |
7144 | { | 7442 | */ |
7145 | switch (l) { | 7443 | static struct sched_domain_topology_level default_topology[] = { |
7146 | #ifdef CONFIG_SCHED_SMT | 7444 | #ifdef CONFIG_SCHED_SMT |
7147 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ | 7445 | { sd_init_SIBLING, cpu_smt_mask, }, |
7148 | cpumask_and(d->this_sibling_map, cpu_map, | ||
7149 | topology_thread_cpumask(cpu)); | ||
7150 | if (cpu == cpumask_first(d->this_sibling_map)) | ||
7151 | init_sched_build_groups(d->this_sibling_map, cpu_map, | ||
7152 | &cpu_to_cpu_group, | ||
7153 | d->send_covered, d->tmpmask); | ||
7154 | break; | ||
7155 | #endif | 7446 | #endif |
7156 | #ifdef CONFIG_SCHED_MC | 7447 | #ifdef CONFIG_SCHED_MC |
7157 | case SD_LV_MC: /* set up multi-core groups */ | 7448 | { sd_init_MC, cpu_coregroup_mask, }, |
7158 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); | ||
7159 | if (cpu == cpumask_first(d->this_core_map)) | ||
7160 | init_sched_build_groups(d->this_core_map, cpu_map, | ||
7161 | &cpu_to_core_group, | ||
7162 | d->send_covered, d->tmpmask); | ||
7163 | break; | ||
7164 | #endif | 7449 | #endif |
7165 | case SD_LV_CPU: /* set up physical groups */ | 7450 | #ifdef CONFIG_SCHED_BOOK |
7166 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); | 7451 | { sd_init_BOOK, cpu_book_mask, }, |
7167 | if (!cpumask_empty(d->nodemask)) | 7452 | #endif |
7168 | init_sched_build_groups(d->nodemask, cpu_map, | 7453 | { sd_init_CPU, cpu_cpu_mask, }, |
7169 | &cpu_to_phys_group, | ||
7170 | d->send_covered, d->tmpmask); | ||
7171 | break; | ||
7172 | #ifdef CONFIG_NUMA | 7454 | #ifdef CONFIG_NUMA |
7173 | case SD_LV_ALLNODES: | 7455 | { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, |
7174 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, | 7456 | { sd_init_ALLNODES, cpu_allnodes_mask, }, |
7175 | d->send_covered, d->tmpmask); | ||
7176 | break; | ||
7177 | #endif | 7457 | #endif |
7178 | default: | 7458 | { NULL, }, |
7179 | break; | 7459 | }; |
7460 | |||
7461 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | ||
7462 | |||
7463 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
7464 | { | ||
7465 | struct sched_domain_topology_level *tl; | ||
7466 | int j; | ||
7467 | |||
7468 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7469 | struct sd_data *sdd = &tl->data; | ||
7470 | |||
7471 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
7472 | if (!sdd->sd) | ||
7473 | return -ENOMEM; | ||
7474 | |||
7475 | sdd->sg = alloc_percpu(struct sched_group *); | ||
7476 | if (!sdd->sg) | ||
7477 | return -ENOMEM; | ||
7478 | |||
7479 | sdd->sgp = alloc_percpu(struct sched_group_power *); | ||
7480 | if (!sdd->sgp) | ||
7481 | return -ENOMEM; | ||
7482 | |||
7483 | for_each_cpu(j, cpu_map) { | ||
7484 | struct sched_domain *sd; | ||
7485 | struct sched_group *sg; | ||
7486 | struct sched_group_power *sgp; | ||
7487 | |||
7488 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
7489 | GFP_KERNEL, cpu_to_node(j)); | ||
7490 | if (!sd) | ||
7491 | return -ENOMEM; | ||
7492 | |||
7493 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
7494 | |||
7495 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7496 | GFP_KERNEL, cpu_to_node(j)); | ||
7497 | if (!sg) | ||
7498 | return -ENOMEM; | ||
7499 | |||
7500 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
7501 | |||
7502 | sgp = kzalloc_node(sizeof(struct sched_group_power), | ||
7503 | GFP_KERNEL, cpu_to_node(j)); | ||
7504 | if (!sgp) | ||
7505 | return -ENOMEM; | ||
7506 | |||
7507 | *per_cpu_ptr(sdd->sgp, j) = sgp; | ||
7508 | } | ||
7509 | } | ||
7510 | |||
7511 | return 0; | ||
7512 | } | ||
7513 | |||
7514 | static void __sdt_free(const struct cpumask *cpu_map) | ||
7515 | { | ||
7516 | struct sched_domain_topology_level *tl; | ||
7517 | int j; | ||
7518 | |||
7519 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7520 | struct sd_data *sdd = &tl->data; | ||
7521 | |||
7522 | for_each_cpu(j, cpu_map) { | ||
7523 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); | ||
7524 | if (sd && (sd->flags & SD_OVERLAP)) | ||
7525 | free_sched_groups(sd->groups, 0); | ||
7526 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
7527 | kfree(*per_cpu_ptr(sdd->sgp, j)); | ||
7528 | } | ||
7529 | free_percpu(sdd->sd); | ||
7530 | free_percpu(sdd->sg); | ||
7531 | free_percpu(sdd->sgp); | ||
7532 | } | ||
7533 | } | ||
7534 | |||
7535 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
7536 | struct s_data *d, const struct cpumask *cpu_map, | ||
7537 | struct sched_domain_attr *attr, struct sched_domain *child, | ||
7538 | int cpu) | ||
7539 | { | ||
7540 | struct sched_domain *sd = tl->init(tl, cpu); | ||
7541 | if (!sd) | ||
7542 | return child; | ||
7543 | |||
7544 | set_domain_attribute(sd, attr); | ||
7545 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
7546 | if (child) { | ||
7547 | sd->level = child->level + 1; | ||
7548 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
7549 | child->parent = sd; | ||
7180 | } | 7550 | } |
7551 | sd->child = child; | ||
7552 | |||
7553 | return sd; | ||
7181 | } | 7554 | } |
7182 | 7555 | ||
7183 | /* | 7556 | /* |
7184 | * Build sched domains for a given set of cpus and attach the sched domains | 7557 | * Build sched domains for a given set of cpus and attach the sched domains |
7185 | * to the individual cpus | 7558 | * to the individual cpus |
7186 | */ | 7559 | */ |
7187 | static int __build_sched_domains(const struct cpumask *cpu_map, | 7560 | static int build_sched_domains(const struct cpumask *cpu_map, |
7188 | struct sched_domain_attr *attr) | 7561 | struct sched_domain_attr *attr) |
7189 | { | 7562 | { |
7190 | enum s_alloc alloc_state = sa_none; | 7563 | enum s_alloc alloc_state = sa_none; |
7191 | struct s_data d; | ||
7192 | struct sched_domain *sd; | 7564 | struct sched_domain *sd; |
7193 | int i; | 7565 | struct s_data d; |
7194 | #ifdef CONFIG_NUMA | 7566 | int i, ret = -ENOMEM; |
7195 | d.sd_allnodes = 0; | ||
7196 | #endif | ||
7197 | 7567 | ||
7198 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | 7568 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
7199 | if (alloc_state != sa_rootdomain) | 7569 | if (alloc_state != sa_rootdomain) |
7200 | goto error; | 7570 | goto error; |
7201 | alloc_state = sa_sched_groups; | ||
7202 | |||
7203 | /* | ||
7204 | * Set up domains for cpus specified by the cpu_map. | ||
7205 | */ | ||
7206 | for_each_cpu(i, cpu_map) { | ||
7207 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), | ||
7208 | cpu_map); | ||
7209 | |||
7210 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); | ||
7211 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); | ||
7212 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); | ||
7213 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); | ||
7214 | } | ||
7215 | 7571 | ||
7572 | /* Set up domains for cpus specified by the cpu_map. */ | ||
7216 | for_each_cpu(i, cpu_map) { | 7573 | for_each_cpu(i, cpu_map) { |
7217 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); | 7574 | struct sched_domain_topology_level *tl; |
7218 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | 7575 | |
7219 | } | 7576 | sd = NULL; |
7220 | 7577 | for (tl = sched_domain_topology; tl->init; tl++) { | |
7221 | /* Set up physical groups */ | 7578 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); |
7222 | for (i = 0; i < nr_node_ids; i++) | 7579 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) |
7223 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); | 7580 | sd->flags |= SD_OVERLAP; |
7224 | 7581 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | |
7225 | #ifdef CONFIG_NUMA | 7582 | break; |
7226 | /* Set up node groups */ | 7583 | } |
7227 | if (d.sd_allnodes) | ||
7228 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
7229 | 7584 | ||
7230 | for (i = 0; i < nr_node_ids; i++) | 7585 | while (sd->child) |
7231 | if (build_numa_sched_groups(&d, cpu_map, i)) | 7586 | sd = sd->child; |
7232 | goto error; | ||
7233 | #endif | ||
7234 | 7587 | ||
7235 | /* Calculate CPU power for physical packages and nodes */ | 7588 | *per_cpu_ptr(d.sd, i) = sd; |
7236 | #ifdef CONFIG_SCHED_SMT | ||
7237 | for_each_cpu(i, cpu_map) { | ||
7238 | sd = &per_cpu(cpu_domains, i).sd; | ||
7239 | init_sched_groups_power(i, sd); | ||
7240 | } | ||
7241 | #endif | ||
7242 | #ifdef CONFIG_SCHED_MC | ||
7243 | for_each_cpu(i, cpu_map) { | ||
7244 | sd = &per_cpu(core_domains, i).sd; | ||
7245 | init_sched_groups_power(i, sd); | ||
7246 | } | 7589 | } |
7247 | #endif | ||
7248 | 7590 | ||
7591 | /* Build the groups for the domains */ | ||
7249 | for_each_cpu(i, cpu_map) { | 7592 | for_each_cpu(i, cpu_map) { |
7250 | sd = &per_cpu(phys_domains, i).sd; | 7593 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7251 | init_sched_groups_power(i, sd); | 7594 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); |
7595 | if (sd->flags & SD_OVERLAP) { | ||
7596 | if (build_overlap_sched_groups(sd, i)) | ||
7597 | goto error; | ||
7598 | } else { | ||
7599 | if (build_sched_groups(sd, i)) | ||
7600 | goto error; | ||
7601 | } | ||
7602 | } | ||
7252 | } | 7603 | } |
7253 | 7604 | ||
7254 | #ifdef CONFIG_NUMA | 7605 | /* Calculate CPU power for physical packages and nodes */ |
7255 | for (i = 0; i < nr_node_ids; i++) | 7606 | for (i = nr_cpumask_bits-1; i >= 0; i--) { |
7256 | init_numa_sched_groups_power(d.sched_group_nodes[i]); | 7607 | if (!cpumask_test_cpu(i, cpu_map)) |
7257 | 7608 | continue; | |
7258 | if (d.sd_allnodes) { | ||
7259 | struct sched_group *sg; | ||
7260 | 7609 | ||
7261 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 7610 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7262 | d.tmpmask); | 7611 | claim_allocations(i, sd); |
7263 | init_numa_sched_groups_power(sg); | 7612 | init_sched_groups_power(i, sd); |
7613 | } | ||
7264 | } | 7614 | } |
7265 | #endif | ||
7266 | 7615 | ||
7267 | /* Attach the domains */ | 7616 | /* Attach the domains */ |
7617 | rcu_read_lock(); | ||
7268 | for_each_cpu(i, cpu_map) { | 7618 | for_each_cpu(i, cpu_map) { |
7269 | #ifdef CONFIG_SCHED_SMT | 7619 | sd = *per_cpu_ptr(d.sd, i); |
7270 | sd = &per_cpu(cpu_domains, i).sd; | ||
7271 | #elif defined(CONFIG_SCHED_MC) | ||
7272 | sd = &per_cpu(core_domains, i).sd; | ||
7273 | #else | ||
7274 | sd = &per_cpu(phys_domains, i).sd; | ||
7275 | #endif | ||
7276 | cpu_attach_domain(sd, d.rd, i); | 7620 | cpu_attach_domain(sd, d.rd, i); |
7277 | } | 7621 | } |
7622 | rcu_read_unlock(); | ||
7278 | 7623 | ||
7279 | d.sched_group_nodes = NULL; /* don't free this we still need it */ | 7624 | ret = 0; |
7280 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | ||
7281 | return 0; | ||
7282 | |||
7283 | error: | 7625 | error: |
7284 | __free_domain_allocs(&d, alloc_state, cpu_map); | 7626 | __free_domain_allocs(&d, alloc_state, cpu_map); |
7285 | return -ENOMEM; | 7627 | return ret; |
7286 | } | ||
7287 | |||
7288 | static int build_sched_domains(const struct cpumask *cpu_map) | ||
7289 | { | ||
7290 | return __build_sched_domains(cpu_map, NULL); | ||
7291 | } | 7628 | } |
7292 | 7629 | ||
7293 | static cpumask_var_t *doms_cur; /* current sched domains */ | 7630 | static cpumask_var_t *doms_cur; /* current sched domains */ |
@@ -7342,7 +7679,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | |||
7342 | * For now this just excludes isolated cpus, but could be used to | 7679 | * For now this just excludes isolated cpus, but could be used to |
7343 | * exclude other special cases in the future. | 7680 | * exclude other special cases in the future. |
7344 | */ | 7681 | */ |
7345 | static int arch_init_sched_domains(const struct cpumask *cpu_map) | 7682 | static int init_sched_domains(const struct cpumask *cpu_map) |
7346 | { | 7683 | { |
7347 | int err; | 7684 | int err; |
7348 | 7685 | ||
@@ -7353,32 +7690,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) | |||
7353 | doms_cur = &fallback_doms; | 7690 | doms_cur = &fallback_doms; |
7354 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 7691 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
7355 | dattr_cur = NULL; | 7692 | dattr_cur = NULL; |
7356 | err = build_sched_domains(doms_cur[0]); | 7693 | err = build_sched_domains(doms_cur[0], NULL); |
7357 | register_sched_domain_sysctl(); | 7694 | register_sched_domain_sysctl(); |
7358 | 7695 | ||
7359 | return err; | 7696 | return err; |
7360 | } | 7697 | } |
7361 | 7698 | ||
7362 | static void arch_destroy_sched_domains(const struct cpumask *cpu_map, | ||
7363 | struct cpumask *tmpmask) | ||
7364 | { | ||
7365 | free_sched_groups(cpu_map, tmpmask); | ||
7366 | } | ||
7367 | |||
7368 | /* | 7699 | /* |
7369 | * Detach sched domains from a group of cpus specified in cpu_map | 7700 | * Detach sched domains from a group of cpus specified in cpu_map |
7370 | * These cpus will now be attached to the NULL domain | 7701 | * These cpus will now be attached to the NULL domain |
7371 | */ | 7702 | */ |
7372 | static void detach_destroy_domains(const struct cpumask *cpu_map) | 7703 | static void detach_destroy_domains(const struct cpumask *cpu_map) |
7373 | { | 7704 | { |
7374 | /* Save because hotplug lock held. */ | ||
7375 | static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); | ||
7376 | int i; | 7705 | int i; |
7377 | 7706 | ||
7707 | rcu_read_lock(); | ||
7378 | for_each_cpu(i, cpu_map) | 7708 | for_each_cpu(i, cpu_map) |
7379 | cpu_attach_domain(NULL, &def_root_domain, i); | 7709 | cpu_attach_domain(NULL, &def_root_domain, i); |
7380 | synchronize_sched(); | 7710 | rcu_read_unlock(); |
7381 | arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); | ||
7382 | } | 7711 | } |
7383 | 7712 | ||
7384 | /* handle null as "default" */ | 7713 | /* handle null as "default" */ |
@@ -7467,8 +7796,7 @@ match1: | |||
7467 | goto match2; | 7796 | goto match2; |
7468 | } | 7797 | } |
7469 | /* no match - add a new doms_new */ | 7798 | /* no match - add a new doms_new */ |
7470 | __build_sched_domains(doms_new[i], | 7799 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); |
7471 | dattr_new ? dattr_new + i : NULL); | ||
7472 | match2: | 7800 | match2: |
7473 | ; | 7801 | ; |
7474 | } | 7802 | } |
@@ -7487,7 +7815,7 @@ match2: | |||
7487 | } | 7815 | } |
7488 | 7816 | ||
7489 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 7817 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
7490 | static void arch_reinit_sched_domains(void) | 7818 | static void reinit_sched_domains(void) |
7491 | { | 7819 | { |
7492 | get_online_cpus(); | 7820 | get_online_cpus(); |
7493 | 7821 | ||
@@ -7520,7 +7848,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
7520 | else | 7848 | else |
7521 | sched_mc_power_savings = level; | 7849 | sched_mc_power_savings = level; |
7522 | 7850 | ||
7523 | arch_reinit_sched_domains(); | 7851 | reinit_sched_domains(); |
7524 | 7852 | ||
7525 | return count; | 7853 | return count; |
7526 | } | 7854 | } |
@@ -7639,14 +7967,9 @@ void __init sched_init_smp(void) | |||
7639 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7967 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
7640 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7968 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
7641 | 7969 | ||
7642 | #if defined(CONFIG_NUMA) | ||
7643 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
7644 | GFP_KERNEL); | ||
7645 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
7646 | #endif | ||
7647 | get_online_cpus(); | 7970 | get_online_cpus(); |
7648 | mutex_lock(&sched_domains_mutex); | 7971 | mutex_lock(&sched_domains_mutex); |
7649 | arch_init_sched_domains(cpu_active_mask); | 7972 | init_sched_domains(cpu_active_mask); |
7650 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 7973 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
7651 | if (cpumask_empty(non_isolated_cpus)) | 7974 | if (cpumask_empty(non_isolated_cpus)) |
7652 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 7975 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
@@ -7691,8 +8014,15 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
7691 | INIT_LIST_HEAD(&cfs_rq->tasks); | 8014 | INIT_LIST_HEAD(&cfs_rq->tasks); |
7692 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8015 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7693 | cfs_rq->rq = rq; | 8016 | cfs_rq->rq = rq; |
8017 | /* allow initial update_cfs_load() to truncate */ | ||
8018 | #ifdef CONFIG_SMP | ||
8019 | cfs_rq->load_stamp = 1; | ||
8020 | #endif | ||
7694 | #endif | 8021 | #endif |
7695 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 8022 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
8023 | #ifndef CONFIG_64BIT | ||
8024 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
8025 | #endif | ||
7696 | } | 8026 | } |
7697 | 8027 | ||
7698 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | 8028 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) |
@@ -7733,18 +8063,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
7733 | 8063 | ||
7734 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8064 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7735 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 8065 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
7736 | struct sched_entity *se, int cpu, int add, | 8066 | struct sched_entity *se, int cpu, |
7737 | struct sched_entity *parent) | 8067 | struct sched_entity *parent) |
7738 | { | 8068 | { |
7739 | struct rq *rq = cpu_rq(cpu); | 8069 | struct rq *rq = cpu_rq(cpu); |
7740 | tg->cfs_rq[cpu] = cfs_rq; | 8070 | tg->cfs_rq[cpu] = cfs_rq; |
7741 | init_cfs_rq(cfs_rq, rq); | 8071 | init_cfs_rq(cfs_rq, rq); |
7742 | cfs_rq->tg = tg; | 8072 | cfs_rq->tg = tg; |
7743 | if (add) | ||
7744 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7745 | 8073 | ||
7746 | tg->se[cpu] = se; | 8074 | tg->se[cpu] = se; |
7747 | /* se could be NULL for init_task_group */ | 8075 | /* se could be NULL for root_task_group */ |
7748 | if (!se) | 8076 | if (!se) |
7749 | return; | 8077 | return; |
7750 | 8078 | ||
@@ -7754,15 +8082,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7754 | se->cfs_rq = parent->my_q; | 8082 | se->cfs_rq = parent->my_q; |
7755 | 8083 | ||
7756 | se->my_q = cfs_rq; | 8084 | se->my_q = cfs_rq; |
7757 | se->load.weight = tg->shares; | 8085 | update_load_set(&se->load, 0); |
7758 | se->load.inv_weight = 0; | ||
7759 | se->parent = parent; | 8086 | se->parent = parent; |
7760 | } | 8087 | } |
7761 | #endif | 8088 | #endif |
7762 | 8089 | ||
7763 | #ifdef CONFIG_RT_GROUP_SCHED | 8090 | #ifdef CONFIG_RT_GROUP_SCHED |
7764 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | 8091 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
7765 | struct sched_rt_entity *rt_se, int cpu, int add, | 8092 | struct sched_rt_entity *rt_se, int cpu, |
7766 | struct sched_rt_entity *parent) | 8093 | struct sched_rt_entity *parent) |
7767 | { | 8094 | { |
7768 | struct rq *rq = cpu_rq(cpu); | 8095 | struct rq *rq = cpu_rq(cpu); |
@@ -7771,8 +8098,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
7771 | init_rt_rq(rt_rq, rq); | 8098 | init_rt_rq(rt_rq, rq); |
7772 | rt_rq->tg = tg; | 8099 | rt_rq->tg = tg; |
7773 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 8100 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
7774 | if (add) | ||
7775 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7776 | 8101 | ||
7777 | tg->rt_se[cpu] = rt_se; | 8102 | tg->rt_se[cpu] = rt_se; |
7778 | if (!rt_se) | 8103 | if (!rt_se) |
@@ -7807,18 +8132,18 @@ void __init sched_init(void) | |||
7807 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 8132 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
7808 | 8133 | ||
7809 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8134 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7810 | init_task_group.se = (struct sched_entity **)ptr; | 8135 | root_task_group.se = (struct sched_entity **)ptr; |
7811 | ptr += nr_cpu_ids * sizeof(void **); | 8136 | ptr += nr_cpu_ids * sizeof(void **); |
7812 | 8137 | ||
7813 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | 8138 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
7814 | ptr += nr_cpu_ids * sizeof(void **); | 8139 | ptr += nr_cpu_ids * sizeof(void **); |
7815 | 8140 | ||
7816 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8141 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7817 | #ifdef CONFIG_RT_GROUP_SCHED | 8142 | #ifdef CONFIG_RT_GROUP_SCHED |
7818 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 8143 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; |
7819 | ptr += nr_cpu_ids * sizeof(void **); | 8144 | ptr += nr_cpu_ids * sizeof(void **); |
7820 | 8145 | ||
7821 | init_task_group.rt_rq = (struct rt_rq **)ptr; | 8146 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
7822 | ptr += nr_cpu_ids * sizeof(void **); | 8147 | ptr += nr_cpu_ids * sizeof(void **); |
7823 | 8148 | ||
7824 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8149 | #endif /* CONFIG_RT_GROUP_SCHED */ |
@@ -7838,20 +8163,16 @@ void __init sched_init(void) | |||
7838 | global_rt_period(), global_rt_runtime()); | 8163 | global_rt_period(), global_rt_runtime()); |
7839 | 8164 | ||
7840 | #ifdef CONFIG_RT_GROUP_SCHED | 8165 | #ifdef CONFIG_RT_GROUP_SCHED |
7841 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | 8166 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
7842 | global_rt_period(), global_rt_runtime()); | 8167 | global_rt_period(), global_rt_runtime()); |
7843 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8168 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7844 | 8169 | ||
7845 | #ifdef CONFIG_CGROUP_SCHED | 8170 | #ifdef CONFIG_CGROUP_SCHED |
7846 | list_add(&init_task_group.list, &task_groups); | 8171 | list_add(&root_task_group.list, &task_groups); |
7847 | INIT_LIST_HEAD(&init_task_group.children); | 8172 | INIT_LIST_HEAD(&root_task_group.children); |
7848 | 8173 | autogroup_init(&init_task); | |
7849 | #endif /* CONFIG_CGROUP_SCHED */ | 8174 | #endif /* CONFIG_CGROUP_SCHED */ |
7850 | 8175 | ||
7851 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
7852 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | ||
7853 | __alignof__(unsigned long)); | ||
7854 | #endif | ||
7855 | for_each_possible_cpu(i) { | 8176 | for_each_possible_cpu(i) { |
7856 | struct rq *rq; | 8177 | struct rq *rq; |
7857 | 8178 | ||
@@ -7863,38 +8184,34 @@ void __init sched_init(void) | |||
7863 | init_cfs_rq(&rq->cfs, rq); | 8184 | init_cfs_rq(&rq->cfs, rq); |
7864 | init_rt_rq(&rq->rt, rq); | 8185 | init_rt_rq(&rq->rt, rq); |
7865 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8186 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7866 | init_task_group.shares = init_task_group_load; | 8187 | root_task_group.shares = root_task_group_load; |
7867 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 8188 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7868 | #ifdef CONFIG_CGROUP_SCHED | ||
7869 | /* | 8189 | /* |
7870 | * How much cpu bandwidth does init_task_group get? | 8190 | * How much cpu bandwidth does root_task_group get? |
7871 | * | 8191 | * |
7872 | * In case of task-groups formed thr' the cgroup filesystem, it | 8192 | * In case of task-groups formed thr' the cgroup filesystem, it |
7873 | * gets 100% of the cpu resources in the system. This overall | 8193 | * gets 100% of the cpu resources in the system. This overall |
7874 | * system cpu resource is divided among the tasks of | 8194 | * system cpu resource is divided among the tasks of |
7875 | * init_task_group and its child task-groups in a fair manner, | 8195 | * root_task_group and its child task-groups in a fair manner, |
7876 | * based on each entity's (task or task-group's) weight | 8196 | * based on each entity's (task or task-group's) weight |
7877 | * (se->load.weight). | 8197 | * (se->load.weight). |
7878 | * | 8198 | * |
7879 | * In other words, if init_task_group has 10 tasks of weight | 8199 | * In other words, if root_task_group has 10 tasks of weight |
7880 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | 8200 | * 1024) and two child groups A0 and A1 (of weight 1024 each), |
7881 | * then A0's share of the cpu resource is: | 8201 | * then A0's share of the cpu resource is: |
7882 | * | 8202 | * |
7883 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | 8203 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% |
7884 | * | 8204 | * |
7885 | * We achieve this by letting init_task_group's tasks sit | 8205 | * We achieve this by letting root_task_group's tasks sit |
7886 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 8206 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
7887 | */ | 8207 | */ |
7888 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | 8208 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
7889 | #endif | ||
7890 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8209 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7891 | 8210 | ||
7892 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | 8211 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; |
7893 | #ifdef CONFIG_RT_GROUP_SCHED | 8212 | #ifdef CONFIG_RT_GROUP_SCHED |
7894 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 8213 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
7895 | #ifdef CONFIG_CGROUP_SCHED | 8214 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); |
7896 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | ||
7897 | #endif | ||
7898 | #endif | 8215 | #endif |
7899 | 8216 | ||
7900 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 8217 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
@@ -7905,7 +8222,7 @@ void __init sched_init(void) | |||
7905 | #ifdef CONFIG_SMP | 8222 | #ifdef CONFIG_SMP |
7906 | rq->sd = NULL; | 8223 | rq->sd = NULL; |
7907 | rq->rd = NULL; | 8224 | rq->rd = NULL; |
7908 | rq->cpu_power = SCHED_LOAD_SCALE; | 8225 | rq->cpu_power = SCHED_POWER_SCALE; |
7909 | rq->post_schedule = 0; | 8226 | rq->post_schedule = 0; |
7910 | rq->active_balance = 0; | 8227 | rq->active_balance = 0; |
7911 | rq->next_balance = jiffies; | 8228 | rq->next_balance = jiffies; |
@@ -7962,6 +8279,7 @@ void __init sched_init(void) | |||
7962 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | 8279 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ |
7963 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 8280 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
7964 | #ifdef CONFIG_SMP | 8281 | #ifdef CONFIG_SMP |
8282 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | ||
7965 | #ifdef CONFIG_NO_HZ | 8283 | #ifdef CONFIG_NO_HZ |
7966 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 8284 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
7967 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | 8285 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
@@ -7974,8 +8292,6 @@ void __init sched_init(void) | |||
7974 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 8292 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
7975 | #endif /* SMP */ | 8293 | #endif /* SMP */ |
7976 | 8294 | ||
7977 | perf_event_init(); | ||
7978 | |||
7979 | scheduler_running = 1; | 8295 | scheduler_running = 1; |
7980 | } | 8296 | } |
7981 | 8297 | ||
@@ -7984,7 +8300,7 @@ static inline int preempt_count_equals(int preempt_offset) | |||
7984 | { | 8300 | { |
7985 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); | 8301 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); |
7986 | 8302 | ||
7987 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | 8303 | return (nested == preempt_offset); |
7988 | } | 8304 | } |
7989 | 8305 | ||
7990 | void __might_sleep(const char *file, int line, int preempt_offset) | 8306 | void __might_sleep(const char *file, int line, int preempt_offset) |
@@ -8019,9 +8335,11 @@ EXPORT_SYMBOL(__might_sleep); | |||
8019 | #ifdef CONFIG_MAGIC_SYSRQ | 8335 | #ifdef CONFIG_MAGIC_SYSRQ |
8020 | static void normalize_task(struct rq *rq, struct task_struct *p) | 8336 | static void normalize_task(struct rq *rq, struct task_struct *p) |
8021 | { | 8337 | { |
8338 | const struct sched_class *prev_class = p->sched_class; | ||
8339 | int old_prio = p->prio; | ||
8022 | int on_rq; | 8340 | int on_rq; |
8023 | 8341 | ||
8024 | on_rq = p->se.on_rq; | 8342 | on_rq = p->on_rq; |
8025 | if (on_rq) | 8343 | if (on_rq) |
8026 | deactivate_task(rq, p, 0); | 8344 | deactivate_task(rq, p, 0); |
8027 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 8345 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
@@ -8029,6 +8347,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
8029 | activate_task(rq, p, 0); | 8347 | activate_task(rq, p, 0); |
8030 | resched_task(rq->curr); | 8348 | resched_task(rq->curr); |
8031 | } | 8349 | } |
8350 | |||
8351 | check_class_changed(rq, p, prev_class, old_prio); | ||
8032 | } | 8352 | } |
8033 | 8353 | ||
8034 | void normalize_rt_tasks(void) | 8354 | void normalize_rt_tasks(void) |
@@ -8144,7 +8464,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8144 | { | 8464 | { |
8145 | struct cfs_rq *cfs_rq; | 8465 | struct cfs_rq *cfs_rq; |
8146 | struct sched_entity *se; | 8466 | struct sched_entity *se; |
8147 | struct rq *rq; | ||
8148 | int i; | 8467 | int i; |
8149 | 8468 | ||
8150 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | 8469 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8157,8 +8476,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8157 | tg->shares = NICE_0_LOAD; | 8476 | tg->shares = NICE_0_LOAD; |
8158 | 8477 | ||
8159 | for_each_possible_cpu(i) { | 8478 | for_each_possible_cpu(i) { |
8160 | rq = cpu_rq(i); | ||
8161 | |||
8162 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8479 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8163 | GFP_KERNEL, cpu_to_node(i)); | 8480 | GFP_KERNEL, cpu_to_node(i)); |
8164 | if (!cfs_rq) | 8481 | if (!cfs_rq) |
@@ -8169,26 +8486,32 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8169 | if (!se) | 8486 | if (!se) |
8170 | goto err_free_rq; | 8487 | goto err_free_rq; |
8171 | 8488 | ||
8172 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); | 8489 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8173 | } | 8490 | } |
8174 | 8491 | ||
8175 | return 1; | 8492 | return 1; |
8176 | 8493 | ||
8177 | err_free_rq: | 8494 | err_free_rq: |
8178 | kfree(cfs_rq); | 8495 | kfree(cfs_rq); |
8179 | err: | 8496 | err: |
8180 | return 0; | 8497 | return 0; |
8181 | } | 8498 | } |
8182 | 8499 | ||
8183 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8184 | { | ||
8185 | list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, | ||
8186 | &cpu_rq(cpu)->leaf_cfs_rq_list); | ||
8187 | } | ||
8188 | |||
8189 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8500 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8190 | { | 8501 | { |
8191 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8502 | struct rq *rq = cpu_rq(cpu); |
8503 | unsigned long flags; | ||
8504 | |||
8505 | /* | ||
8506 | * Only empty task groups can be destroyed; so we can speculatively | ||
8507 | * check on_list without danger of it being re-added. | ||
8508 | */ | ||
8509 | if (!tg->cfs_rq[cpu]->on_list) | ||
8510 | return; | ||
8511 | |||
8512 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8513 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
8514 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8192 | } | 8515 | } |
8193 | #else /* !CONFG_FAIR_GROUP_SCHED */ | 8516 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
8194 | static inline void free_fair_sched_group(struct task_group *tg) | 8517 | static inline void free_fair_sched_group(struct task_group *tg) |
@@ -8201,10 +8524,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8201 | return 1; | 8524 | return 1; |
8202 | } | 8525 | } |
8203 | 8526 | ||
8204 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8205 | { | ||
8206 | } | ||
8207 | |||
8208 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8527 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8209 | { | 8528 | { |
8210 | } | 8529 | } |
@@ -8233,7 +8552,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8233 | { | 8552 | { |
8234 | struct rt_rq *rt_rq; | 8553 | struct rt_rq *rt_rq; |
8235 | struct sched_rt_entity *rt_se; | 8554 | struct sched_rt_entity *rt_se; |
8236 | struct rq *rq; | ||
8237 | int i; | 8555 | int i; |
8238 | 8556 | ||
8239 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | 8557 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8247,8 +8565,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8247 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | 8565 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); |
8248 | 8566 | ||
8249 | for_each_possible_cpu(i) { | 8567 | for_each_possible_cpu(i) { |
8250 | rq = cpu_rq(i); | ||
8251 | |||
8252 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | 8568 | rt_rq = kzalloc_node(sizeof(struct rt_rq), |
8253 | GFP_KERNEL, cpu_to_node(i)); | 8569 | GFP_KERNEL, cpu_to_node(i)); |
8254 | if (!rt_rq) | 8570 | if (!rt_rq) |
@@ -8259,27 +8575,16 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8259 | if (!rt_se) | 8575 | if (!rt_se) |
8260 | goto err_free_rq; | 8576 | goto err_free_rq; |
8261 | 8577 | ||
8262 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); | 8578 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
8263 | } | 8579 | } |
8264 | 8580 | ||
8265 | return 1; | 8581 | return 1; |
8266 | 8582 | ||
8267 | err_free_rq: | 8583 | err_free_rq: |
8268 | kfree(rt_rq); | 8584 | kfree(rt_rq); |
8269 | err: | 8585 | err: |
8270 | return 0; | 8586 | return 0; |
8271 | } | 8587 | } |
8272 | |||
8273 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8274 | { | ||
8275 | list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, | ||
8276 | &cpu_rq(cpu)->leaf_rt_rq_list); | ||
8277 | } | ||
8278 | |||
8279 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8280 | { | ||
8281 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | ||
8282 | } | ||
8283 | #else /* !CONFIG_RT_GROUP_SCHED */ | 8588 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8284 | static inline void free_rt_sched_group(struct task_group *tg) | 8589 | static inline void free_rt_sched_group(struct task_group *tg) |
8285 | { | 8590 | { |
@@ -8290,14 +8595,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8290 | { | 8595 | { |
8291 | return 1; | 8596 | return 1; |
8292 | } | 8597 | } |
8293 | |||
8294 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8295 | { | ||
8296 | } | ||
8297 | |||
8298 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8299 | { | ||
8300 | } | ||
8301 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8598 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8302 | 8599 | ||
8303 | #ifdef CONFIG_CGROUP_SCHED | 8600 | #ifdef CONFIG_CGROUP_SCHED |
@@ -8305,6 +8602,7 @@ static void free_sched_group(struct task_group *tg) | |||
8305 | { | 8602 | { |
8306 | free_fair_sched_group(tg); | 8603 | free_fair_sched_group(tg); |
8307 | free_rt_sched_group(tg); | 8604 | free_rt_sched_group(tg); |
8605 | autogroup_free(tg); | ||
8308 | kfree(tg); | 8606 | kfree(tg); |
8309 | } | 8607 | } |
8310 | 8608 | ||
@@ -8313,7 +8611,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8313 | { | 8611 | { |
8314 | struct task_group *tg; | 8612 | struct task_group *tg; |
8315 | unsigned long flags; | 8613 | unsigned long flags; |
8316 | int i; | ||
8317 | 8614 | ||
8318 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | 8615 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); |
8319 | if (!tg) | 8616 | if (!tg) |
@@ -8326,10 +8623,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8326 | goto err; | 8623 | goto err; |
8327 | 8624 | ||
8328 | spin_lock_irqsave(&task_group_lock, flags); | 8625 | spin_lock_irqsave(&task_group_lock, flags); |
8329 | for_each_possible_cpu(i) { | ||
8330 | register_fair_sched_group(tg, i); | ||
8331 | register_rt_sched_group(tg, i); | ||
8332 | } | ||
8333 | list_add_rcu(&tg->list, &task_groups); | 8626 | list_add_rcu(&tg->list, &task_groups); |
8334 | 8627 | ||
8335 | WARN_ON(!parent); /* root should already exist */ | 8628 | WARN_ON(!parent); /* root should already exist */ |
@@ -8359,11 +8652,11 @@ void sched_destroy_group(struct task_group *tg) | |||
8359 | unsigned long flags; | 8652 | unsigned long flags; |
8360 | int i; | 8653 | int i; |
8361 | 8654 | ||
8362 | spin_lock_irqsave(&task_group_lock, flags); | 8655 | /* end participation in shares distribution */ |
8363 | for_each_possible_cpu(i) { | 8656 | for_each_possible_cpu(i) |
8364 | unregister_fair_sched_group(tg, i); | 8657 | unregister_fair_sched_group(tg, i); |
8365 | unregister_rt_sched_group(tg, i); | 8658 | |
8366 | } | 8659 | spin_lock_irqsave(&task_group_lock, flags); |
8367 | list_del_rcu(&tg->list); | 8660 | list_del_rcu(&tg->list); |
8368 | list_del_rcu(&tg->siblings); | 8661 | list_del_rcu(&tg->siblings); |
8369 | spin_unlock_irqrestore(&task_group_lock, flags); | 8662 | spin_unlock_irqrestore(&task_group_lock, flags); |
@@ -8386,57 +8679,30 @@ void sched_move_task(struct task_struct *tsk) | |||
8386 | rq = task_rq_lock(tsk, &flags); | 8679 | rq = task_rq_lock(tsk, &flags); |
8387 | 8680 | ||
8388 | running = task_current(rq, tsk); | 8681 | running = task_current(rq, tsk); |
8389 | on_rq = tsk->se.on_rq; | 8682 | on_rq = tsk->on_rq; |
8390 | 8683 | ||
8391 | if (on_rq) | 8684 | if (on_rq) |
8392 | dequeue_task(rq, tsk, 0); | 8685 | dequeue_task(rq, tsk, 0); |
8393 | if (unlikely(running)) | 8686 | if (unlikely(running)) |
8394 | tsk->sched_class->put_prev_task(rq, tsk); | 8687 | tsk->sched_class->put_prev_task(rq, tsk); |
8395 | 8688 | ||
8396 | set_task_rq(tsk, task_cpu(tsk)); | ||
8397 | |||
8398 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8689 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8399 | if (tsk->sched_class->moved_group) | 8690 | if (tsk->sched_class->task_move_group) |
8400 | tsk->sched_class->moved_group(tsk, on_rq); | 8691 | tsk->sched_class->task_move_group(tsk, on_rq); |
8692 | else | ||
8401 | #endif | 8693 | #endif |
8694 | set_task_rq(tsk, task_cpu(tsk)); | ||
8402 | 8695 | ||
8403 | if (unlikely(running)) | 8696 | if (unlikely(running)) |
8404 | tsk->sched_class->set_curr_task(rq); | 8697 | tsk->sched_class->set_curr_task(rq); |
8405 | if (on_rq) | 8698 | if (on_rq) |
8406 | enqueue_task(rq, tsk, 0); | 8699 | enqueue_task(rq, tsk, 0); |
8407 | 8700 | ||
8408 | task_rq_unlock(rq, &flags); | 8701 | task_rq_unlock(rq, tsk, &flags); |
8409 | } | 8702 | } |
8410 | #endif /* CONFIG_CGROUP_SCHED */ | 8703 | #endif /* CONFIG_CGROUP_SCHED */ |
8411 | 8704 | ||
8412 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8705 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8413 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8414 | { | ||
8415 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8416 | int on_rq; | ||
8417 | |||
8418 | on_rq = se->on_rq; | ||
8419 | if (on_rq) | ||
8420 | dequeue_entity(cfs_rq, se, 0); | ||
8421 | |||
8422 | se->load.weight = shares; | ||
8423 | se->load.inv_weight = 0; | ||
8424 | |||
8425 | if (on_rq) | ||
8426 | enqueue_entity(cfs_rq, se, 0); | ||
8427 | } | ||
8428 | |||
8429 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8430 | { | ||
8431 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8432 | struct rq *rq = cfs_rq->rq; | ||
8433 | unsigned long flags; | ||
8434 | |||
8435 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8436 | __set_se_shares(se, shares); | ||
8437 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8438 | } | ||
8439 | |||
8440 | static DEFINE_MUTEX(shares_mutex); | 8706 | static DEFINE_MUTEX(shares_mutex); |
8441 | 8707 | ||
8442 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 8708 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
@@ -8450,46 +8716,25 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8450 | if (!tg->se[0]) | 8716 | if (!tg->se[0]) |
8451 | return -EINVAL; | 8717 | return -EINVAL; |
8452 | 8718 | ||
8453 | if (shares < MIN_SHARES) | 8719 | shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); |
8454 | shares = MIN_SHARES; | ||
8455 | else if (shares > MAX_SHARES) | ||
8456 | shares = MAX_SHARES; | ||
8457 | 8720 | ||
8458 | mutex_lock(&shares_mutex); | 8721 | mutex_lock(&shares_mutex); |
8459 | if (tg->shares == shares) | 8722 | if (tg->shares == shares) |
8460 | goto done; | 8723 | goto done; |
8461 | 8724 | ||
8462 | spin_lock_irqsave(&task_group_lock, flags); | ||
8463 | for_each_possible_cpu(i) | ||
8464 | unregister_fair_sched_group(tg, i); | ||
8465 | list_del_rcu(&tg->siblings); | ||
8466 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8467 | |||
8468 | /* wait for any ongoing reference to this group to finish */ | ||
8469 | synchronize_sched(); | ||
8470 | |||
8471 | /* | ||
8472 | * Now we are free to modify the group's share on each cpu | ||
8473 | * w/o tripping rebalance_share or load_balance_fair. | ||
8474 | */ | ||
8475 | tg->shares = shares; | 8725 | tg->shares = shares; |
8476 | for_each_possible_cpu(i) { | 8726 | for_each_possible_cpu(i) { |
8477 | /* | 8727 | struct rq *rq = cpu_rq(i); |
8478 | * force a rebalance | 8728 | struct sched_entity *se; |
8479 | */ | 8729 | |
8480 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | 8730 | se = tg->se[i]; |
8481 | set_se_shares(tg->se[i], shares); | 8731 | /* Propagate contribution to hierarchy */ |
8732 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8733 | for_each_sched_entity(se) | ||
8734 | update_cfs_shares(group_cfs_rq(se)); | ||
8735 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8482 | } | 8736 | } |
8483 | 8737 | ||
8484 | /* | ||
8485 | * Enable load balance activity on this group, by inserting it back on | ||
8486 | * each cpu's rq->leaf_cfs_rq_list. | ||
8487 | */ | ||
8488 | spin_lock_irqsave(&task_group_lock, flags); | ||
8489 | for_each_possible_cpu(i) | ||
8490 | register_fair_sched_group(tg, i); | ||
8491 | list_add_rcu(&tg->siblings, &tg->parent->children); | ||
8492 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8493 | done: | 8738 | done: |
8494 | mutex_unlock(&shares_mutex); | 8739 | mutex_unlock(&shares_mutex); |
8495 | return 0; | 8740 | return 0; |
@@ -8624,7 +8869,7 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
8624 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 8869 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
8625 | } | 8870 | } |
8626 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8871 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
8627 | unlock: | 8872 | unlock: |
8628 | read_unlock(&tasklist_lock); | 8873 | read_unlock(&tasklist_lock); |
8629 | mutex_unlock(&rt_constraints_mutex); | 8874 | mutex_unlock(&rt_constraints_mutex); |
8630 | 8875 | ||
@@ -8788,7 +9033,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
8788 | 9033 | ||
8789 | if (!cgrp->parent) { | 9034 | if (!cgrp->parent) { |
8790 | /* This is early initialization for the top cgroup */ | 9035 | /* This is early initialization for the top cgroup */ |
8791 | return &init_task_group.css; | 9036 | return &root_task_group.css; |
8792 | } | 9037 | } |
8793 | 9038 | ||
8794 | parent = cgroup_tg(cgrp->parent); | 9039 | parent = cgroup_tg(cgrp->parent); |
@@ -8821,56 +9066,39 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
8821 | return 0; | 9066 | return 0; |
8822 | } | 9067 | } |
8823 | 9068 | ||
8824 | static int | 9069 | static void |
8825 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 9070 | cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
8826 | struct task_struct *tsk, bool threadgroup) | ||
8827 | { | 9071 | { |
8828 | int retval = cpu_cgroup_can_attach_task(cgrp, tsk); | 9072 | sched_move_task(tsk); |
8829 | if (retval) | ||
8830 | return retval; | ||
8831 | if (threadgroup) { | ||
8832 | struct task_struct *c; | ||
8833 | rcu_read_lock(); | ||
8834 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
8835 | retval = cpu_cgroup_can_attach_task(cgrp, c); | ||
8836 | if (retval) { | ||
8837 | rcu_read_unlock(); | ||
8838 | return retval; | ||
8839 | } | ||
8840 | } | ||
8841 | rcu_read_unlock(); | ||
8842 | } | ||
8843 | return 0; | ||
8844 | } | 9073 | } |
8845 | 9074 | ||
8846 | static void | 9075 | static void |
8847 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 9076 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, |
8848 | struct cgroup *old_cont, struct task_struct *tsk, | 9077 | struct cgroup *old_cgrp, struct task_struct *task) |
8849 | bool threadgroup) | ||
8850 | { | 9078 | { |
8851 | sched_move_task(tsk); | 9079 | /* |
8852 | if (threadgroup) { | 9080 | * cgroup_exit() is called in the copy_process() failure path. |
8853 | struct task_struct *c; | 9081 | * Ignore this case since the task hasn't ran yet, this avoids |
8854 | rcu_read_lock(); | 9082 | * trying to poke a half freed task state from generic code. |
8855 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | 9083 | */ |
8856 | sched_move_task(c); | 9084 | if (!(task->flags & PF_EXITING)) |
8857 | } | 9085 | return; |
8858 | rcu_read_unlock(); | 9086 | |
8859 | } | 9087 | sched_move_task(task); |
8860 | } | 9088 | } |
8861 | 9089 | ||
8862 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9090 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8863 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 9091 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, |
8864 | u64 shareval) | 9092 | u64 shareval) |
8865 | { | 9093 | { |
8866 | return sched_group_set_shares(cgroup_tg(cgrp), shareval); | 9094 | return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); |
8867 | } | 9095 | } |
8868 | 9096 | ||
8869 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | 9097 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) |
8870 | { | 9098 | { |
8871 | struct task_group *tg = cgroup_tg(cgrp); | 9099 | struct task_group *tg = cgroup_tg(cgrp); |
8872 | 9100 | ||
8873 | return (u64) tg->shares; | 9101 | return (u64) scale_load_down(tg->shares); |
8874 | } | 9102 | } |
8875 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 9103 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8876 | 9104 | ||
@@ -8929,8 +9157,9 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
8929 | .name = "cpu", | 9157 | .name = "cpu", |
8930 | .create = cpu_cgroup_create, | 9158 | .create = cpu_cgroup_create, |
8931 | .destroy = cpu_cgroup_destroy, | 9159 | .destroy = cpu_cgroup_destroy, |
8932 | .can_attach = cpu_cgroup_can_attach, | 9160 | .can_attach_task = cpu_cgroup_can_attach_task, |
8933 | .attach = cpu_cgroup_attach, | 9161 | .attach_task = cpu_cgroup_attach_task, |
9162 | .exit = cpu_cgroup_exit, | ||
8934 | .populate = cpu_cgroup_populate, | 9163 | .populate = cpu_cgroup_populate, |
8935 | .subsys_id = cpu_cgroup_subsys_id, | 9164 | .subsys_id = cpu_cgroup_subsys_id, |
8936 | .early_init = 1, | 9165 | .early_init = 1, |
@@ -9215,72 +9444,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9215 | }; | 9444 | }; |
9216 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9445 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9217 | 9446 | ||
9218 | #ifndef CONFIG_SMP | ||
9219 | |||
9220 | void synchronize_sched_expedited(void) | ||
9221 | { | ||
9222 | barrier(); | ||
9223 | } | ||
9224 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9225 | |||
9226 | #else /* #ifndef CONFIG_SMP */ | ||
9227 | |||
9228 | static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); | ||
9229 | |||
9230 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
9231 | { | ||
9232 | /* | ||
9233 | * There must be a full memory barrier on each affected CPU | ||
9234 | * between the time that try_stop_cpus() is called and the | ||
9235 | * time that it returns. | ||
9236 | * | ||
9237 | * In the current initial implementation of cpu_stop, the | ||
9238 | * above condition is already met when the control reaches | ||
9239 | * this point and the following smp_mb() is not strictly | ||
9240 | * necessary. Do smp_mb() anyway for documentation and | ||
9241 | * robustness against future implementation changes. | ||
9242 | */ | ||
9243 | smp_mb(); /* See above comment block. */ | ||
9244 | return 0; | ||
9245 | } | ||
9246 | |||
9247 | /* | ||
9248 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
9249 | * approach to force grace period to end quickly. This consumes | ||
9250 | * significant time on all CPUs, and is thus not recommended for | ||
9251 | * any sort of common-case code. | ||
9252 | * | ||
9253 | * Note that it is illegal to call this function while holding any | ||
9254 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
9255 | * observe this restriction will result in deadlock. | ||
9256 | */ | ||
9257 | void synchronize_sched_expedited(void) | ||
9258 | { | ||
9259 | int snap, trycount = 0; | ||
9260 | |||
9261 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
9262 | snap = atomic_read(&synchronize_sched_expedited_count) + 1; | ||
9263 | get_online_cpus(); | ||
9264 | while (try_stop_cpus(cpu_online_mask, | ||
9265 | synchronize_sched_expedited_cpu_stop, | ||
9266 | NULL) == -EAGAIN) { | ||
9267 | put_online_cpus(); | ||
9268 | if (trycount++ < 10) | ||
9269 | udelay(trycount * num_online_cpus()); | ||
9270 | else { | ||
9271 | synchronize_sched(); | ||
9272 | return; | ||
9273 | } | ||
9274 | if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { | ||
9275 | smp_mb(); /* ensure test happens before caller kfree */ | ||
9276 | return; | ||
9277 | } | ||
9278 | get_online_cpus(); | ||
9279 | } | ||
9280 | atomic_inc(&synchronize_sched_expedited_count); | ||
9281 | smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ | ||
9282 | put_online_cpus(); | ||
9283 | } | ||
9284 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9285 | |||
9286 | #endif /* #else #ifndef CONFIG_SMP */ | ||