aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /kernel/sched.c
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c3437
1 files changed, 1801 insertions, 1636 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index c5d775079027..baaca61bc3a3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/uaccess.h> 33#include <linux/uaccess.h>
34#include <linux/highmem.h> 34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h> 35#include <asm/mmu_context.h>
37#include <linux/interrupt.h> 36#include <linux/interrupt.h>
38#include <linux/capability.h> 37#include <linux/capability.h>
@@ -75,9 +74,11 @@
75 74
76#include <asm/tlb.h> 75#include <asm/tlb.h>
77#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
77#include <asm/mutex.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h" 80#include "workqueue_sched.h"
81#include "sched_autogroup.h"
81 82
82#include <litmus/sched_trace.h> 83#include <litmus/sched_trace.h>
83#include <litmus/trace.h> 84#include <litmus/trace.h>
@@ -235,7 +236,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
235#endif 236#endif
236 237
237/* 238/*
238 * sched_domains_mutex serializes calls to arch_init_sched_domains, 239 * sched_domains_mutex serializes calls to init_sched_domains,
239 * detach_destroy_domains and partition_sched_domains. 240 * detach_destroy_domains and partition_sched_domains.
240 */ 241 */
241static DEFINE_MUTEX(sched_domains_mutex); 242static DEFINE_MUTEX(sched_domains_mutex);
@@ -258,6 +259,8 @@ struct task_group {
258 /* runqueue "owned" by this group on each cpu */ 259 /* runqueue "owned" by this group on each cpu */
259 struct cfs_rq **cfs_rq; 260 struct cfs_rq **cfs_rq;
260 unsigned long shares; 261 unsigned long shares;
262
263 atomic_t load_weight;
261#endif 264#endif
262 265
263#ifdef CONFIG_RT_GROUP_SCHED 266#ifdef CONFIG_RT_GROUP_SCHED
@@ -273,25 +276,18 @@ struct task_group {
273 struct task_group *parent; 276 struct task_group *parent;
274 struct list_head siblings; 277 struct list_head siblings;
275 struct list_head children; 278 struct list_head children;
276};
277 279
278#define root_task_group init_task_group 280#ifdef CONFIG_SCHED_AUTOGROUP
281 struct autogroup *autogroup;
282#endif
283};
279 284
280/* task_group_lock serializes add/remove of task groups and also changes to 285/* task_group_lock serializes the addition/removal of task groups */
281 * a task group's cpu shares.
282 */
283static DEFINE_SPINLOCK(task_group_lock); 286static DEFINE_SPINLOCK(task_group_lock);
284 287
285#ifdef CONFIG_FAIR_GROUP_SCHED 288#ifdef CONFIG_FAIR_GROUP_SCHED
286 289
287#ifdef CONFIG_SMP 290# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
288static int root_task_group_empty(void)
289{
290 return list_empty(&root_task_group.children);
291}
292#endif
293
294# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
295 291
296/* 292/*
297 * A weight of 0 or 1 can cause arithmetics problems. 293 * A weight of 0 or 1 can cause arithmetics problems.
@@ -301,16 +297,16 @@ static int root_task_group_empty(void)
301 * (The default weight is 1024 - so there's no practical 297 * (The default weight is 1024 - so there's no practical
302 * limitation from this.) 298 * limitation from this.)
303 */ 299 */
304#define MIN_SHARES 2 300#define MIN_SHARES (1UL << 1)
305#define MAX_SHARES (1UL << 18) 301#define MAX_SHARES (1UL << 18)
306 302
307static int init_task_group_load = INIT_TASK_GROUP_LOAD; 303static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
308#endif 304#endif
309 305
310/* Default task group. 306/* Default task group.
311 * Every task in system belong to this group at bootup. 307 * Every task in system belong to this group at bootup.
312 */ 308 */
313struct task_group init_task_group; 309struct task_group root_task_group;
314 310
315#endif /* CONFIG_CGROUP_SCHED */ 311#endif /* CONFIG_CGROUP_SCHED */
316 312
@@ -321,6 +317,9 @@ struct cfs_rq {
321 317
322 u64 exec_clock; 318 u64 exec_clock;
323 u64 min_vruntime; 319 u64 min_vruntime;
320#ifndef CONFIG_64BIT
321 u64 min_vruntime_copy;
322#endif
324 323
325 struct rb_root tasks_timeline; 324 struct rb_root tasks_timeline;
326 struct rb_node *rb_leftmost; 325 struct rb_node *rb_leftmost;
@@ -332,9 +331,11 @@ struct cfs_rq {
332 * 'curr' points to currently running entity on this cfs_rq. 331 * 'curr' points to currently running entity on this cfs_rq.
333 * It is set to NULL otherwise (i.e when none are currently running). 332 * It is set to NULL otherwise (i.e when none are currently running).
334 */ 333 */
335 struct sched_entity *curr, *next, *last; 334 struct sched_entity *curr, *next, *last, *skip;
336 335
336#ifdef CONFIG_SCHED_DEBUG
337 unsigned int nr_spread_over; 337 unsigned int nr_spread_over;
338#endif
338 339
339#ifdef CONFIG_FAIR_GROUP_SCHED 340#ifdef CONFIG_FAIR_GROUP_SCHED
340 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 341 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -347,6 +348,7 @@ struct cfs_rq {
347 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 348 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
348 * list is used during load balance. 349 * list is used during load balance.
349 */ 350 */
351 int on_list;
350 struct list_head leaf_cfs_rq_list; 352 struct list_head leaf_cfs_rq_list;
351 struct task_group *tg; /* group that "owns" this runqueue */ 353 struct task_group *tg; /* group that "owns" this runqueue */
352 354
@@ -365,14 +367,17 @@ struct cfs_rq {
365 unsigned long h_load; 367 unsigned long h_load;
366 368
367 /* 369 /*
368 * this cpu's part of tg->shares 370 * Maintaining per-cpu shares distribution for group scheduling
371 *
372 * load_stamp is the last time we updated the load average
373 * load_last is the last time we updated the load average and saw load
374 * load_unacc_exec_time is currently unaccounted execution time
369 */ 375 */
370 unsigned long shares; 376 u64 load_avg;
377 u64 load_period;
378 u64 load_stamp, load_last, load_unacc_exec_time;
371 379
372 /* 380 unsigned long load_contribution;
373 * load.weight at the time we set shares
374 */
375 unsigned long rq_weight;
376#endif 381#endif
377#endif 382#endif
378}; 383};
@@ -428,6 +433,7 @@ struct litmus_rq {
428 */ 433 */
429struct root_domain { 434struct root_domain {
430 atomic_t refcount; 435 atomic_t refcount;
436 struct rcu_head rcu;
431 cpumask_var_t span; 437 cpumask_var_t span;
432 cpumask_var_t online; 438 cpumask_var_t online;
433 439
@@ -437,9 +443,7 @@ struct root_domain {
437 */ 443 */
438 cpumask_var_t rto_mask; 444 cpumask_var_t rto_mask;
439 atomic_t rto_count; 445 atomic_t rto_count;
440#ifdef CONFIG_SMP
441 struct cpupri cpupri; 446 struct cpupri cpupri;
442#endif
443}; 447};
444 448
445/* 449/*
@@ -448,7 +452,7 @@ struct root_domain {
448 */ 452 */
449static struct root_domain def_root_domain; 453static struct root_domain def_root_domain;
450 454
451#endif 455#endif /* CONFIG_SMP */
452 456
453/* 457/*
454 * This is the main, per-CPU runqueue data structure. 458 * This is the main, per-CPU runqueue data structure.
@@ -473,7 +477,7 @@ struct rq {
473 u64 nohz_stamp; 477 u64 nohz_stamp;
474 unsigned char nohz_balance_kick; 478 unsigned char nohz_balance_kick;
475#endif 479#endif
476 unsigned int skip_clock_update; 480 int skip_clock_update;
477 481
478 /* capture load from *all* tasks on this cpu: */ 482 /* capture load from *all* tasks on this cpu: */
479 struct load_weight load; 483 struct load_weight load;
@@ -500,11 +504,12 @@ struct rq {
500 */ 504 */
501 unsigned long nr_uninterruptible; 505 unsigned long nr_uninterruptible;
502 506
503 struct task_struct *curr, *idle; 507 struct task_struct *curr, *idle, *stop;
504 unsigned long next_balance; 508 unsigned long next_balance;
505 struct mm_struct *prev_mm; 509 struct mm_struct *prev_mm;
506 510
507 u64 clock; 511 u64 clock;
512 u64 clock_task;
508 513
509 atomic_t nr_iowait; 514 atomic_t nr_iowait;
510 515
@@ -532,6 +537,10 @@ struct rq {
532 u64 avg_idle; 537 u64 avg_idle;
533#endif 538#endif
534 539
540#ifdef CONFIG_IRQ_TIME_ACCOUNTING
541 u64 prev_irq_time;
542#endif
543
535 /* calc_load related fields */ 544 /* calc_load related fields */
536 unsigned long calc_load_update; 545 unsigned long calc_load_update;
537 long calc_load_active; 546 long calc_load_active;
@@ -561,32 +570,17 @@ struct rq {
561 /* try_to_wake_up() stats */ 570 /* try_to_wake_up() stats */
562 unsigned int ttwu_count; 571 unsigned int ttwu_count;
563 unsigned int ttwu_local; 572 unsigned int ttwu_local;
573#endif
564 574
565 /* BKL stats */ 575#ifdef CONFIG_SMP
566 unsigned int bkl_count; 576 struct task_struct *wake_list;
567#endif 577#endif
568}; 578};
569 579
570static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 580static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
571 581
572static inline
573void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
574{
575 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
576 582
577 /* 583static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
578 * A queue event has occurred, and we're going to schedule. In
579 * this case, we can save a useless back to back clock update.
580 */
581 /* LITMUS^RT: turning off the clock update is buggy in Linux 2.6.36;
582 * the scheduler can "forget" to renable the runqueue clock in some
583 * cases. LITMUS^RT amplifies the effects of this problem. Hence, we
584 * turn it off to avoid stalling clocks. */
585 /*
586 if (test_tsk_need_resched(p))
587 rq->skip_clock_update = 1;
588 */
589}
590 584
591static inline int cpu_of(struct rq *rq) 585static inline int cpu_of(struct rq *rq)
592{ 586{
@@ -599,7 +593,7 @@ static inline int cpu_of(struct rq *rq)
599 593
600#define rcu_dereference_check_sched_domain(p) \ 594#define rcu_dereference_check_sched_domain(p) \
601 rcu_dereference_check((p), \ 595 rcu_dereference_check((p), \
602 rcu_read_lock_sched_held() || \ 596 rcu_read_lock_held() || \
603 lockdep_is_held(&sched_domains_mutex)) 597 lockdep_is_held(&sched_domains_mutex))
604 598
605/* 599/*
@@ -623,18 +617,22 @@ static inline int cpu_of(struct rq *rq)
623/* 617/*
624 * Return the group to which this tasks belongs. 618 * Return the group to which this tasks belongs.
625 * 619 *
626 * We use task_subsys_state_check() and extend the RCU verification 620 * We use task_subsys_state_check() and extend the RCU verification with
627 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 621 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
628 * holds that lock for each task it moves into the cgroup. Therefore 622 * task it moves into the cgroup. Therefore by holding either of those locks,
629 * by holding that lock, we pin the task to the current cgroup. 623 * we pin the task to the current cgroup.
630 */ 624 */
631static inline struct task_group *task_group(struct task_struct *p) 625static inline struct task_group *task_group(struct task_struct *p)
632{ 626{
627 struct task_group *tg;
633 struct cgroup_subsys_state *css; 628 struct cgroup_subsys_state *css;
634 629
635 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 630 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
631 lockdep_is_held(&p->pi_lock) ||
636 lockdep_is_held(&task_rq(p)->lock)); 632 lockdep_is_held(&task_rq(p)->lock));
637 return container_of(css, struct task_group, css); 633 tg = container_of(css, struct task_group, css);
634
635 return autogroup_task_group(p, tg);
638} 636}
639 637
640/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 638/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -661,10 +659,18 @@ static inline struct task_group *task_group(struct task_struct *p)
661 659
662#endif /* CONFIG_CGROUP_SCHED */ 660#endif /* CONFIG_CGROUP_SCHED */
663 661
664inline void update_rq_clock(struct rq *rq) 662static void update_rq_clock_task(struct rq *rq, s64 delta);
663
664static void update_rq_clock(struct rq *rq)
665{ 665{
666 if (!rq->skip_clock_update) 666 s64 delta;
667 rq->clock = sched_clock_cpu(cpu_of(rq)); 667
668 if (rq->skip_clock_update > 0)
669 return;
670
671 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
672 rq->clock += delta;
673 update_rq_clock_task(rq, delta);
668} 674}
669 675
670/* 676/*
@@ -677,10 +683,9 @@ inline void update_rq_clock(struct rq *rq)
677#endif 683#endif
678 684
679/** 685/**
680 * runqueue_is_locked 686 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
681 * @cpu: the processor in question. 687 * @cpu: the processor in question.
682 * 688 *
683 * Returns true if the current cpu runqueue is locked.
684 * This interface allows printk to be called with the runqueue lock 689 * This interface allows printk to be called with the runqueue lock
685 * held and know whether or not it is OK to wake up the klogd. 690 * held and know whether or not it is OK to wake up the klogd.
686 */ 691 */
@@ -741,7 +746,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
741 size_t cnt, loff_t *ppos) 746 size_t cnt, loff_t *ppos)
742{ 747{
743 char buf[64]; 748 char buf[64];
744 char *cmp = buf; 749 char *cmp;
745 int neg = 0; 750 int neg = 0;
746 int i; 751 int i;
747 752
@@ -752,16 +757,15 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
752 return -EFAULT; 757 return -EFAULT;
753 758
754 buf[cnt] = 0; 759 buf[cnt] = 0;
760 cmp = strstrip(buf);
755 761
756 if (strncmp(buf, "NO_", 3) == 0) { 762 if (strncmp(cmp, "NO_", 3) == 0) {
757 neg = 1; 763 neg = 1;
758 cmp += 3; 764 cmp += 3;
759 } 765 }
760 766
761 for (i = 0; sched_feat_names[i]; i++) { 767 for (i = 0; sched_feat_names[i]; i++) {
762 int len = strlen(sched_feat_names[i]); 768 if (strcmp(cmp, sched_feat_names[i]) == 0) {
763
764 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
765 if (neg) 769 if (neg)
766 sysctl_sched_features &= ~(1UL << i); 770 sysctl_sched_features &= ~(1UL << i);
767 else 771 else
@@ -811,20 +815,6 @@ late_initcall(sched_init_debug);
811const_debug unsigned int sysctl_sched_nr_migrate = 32; 815const_debug unsigned int sysctl_sched_nr_migrate = 32;
812 816
813/* 817/*
814 * ratelimit for updating the group shares.
815 * default: 0.25ms
816 */
817unsigned int sysctl_sched_shares_ratelimit = 250000;
818unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
819
820/*
821 * Inject some fuzzyness into changing the per-cpu group shares
822 * this avoids remote rq-locks at the expense of fairness.
823 * default: 4
824 */
825unsigned int sysctl_sched_shares_thresh = 4;
826
827/*
828 * period over which we average the RT time consumption, measured 818 * period over which we average the RT time consumption, measured
829 * in ms. 819 * in ms.
830 * 820 *
@@ -871,18 +861,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
871 return rq->curr == p; 861 return rq->curr == p;
872} 862}
873 863
874#ifndef __ARCH_WANT_UNLOCKED_CTXSW
875static inline int task_running(struct rq *rq, struct task_struct *p) 864static inline int task_running(struct rq *rq, struct task_struct *p)
876{ 865{
866#ifdef CONFIG_SMP
867 return p->on_cpu;
868#else
877 return task_current(rq, p); 869 return task_current(rq, p);
870#endif
878} 871}
879 872
873#ifndef __ARCH_WANT_UNLOCKED_CTXSW
880static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 874static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
881{ 875{
876#ifdef CONFIG_SMP
877 /*
878 * We can optimise this out completely for !SMP, because the
879 * SMP rebalancing from interrupt is the only thing that cares
880 * here.
881 */
882 next->on_cpu = 1;
883#endif
882} 884}
883 885
884static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 886static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
885{ 887{
888#ifdef CONFIG_SMP
889 /*
890 * After ->on_cpu is cleared, the task can be moved to a different CPU.
891 * We must ensure this doesn't happen until the switch is completely
892 * finished.
893 */
894 smp_wmb();
895 prev->on_cpu = 0;
896#endif
886#ifdef CONFIG_DEBUG_SPINLOCK 897#ifdef CONFIG_DEBUG_SPINLOCK
887 /* this is a valid case when another task releases the spinlock */ 898 /* this is a valid case when another task releases the spinlock */
888 rq->lock.owner = current; 899 rq->lock.owner = current;
@@ -898,15 +909,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
898} 909}
899 910
900#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 911#else /* __ARCH_WANT_UNLOCKED_CTXSW */
901static inline int task_running(struct rq *rq, struct task_struct *p)
902{
903#ifdef CONFIG_SMP
904 return p->oncpu;
905#else
906 return task_current(rq, p);
907#endif
908}
909
910static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 912static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
911{ 913{
912#ifdef CONFIG_SMP 914#ifdef CONFIG_SMP
@@ -915,7 +917,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
915 * SMP rebalancing from interrupt is the only thing that cares 917 * SMP rebalancing from interrupt is the only thing that cares
916 * here. 918 * here.
917 */ 919 */
918 next->oncpu = 1; 920 next->on_cpu = 1;
919#endif 921#endif
920#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 922#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
921 raw_spin_unlock_irq(&rq->lock); 923 raw_spin_unlock_irq(&rq->lock);
@@ -928,12 +930,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
928{ 930{
929#ifdef CONFIG_SMP 931#ifdef CONFIG_SMP
930 /* 932 /*
931 * After ->oncpu is cleared, the task can be moved to a different CPU. 933 * After ->on_cpu is cleared, the task can be moved to a different CPU.
932 * We must ensure this doesn't happen until the switch is completely 934 * We must ensure this doesn't happen until the switch is completely
933 * finished. 935 * finished.
934 */ 936 */
935 smp_wmb(); 937 smp_wmb();
936 prev->oncpu = 0; 938 prev->on_cpu = 0;
937#endif 939#endif
938#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 940#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
939 local_irq_enable(); 941 local_irq_enable();
@@ -942,23 +944,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
942#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 944#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
943 945
944/* 946/*
945 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 947 * __task_rq_lock - lock the rq @p resides on.
946 * against ttwu().
947 */
948static inline int task_is_waking(struct task_struct *p)
949{
950 return unlikely(p->state == TASK_WAKING);
951}
952
953/*
954 * __task_rq_lock - lock the runqueue a given task resides on.
955 * Must be called interrupts disabled.
956 */ 948 */
957static inline struct rq *__task_rq_lock(struct task_struct *p) 949static inline struct rq *__task_rq_lock(struct task_struct *p)
958 __acquires(rq->lock) 950 __acquires(rq->lock)
959{ 951{
960 struct rq *rq; 952 struct rq *rq;
961 953
954 lockdep_assert_held(&p->pi_lock);
955
962 for (;;) { 956 for (;;) {
963 rq = task_rq(p); 957 rq = task_rq(p);
964 raw_spin_lock(&rq->lock); 958 raw_spin_lock(&rq->lock);
@@ -969,22 +963,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
969} 963}
970 964
971/* 965/*
972 * task_rq_lock - lock the runqueue a given task resides on and disable 966 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
973 * interrupts. Note the ordering: we can safely lookup the task_rq without
974 * explicitly disabling preemption.
975 */ 967 */
976static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 968static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
969 __acquires(p->pi_lock)
977 __acquires(rq->lock) 970 __acquires(rq->lock)
978{ 971{
979 struct rq *rq; 972 struct rq *rq;
980 973
981 for (;;) { 974 for (;;) {
982 local_irq_save(*flags); 975 raw_spin_lock_irqsave(&p->pi_lock, *flags);
983 rq = task_rq(p); 976 rq = task_rq(p);
984 raw_spin_lock(&rq->lock); 977 raw_spin_lock(&rq->lock);
985 if (likely(rq == task_rq(p))) 978 if (likely(rq == task_rq(p)))
986 return rq; 979 return rq;
987 raw_spin_unlock_irqrestore(&rq->lock, *flags); 980 raw_spin_unlock(&rq->lock);
981 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
988 } 982 }
989} 983}
990 984
@@ -994,10 +988,13 @@ static void __task_rq_unlock(struct rq *rq)
994 raw_spin_unlock(&rq->lock); 988 raw_spin_unlock(&rq->lock);
995} 989}
996 990
997static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 991static inline void
992task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
998 __releases(rq->lock) 993 __releases(rq->lock)
994 __releases(p->pi_lock)
999{ 995{
1000 raw_spin_unlock_irqrestore(&rq->lock, *flags); 996 raw_spin_unlock(&rq->lock);
997 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
1001} 998}
1002 999
1003/* 1000/*
@@ -1227,11 +1224,17 @@ int get_nohz_timer_target(void)
1227 int i; 1224 int i;
1228 struct sched_domain *sd; 1225 struct sched_domain *sd;
1229 1226
1227 rcu_read_lock();
1230 for_each_domain(cpu, sd) { 1228 for_each_domain(cpu, sd) {
1231 for_each_cpu(i, sched_domain_span(sd)) 1229 for_each_cpu(i, sched_domain_span(sd)) {
1232 if (!idle_cpu(i)) 1230 if (!idle_cpu(i)) {
1233 return i; 1231 cpu = i;
1232 goto unlock;
1233 }
1234 }
1234 } 1235 }
1236unlock:
1237 rcu_read_unlock();
1235 return cpu; 1238 return cpu;
1236} 1239}
1237/* 1240/*
@@ -1341,15 +1344,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1341{ 1344{
1342 u64 tmp; 1345 u64 tmp;
1343 1346
1347 /*
1348 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1349 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1350 * 2^SCHED_LOAD_RESOLUTION.
1351 */
1352 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1353 tmp = (u64)delta_exec * scale_load_down(weight);
1354 else
1355 tmp = (u64)delta_exec;
1356
1344 if (!lw->inv_weight) { 1357 if (!lw->inv_weight) {
1345 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1358 unsigned long w = scale_load_down(lw->weight);
1359
1360 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1346 lw->inv_weight = 1; 1361 lw->inv_weight = 1;
1362 else if (unlikely(!w))
1363 lw->inv_weight = WMULT_CONST;
1347 else 1364 else
1348 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1365 lw->inv_weight = WMULT_CONST / w;
1349 / (lw->weight+1);
1350 } 1366 }
1351 1367
1352 tmp = (u64)delta_exec * weight;
1353 /* 1368 /*
1354 * Check whether we'd overflow the 64-bit multiplication: 1369 * Check whether we'd overflow the 64-bit multiplication:
1355 */ 1370 */
@@ -1374,6 +1389,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1374 lw->inv_weight = 0; 1389 lw->inv_weight = 0;
1375} 1390}
1376 1391
1392static inline void update_load_set(struct load_weight *lw, unsigned long w)
1393{
1394 lw->weight = w;
1395 lw->inv_weight = 0;
1396}
1397
1377/* 1398/*
1378 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1399 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1379 * of tasks with abnormal "nice" values across CPUs the contribution that 1400 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1562,101 +1583,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1562 1583
1563#ifdef CONFIG_FAIR_GROUP_SCHED 1584#ifdef CONFIG_FAIR_GROUP_SCHED
1564 1585
1565static __read_mostly unsigned long __percpu *update_shares_data;
1566
1567static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1568
1569/*
1570 * Calculate and set the cpu's group shares.
1571 */
1572static void update_group_shares_cpu(struct task_group *tg, int cpu,
1573 unsigned long sd_shares,
1574 unsigned long sd_rq_weight,
1575 unsigned long *usd_rq_weight)
1576{
1577 unsigned long shares, rq_weight;
1578 int boost = 0;
1579
1580 rq_weight = usd_rq_weight[cpu];
1581 if (!rq_weight) {
1582 boost = 1;
1583 rq_weight = NICE_0_LOAD;
1584 }
1585
1586 /*
1587 * \Sum_j shares_j * rq_weight_i
1588 * shares_i = -----------------------------
1589 * \Sum_j rq_weight_j
1590 */
1591 shares = (sd_shares * rq_weight) / sd_rq_weight;
1592 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1593
1594 if (abs(shares - tg->se[cpu]->load.weight) >
1595 sysctl_sched_shares_thresh) {
1596 struct rq *rq = cpu_rq(cpu);
1597 unsigned long flags;
1598
1599 raw_spin_lock_irqsave(&rq->lock, flags);
1600 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1601 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1602 __set_se_shares(tg->se[cpu], shares);
1603 raw_spin_unlock_irqrestore(&rq->lock, flags);
1604 }
1605}
1606
1607/*
1608 * Re-compute the task group their per cpu shares over the given domain.
1609 * This needs to be done in a bottom-up fashion because the rq weight of a
1610 * parent group depends on the shares of its child groups.
1611 */
1612static int tg_shares_up(struct task_group *tg, void *data)
1613{
1614 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1615 unsigned long *usd_rq_weight;
1616 struct sched_domain *sd = data;
1617 unsigned long flags;
1618 int i;
1619
1620 if (!tg->se[0])
1621 return 0;
1622
1623 local_irq_save(flags);
1624 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1625
1626 for_each_cpu(i, sched_domain_span(sd)) {
1627 weight = tg->cfs_rq[i]->load.weight;
1628 usd_rq_weight[i] = weight;
1629
1630 rq_weight += weight;
1631 /*
1632 * If there are currently no tasks on the cpu pretend there
1633 * is one of average load so that when a new task gets to
1634 * run here it will not get delayed by group starvation.
1635 */
1636 if (!weight)
1637 weight = NICE_0_LOAD;
1638
1639 sum_weight += weight;
1640 shares += tg->cfs_rq[i]->shares;
1641 }
1642
1643 if (!rq_weight)
1644 rq_weight = sum_weight;
1645
1646 if ((!shares && rq_weight) || shares > tg->shares)
1647 shares = tg->shares;
1648
1649 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1650 shares = tg->shares;
1651
1652 for_each_cpu(i, sched_domain_span(sd))
1653 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1654
1655 local_irq_restore(flags);
1656
1657 return 0;
1658}
1659
1660/* 1586/*
1661 * Compute the cpu's hierarchical load factor for each task group. 1587 * Compute the cpu's hierarchical load factor for each task group.
1662 * This needs to be done in a top-down fashion because the load of a child 1588 * This needs to be done in a top-down fashion because the load of a child
@@ -1671,7 +1597,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1671 load = cpu_rq(cpu)->load.weight; 1597 load = cpu_rq(cpu)->load.weight;
1672 } else { 1598 } else {
1673 load = tg->parent->cfs_rq[cpu]->h_load; 1599 load = tg->parent->cfs_rq[cpu]->h_load;
1674 load *= tg->cfs_rq[cpu]->shares; 1600 load *= tg->se[cpu]->load.weight;
1675 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1601 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1676 } 1602 }
1677 1603
@@ -1680,34 +1606,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1680 return 0; 1606 return 0;
1681} 1607}
1682 1608
1683static void update_shares(struct sched_domain *sd)
1684{
1685 s64 elapsed;
1686 u64 now;
1687
1688 if (root_task_group_empty())
1689 return;
1690
1691 now = local_clock();
1692 elapsed = now - sd->last_update;
1693
1694 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1695 sd->last_update = now;
1696 walk_tg_tree(tg_nop, tg_shares_up, sd);
1697 }
1698}
1699
1700static void update_h_load(long cpu) 1609static void update_h_load(long cpu)
1701{ 1610{
1702 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1611 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1703} 1612}
1704 1613
1705#else
1706
1707static inline void update_shares(struct sched_domain *sd)
1708{
1709}
1710
1711#endif 1614#endif
1712 1615
1713#ifdef CONFIG_PREEMPT 1616#ifdef CONFIG_PREEMPT
@@ -1827,15 +1730,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1827 __release(rq2->lock); 1730 __release(rq2->lock);
1828} 1731}
1829 1732
1830#endif 1733#else /* CONFIG_SMP */
1831 1734
1832#ifdef CONFIG_FAIR_GROUP_SCHED 1735/*
1833static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1736 * double_rq_lock - safely lock two runqueues
1737 *
1738 * Note this does not disable interrupts like task_rq_lock,
1739 * you need to do so manually before calling.
1740 */
1741static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1742 __acquires(rq1->lock)
1743 __acquires(rq2->lock)
1834{ 1744{
1835#ifdef CONFIG_SMP 1745 BUG_ON(!irqs_disabled());
1836 cfs_rq->shares = shares; 1746 BUG_ON(rq1 != rq2);
1837#endif 1747 raw_spin_lock(&rq1->lock);
1748 __acquire(rq2->lock); /* Fake it out ;) */
1838} 1749}
1750
1751/*
1752 * double_rq_unlock - safely unlock two runqueues
1753 *
1754 * Note this does not restore interrupts like task_rq_unlock,
1755 * you need to do so manually after calling.
1756 */
1757static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1758 __releases(rq1->lock)
1759 __releases(rq2->lock)
1760{
1761 BUG_ON(rq1 != rq2);
1762 raw_spin_unlock(&rq1->lock);
1763 __release(rq2->lock);
1764}
1765
1839#endif 1766#endif
1840 1767
1841static void calc_load_account_idle(struct rq *this_rq); 1768static void calc_load_account_idle(struct rq *this_rq);
@@ -1877,23 +1804,20 @@ static void dec_nr_running(struct rq *rq)
1877 1804
1878static void set_load_weight(struct task_struct *p) 1805static void set_load_weight(struct task_struct *p)
1879{ 1806{
1880 if (task_has_rt_policy(p)) { 1807 int prio = p->static_prio - MAX_RT_PRIO;
1881 p->se.load.weight = 0; 1808 struct load_weight *load = &p->se.load;
1882 p->se.load.inv_weight = WMULT_CONST;
1883 return;
1884 }
1885 1809
1886 /* 1810 /*
1887 * SCHED_IDLE tasks get minimal weight: 1811 * SCHED_IDLE tasks get minimal weight:
1888 */ 1812 */
1889 if (p->policy == SCHED_IDLE) { 1813 if (p->policy == SCHED_IDLE) {
1890 p->se.load.weight = WEIGHT_IDLEPRIO; 1814 load->weight = scale_load(WEIGHT_IDLEPRIO);
1891 p->se.load.inv_weight = WMULT_IDLEPRIO; 1815 load->inv_weight = WMULT_IDLEPRIO;
1892 return; 1816 return;
1893 } 1817 }
1894 1818
1895 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 1819 load->weight = scale_load(prio_to_weight[prio]);
1896 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1820 load->inv_weight = prio_to_wmult[prio];
1897} 1821}
1898 1822
1899static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1823static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1901,7 +1825,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1901 update_rq_clock(rq); 1825 update_rq_clock(rq);
1902 sched_info_queued(p); 1826 sched_info_queued(p);
1903 p->sched_class->enqueue_task(rq, p, flags); 1827 p->sched_class->enqueue_task(rq, p, flags);
1904 p->se.on_rq = 1;
1905} 1828}
1906 1829
1907static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1830static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1909,7 +1832,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1909 update_rq_clock(rq); 1832 update_rq_clock(rq);
1910 sched_info_dequeued(p); 1833 sched_info_dequeued(p);
1911 p->sched_class->dequeue_task(rq, p, flags); 1834 p->sched_class->dequeue_task(rq, p, flags);
1912 p->se.on_rq = 0;
1913} 1835}
1914 1836
1915/* 1837/*
@@ -1936,14 +1858,227 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1936 dec_nr_running(rq); 1858 dec_nr_running(rq);
1937} 1859}
1938 1860
1861#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1862
1863/*
1864 * There are no locks covering percpu hardirq/softirq time.
1865 * They are only modified in account_system_vtime, on corresponding CPU
1866 * with interrupts disabled. So, writes are safe.
1867 * They are read and saved off onto struct rq in update_rq_clock().
1868 * This may result in other CPU reading this CPU's irq time and can
1869 * race with irq/account_system_vtime on this CPU. We would either get old
1870 * or new value with a side effect of accounting a slice of irq time to wrong
1871 * task when irq is in progress while we read rq->clock. That is a worthy
1872 * compromise in place of having locks on each irq in account_system_time.
1873 */
1874static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1875static DEFINE_PER_CPU(u64, cpu_softirq_time);
1876
1877static DEFINE_PER_CPU(u64, irq_start_time);
1878static int sched_clock_irqtime;
1879
1880void enable_sched_clock_irqtime(void)
1881{
1882 sched_clock_irqtime = 1;
1883}
1884
1885void disable_sched_clock_irqtime(void)
1886{
1887 sched_clock_irqtime = 0;
1888}
1889
1890#ifndef CONFIG_64BIT
1891static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1892
1893static inline void irq_time_write_begin(void)
1894{
1895 __this_cpu_inc(irq_time_seq.sequence);
1896 smp_wmb();
1897}
1898
1899static inline void irq_time_write_end(void)
1900{
1901 smp_wmb();
1902 __this_cpu_inc(irq_time_seq.sequence);
1903}
1904
1905static inline u64 irq_time_read(int cpu)
1906{
1907 u64 irq_time;
1908 unsigned seq;
1909
1910 do {
1911 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1912 irq_time = per_cpu(cpu_softirq_time, cpu) +
1913 per_cpu(cpu_hardirq_time, cpu);
1914 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1915
1916 return irq_time;
1917}
1918#else /* CONFIG_64BIT */
1919static inline void irq_time_write_begin(void)
1920{
1921}
1922
1923static inline void irq_time_write_end(void)
1924{
1925}
1926
1927static inline u64 irq_time_read(int cpu)
1928{
1929 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1930}
1931#endif /* CONFIG_64BIT */
1932
1933/*
1934 * Called before incrementing preempt_count on {soft,}irq_enter
1935 * and before decrementing preempt_count on {soft,}irq_exit.
1936 */
1937void account_system_vtime(struct task_struct *curr)
1938{
1939 unsigned long flags;
1940 s64 delta;
1941 int cpu;
1942
1943 if (!sched_clock_irqtime)
1944 return;
1945
1946 local_irq_save(flags);
1947
1948 cpu = smp_processor_id();
1949 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1950 __this_cpu_add(irq_start_time, delta);
1951
1952 irq_time_write_begin();
1953 /*
1954 * We do not account for softirq time from ksoftirqd here.
1955 * We want to continue accounting softirq time to ksoftirqd thread
1956 * in that case, so as not to confuse scheduler with a special task
1957 * that do not consume any time, but still wants to run.
1958 */
1959 if (hardirq_count())
1960 __this_cpu_add(cpu_hardirq_time, delta);
1961 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1962 __this_cpu_add(cpu_softirq_time, delta);
1963
1964 irq_time_write_end();
1965 local_irq_restore(flags);
1966}
1967EXPORT_SYMBOL_GPL(account_system_vtime);
1968
1969static void update_rq_clock_task(struct rq *rq, s64 delta)
1970{
1971 s64 irq_delta;
1972
1973 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1974
1975 /*
1976 * Since irq_time is only updated on {soft,}irq_exit, we might run into
1977 * this case when a previous update_rq_clock() happened inside a
1978 * {soft,}irq region.
1979 *
1980 * When this happens, we stop ->clock_task and only update the
1981 * prev_irq_time stamp to account for the part that fit, so that a next
1982 * update will consume the rest. This ensures ->clock_task is
1983 * monotonic.
1984 *
1985 * It does however cause some slight miss-attribution of {soft,}irq
1986 * time, a more accurate solution would be to update the irq_time using
1987 * the current rq->clock timestamp, except that would require using
1988 * atomic ops.
1989 */
1990 if (irq_delta > delta)
1991 irq_delta = delta;
1992
1993 rq->prev_irq_time += irq_delta;
1994 delta -= irq_delta;
1995 rq->clock_task += delta;
1996
1997 if (irq_delta && sched_feat(NONIRQ_POWER))
1998 sched_rt_avg_update(rq, irq_delta);
1999}
2000
2001static int irqtime_account_hi_update(void)
2002{
2003 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2004 unsigned long flags;
2005 u64 latest_ns;
2006 int ret = 0;
2007
2008 local_irq_save(flags);
2009 latest_ns = this_cpu_read(cpu_hardirq_time);
2010 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
2011 ret = 1;
2012 local_irq_restore(flags);
2013 return ret;
2014}
2015
2016static int irqtime_account_si_update(void)
2017{
2018 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2019 unsigned long flags;
2020 u64 latest_ns;
2021 int ret = 0;
2022
2023 local_irq_save(flags);
2024 latest_ns = this_cpu_read(cpu_softirq_time);
2025 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
2026 ret = 1;
2027 local_irq_restore(flags);
2028 return ret;
2029}
2030
2031#else /* CONFIG_IRQ_TIME_ACCOUNTING */
2032
2033#define sched_clock_irqtime (0)
2034
2035static void update_rq_clock_task(struct rq *rq, s64 delta)
2036{
2037 rq->clock_task += delta;
2038}
2039
2040#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2041
1939#include "sched_idletask.c" 2042#include "sched_idletask.c"
1940#include "sched_fair.c" 2043#include "sched_fair.c"
1941#include "sched_rt.c" 2044#include "sched_rt.c"
2045#include "sched_autogroup.c"
2046#include "sched_stoptask.c"
1942#include "../litmus/sched_litmus.c" 2047#include "../litmus/sched_litmus.c"
1943#ifdef CONFIG_SCHED_DEBUG 2048#ifdef CONFIG_SCHED_DEBUG
1944# include "sched_debug.c" 2049# include "sched_debug.c"
1945#endif 2050#endif
1946 2051
2052void sched_set_stop_task(int cpu, struct task_struct *stop)
2053{
2054 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2055 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2056
2057 if (stop) {
2058 /*
2059 * Make it appear like a SCHED_FIFO task, its something
2060 * userspace knows about and won't get confused about.
2061 *
2062 * Also, it will make PI more or less work without too
2063 * much confusion -- but then, stop work should not
2064 * rely on PI working anyway.
2065 */
2066 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2067
2068 stop->sched_class = &stop_sched_class;
2069 }
2070
2071 cpu_rq(cpu)->stop = stop;
2072
2073 if (old_stop) {
2074 /*
2075 * Reset it back to a normal scheduling class so that
2076 * it can die in pieces.
2077 */
2078 old_stop->sched_class = &rt_sched_class;
2079 }
2080}
2081
1947/* 2082/*
1948 * __normal_prio - return the priority that is based on the static prio 2083 * __normal_prio - return the priority that is based on the static prio
1949 */ 2084 */
@@ -2001,14 +2136,43 @@ inline int task_curr(const struct task_struct *p)
2001 2136
2002static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2137static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2003 const struct sched_class *prev_class, 2138 const struct sched_class *prev_class,
2004 int oldprio, int running) 2139 int oldprio)
2005{ 2140{
2006 if (prev_class != p->sched_class) { 2141 if (prev_class != p->sched_class) {
2007 if (prev_class->switched_from) 2142 if (prev_class->switched_from)
2008 prev_class->switched_from(rq, p, running); 2143 prev_class->switched_from(rq, p);
2009 p->sched_class->switched_to(rq, p, running); 2144 p->sched_class->switched_to(rq, p);
2010 } else 2145 } else if (oldprio != p->prio)
2011 p->sched_class->prio_changed(rq, p, oldprio, running); 2146 p->sched_class->prio_changed(rq, p, oldprio);
2147}
2148
2149static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2150{
2151 const struct sched_class *class;
2152
2153 if (p->sched_class == rq->curr->sched_class) {
2154 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2155 } else {
2156 for_each_class(class) {
2157 if (class == rq->curr->sched_class)
2158 break;
2159 if (class == p->sched_class) {
2160 resched_task(rq->curr);
2161 break;
2162 }
2163 }
2164 }
2165
2166 /*
2167 * A queue event has occurred, and we're going to schedule. In
2168 * this case, we can save a useless back to back clock update.
2169 */
2170 /* LITMUS^RT:
2171 * The "disable-clock-update" approach was buggy in Linux 2.6.36.
2172 * The issue has been solved in 2.6.37.
2173 */
2174 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2175 rq->skip_clock_update = 1;
2012} 2176}
2013 2177
2014#ifdef CONFIG_SMP 2178#ifdef CONFIG_SMP
@@ -2023,6 +2187,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2023 if (p->sched_class != &fair_sched_class) 2187 if (p->sched_class != &fair_sched_class)
2024 return 0; 2188 return 0;
2025 2189
2190 if (unlikely(p->policy == SCHED_IDLE))
2191 return 0;
2192
2026 /* 2193 /*
2027 * Buddy candidates are cache hot: 2194 * Buddy candidates are cache hot:
2028 */ 2195 */
@@ -2050,6 +2217,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2050 */ 2217 */
2051 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2218 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2052 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2219 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2220
2221#ifdef CONFIG_LOCKDEP
2222 /*
2223 * The caller should hold either p->pi_lock or rq->lock, when changing
2224 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2225 *
2226 * sched_move_task() holds both and thus holding either pins the cgroup,
2227 * see set_task_rq().
2228 *
2229 * Furthermore, all task_rq users should acquire both locks, see
2230 * task_rq_lock().
2231 */
2232 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2233 lockdep_is_held(&task_rq(p)->lock)));
2234#endif
2053#endif 2235#endif
2054 2236
2055 trace_sched_migrate_task(p, new_cpu); 2237 trace_sched_migrate_task(p, new_cpu);
@@ -2070,21 +2252,6 @@ struct migration_arg {
2070static int migration_cpu_stop(void *data); 2252static int migration_cpu_stop(void *data);
2071 2253
2072/* 2254/*
2073 * The task's runqueue lock must be held.
2074 * Returns true if you have to wait for migration thread.
2075 */
2076static bool migrate_task(struct task_struct *p, int dest_cpu)
2077{
2078 struct rq *rq = task_rq(p);
2079
2080 /*
2081 * If the task is not on a runqueue (and not running), then
2082 * the next wake-up will properly place the task.
2083 */
2084 return p->se.on_rq || task_running(rq, p);
2085}
2086
2087/*
2088 * wait_task_inactive - wait for a thread to unschedule. 2255 * wait_task_inactive - wait for a thread to unschedule.
2089 * 2256 *
2090 * If @match_state is nonzero, it's the @p->state value just checked and 2257 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2141,11 +2308,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2141 rq = task_rq_lock(p, &flags); 2308 rq = task_rq_lock(p, &flags);
2142 trace_sched_wait_task(p); 2309 trace_sched_wait_task(p);
2143 running = task_running(rq, p); 2310 running = task_running(rq, p);
2144 on_rq = p->se.on_rq; 2311 on_rq = p->on_rq;
2145 ncsw = 0; 2312 ncsw = 0;
2146 if (!match_state || p->state == match_state) 2313 if (!match_state || p->state == match_state)
2147 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2314 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2148 task_rq_unlock(rq, &flags); 2315 task_rq_unlock(rq, p, &flags);
2149 2316
2150 /* 2317 /*
2151 * If it changed from the expected state, bail out now. 2318 * If it changed from the expected state, bail out now.
@@ -2174,7 +2341,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2174 * yield - it could be a while. 2341 * yield - it could be a while.
2175 */ 2342 */
2176 if (unlikely(on_rq)) { 2343 if (unlikely(on_rq)) {
2177 schedule_timeout_uninterruptible(1); 2344 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2345
2346 set_current_state(TASK_UNINTERRUPTIBLE);
2347 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2178 continue; 2348 continue;
2179 } 2349 }
2180 2350
@@ -2196,7 +2366,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2196 * Cause a process which is running on another CPU to enter 2366 * Cause a process which is running on another CPU to enter
2197 * kernel-mode, without any delay. (to get signals handled.) 2367 * kernel-mode, without any delay. (to get signals handled.)
2198 * 2368 *
2199 * NOTE: this function doesnt have to take the runqueue lock, 2369 * NOTE: this function doesn't have to take the runqueue lock,
2200 * because all it wants to ensure is that the remote task enters 2370 * because all it wants to ensure is that the remote task enters
2201 * the kernel. If the IPI races and the task has been migrated 2371 * the kernel. If the IPI races and the task has been migrated
2202 * to another CPU then no harm is done and the purpose has been 2372 * to another CPU then no harm is done and the purpose has been
@@ -2215,30 +2385,9 @@ void kick_process(struct task_struct *p)
2215EXPORT_SYMBOL_GPL(kick_process); 2385EXPORT_SYMBOL_GPL(kick_process);
2216#endif /* CONFIG_SMP */ 2386#endif /* CONFIG_SMP */
2217 2387
2218/**
2219 * task_oncpu_function_call - call a function on the cpu on which a task runs
2220 * @p: the task to evaluate
2221 * @func: the function to be called
2222 * @info: the function call argument
2223 *
2224 * Calls the function @func when the task is currently running. This might
2225 * be on the current CPU, which just calls the function directly
2226 */
2227void task_oncpu_function_call(struct task_struct *p,
2228 void (*func) (void *info), void *info)
2229{
2230 int cpu;
2231
2232 preempt_disable();
2233 cpu = task_cpu(p);
2234 if (task_curr(p))
2235 smp_call_function_single(cpu, func, info, 1);
2236 preempt_enable();
2237}
2238
2239#ifdef CONFIG_SMP 2388#ifdef CONFIG_SMP
2240/* 2389/*
2241 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2390 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2242 */ 2391 */
2243static int select_fallback_rq(int cpu, struct task_struct *p) 2392static int select_fallback_rq(int cpu, struct task_struct *p)
2244{ 2393{
@@ -2256,30 +2405,27 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2256 return dest_cpu; 2405 return dest_cpu;
2257 2406
2258 /* No more Mr. Nice Guy. */ 2407 /* No more Mr. Nice Guy. */
2259 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2408 dest_cpu = cpuset_cpus_allowed_fallback(p);
2260 dest_cpu = cpuset_cpus_allowed_fallback(p); 2409 /*
2261 /* 2410 * Don't tell them about moving exiting tasks or
2262 * Don't tell them about moving exiting tasks or 2411 * kernel threads (both mm NULL), since they never
2263 * kernel threads (both mm NULL), since they never 2412 * leave kernel.
2264 * leave kernel. 2413 */
2265 */ 2414 if (p->mm && printk_ratelimit()) {
2266 if (p->mm && printk_ratelimit()) { 2415 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2267 printk(KERN_INFO "process %d (%s) no " 2416 task_pid_nr(p), p->comm, cpu);
2268 "longer affine to cpu%d\n",
2269 task_pid_nr(p), p->comm, cpu);
2270 }
2271 } 2417 }
2272 2418
2273 return dest_cpu; 2419 return dest_cpu;
2274} 2420}
2275 2421
2276/* 2422/*
2277 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2423 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2278 */ 2424 */
2279static inline 2425static inline
2280int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2426int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2281{ 2427{
2282 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2428 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2283 2429
2284 /* 2430 /*
2285 * In order not to call set_task_cpu() on a blocking task we need 2431 * In order not to call set_task_cpu() on a blocking task we need
@@ -2305,27 +2451,63 @@ static void update_avg(u64 *avg, u64 sample)
2305} 2451}
2306#endif 2452#endif
2307 2453
2308static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2454static void
2309 bool is_sync, bool is_migrate, bool is_local, 2455ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2310 unsigned long en_flags)
2311{ 2456{
2312 schedstat_inc(p, se.statistics.nr_wakeups); 2457#ifdef CONFIG_SCHEDSTATS
2313 if (is_sync) 2458 struct rq *rq = this_rq();
2314 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2459
2315 if (is_migrate) 2460#ifdef CONFIG_SMP
2316 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2461 int this_cpu = smp_processor_id();
2317 if (is_local) 2462
2463 if (cpu == this_cpu) {
2464 schedstat_inc(rq, ttwu_local);
2318 schedstat_inc(p, se.statistics.nr_wakeups_local); 2465 schedstat_inc(p, se.statistics.nr_wakeups_local);
2319 else 2466 } else {
2467 struct sched_domain *sd;
2468
2320 schedstat_inc(p, se.statistics.nr_wakeups_remote); 2469 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2470 rcu_read_lock();
2471 for_each_domain(this_cpu, sd) {
2472 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2473 schedstat_inc(sd, ttwu_wake_remote);
2474 break;
2475 }
2476 }
2477 rcu_read_unlock();
2478 }
2479
2480 if (wake_flags & WF_MIGRATED)
2481 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2482
2483#endif /* CONFIG_SMP */
2484
2485 schedstat_inc(rq, ttwu_count);
2486 schedstat_inc(p, se.statistics.nr_wakeups);
2487
2488 if (wake_flags & WF_SYNC)
2489 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2490
2491#endif /* CONFIG_SCHEDSTATS */
2492}
2321 2493
2494static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2495{
2322 activate_task(rq, p, en_flags); 2496 activate_task(rq, p, en_flags);
2497 p->on_rq = 1;
2498
2499 /* if a worker is waking up, notify workqueue */
2500 if (p->flags & PF_WQ_WORKER)
2501 wq_worker_waking_up(p, cpu_of(rq));
2323} 2502}
2324 2503
2325static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2504/*
2326 int wake_flags, bool success) 2505 * Mark the task runnable and perform wakeup-preemption.
2506 */
2507static void
2508ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2327{ 2509{
2328 trace_sched_wakeup(p, success); 2510 trace_sched_wakeup(p, true);
2329 check_preempt_curr(rq, p, wake_flags); 2511 check_preempt_curr(rq, p, wake_flags);
2330 2512
2331 p->state = TASK_RUNNING; 2513 p->state = TASK_RUNNING;
@@ -2344,9 +2526,156 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2344 rq->idle_stamp = 0; 2526 rq->idle_stamp = 0;
2345 } 2527 }
2346#endif 2528#endif
2347 /* if a worker is waking up, notify workqueue */ 2529}
2348 if ((p->flags & PF_WQ_WORKER) && success) 2530
2349 wq_worker_waking_up(p, cpu_of(rq)); 2531static void
2532ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2533{
2534#ifdef CONFIG_SMP
2535 if (p->sched_contributes_to_load)
2536 rq->nr_uninterruptible--;
2537#endif
2538
2539 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2540 ttwu_do_wakeup(rq, p, wake_flags);
2541}
2542
2543/*
2544 * Called in case the task @p isn't fully descheduled from its runqueue,
2545 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2546 * since all we need to do is flip p->state to TASK_RUNNING, since
2547 * the task is still ->on_rq.
2548 */
2549static int ttwu_remote(struct task_struct *p, int wake_flags)
2550{
2551 struct rq *rq;
2552 int ret = 0;
2553
2554 rq = __task_rq_lock(p);
2555 if (p->on_rq) {
2556 ttwu_do_wakeup(rq, p, wake_flags);
2557 ret = 1;
2558 }
2559 __task_rq_unlock(rq);
2560
2561 return ret;
2562}
2563
2564#ifdef CONFIG_SMP
2565static void sched_ttwu_do_pending(struct task_struct *list)
2566{
2567 struct rq *rq = this_rq();
2568
2569 raw_spin_lock(&rq->lock);
2570
2571 while (list) {
2572 struct task_struct *p = list;
2573 list = list->wake_entry;
2574 ttwu_do_activate(rq, p, 0);
2575 }
2576
2577 raw_spin_unlock(&rq->lock);
2578}
2579
2580#ifdef CONFIG_HOTPLUG_CPU
2581
2582static void sched_ttwu_pending(void)
2583{
2584 struct rq *rq = this_rq();
2585 struct task_struct *list = xchg(&rq->wake_list, NULL);
2586
2587 if (!list)
2588 return;
2589
2590 sched_ttwu_do_pending(list);
2591}
2592
2593#endif /* CONFIG_HOTPLUG_CPU */
2594
2595void scheduler_ipi(void)
2596{
2597 struct rq *rq = this_rq();
2598 struct task_struct *list = xchg(&rq->wake_list, NULL);
2599
2600 if (!list)
2601 return;
2602
2603 /*
2604 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2605 * traditionally all their work was done from the interrupt return
2606 * path. Now that we actually do some work, we need to make sure
2607 * we do call them.
2608 *
2609 * Some archs already do call them, luckily irq_enter/exit nest
2610 * properly.
2611 *
2612 * Arguably we should visit all archs and update all handlers,
2613 * however a fair share of IPIs are still resched only so this would
2614 * somewhat pessimize the simple resched case.
2615 */
2616 irq_enter();
2617 sched_ttwu_do_pending(list);
2618 irq_exit();
2619}
2620
2621static void ttwu_queue_remote(struct task_struct *p, int cpu)
2622{
2623 struct rq *rq = cpu_rq(cpu);
2624 struct task_struct *next = rq->wake_list;
2625
2626 for (;;) {
2627 struct task_struct *old = next;
2628
2629 p->wake_entry = next;
2630 next = cmpxchg(&rq->wake_list, old, p);
2631 if (next == old)
2632 break;
2633 }
2634
2635 if (!next)
2636 smp_send_reschedule(cpu);
2637}
2638
2639#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2640static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2641{
2642 struct rq *rq;
2643 int ret = 0;
2644
2645 rq = __task_rq_lock(p);
2646 if (p->on_cpu) {
2647 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2648 ttwu_do_wakeup(rq, p, wake_flags);
2649 ret = 1;
2650 }
2651 __task_rq_unlock(rq);
2652
2653 return ret;
2654
2655}
2656#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2657#endif /* CONFIG_SMP */
2658
2659static void ttwu_queue(struct task_struct *p, int cpu)
2660{
2661 struct rq *rq = cpu_rq(cpu);
2662
2663#if defined(CONFIG_SMP)
2664 /*
2665 * LITMUS^RT: whether to send an IPI to the remote CPU
2666 * is plugin specific.
2667 */
2668 if (!is_realtime(p) &&
2669 sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2670 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2671 ttwu_queue_remote(p, cpu);
2672 return;
2673 }
2674#endif
2675
2676 raw_spin_lock(&rq->lock);
2677 ttwu_do_activate(rq, p, 0);
2678 raw_spin_unlock(&rq->lock);
2350} 2679}
2351 2680
2352/** 2681/**
@@ -2364,97 +2693,79 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2364 * Returns %true if @p was woken up, %false if it was already running 2693 * Returns %true if @p was woken up, %false if it was already running
2365 * or @state didn't match @p's state. 2694 * or @state didn't match @p's state.
2366 */ 2695 */
2367static int try_to_wake_up(struct task_struct *p, unsigned int state, 2696static int
2368 int wake_flags) 2697try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2369{ 2698{
2370 int cpu, orig_cpu, this_cpu, success = 0;
2371 unsigned long flags; 2699 unsigned long flags;
2372 unsigned long en_flags = ENQUEUE_WAKEUP; 2700 int cpu, success = 0;
2373 struct rq *rq;
2374 2701
2375 if (is_realtime(p)) 2702 if (is_realtime(p))
2376 TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); 2703 TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
2377 2704
2378 this_cpu = get_cpu();
2379
2380 smp_wmb(); 2705 smp_wmb();
2381 rq = task_rq_lock(p, &flags); 2706 raw_spin_lock_irqsave(&p->pi_lock, flags);
2382 if (!(p->state & state)) 2707 if (!(p->state & state))
2383 goto out; 2708 goto out;
2384 2709
2385 if (p->se.on_rq) 2710 success = 1; /* we're going to change ->state */
2386 goto out_running;
2387
2388 cpu = task_cpu(p); 2711 cpu = task_cpu(p);
2389 orig_cpu = cpu;
2390 2712
2391#ifdef CONFIG_SMP 2713 if (p->on_rq && ttwu_remote(p, wake_flags))
2392 if (unlikely(task_running(rq, p)) || is_realtime(p)) 2714 goto stat;
2393 goto out_activate;
2394 2715
2716#ifdef CONFIG_SMP
2395 /* 2717 /*
2396 * In order to handle concurrent wakeups and release the rq->lock 2718 * If the owning (remote) cpu is still in the middle of schedule() with
2397 * we put the task in TASK_WAKING state. 2719 * this task as prev, wait until its done referencing the task.
2398 *
2399 * First fix up the nr_uninterruptible count:
2400 */ 2720 */
2401 if (task_contributes_to_load(p)) { 2721 while (p->on_cpu) {
2402 if (likely(cpu_online(orig_cpu))) 2722#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2403 rq->nr_uninterruptible--; 2723 /*
2404 else 2724 * In case the architecture enables interrupts in
2405 this_rq()->nr_uninterruptible--; 2725 * context_switch(), we cannot busy wait, since that
2406 } 2726 * would lead to deadlocks when an interrupt hits and
2407 p->state = TASK_WAKING; 2727 * tries to wake up @prev. So bail and do a complete
2408 2728 * remote wakeup.
2409 if (p->sched_class->task_waking) { 2729 */
2410 p->sched_class->task_waking(rq, p); 2730 if (ttwu_activate_remote(p, wake_flags))
2411 en_flags |= ENQUEUE_WAKING; 2731 goto stat;
2732#else
2733 cpu_relax();
2734#endif
2412 } 2735 }
2736 /*
2737 * Pairs with the smp_wmb() in finish_lock_switch().
2738 */
2739 smp_rmb();
2413 2740
2414 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); 2741 /* LITMUS^RT: once the task can be safely referenced by this
2415 if (cpu != orig_cpu) 2742 * CPU, don't mess up with Linux load balancing stuff.
2416 set_task_cpu(p, cpu); 2743 */
2417 __task_rq_unlock(rq); 2744 if (is_realtime(p))
2745 goto litmus_out_activate;
2418 2746
2419 rq = cpu_rq(cpu); 2747 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2420 raw_spin_lock(&rq->lock); 2748 p->state = TASK_WAKING;
2421 2749
2422 /* 2750 if (p->sched_class->task_waking)
2423 * We migrated the task without holding either rq->lock, however 2751 p->sched_class->task_waking(p);
2424 * since the task is not on the task list itself, nobody else
2425 * will try and migrate the task, hence the rq should match the
2426 * cpu we just moved it to.
2427 */
2428 WARN_ON(task_cpu(p) != cpu);
2429 WARN_ON(p->state != TASK_WAKING);
2430 2752
2431#ifdef CONFIG_SCHEDSTATS 2753 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2432 schedstat_inc(rq, ttwu_count); 2754 if (task_cpu(p) != cpu) {
2433 if (cpu == this_cpu) 2755 wake_flags |= WF_MIGRATED;
2434 schedstat_inc(rq, ttwu_local); 2756 set_task_cpu(p, cpu);
2435 else {
2436 struct sched_domain *sd;
2437 for_each_domain(this_cpu, sd) {
2438 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2439 schedstat_inc(sd, ttwu_wake_remote);
2440 break;
2441 }
2442 }
2443 } 2757 }
2444#endif /* CONFIG_SCHEDSTATS */
2445 2758
2446out_activate: 2759litmus_out_activate:
2447#endif /* CONFIG_SMP */ 2760#endif /* CONFIG_SMP */
2448 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2761
2449 cpu == this_cpu, en_flags); 2762 ttwu_queue(p, cpu);
2450 success = 1; 2763stat:
2451out_running: 2764 ttwu_stat(p, cpu, wake_flags);
2452 ttwu_post_activation(p, rq, wake_flags, success);
2453out: 2765out:
2454 if (is_realtime(p)) 2766 if (is_realtime(p))
2455 TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state); 2767 TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
2456 task_rq_unlock(rq, &flags); 2768 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2457 put_cpu();
2458 2769
2459 return success; 2770 return success;
2460} 2771}
@@ -2463,31 +2774,34 @@ out:
2463 * try_to_wake_up_local - try to wake up a local task with rq lock held 2774 * try_to_wake_up_local - try to wake up a local task with rq lock held
2464 * @p: the thread to be awakened 2775 * @p: the thread to be awakened
2465 * 2776 *
2466 * Put @p on the run-queue if it's not alredy there. The caller must 2777 * Put @p on the run-queue if it's not already there. The caller must
2467 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2778 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2468 * the current task. this_rq() stays locked over invocation. 2779 * the current task.
2469 */ 2780 */
2470static void try_to_wake_up_local(struct task_struct *p) 2781static void try_to_wake_up_local(struct task_struct *p)
2471{ 2782{
2472 struct rq *rq = task_rq(p); 2783 struct rq *rq = task_rq(p);
2473 bool success = false;
2474 2784
2475 BUG_ON(rq != this_rq()); 2785 BUG_ON(rq != this_rq());
2476 BUG_ON(p == current); 2786 BUG_ON(p == current);
2477 lockdep_assert_held(&rq->lock); 2787 lockdep_assert_held(&rq->lock);
2478 2788
2789 if (!raw_spin_trylock(&p->pi_lock)) {
2790 raw_spin_unlock(&rq->lock);
2791 raw_spin_lock(&p->pi_lock);
2792 raw_spin_lock(&rq->lock);
2793 }
2794
2479 if (!(p->state & TASK_NORMAL)) 2795 if (!(p->state & TASK_NORMAL))
2480 return; 2796 goto out;
2481 2797
2482 if (!p->se.on_rq) { 2798 if (!p->on_rq)
2483 if (likely(!task_running(rq, p))) { 2799 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2484 schedstat_inc(rq, ttwu_count); 2800
2485 schedstat_inc(rq, ttwu_local); 2801 ttwu_do_wakeup(rq, p, 0);
2486 } 2802 ttwu_stat(p, smp_processor_id(), 0);
2487 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2803out:
2488 success = true; 2804 raw_spin_unlock(&p->pi_lock);
2489 }
2490 ttwu_post_activation(p, rq, 0, success);
2491} 2805}
2492 2806
2493/** 2807/**
@@ -2520,18 +2834,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2520 */ 2834 */
2521static void __sched_fork(struct task_struct *p) 2835static void __sched_fork(struct task_struct *p)
2522{ 2836{
2837 p->on_rq = 0;
2838
2839 p->se.on_rq = 0;
2523 p->se.exec_start = 0; 2840 p->se.exec_start = 0;
2524 p->se.sum_exec_runtime = 0; 2841 p->se.sum_exec_runtime = 0;
2525 p->se.prev_sum_exec_runtime = 0; 2842 p->se.prev_sum_exec_runtime = 0;
2526 p->se.nr_migrations = 0; 2843 p->se.nr_migrations = 0;
2844 p->se.vruntime = 0;
2845 INIT_LIST_HEAD(&p->se.group_node);
2527 2846
2528#ifdef CONFIG_SCHEDSTATS 2847#ifdef CONFIG_SCHEDSTATS
2529 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2848 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2530#endif 2849#endif
2531 2850
2532 INIT_LIST_HEAD(&p->rt.run_list); 2851 INIT_LIST_HEAD(&p->rt.run_list);
2533 p->se.on_rq = 0;
2534 INIT_LIST_HEAD(&p->se.group_node);
2535 2852
2536#ifdef CONFIG_PREEMPT_NOTIFIERS 2853#ifdef CONFIG_PREEMPT_NOTIFIERS
2537 INIT_HLIST_HEAD(&p->preempt_notifiers); 2854 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2541,8 +2858,9 @@ static void __sched_fork(struct task_struct *p)
2541/* 2858/*
2542 * fork()/clone()-time setup: 2859 * fork()/clone()-time setup:
2543 */ 2860 */
2544void sched_fork(struct task_struct *p, int clone_flags) 2861void sched_fork(struct task_struct *p)
2545{ 2862{
2863 unsigned long flags;
2546 int cpu = get_cpu(); 2864 int cpu = get_cpu();
2547 2865
2548 __sched_fork(p); 2866 __sched_fork(p);
@@ -2594,22 +2912,24 @@ void sched_fork(struct task_struct *p, int clone_flags)
2594 * 2912 *
2595 * Silence PROVE_RCU. 2913 * Silence PROVE_RCU.
2596 */ 2914 */
2597 rcu_read_lock(); 2915 raw_spin_lock_irqsave(&p->pi_lock, flags);
2598 set_task_cpu(p, cpu); 2916 set_task_cpu(p, cpu);
2599 rcu_read_unlock(); 2917 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2600 2918
2601#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2919#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2602 if (likely(sched_info_on())) 2920 if (likely(sched_info_on()))
2603 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2921 memset(&p->sched_info, 0, sizeof(p->sched_info));
2604#endif 2922#endif
2605#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2923#if defined(CONFIG_SMP)
2606 p->oncpu = 0; 2924 p->on_cpu = 0;
2607#endif 2925#endif
2608#ifdef CONFIG_PREEMPT 2926#ifdef CONFIG_PREEMPT
2609 /* Want to start with kernel preemption disabled. */ 2927 /* Want to start with kernel preemption disabled. */
2610 task_thread_info(p)->preempt_count = 1; 2928 task_thread_info(p)->preempt_count = 1;
2611#endif 2929#endif
2930#ifdef CONFIG_SMP
2612 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2931 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2932#endif
2613 2933
2614 put_cpu(); 2934 put_cpu();
2615} 2935}
@@ -2621,41 +2941,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
2621 * that must be done for every newly created context, then puts the task 2941 * that must be done for every newly created context, then puts the task
2622 * on the runqueue and wakes it. 2942 * on the runqueue and wakes it.
2623 */ 2943 */
2624void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2944void wake_up_new_task(struct task_struct *p)
2625{ 2945{
2626 unsigned long flags; 2946 unsigned long flags;
2627 struct rq *rq; 2947 struct rq *rq;
2628 int cpu __maybe_unused = get_cpu();
2629 2948
2949 raw_spin_lock_irqsave(&p->pi_lock, flags);
2630#ifdef CONFIG_SMP 2950#ifdef CONFIG_SMP
2631 rq = task_rq_lock(p, &flags);
2632 p->state = TASK_WAKING;
2633
2634 /* 2951 /*
2635 * Fork balancing, do it here and not earlier because: 2952 * Fork balancing, do it here and not earlier because:
2636 * - cpus_allowed can change in the fork path 2953 * - cpus_allowed can change in the fork path
2637 * - any previously selected cpu might disappear through hotplug 2954 * - any previously selected cpu might disappear through hotplug
2638 *
2639 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2640 * without people poking at ->cpus_allowed.
2641 */ 2955 */
2642 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2956 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2643 set_task_cpu(p, cpu);
2644
2645 p->state = TASK_RUNNING;
2646 task_rq_unlock(rq, &flags);
2647#endif 2957#endif
2648 2958
2649 rq = task_rq_lock(p, &flags); 2959 rq = __task_rq_lock(p);
2650 activate_task(rq, p, 0); 2960 activate_task(rq, p, 0);
2651 trace_sched_wakeup_new(p, 1); 2961 p->on_rq = 1;
2962 trace_sched_wakeup_new(p, true);
2652 check_preempt_curr(rq, p, WF_FORK); 2963 check_preempt_curr(rq, p, WF_FORK);
2653#ifdef CONFIG_SMP 2964#ifdef CONFIG_SMP
2654 if (p->sched_class->task_woken) 2965 if (p->sched_class->task_woken)
2655 p->sched_class->task_woken(rq, p); 2966 p->sched_class->task_woken(rq, p);
2656#endif 2967#endif
2657 task_rq_unlock(rq, &flags); 2968 task_rq_unlock(rq, p, &flags);
2658 put_cpu();
2659} 2969}
2660 2970
2661#ifdef CONFIG_PREEMPT_NOTIFIERS 2971#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2733,9 +3043,12 @@ static inline void
2733prepare_task_switch(struct rq *rq, struct task_struct *prev, 3043prepare_task_switch(struct rq *rq, struct task_struct *prev,
2734 struct task_struct *next) 3044 struct task_struct *next)
2735{ 3045{
3046 sched_info_switch(prev, next);
3047 perf_event_task_sched_out(prev, next);
2736 fire_sched_out_preempt_notifiers(prev, next); 3048 fire_sched_out_preempt_notifiers(prev, next);
2737 prepare_lock_switch(rq, next); 3049 prepare_lock_switch(rq, next);
2738 prepare_arch_switch(next); 3050 prepare_arch_switch(next);
3051 trace_sched_switch(prev, next);
2739} 3052}
2740 3053
2741/** 3054/**
@@ -2879,7 +3192,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2879 struct mm_struct *mm, *oldmm; 3192 struct mm_struct *mm, *oldmm;
2880 3193
2881 prepare_task_switch(rq, prev, next); 3194 prepare_task_switch(rq, prev, next);
2882 trace_sched_switch(prev, next); 3195
2883 mm = next->mm; 3196 mm = next->mm;
2884 oldmm = prev->active_mm; 3197 oldmm = prev->active_mm;
2885 /* 3198 /*
@@ -2889,14 +3202,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2889 */ 3202 */
2890 arch_start_context_switch(prev); 3203 arch_start_context_switch(prev);
2891 3204
2892 if (likely(!mm)) { 3205 if (!mm) {
2893 next->active_mm = oldmm; 3206 next->active_mm = oldmm;
2894 atomic_inc(&oldmm->mm_count); 3207 atomic_inc(&oldmm->mm_count);
2895 enter_lazy_tlb(oldmm, next); 3208 enter_lazy_tlb(oldmm, next);
2896 } else 3209 } else
2897 switch_mm(oldmm, mm, next); 3210 switch_mm(oldmm, mm, next);
2898 3211
2899 if (likely(!prev->mm)) { 3212 if (!prev->mm) {
2900 prev->active_mm = NULL; 3213 prev->active_mm = NULL;
2901 rq->prev_mm = oldmm; 3214 rq->prev_mm = oldmm;
2902 } 3215 }
@@ -3011,6 +3324,15 @@ static long calc_load_fold_active(struct rq *this_rq)
3011 return delta; 3324 return delta;
3012} 3325}
3013 3326
3327static unsigned long
3328calc_load(unsigned long load, unsigned long exp, unsigned long active)
3329{
3330 load *= exp;
3331 load += active * (FIXED_1 - exp);
3332 load += 1UL << (FSHIFT - 1);
3333 return load >> FSHIFT;
3334}
3335
3014#ifdef CONFIG_NO_HZ 3336#ifdef CONFIG_NO_HZ
3015/* 3337/*
3016 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3338 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3040,6 +3362,128 @@ static long calc_load_fold_idle(void)
3040 3362
3041 return delta; 3363 return delta;
3042} 3364}
3365
3366/**
3367 * fixed_power_int - compute: x^n, in O(log n) time
3368 *
3369 * @x: base of the power
3370 * @frac_bits: fractional bits of @x
3371 * @n: power to raise @x to.
3372 *
3373 * By exploiting the relation between the definition of the natural power
3374 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3375 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3376 * (where: n_i \elem {0, 1}, the binary vector representing n),
3377 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3378 * of course trivially computable in O(log_2 n), the length of our binary
3379 * vector.
3380 */
3381static unsigned long
3382fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3383{
3384 unsigned long result = 1UL << frac_bits;
3385
3386 if (n) for (;;) {
3387 if (n & 1) {
3388 result *= x;
3389 result += 1UL << (frac_bits - 1);
3390 result >>= frac_bits;
3391 }
3392 n >>= 1;
3393 if (!n)
3394 break;
3395 x *= x;
3396 x += 1UL << (frac_bits - 1);
3397 x >>= frac_bits;
3398 }
3399
3400 return result;
3401}
3402
3403/*
3404 * a1 = a0 * e + a * (1 - e)
3405 *
3406 * a2 = a1 * e + a * (1 - e)
3407 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3408 * = a0 * e^2 + a * (1 - e) * (1 + e)
3409 *
3410 * a3 = a2 * e + a * (1 - e)
3411 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3412 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3413 *
3414 * ...
3415 *
3416 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3417 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3418 * = a0 * e^n + a * (1 - e^n)
3419 *
3420 * [1] application of the geometric series:
3421 *
3422 * n 1 - x^(n+1)
3423 * S_n := \Sum x^i = -------------
3424 * i=0 1 - x
3425 */
3426static unsigned long
3427calc_load_n(unsigned long load, unsigned long exp,
3428 unsigned long active, unsigned int n)
3429{
3430
3431 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3432}
3433
3434/*
3435 * NO_HZ can leave us missing all per-cpu ticks calling
3436 * calc_load_account_active(), but since an idle CPU folds its delta into
3437 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3438 * in the pending idle delta if our idle period crossed a load cycle boundary.
3439 *
3440 * Once we've updated the global active value, we need to apply the exponential
3441 * weights adjusted to the number of cycles missed.
3442 */
3443static void calc_global_nohz(unsigned long ticks)
3444{
3445 long delta, active, n;
3446
3447 if (time_before(jiffies, calc_load_update))
3448 return;
3449
3450 /*
3451 * If we crossed a calc_load_update boundary, make sure to fold
3452 * any pending idle changes, the respective CPUs might have
3453 * missed the tick driven calc_load_account_active() update
3454 * due to NO_HZ.
3455 */
3456 delta = calc_load_fold_idle();
3457 if (delta)
3458 atomic_long_add(delta, &calc_load_tasks);
3459
3460 /*
3461 * If we were idle for multiple load cycles, apply them.
3462 */
3463 if (ticks >= LOAD_FREQ) {
3464 n = ticks / LOAD_FREQ;
3465
3466 active = atomic_long_read(&calc_load_tasks);
3467 active = active > 0 ? active * FIXED_1 : 0;
3468
3469 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3470 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3471 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3472
3473 calc_load_update += n * LOAD_FREQ;
3474 }
3475
3476 /*
3477 * Its possible the remainder of the above division also crosses
3478 * a LOAD_FREQ period, the regular check in calc_global_load()
3479 * which comes after this will take care of that.
3480 *
3481 * Consider us being 11 ticks before a cycle completion, and us
3482 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3483 * age us 4 cycles, and the test in calc_global_load() will
3484 * pick up the final one.
3485 */
3486}
3043#else 3487#else
3044static void calc_load_account_idle(struct rq *this_rq) 3488static void calc_load_account_idle(struct rq *this_rq)
3045{ 3489{
@@ -3049,6 +3493,10 @@ static inline long calc_load_fold_idle(void)
3049{ 3493{
3050 return 0; 3494 return 0;
3051} 3495}
3496
3497static void calc_global_nohz(unsigned long ticks)
3498{
3499}
3052#endif 3500#endif
3053 3501
3054/** 3502/**
@@ -3066,24 +3514,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3066 loads[2] = (avenrun[2] + offset) << shift; 3514 loads[2] = (avenrun[2] + offset) << shift;
3067} 3515}
3068 3516
3069static unsigned long
3070calc_load(unsigned long load, unsigned long exp, unsigned long active)
3071{
3072 load *= exp;
3073 load += active * (FIXED_1 - exp);
3074 return load >> FSHIFT;
3075}
3076
3077/* 3517/*
3078 * calc_load - update the avenrun load estimates 10 ticks after the 3518 * calc_load - update the avenrun load estimates 10 ticks after the
3079 * CPUs have updated calc_load_tasks. 3519 * CPUs have updated calc_load_tasks.
3080 */ 3520 */
3081void calc_global_load(void) 3521void calc_global_load(unsigned long ticks)
3082{ 3522{
3083 unsigned long upd = calc_load_update + 10;
3084 long active; 3523 long active;
3085 3524
3086 if (time_before(jiffies, upd)) 3525 calc_global_nohz(ticks);
3526
3527 if (time_before(jiffies, calc_load_update + 10))
3087 return; 3528 return;
3088 3529
3089 active = atomic_long_read(&calc_load_tasks); 3530 active = atomic_long_read(&calc_load_tasks);
@@ -3244,27 +3685,22 @@ void sched_exec(void)
3244{ 3685{
3245 struct task_struct *p = current; 3686 struct task_struct *p = current;
3246 unsigned long flags; 3687 unsigned long flags;
3247 struct rq *rq;
3248 int dest_cpu; 3688 int dest_cpu;
3249 3689
3250 rq = task_rq_lock(p, &flags); 3690 raw_spin_lock_irqsave(&p->pi_lock, flags);
3251 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3691 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3252 if (dest_cpu == smp_processor_id()) 3692 if (dest_cpu == smp_processor_id())
3253 goto unlock; 3693 goto unlock;
3254 3694
3255 /* 3695 if (likely(cpu_active(dest_cpu))) {
3256 * select_task_rq() can race against ->cpus_allowed
3257 */
3258 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3259 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3260 struct migration_arg arg = { p, dest_cpu }; 3696 struct migration_arg arg = { p, dest_cpu };
3261 3697
3262 task_rq_unlock(rq, &flags); 3698 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3263 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3699 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3264 return; 3700 return;
3265 } 3701 }
3266unlock: 3702unlock:
3267 task_rq_unlock(rq, &flags); 3703 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3268} 3704}
3269 3705
3270#endif 3706#endif
@@ -3285,7 +3721,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3285 3721
3286 if (task_current(rq, p)) { 3722 if (task_current(rq, p)) {
3287 update_rq_clock(rq); 3723 update_rq_clock(rq);
3288 ns = rq->clock - p->se.exec_start; 3724 ns = rq->clock_task - p->se.exec_start;
3289 if ((s64)ns < 0) 3725 if ((s64)ns < 0)
3290 ns = 0; 3726 ns = 0;
3291 } 3727 }
@@ -3301,7 +3737,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3301 3737
3302 rq = task_rq_lock(p, &flags); 3738 rq = task_rq_lock(p, &flags);
3303 ns = do_task_delta_exec(p, rq); 3739 ns = do_task_delta_exec(p, rq);
3304 task_rq_unlock(rq, &flags); 3740 task_rq_unlock(rq, p, &flags);
3305 3741
3306 return ns; 3742 return ns;
3307} 3743}
@@ -3319,7 +3755,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3319 3755
3320 rq = task_rq_lock(p, &flags); 3756 rq = task_rq_lock(p, &flags);
3321 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3757 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3322 task_rq_unlock(rq, &flags); 3758 task_rq_unlock(rq, p, &flags);
3323 3759
3324 return ns; 3760 return ns;
3325} 3761}
@@ -3343,7 +3779,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3343 rq = task_rq_lock(p, &flags); 3779 rq = task_rq_lock(p, &flags);
3344 thread_group_cputime(p, &totals); 3780 thread_group_cputime(p, &totals);
3345 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3781 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3346 task_rq_unlock(rq, &flags); 3782 task_rq_unlock(rq, p, &flags);
3347 3783
3348 return ns; 3784 return ns;
3349} 3785}
@@ -3408,6 +3844,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3408} 3844}
3409 3845
3410/* 3846/*
3847 * Account system cpu time to a process and desired cpustat field
3848 * @p: the process that the cpu time gets accounted to
3849 * @cputime: the cpu time spent in kernel space since the last update
3850 * @cputime_scaled: cputime scaled by cpu frequency
3851 * @target_cputime64: pointer to cpustat field that has to be updated
3852 */
3853static inline
3854void __account_system_time(struct task_struct *p, cputime_t cputime,
3855 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3856{
3857 cputime64_t tmp = cputime_to_cputime64(cputime);
3858
3859 /* Add system time to process. */
3860 p->stime = cputime_add(p->stime, cputime);
3861 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3862 account_group_system_time(p, cputime);
3863
3864 /* Add system time to cpustat. */
3865 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3866 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3867
3868 /* Account for system time used */
3869 acct_update_integrals(p);
3870}
3871
3872/*
3411 * Account system cpu time to a process. 3873 * Account system cpu time to a process.
3412 * @p: the process that the cpu time gets accounted to 3874 * @p: the process that the cpu time gets accounted to
3413 * @hardirq_offset: the offset to subtract from hardirq_count() 3875 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3418,36 +3880,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3418 cputime_t cputime, cputime_t cputime_scaled) 3880 cputime_t cputime, cputime_t cputime_scaled)
3419{ 3881{
3420 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3882 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3421 cputime64_t tmp; 3883 cputime64_t *target_cputime64;
3422 3884
3423 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3885 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3424 account_guest_time(p, cputime, cputime_scaled); 3886 account_guest_time(p, cputime, cputime_scaled);
3425 return; 3887 return;
3426 } 3888 }
3427 3889
3428 /* Add system time to process. */
3429 p->stime = cputime_add(p->stime, cputime);
3430 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3431 account_group_system_time(p, cputime);
3432
3433 /* Add system time to cpustat. */
3434 tmp = cputime_to_cputime64(cputime);
3435 if (hardirq_count() - hardirq_offset) 3890 if (hardirq_count() - hardirq_offset)
3436 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3891 target_cputime64 = &cpustat->irq;
3437 else if (softirq_count()) 3892 else if (in_serving_softirq())
3438 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3893 target_cputime64 = &cpustat->softirq;
3439 else 3894 else
3440 cpustat->system = cputime64_add(cpustat->system, tmp); 3895 target_cputime64 = &cpustat->system;
3441
3442 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3443 3896
3444 /* Account for system time used */ 3897 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3445 acct_update_integrals(p);
3446} 3898}
3447 3899
3448/* 3900/*
3449 * Account for involuntary wait time. 3901 * Account for involuntary wait time.
3450 * @steal: the cpu time spent in involuntary wait 3902 * @cputime: the cpu time spent in involuntary wait
3451 */ 3903 */
3452void account_steal_time(cputime_t cputime) 3904void account_steal_time(cputime_t cputime)
3453{ 3905{
@@ -3475,6 +3927,73 @@ void account_idle_time(cputime_t cputime)
3475 3927
3476#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3928#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3477 3929
3930#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3931/*
3932 * Account a tick to a process and cpustat
3933 * @p: the process that the cpu time gets accounted to
3934 * @user_tick: is the tick from userspace
3935 * @rq: the pointer to rq
3936 *
3937 * Tick demultiplexing follows the order
3938 * - pending hardirq update
3939 * - pending softirq update
3940 * - user_time
3941 * - idle_time
3942 * - system time
3943 * - check for guest_time
3944 * - else account as system_time
3945 *
3946 * Check for hardirq is done both for system and user time as there is
3947 * no timer going off while we are on hardirq and hence we may never get an
3948 * opportunity to update it solely in system time.
3949 * p->stime and friends are only updated on system time and not on irq
3950 * softirq as those do not count in task exec_runtime any more.
3951 */
3952static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3953 struct rq *rq)
3954{
3955 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3956 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3957 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3958
3959 if (irqtime_account_hi_update()) {
3960 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3961 } else if (irqtime_account_si_update()) {
3962 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3963 } else if (this_cpu_ksoftirqd() == p) {
3964 /*
3965 * ksoftirqd time do not get accounted in cpu_softirq_time.
3966 * So, we have to handle it separately here.
3967 * Also, p->stime needs to be updated for ksoftirqd.
3968 */
3969 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3970 &cpustat->softirq);
3971 } else if (user_tick) {
3972 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3973 } else if (p == rq->idle) {
3974 account_idle_time(cputime_one_jiffy);
3975 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3976 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3977 } else {
3978 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3979 &cpustat->system);
3980 }
3981}
3982
3983static void irqtime_account_idle_ticks(int ticks)
3984{
3985 int i;
3986 struct rq *rq = this_rq();
3987
3988 for (i = 0; i < ticks; i++)
3989 irqtime_account_process_tick(current, 0, rq);
3990}
3991#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3992static void irqtime_account_idle_ticks(int ticks) {}
3993static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3994 struct rq *rq) {}
3995#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3996
3478/* 3997/*
3479 * Account a single tick of cpu time. 3998 * Account a single tick of cpu time.
3480 * @p: the process that the cpu time gets accounted to 3999 * @p: the process that the cpu time gets accounted to
@@ -3485,6 +4004,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
3485 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 4004 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3486 struct rq *rq = this_rq(); 4005 struct rq *rq = this_rq();
3487 4006
4007 if (sched_clock_irqtime) {
4008 irqtime_account_process_tick(p, user_tick, rq);
4009 return;
4010 }
4011
3488 if (user_tick) 4012 if (user_tick)
3489 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 4013 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3490 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 4014 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3510,6 +4034,12 @@ void account_steal_ticks(unsigned long ticks)
3510 */ 4034 */
3511void account_idle_ticks(unsigned long ticks) 4035void account_idle_ticks(unsigned long ticks)
3512{ 4036{
4037
4038 if (sched_clock_irqtime) {
4039 irqtime_account_idle_ticks(ticks);
4040 return;
4041 }
4042
3513 account_idle_time(jiffies_to_cputime(ticks)); 4043 account_idle_time(jiffies_to_cputime(ticks));
3514} 4044}
3515 4045
@@ -3603,9 +4133,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3603/* 4133/*
3604 * This function gets called by the timer code, with HZ frequency. 4134 * This function gets called by the timer code, with HZ frequency.
3605 * We call it with interrupts disabled. 4135 * We call it with interrupts disabled.
3606 *
3607 * It also gets called by the fork code, when changing the parent's
3608 * timeslices.
3609 */ 4136 */
3610void scheduler_tick(void) 4137void scheduler_tick(void)
3611{ 4138{
@@ -3627,7 +4154,7 @@ void scheduler_tick(void)
3627 4154
3628 raw_spin_unlock(&rq->lock); 4155 raw_spin_unlock(&rq->lock);
3629 4156
3630 perf_event_task_tick(curr); 4157 perf_event_task_tick();
3631 4158
3632#ifdef CONFIG_SMP 4159#ifdef CONFIG_SMP
3633 rq->idle_at_tick = idle_cpu(cpu); 4160 rq->idle_at_tick = idle_cpu(cpu);
@@ -3733,19 +4260,12 @@ static inline void schedule_debug(struct task_struct *prev)
3733 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4260 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3734 4261
3735 schedstat_inc(this_rq(), sched_count); 4262 schedstat_inc(this_rq(), sched_count);
3736#ifdef CONFIG_SCHEDSTATS
3737 if (unlikely(prev->lock_depth >= 0)) {
3738 schedstat_inc(this_rq(), bkl_count);
3739 schedstat_inc(prev, sched_info.bkl_count);
3740 }
3741#endif
3742} 4263}
3743 4264
3744static void put_prev_task(struct rq *rq, struct task_struct *prev) 4265static void put_prev_task(struct rq *rq, struct task_struct *prev)
3745{ 4266{
3746 if (prev->se.on_rq) 4267 if (prev->on_rq || rq->skip_clock_update < 0)
3747 update_rq_clock(rq); 4268 update_rq_clock(rq);
3748 rq->skip_clock_update = 0;
3749 prev->sched_class->put_prev_task(rq, prev); 4269 prev->sched_class->put_prev_task(rq, prev);
3750} 4270}
3751 4271
@@ -3776,17 +4296,13 @@ pick_next_task(struct rq *rq)
3776 } 4296 }
3777 */ 4297 */
3778 4298
3779 class = sched_class_highest; 4299 for_each_class(class) {
3780 for ( ; ; ) {
3781 p = class->pick_next_task(rq); 4300 p = class->pick_next_task(rq);
3782 if (p) 4301 if (p)
3783 return p; 4302 return p;
3784 /*
3785 * Will never be NULL as the idle class always
3786 * returns a non-NULL p:
3787 */
3788 class = class->next;
3789 } 4303 }
4304
4305 BUG(); /* the idle class will always have a runnable task */
3790} 4306}
3791 4307
3792/* 4308/*
@@ -3807,8 +4323,10 @@ need_resched:
3807 rcu_note_context_switch(cpu); 4323 rcu_note_context_switch(cpu);
3808 prev = rq->curr; 4324 prev = rq->curr;
3809 4325
3810 release_kernel_lock(prev); 4326 /* LITMUS^RT: quickly re-evaluate the scheduling decision
3811need_resched_nonpreemptible: 4327 * if the previous one is no longer valid after CTX.
4328 */
4329litmus_need_resched_nonpreemptible:
3812 TS_SCHED_START; 4330 TS_SCHED_START;
3813 sched_trace_task_switch_away(prev); 4331 sched_trace_task_switch_away(prev);
3814 4332
@@ -3818,18 +4336,19 @@ need_resched_nonpreemptible:
3818 hrtick_clear(rq); 4336 hrtick_clear(rq);
3819 4337
3820 raw_spin_lock_irq(&rq->lock); 4338 raw_spin_lock_irq(&rq->lock);
3821 clear_tsk_need_resched(prev);
3822 4339
3823 switch_count = &prev->nivcsw; 4340 switch_count = &prev->nivcsw;
3824 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4341 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3825 if (unlikely(signal_pending_state(prev->state, prev))) { 4342 if (unlikely(signal_pending_state(prev->state, prev))) {
3826 prev->state = TASK_RUNNING; 4343 prev->state = TASK_RUNNING;
3827 } else { 4344 } else {
4345 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4346 prev->on_rq = 0;
4347
3828 /* 4348 /*
3829 * If a worker is going to sleep, notify and 4349 * If a worker went to sleep, notify and ask workqueue
3830 * ask workqueue whether it wants to wake up a 4350 * whether it wants to wake up a task to maintain
3831 * task to maintain concurrency. If so, wake 4351 * concurrency.
3832 * up the task.
3833 */ 4352 */
3834 if (prev->flags & PF_WQ_WORKER) { 4353 if (prev->flags & PF_WQ_WORKER) {
3835 struct task_struct *to_wakeup; 4354 struct task_struct *to_wakeup;
@@ -3838,7 +4357,16 @@ need_resched_nonpreemptible:
3838 if (to_wakeup) 4357 if (to_wakeup)
3839 try_to_wake_up_local(to_wakeup); 4358 try_to_wake_up_local(to_wakeup);
3840 } 4359 }
3841 deactivate_task(rq, prev, DEQUEUE_SLEEP); 4360
4361 /*
4362 * If we are going to sleep and we have plugged IO
4363 * queued, make sure to submit it to avoid deadlocks.
4364 */
4365 if (blk_needs_flush_plug(prev)) {
4366 raw_spin_unlock(&rq->lock);
4367 blk_schedule_flush_plug(prev);
4368 raw_spin_lock(&rq->lock);
4369 }
3842 } 4370 }
3843 switch_count = &prev->nvcsw; 4371 switch_count = &prev->nvcsw;
3844 } 4372 }
@@ -3850,11 +4378,10 @@ need_resched_nonpreemptible:
3850 4378
3851 put_prev_task(rq, prev); 4379 put_prev_task(rq, prev);
3852 next = pick_next_task(rq); 4380 next = pick_next_task(rq);
4381 clear_tsk_need_resched(prev);
4382 rq->skip_clock_update = 0;
3853 4383
3854 if (likely(prev != next)) { 4384 if (likely(prev != next)) {
3855 sched_info_switch(prev, next);
3856 perf_event_task_sched_out(prev, next);
3857
3858 rq->nr_switches++; 4385 rq->nr_switches++;
3859 rq->curr = next; 4386 rq->curr = next;
3860 ++*switch_count; 4387 ++*switch_count;
@@ -3880,8 +4407,8 @@ need_resched_nonpreemptible:
3880 4407
3881 post_schedule(rq); 4408 post_schedule(rq);
3882 4409
3883 if (sched_state_validate_switch() || unlikely(reacquire_kernel_lock(prev))) 4410 if (sched_state_validate_switch())
3884 goto need_resched_nonpreemptible; 4411 goto litmus_need_resched_nonpreemptible;
3885 4412
3886 preempt_enable_no_resched(); 4413 preempt_enable_no_resched();
3887 if (need_resched()) 4414 if (need_resched())
@@ -3892,70 +4419,53 @@ need_resched_nonpreemptible:
3892EXPORT_SYMBOL(schedule); 4419EXPORT_SYMBOL(schedule);
3893 4420
3894#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4421#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4422
4423static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4424{
4425 bool ret = false;
4426
4427 rcu_read_lock();
4428 if (lock->owner != owner)
4429 goto fail;
4430
4431 /*
4432 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4433 * lock->owner still matches owner, if that fails, owner might
4434 * point to free()d memory, if it still matches, the rcu_read_lock()
4435 * ensures the memory stays valid.
4436 */
4437 barrier();
4438
4439 ret = owner->on_cpu;
4440fail:
4441 rcu_read_unlock();
4442
4443 return ret;
4444}
4445
3895/* 4446/*
3896 * Look out! "owner" is an entirely speculative pointer 4447 * Look out! "owner" is an entirely speculative pointer
3897 * access and not reliable. 4448 * access and not reliable.
3898 */ 4449 */
3899int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) 4450int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3900{ 4451{
3901 unsigned int cpu;
3902 struct rq *rq;
3903
3904 if (!sched_feat(OWNER_SPIN)) 4452 if (!sched_feat(OWNER_SPIN))
3905 return 0; 4453 return 0;
3906 4454
3907#ifdef CONFIG_DEBUG_PAGEALLOC 4455 while (owner_running(lock, owner)) {
3908 /* 4456 if (need_resched())
3909 * Need to access the cpu field knowing that 4457 return 0;
3910 * DEBUG_PAGEALLOC could have unmapped it if
3911 * the mutex owner just released it and exited.
3912 */
3913 if (probe_kernel_address(&owner->cpu, cpu))
3914 return 0;
3915#else
3916 cpu = owner->cpu;
3917#endif
3918 4458
3919 /* 4459 arch_mutex_cpu_relax();
3920 * Even if the access succeeded (likely case), 4460 }
3921 * the cpu field may no longer be valid.
3922 */
3923 if (cpu >= nr_cpumask_bits)
3924 return 0;
3925 4461
3926 /* 4462 /*
3927 * We need to validate that we can do a 4463 * If the owner changed to another task there is likely
3928 * get_cpu() and that we have the percpu area. 4464 * heavy contention, stop spinning.
3929 */ 4465 */
3930 if (!cpu_online(cpu)) 4466 if (lock->owner)
3931 return 0; 4467 return 0;
3932 4468
3933 rq = cpu_rq(cpu);
3934
3935 for (;;) {
3936 /*
3937 * Owner changed, break to re-assess state.
3938 */
3939 if (lock->owner != owner) {
3940 /*
3941 * If the lock has switched to a different owner,
3942 * we likely have heavy contention. Return 0 to quit
3943 * optimistic spinning and not contend further:
3944 */
3945 if (lock->owner)
3946 return 0;
3947 break;
3948 }
3949
3950 /*
3951 * Is that owner really running on that cpu?
3952 */
3953 if (task_thread_info(rq->curr) != owner || need_resched())
3954 return 0;
3955
3956 cpu_relax();
3957 }
3958
3959 return 1; 4469 return 1;
3960} 4470}
3961#endif 4471#endif
@@ -4085,6 +4595,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4085{ 4595{
4086 __wake_up_common(q, mode, 1, 0, key); 4596 __wake_up_common(q, mode, 1, 0, key);
4087} 4597}
4598EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4088 4599
4089/** 4600/**
4090 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 4601 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4276,7 +4787,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4276 * This waits for either a completion of a specific task to be signaled or for a 4787 * This waits for either a completion of a specific task to be signaled or for a
4277 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4788 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4278 */ 4789 */
4279unsigned long __sched 4790long __sched
4280wait_for_completion_interruptible_timeout(struct completion *x, 4791wait_for_completion_interruptible_timeout(struct completion *x,
4281 unsigned long timeout) 4792 unsigned long timeout)
4282{ 4793{
@@ -4309,7 +4820,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4309 * signaled or for a specified timeout to expire. It can be 4820 * signaled or for a specified timeout to expire. It can be
4310 * interrupted by a kill signal. The timeout is in jiffies. 4821 * interrupted by a kill signal. The timeout is in jiffies.
4311 */ 4822 */
4312unsigned long __sched 4823long __sched
4313wait_for_completion_killable_timeout(struct completion *x, 4824wait_for_completion_killable_timeout(struct completion *x,
4314 unsigned long timeout) 4825 unsigned long timeout)
4315{ 4826{
@@ -4425,18 +4936,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4425 */ 4936 */
4426void rt_mutex_setprio(struct task_struct *p, int prio) 4937void rt_mutex_setprio(struct task_struct *p, int prio)
4427{ 4938{
4428 unsigned long flags;
4429 int oldprio, on_rq, running; 4939 int oldprio, on_rq, running;
4430 struct rq *rq; 4940 struct rq *rq;
4431 const struct sched_class *prev_class; 4941 const struct sched_class *prev_class;
4432 4942
4433 BUG_ON(prio < 0 || prio > MAX_PRIO); 4943 BUG_ON(prio < 0 || prio > MAX_PRIO);
4434 4944
4435 rq = task_rq_lock(p, &flags); 4945 rq = __task_rq_lock(p);
4436 4946
4947 trace_sched_pi_setprio(p, prio);
4437 oldprio = p->prio; 4948 oldprio = p->prio;
4438 prev_class = p->sched_class; 4949 prev_class = p->sched_class;
4439 on_rq = p->se.on_rq; 4950 on_rq = p->on_rq;
4440 running = task_current(rq, p); 4951 running = task_current(rq, p);
4441 if (on_rq) 4952 if (on_rq)
4442 dequeue_task(rq, p, 0); 4953 dequeue_task(rq, p, 0);
@@ -4452,12 +4963,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4452 4963
4453 if (running) 4964 if (running)
4454 p->sched_class->set_curr_task(rq); 4965 p->sched_class->set_curr_task(rq);
4455 if (on_rq) { 4966 if (on_rq)
4456 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4967 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4457 4968
4458 check_class_changed(rq, p, prev_class, oldprio, running); 4969 check_class_changed(rq, p, prev_class, oldprio);
4459 } 4970 __task_rq_unlock(rq);
4460 task_rq_unlock(rq, &flags);
4461} 4971}
4462 4972
4463#endif 4973#endif
@@ -4485,7 +4995,7 @@ void set_user_nice(struct task_struct *p, long nice)
4485 p->static_prio = NICE_TO_PRIO(nice); 4995 p->static_prio = NICE_TO_PRIO(nice);
4486 goto out_unlock; 4996 goto out_unlock;
4487 } 4997 }
4488 on_rq = p->se.on_rq; 4998 on_rq = p->on_rq;
4489 if (on_rq) 4999 if (on_rq)
4490 dequeue_task(rq, p, 0); 5000 dequeue_task(rq, p, 0);
4491 5001
@@ -4505,7 +5015,7 @@ void set_user_nice(struct task_struct *p, long nice)
4505 resched_task(rq->curr); 5015 resched_task(rq->curr);
4506 } 5016 }
4507out_unlock: 5017out_unlock:
4508 task_rq_unlock(rq, &flags); 5018 task_rq_unlock(rq, p, &flags);
4509} 5019}
4510EXPORT_SYMBOL(set_user_nice); 5020EXPORT_SYMBOL(set_user_nice);
4511 5021
@@ -4619,8 +5129,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4619static void 5129static void
4620__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 5130__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4621{ 5131{
4622 BUG_ON(p->se.on_rq);
4623
4624 p->policy = policy; 5132 p->policy = policy;
4625 p->rt_priority = prio; 5133 p->rt_priority = prio;
4626 p->normal_prio = normal_prio(p); 5134 p->normal_prio = normal_prio(p);
@@ -4645,14 +5153,17 @@ static bool check_same_owner(struct task_struct *p)
4645 5153
4646 rcu_read_lock(); 5154 rcu_read_lock();
4647 pcred = __task_cred(p); 5155 pcred = __task_cred(p);
4648 match = (cred->euid == pcred->euid || 5156 if (cred->user->user_ns == pcred->user->user_ns)
4649 cred->euid == pcred->uid); 5157 match = (cred->euid == pcred->euid ||
5158 cred->euid == pcred->uid);
5159 else
5160 match = false;
4650 rcu_read_unlock(); 5161 rcu_read_unlock();
4651 return match; 5162 return match;
4652} 5163}
4653 5164
4654static int __sched_setscheduler(struct task_struct *p, int policy, 5165static int __sched_setscheduler(struct task_struct *p, int policy,
4655 struct sched_param *param, bool user) 5166 const struct sched_param *param, bool user)
4656{ 5167{
4657 int retval, oldprio, oldpolicy = -1, on_rq, running; 5168 int retval, oldprio, oldpolicy = -1, on_rq, running;
4658 unsigned long flags; 5169 unsigned long flags;
@@ -4708,12 +5219,15 @@ recheck:
4708 param->sched_priority > rlim_rtprio) 5219 param->sched_priority > rlim_rtprio)
4709 return -EPERM; 5220 return -EPERM;
4710 } 5221 }
5222
4711 /* 5223 /*
4712 * Like positive nice levels, dont allow tasks to 5224 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4713 * move out of SCHED_IDLE either: 5225 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4714 */ 5226 */
4715 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 5227 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4716 return -EPERM; 5228 if (!can_nice(p, TASK_NICE(p)))
5229 return -EPERM;
5230 }
4717 5231
4718 /* can't change other user's priorities */ 5232 /* can't change other user's priorities */
4719 if (!check_same_owner(p)) 5233 if (!check_same_owner(p))
@@ -4725,7 +5239,7 @@ recheck:
4725 } 5239 }
4726 5240
4727 if (user) { 5241 if (user) {
4728 retval = security_task_setscheduler(p, policy, param); 5242 retval = security_task_setscheduler(p);
4729 if (retval) 5243 if (retval)
4730 return retval; 5244 return retval;
4731 } 5245 }
@@ -4739,13 +5253,30 @@ recheck:
4739 /* 5253 /*
4740 * make sure no PI-waiters arrive (or leave) while we are 5254 * make sure no PI-waiters arrive (or leave) while we are
4741 * changing the priority of the task: 5255 * changing the priority of the task:
5256 *
5257 * To be able to change p->policy safely, the appropriate
5258 * runqueue lock must be held.
4742 */ 5259 */
4743 raw_spin_lock_irqsave(&p->pi_lock, flags); 5260 rq = task_rq_lock(p, &flags);
5261
4744 /* 5262 /*
4745 * To be able to change p->policy safely, the apropriate 5263 * Changing the policy of the stop threads its a very bad idea
4746 * runqueue lock must be held.
4747 */ 5264 */
4748 rq = __task_rq_lock(p); 5265 if (p == rq->stop) {
5266 task_rq_unlock(rq, p, &flags);
5267 return -EINVAL;
5268 }
5269
5270 /*
5271 * If not changing anything there's no need to proceed further:
5272 */
5273 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
5274 param->sched_priority == p->rt_priority))) {
5275
5276 __task_rq_unlock(rq);
5277 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5278 return 0;
5279 }
4749 5280
4750#ifdef CONFIG_RT_GROUP_SCHED 5281#ifdef CONFIG_RT_GROUP_SCHED
4751 if (user) { 5282 if (user) {
@@ -4754,9 +5285,9 @@ recheck:
4754 * assigned. 5285 * assigned.
4755 */ 5286 */
4756 if (rt_bandwidth_enabled() && rt_policy(policy) && 5287 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4757 task_group(p)->rt_bandwidth.rt_runtime == 0) { 5288 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4758 __task_rq_unlock(rq); 5289 !task_group_is_autogroup(task_group(p))) {
4759 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5290 task_rq_unlock(rq, p, &flags);
4760 return -EPERM; 5291 return -EPERM;
4761 } 5292 }
4762 } 5293 }
@@ -4765,11 +5296,10 @@ recheck:
4765 /* recheck policy now with rq lock held */ 5296 /* recheck policy now with rq lock held */
4766 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5297 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4767 policy = oldpolicy = -1; 5298 policy = oldpolicy = -1;
4768 __task_rq_unlock(rq); 5299 task_rq_unlock(rq, p, &flags);
4769 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4770 goto recheck; 5300 goto recheck;
4771 } 5301 }
4772 on_rq = p->se.on_rq; 5302 on_rq = p->on_rq;
4773 running = task_current(rq, p); 5303 running = task_current(rq, p);
4774 if (on_rq) 5304 if (on_rq)
4775 deactivate_task(rq, p, 0); 5305 deactivate_task(rq, p, 0);
@@ -4793,13 +5323,11 @@ recheck:
4793 5323
4794 if (running) 5324 if (running)
4795 p->sched_class->set_curr_task(rq); 5325 p->sched_class->set_curr_task(rq);
4796 if (on_rq) { 5326 if (on_rq)
4797 activate_task(rq, p, 0); 5327 activate_task(rq, p, 0);
4798 5328
4799 check_class_changed(rq, p, prev_class, oldprio, running); 5329 check_class_changed(rq, p, prev_class, oldprio);
4800 } 5330 task_rq_unlock(rq, p, &flags);
4801 __task_rq_unlock(rq);
4802 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4803 5331
4804 rt_mutex_adjust_pi(p); 5332 rt_mutex_adjust_pi(p);
4805 5333
@@ -4815,7 +5343,7 @@ recheck:
4815 * NOTE that the task may be already dead. 5343 * NOTE that the task may be already dead.
4816 */ 5344 */
4817int sched_setscheduler(struct task_struct *p, int policy, 5345int sched_setscheduler(struct task_struct *p, int policy,
4818 struct sched_param *param) 5346 const struct sched_param *param)
4819{ 5347{
4820 return __sched_setscheduler(p, policy, param, true); 5348 return __sched_setscheduler(p, policy, param, true);
4821} 5349}
@@ -4833,7 +5361,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
4833 * but our caller might not have that capability. 5361 * but our caller might not have that capability.
4834 */ 5362 */
4835int sched_setscheduler_nocheck(struct task_struct *p, int policy, 5363int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4836 struct sched_param *param) 5364 const struct sched_param *param)
4837{ 5365{
4838 return __sched_setscheduler(p, policy, param, false); 5366 return __sched_setscheduler(p, policy, param, false);
4839} 5367}
@@ -4980,16 +5508,16 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4980 goto out_free_cpus_allowed; 5508 goto out_free_cpus_allowed;
4981 } 5509 }
4982 retval = -EPERM; 5510 retval = -EPERM;
4983 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5511 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
4984 goto out_unlock; 5512 goto out_unlock;
4985 5513
4986 retval = security_task_setscheduler(p, 0, NULL); 5514 retval = security_task_setscheduler(p);
4987 if (retval) 5515 if (retval)
4988 goto out_unlock; 5516 goto out_unlock;
4989 5517
4990 cpuset_cpus_allowed(p, cpus_allowed); 5518 cpuset_cpus_allowed(p, cpus_allowed);
4991 cpumask_and(new_mask, in_mask, cpus_allowed); 5519 cpumask_and(new_mask, in_mask, cpus_allowed);
4992 again: 5520again:
4993 retval = set_cpus_allowed_ptr(p, new_mask); 5521 retval = set_cpus_allowed_ptr(p, new_mask);
4994 5522
4995 if (!retval) { 5523 if (!retval) {
@@ -5051,7 +5579,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5051{ 5579{
5052 struct task_struct *p; 5580 struct task_struct *p;
5053 unsigned long flags; 5581 unsigned long flags;
5054 struct rq *rq;
5055 int retval; 5582 int retval;
5056 5583
5057 get_online_cpus(); 5584 get_online_cpus();
@@ -5066,9 +5593,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5066 if (retval) 5593 if (retval)
5067 goto out_unlock; 5594 goto out_unlock;
5068 5595
5069 rq = task_rq_lock(p, &flags); 5596 raw_spin_lock_irqsave(&p->pi_lock, flags);
5070 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5597 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5071 task_rq_unlock(rq, &flags); 5598 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5072 5599
5073out_unlock: 5600out_unlock:
5074 rcu_read_unlock(); 5601 rcu_read_unlock();
@@ -5215,6 +5742,67 @@ void __sched yield(void)
5215} 5742}
5216EXPORT_SYMBOL(yield); 5743EXPORT_SYMBOL(yield);
5217 5744
5745/**
5746 * yield_to - yield the current processor to another thread in
5747 * your thread group, or accelerate that thread toward the
5748 * processor it's on.
5749 * @p: target task
5750 * @preempt: whether task preemption is allowed or not
5751 *
5752 * It's the caller's job to ensure that the target task struct
5753 * can't go away on us before we can do any checks.
5754 *
5755 * Returns true if we indeed boosted the target task.
5756 */
5757bool __sched yield_to(struct task_struct *p, bool preempt)
5758{
5759 struct task_struct *curr = current;
5760 struct rq *rq, *p_rq;
5761 unsigned long flags;
5762 bool yielded = 0;
5763
5764 local_irq_save(flags);
5765 rq = this_rq();
5766
5767again:
5768 p_rq = task_rq(p);
5769 double_rq_lock(rq, p_rq);
5770 while (task_rq(p) != p_rq) {
5771 double_rq_unlock(rq, p_rq);
5772 goto again;
5773 }
5774
5775 if (!curr->sched_class->yield_to_task)
5776 goto out;
5777
5778 if (curr->sched_class != p->sched_class)
5779 goto out;
5780
5781 if (task_running(p_rq, p) || p->state)
5782 goto out;
5783
5784 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5785 if (yielded) {
5786 schedstat_inc(rq, yld_count);
5787 /*
5788 * Make p's CPU reschedule; pick_next_entity takes care of
5789 * fairness.
5790 */
5791 if (preempt && rq != p_rq)
5792 resched_task(p_rq->curr);
5793 }
5794
5795out:
5796 double_rq_unlock(rq, p_rq);
5797 local_irq_restore(flags);
5798
5799 if (yielded)
5800 schedule();
5801
5802 return yielded;
5803}
5804EXPORT_SYMBOL_GPL(yield_to);
5805
5218/* 5806/*
5219 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5807 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5220 * that process accounting knows that this is a task in IO wait state. 5808 * that process accounting knows that this is a task in IO wait state.
@@ -5225,6 +5813,7 @@ void __sched io_schedule(void)
5225 5813
5226 delayacct_blkio_start(); 5814 delayacct_blkio_start();
5227 atomic_inc(&rq->nr_iowait); 5815 atomic_inc(&rq->nr_iowait);
5816 blk_flush_plug(current);
5228 current->in_iowait = 1; 5817 current->in_iowait = 1;
5229 schedule(); 5818 schedule();
5230 current->in_iowait = 0; 5819 current->in_iowait = 0;
@@ -5240,6 +5829,7 @@ long __sched io_schedule_timeout(long timeout)
5240 5829
5241 delayacct_blkio_start(); 5830 delayacct_blkio_start();
5242 atomic_inc(&rq->nr_iowait); 5831 atomic_inc(&rq->nr_iowait);
5832 blk_flush_plug(current);
5243 current->in_iowait = 1; 5833 current->in_iowait = 1;
5244 ret = schedule_timeout(timeout); 5834 ret = schedule_timeout(timeout);
5245 current->in_iowait = 0; 5835 current->in_iowait = 0;
@@ -5330,7 +5920,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5330 5920
5331 rq = task_rq_lock(p, &flags); 5921 rq = task_rq_lock(p, &flags);
5332 time_slice = p->sched_class->get_rr_interval(rq, p); 5922 time_slice = p->sched_class->get_rr_interval(rq, p);
5333 task_rq_unlock(rq, &flags); 5923 task_rq_unlock(rq, p, &flags);
5334 5924
5335 rcu_read_unlock(); 5925 rcu_read_unlock();
5336 jiffies_to_timespec(time_slice, &t); 5926 jiffies_to_timespec(time_slice, &t);
@@ -5350,7 +5940,7 @@ void sched_show_task(struct task_struct *p)
5350 unsigned state; 5940 unsigned state;
5351 5941
5352 state = p->state ? __ffs(p->state) + 1 : 0; 5942 state = p->state ? __ffs(p->state) + 1 : 0;
5353 printk(KERN_INFO "%-13.13s %c", p->comm, 5943 printk(KERN_INFO "%-15.15s %c", p->comm,
5354 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5944 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5355#if BITS_PER_LONG == 32 5945#if BITS_PER_LONG == 32
5356 if (state == TASK_RUNNING) 5946 if (state == TASK_RUNNING)
@@ -5388,7 +5978,7 @@ void show_state_filter(unsigned long state_filter)
5388 do_each_thread(g, p) { 5978 do_each_thread(g, p) {
5389 /* 5979 /*
5390 * reset the NMI-timeout, listing all files on a slow 5980 * reset the NMI-timeout, listing all files on a slow
5391 * console might take alot of time: 5981 * console might take a lot of time:
5392 */ 5982 */
5393 touch_nmi_watchdog(); 5983 touch_nmi_watchdog();
5394 if (!state_filter || (p->state & state_filter)) 5984 if (!state_filter || (p->state & state_filter))
@@ -5432,26 +6022,35 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5432 idle->state = TASK_RUNNING; 6022 idle->state = TASK_RUNNING;
5433 idle->se.exec_start = sched_clock(); 6023 idle->se.exec_start = sched_clock();
5434 6024
5435 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 6025 do_set_cpus_allowed(idle, cpumask_of(cpu));
6026 /*
6027 * We're having a chicken and egg problem, even though we are
6028 * holding rq->lock, the cpu isn't yet set to this cpu so the
6029 * lockdep check in task_group() will fail.
6030 *
6031 * Similar case to sched_fork(). / Alternatively we could
6032 * use task_rq_lock() here and obtain the other rq->lock.
6033 *
6034 * Silence PROVE_RCU
6035 */
6036 rcu_read_lock();
5436 __set_task_cpu(idle, cpu); 6037 __set_task_cpu(idle, cpu);
6038 rcu_read_unlock();
5437 6039
5438 rq->curr = rq->idle = idle; 6040 rq->curr = rq->idle = idle;
5439#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 6041#if defined(CONFIG_SMP)
5440 idle->oncpu = 1; 6042 idle->on_cpu = 1;
5441#endif 6043#endif
5442 raw_spin_unlock_irqrestore(&rq->lock, flags); 6044 raw_spin_unlock_irqrestore(&rq->lock, flags);
5443 6045
5444 /* Set the preempt count _outside_ the spinlocks! */ 6046 /* Set the preempt count _outside_ the spinlocks! */
5445#if defined(CONFIG_PREEMPT)
5446 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5447#else
5448 task_thread_info(idle)->preempt_count = 0; 6047 task_thread_info(idle)->preempt_count = 0;
5449#endif 6048
5450 /* 6049 /*
5451 * The idle tasks have their own, simple scheduling class: 6050 * The idle tasks have their own, simple scheduling class:
5452 */ 6051 */
5453 idle->sched_class = &idle_sched_class; 6052 idle->sched_class = &idle_sched_class;
5454 ftrace_graph_init_task(idle); 6053 ftrace_graph_init_idle_task(idle, cpu);
5455} 6054}
5456 6055
5457/* 6056/*
@@ -5502,7 +6101,6 @@ static void update_sysctl(void)
5502 SET_SYSCTL(sched_min_granularity); 6101 SET_SYSCTL(sched_min_granularity);
5503 SET_SYSCTL(sched_latency); 6102 SET_SYSCTL(sched_latency);
5504 SET_SYSCTL(sched_wakeup_granularity); 6103 SET_SYSCTL(sched_wakeup_granularity);
5505 SET_SYSCTL(sched_shares_ratelimit);
5506#undef SET_SYSCTL 6104#undef SET_SYSCTL
5507} 6105}
5508 6106
@@ -5512,6 +6110,16 @@ static inline void sched_init_granularity(void)
5512} 6110}
5513 6111
5514#ifdef CONFIG_SMP 6112#ifdef CONFIG_SMP
6113void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6114{
6115 if (p->sched_class && p->sched_class->set_cpus_allowed)
6116 p->sched_class->set_cpus_allowed(p, new_mask);
6117 else {
6118 cpumask_copy(&p->cpus_allowed, new_mask);
6119 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6120 }
6121}
6122
5515/* 6123/*
5516 * This is how migration works: 6124 * This is how migration works:
5517 * 6125 *
@@ -5542,52 +6150,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5542 unsigned int dest_cpu; 6150 unsigned int dest_cpu;
5543 int ret = 0; 6151 int ret = 0;
5544 6152
5545 /*
5546 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5547 * drop the rq->lock and still rely on ->cpus_allowed.
5548 */
5549again:
5550 while (task_is_waking(p))
5551 cpu_relax();
5552 rq = task_rq_lock(p, &flags); 6153 rq = task_rq_lock(p, &flags);
5553 if (task_is_waking(p)) { 6154
5554 task_rq_unlock(rq, &flags); 6155 if (cpumask_equal(&p->cpus_allowed, new_mask))
5555 goto again; 6156 goto out;
5556 }
5557 6157
5558 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 6158 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5559 ret = -EINVAL; 6159 ret = -EINVAL;
5560 goto out; 6160 goto out;
5561 } 6161 }
5562 6162
5563 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 6163 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5564 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5565 ret = -EINVAL; 6164 ret = -EINVAL;
5566 goto out; 6165 goto out;
5567 } 6166 }
5568 6167
5569 if (p->sched_class->set_cpus_allowed) 6168 do_set_cpus_allowed(p, new_mask);
5570 p->sched_class->set_cpus_allowed(p, new_mask);
5571 else {
5572 cpumask_copy(&p->cpus_allowed, new_mask);
5573 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5574 }
5575 6169
5576 /* Can the task run on the task's current CPU? If so, we're done */ 6170 /* Can the task run on the task's current CPU? If so, we're done */
5577 if (cpumask_test_cpu(task_cpu(p), new_mask)) 6171 if (cpumask_test_cpu(task_cpu(p), new_mask))
5578 goto out; 6172 goto out;
5579 6173
5580 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 6174 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5581 if (migrate_task(p, dest_cpu)) { 6175 if (p->on_rq) {
5582 struct migration_arg arg = { p, dest_cpu }; 6176 struct migration_arg arg = { p, dest_cpu };
5583 /* Need help from migration thread: drop lock and wait. */ 6177 /* Need help from migration thread: drop lock and wait. */
5584 task_rq_unlock(rq, &flags); 6178 task_rq_unlock(rq, p, &flags);
5585 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 6179 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5586 tlb_migrate_finish(p->mm); 6180 tlb_migrate_finish(p->mm);
5587 return 0; 6181 return 0;
5588 } 6182 }
5589out: 6183out:
5590 task_rq_unlock(rq, &flags); 6184 task_rq_unlock(rq, p, &flags);
5591 6185
5592 return ret; 6186 return ret;
5593} 6187}
@@ -5615,6 +6209,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5615 rq_src = cpu_rq(src_cpu); 6209 rq_src = cpu_rq(src_cpu);
5616 rq_dest = cpu_rq(dest_cpu); 6210 rq_dest = cpu_rq(dest_cpu);
5617 6211
6212 raw_spin_lock(&p->pi_lock);
5618 double_rq_lock(rq_src, rq_dest); 6213 double_rq_lock(rq_src, rq_dest);
5619 /* Already moved. */ 6214 /* Already moved. */
5620 if (task_cpu(p) != src_cpu) 6215 if (task_cpu(p) != src_cpu)
@@ -5627,7 +6222,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5627 * If we're not on a rq, the next wake-up will ensure we're 6222 * If we're not on a rq, the next wake-up will ensure we're
5628 * placed properly. 6223 * placed properly.
5629 */ 6224 */
5630 if (p->se.on_rq) { 6225 if (p->on_rq) {
5631 deactivate_task(rq_src, p, 0); 6226 deactivate_task(rq_src, p, 0);
5632 set_task_cpu(p, dest_cpu); 6227 set_task_cpu(p, dest_cpu);
5633 activate_task(rq_dest, p, 0); 6228 activate_task(rq_dest, p, 0);
@@ -5637,6 +6232,7 @@ done:
5637 ret = 1; 6232 ret = 1;
5638fail: 6233fail:
5639 double_rq_unlock(rq_src, rq_dest); 6234 double_rq_unlock(rq_src, rq_dest);
6235 raw_spin_unlock(&p->pi_lock);
5640 return ret; 6236 return ret;
5641} 6237}
5642 6238
@@ -5660,29 +6256,20 @@ static int migration_cpu_stop(void *data)
5660} 6256}
5661 6257
5662#ifdef CONFIG_HOTPLUG_CPU 6258#ifdef CONFIG_HOTPLUG_CPU
6259
5663/* 6260/*
5664 * Figure out where task on dead CPU should go, use force if necessary. 6261 * Ensures that the idle task is using init_mm right before its cpu goes
6262 * offline.
5665 */ 6263 */
5666void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6264void idle_task_exit(void)
5667{ 6265{
5668 struct rq *rq = cpu_rq(dead_cpu); 6266 struct mm_struct *mm = current->active_mm;
5669 int needs_cpu, uninitialized_var(dest_cpu);
5670 unsigned long flags;
5671 6267
5672 local_irq_save(flags); 6268 BUG_ON(cpu_online(smp_processor_id()));
5673 6269
5674 raw_spin_lock(&rq->lock); 6270 if (mm != &init_mm)
5675 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 6271 switch_mm(mm, &init_mm, current);
5676 if (needs_cpu) 6272 mmdrop(mm);
5677 dest_cpu = select_fallback_rq(dead_cpu, p);
5678 raw_spin_unlock(&rq->lock);
5679 /*
5680 * It can only fail if we race with set_cpus_allowed(),
5681 * in the racer should migrate the task anyway.
5682 */
5683 if (needs_cpu)
5684 __migrate_task(p, dead_cpu, dest_cpu);
5685 local_irq_restore(flags);
5686} 6273}
5687 6274
5688/* 6275/*
@@ -5695,128 +6282,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5695static void migrate_nr_uninterruptible(struct rq *rq_src) 6282static void migrate_nr_uninterruptible(struct rq *rq_src)
5696{ 6283{
5697 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 6284 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5698 unsigned long flags;
5699 6285
5700 local_irq_save(flags);
5701 double_rq_lock(rq_src, rq_dest);
5702 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 6286 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5703 rq_src->nr_uninterruptible = 0; 6287 rq_src->nr_uninterruptible = 0;
5704 double_rq_unlock(rq_src, rq_dest);
5705 local_irq_restore(flags);
5706}
5707
5708/* Run through task list and migrate tasks from the dead cpu. */
5709static void migrate_live_tasks(int src_cpu)
5710{
5711 struct task_struct *p, *t;
5712
5713 read_lock(&tasklist_lock);
5714
5715 do_each_thread(t, p) {
5716 if (p == current)
5717 continue;
5718
5719 if (task_cpu(p) == src_cpu)
5720 move_task_off_dead_cpu(src_cpu, p);
5721 } while_each_thread(t, p);
5722
5723 read_unlock(&tasklist_lock);
5724} 6288}
5725 6289
5726/* 6290/*
5727 * Schedules idle task to be the next runnable task on current CPU. 6291 * remove the tasks which were accounted by rq from calc_load_tasks.
5728 * It does so by boosting its priority to highest possible.
5729 * Used by CPU offline code.
5730 */ 6292 */
5731void sched_idle_next(void) 6293static void calc_global_load_remove(struct rq *rq)
5732{ 6294{
5733 int this_cpu = smp_processor_id(); 6295 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5734 struct rq *rq = cpu_rq(this_cpu); 6296 rq->calc_load_active = 0;
5735 struct task_struct *p = rq->idle;
5736 unsigned long flags;
5737
5738 /* cpu has to be offline */
5739 BUG_ON(cpu_online(this_cpu));
5740
5741 /*
5742 * Strictly not necessary since rest of the CPUs are stopped by now
5743 * and interrupts disabled on the current cpu.
5744 */
5745 raw_spin_lock_irqsave(&rq->lock, flags);
5746
5747 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5748
5749 activate_task(rq, p, 0);
5750
5751 raw_spin_unlock_irqrestore(&rq->lock, flags);
5752} 6297}
5753 6298
5754/* 6299/*
5755 * Ensures that the idle task is using init_mm right before its cpu goes 6300 * Migrate all tasks from the rq, sleeping tasks will be migrated by
5756 * offline. 6301 * try_to_wake_up()->select_task_rq().
6302 *
6303 * Called with rq->lock held even though we'er in stop_machine() and
6304 * there's no concurrency possible, we hold the required locks anyway
6305 * because of lock validation efforts.
5757 */ 6306 */
5758void idle_task_exit(void) 6307static void migrate_tasks(unsigned int dead_cpu)
5759{
5760 struct mm_struct *mm = current->active_mm;
5761
5762 BUG_ON(cpu_online(smp_processor_id()));
5763
5764 if (mm != &init_mm)
5765 switch_mm(mm, &init_mm, current);
5766 mmdrop(mm);
5767}
5768
5769/* called under rq->lock with disabled interrupts */
5770static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5771{ 6308{
5772 struct rq *rq = cpu_rq(dead_cpu); 6309 struct rq *rq = cpu_rq(dead_cpu);
5773 6310 struct task_struct *next, *stop = rq->stop;
5774 /* Must be exiting, otherwise would be on tasklist. */ 6311 int dest_cpu;
5775 BUG_ON(!p->exit_state);
5776
5777 /* Cannot have done final schedule yet: would have vanished. */
5778 BUG_ON(p->state == TASK_DEAD);
5779
5780 get_task_struct(p);
5781 6312
5782 /* 6313 /*
5783 * Drop lock around migration; if someone else moves it, 6314 * Fudge the rq selection such that the below task selection loop
5784 * that's OK. No task can be added to this CPU, so iteration is 6315 * doesn't get stuck on the currently eligible stop task.
5785 * fine. 6316 *
6317 * We're currently inside stop_machine() and the rq is either stuck
6318 * in the stop_machine_cpu_stop() loop, or we're executing this code,
6319 * either way we should never end up calling schedule() until we're
6320 * done here.
5786 */ 6321 */
5787 raw_spin_unlock_irq(&rq->lock); 6322 rq->stop = NULL;
5788 move_task_off_dead_cpu(dead_cpu, p);
5789 raw_spin_lock_irq(&rq->lock);
5790
5791 put_task_struct(p);
5792}
5793
5794/* release_task() removes task from tasklist, so we won't find dead tasks. */
5795static void migrate_dead_tasks(unsigned int dead_cpu)
5796{
5797 struct rq *rq = cpu_rq(dead_cpu);
5798 struct task_struct *next;
5799 6323
5800 for ( ; ; ) { 6324 for ( ; ; ) {
5801 if (!rq->nr_running) 6325 /*
6326 * There's this thread running, bail when that's the only
6327 * remaining thread.
6328 */
6329 if (rq->nr_running == 1)
5802 break; 6330 break;
6331
5803 next = pick_next_task(rq); 6332 next = pick_next_task(rq);
5804 if (!next) 6333 BUG_ON(!next);
5805 break;
5806 next->sched_class->put_prev_task(rq, next); 6334 next->sched_class->put_prev_task(rq, next);
5807 migrate_dead(dead_cpu, next);
5808 6335
6336 /* Find suitable destination for @next, with force if needed. */
6337 dest_cpu = select_fallback_rq(dead_cpu, next);
6338 raw_spin_unlock(&rq->lock);
6339
6340 __migrate_task(next, dead_cpu, dest_cpu);
6341
6342 raw_spin_lock(&rq->lock);
5809 } 6343 }
5810}
5811 6344
5812/* 6345 rq->stop = stop;
5813 * remove the tasks which were accounted by rq from calc_load_tasks.
5814 */
5815static void calc_global_load_remove(struct rq *rq)
5816{
5817 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5818 rq->calc_load_active = 0;
5819} 6346}
6347
5820#endif /* CONFIG_HOTPLUG_CPU */ 6348#endif /* CONFIG_HOTPLUG_CPU */
5821 6349
5822#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 6350#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6026,15 +6554,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6026 unsigned long flags; 6554 unsigned long flags;
6027 struct rq *rq = cpu_rq(cpu); 6555 struct rq *rq = cpu_rq(cpu);
6028 6556
6029 switch (action) { 6557 switch (action & ~CPU_TASKS_FROZEN) {
6030 6558
6031 case CPU_UP_PREPARE: 6559 case CPU_UP_PREPARE:
6032 case CPU_UP_PREPARE_FROZEN:
6033 rq->calc_load_update = calc_load_update; 6560 rq->calc_load_update = calc_load_update;
6034 break; 6561 break;
6035 6562
6036 case CPU_ONLINE: 6563 case CPU_ONLINE:
6037 case CPU_ONLINE_FROZEN:
6038 /* Update our root-domain */ 6564 /* Update our root-domain */
6039 raw_spin_lock_irqsave(&rq->lock, flags); 6565 raw_spin_lock_irqsave(&rq->lock, flags);
6040 if (rq->rd) { 6566 if (rq->rd) {
@@ -6046,33 +6572,26 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6046 break; 6572 break;
6047 6573
6048#ifdef CONFIG_HOTPLUG_CPU 6574#ifdef CONFIG_HOTPLUG_CPU
6049 case CPU_DEAD:
6050 case CPU_DEAD_FROZEN:
6051 migrate_live_tasks(cpu);
6052 /* Idle task back to normal (off runqueue, low prio) */
6053 raw_spin_lock_irq(&rq->lock);
6054 deactivate_task(rq, rq->idle, 0);
6055 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6056 rq->idle->sched_class = &idle_sched_class;
6057 migrate_dead_tasks(cpu);
6058 raw_spin_unlock_irq(&rq->lock);
6059 migrate_nr_uninterruptible(rq);
6060 BUG_ON(rq->nr_running != 0);
6061 calc_global_load_remove(rq);
6062 break;
6063
6064 case CPU_DYING: 6575 case CPU_DYING:
6065 case CPU_DYING_FROZEN: 6576 sched_ttwu_pending();
6066 /* Update our root-domain */ 6577 /* Update our root-domain */
6067 raw_spin_lock_irqsave(&rq->lock, flags); 6578 raw_spin_lock_irqsave(&rq->lock, flags);
6068 if (rq->rd) { 6579 if (rq->rd) {
6069 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6580 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6070 set_rq_offline(rq); 6581 set_rq_offline(rq);
6071 } 6582 }
6583 migrate_tasks(cpu);
6584 BUG_ON(rq->nr_running != 1); /* the migration thread */
6072 raw_spin_unlock_irqrestore(&rq->lock, flags); 6585 raw_spin_unlock_irqrestore(&rq->lock, flags);
6586
6587 migrate_nr_uninterruptible(rq);
6588 calc_global_load_remove(rq);
6073 break; 6589 break;
6074#endif 6590#endif
6075 } 6591 }
6592
6593 update_max_interval();
6594
6076 return NOTIFY_OK; 6595 return NOTIFY_OK;
6077} 6596}
6078 6597
@@ -6133,6 +6652,8 @@ early_initcall(migration_init);
6133 6652
6134#ifdef CONFIG_SMP 6653#ifdef CONFIG_SMP
6135 6654
6655static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6656
6136#ifdef CONFIG_SCHED_DEBUG 6657#ifdef CONFIG_SCHED_DEBUG
6137 6658
6138static __read_mostly int sched_domain_debug_enabled; 6659static __read_mostly int sched_domain_debug_enabled;
@@ -6183,7 +6704,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6183 break; 6704 break;
6184 } 6705 }
6185 6706
6186 if (!group->cpu_power) { 6707 if (!group->sgp->power) {
6187 printk(KERN_CONT "\n"); 6708 printk(KERN_CONT "\n");
6188 printk(KERN_ERR "ERROR: domain->cpu_power not " 6709 printk(KERN_ERR "ERROR: domain->cpu_power not "
6189 "set\n"); 6710 "set\n");
@@ -6207,9 +6728,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6207 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6728 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6208 6729
6209 printk(KERN_CONT " %s", str); 6730 printk(KERN_CONT " %s", str);
6210 if (group->cpu_power != SCHED_LOAD_SCALE) { 6731 if (group->sgp->power != SCHED_POWER_SCALE) {
6211 printk(KERN_CONT " (cpu_power = %d)", 6732 printk(KERN_CONT " (cpu_power = %d)",
6212 group->cpu_power); 6733 group->sgp->power);
6213 } 6734 }
6214 6735
6215 group = group->next; 6736 group = group->next;
@@ -6228,7 +6749,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6228 6749
6229static void sched_domain_debug(struct sched_domain *sd, int cpu) 6750static void sched_domain_debug(struct sched_domain *sd, int cpu)
6230{ 6751{
6231 cpumask_var_t groupmask;
6232 int level = 0; 6752 int level = 0;
6233 6753
6234 if (!sched_domain_debug_enabled) 6754 if (!sched_domain_debug_enabled)
@@ -6241,20 +6761,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6241 6761
6242 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6762 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6243 6763
6244 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6245 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6246 return;
6247 }
6248
6249 for (;;) { 6764 for (;;) {
6250 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6765 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6251 break; 6766 break;
6252 level++; 6767 level++;
6253 sd = sd->parent; 6768 sd = sd->parent;
6254 if (!sd) 6769 if (!sd)
6255 break; 6770 break;
6256 } 6771 }
6257 free_cpumask_var(groupmask);
6258} 6772}
6259#else /* !CONFIG_SCHED_DEBUG */ 6773#else /* !CONFIG_SCHED_DEBUG */
6260# define sched_domain_debug(sd, cpu) do { } while (0) 6774# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6311,12 +6825,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6311 return 1; 6825 return 1;
6312} 6826}
6313 6827
6314static void free_rootdomain(struct root_domain *rd) 6828static void free_rootdomain(struct rcu_head *rcu)
6315{ 6829{
6316 synchronize_sched(); 6830 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6317 6831
6318 cpupri_cleanup(&rd->cpupri); 6832 cpupri_cleanup(&rd->cpupri);
6319
6320 free_cpumask_var(rd->rto_mask); 6833 free_cpumask_var(rd->rto_mask);
6321 free_cpumask_var(rd->online); 6834 free_cpumask_var(rd->online);
6322 free_cpumask_var(rd->span); 6835 free_cpumask_var(rd->span);
@@ -6357,7 +6870,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6357 raw_spin_unlock_irqrestore(&rq->lock, flags); 6870 raw_spin_unlock_irqrestore(&rq->lock, flags);
6358 6871
6359 if (old_rd) 6872 if (old_rd)
6360 free_rootdomain(old_rd); 6873 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6361} 6874}
6362 6875
6363static int init_rootdomain(struct root_domain *rd) 6876static int init_rootdomain(struct root_domain *rd)
@@ -6408,6 +6921,53 @@ static struct root_domain *alloc_rootdomain(void)
6408 return rd; 6921 return rd;
6409} 6922}
6410 6923
6924static void free_sched_groups(struct sched_group *sg, int free_sgp)
6925{
6926 struct sched_group *tmp, *first;
6927
6928 if (!sg)
6929 return;
6930
6931 first = sg;
6932 do {
6933 tmp = sg->next;
6934
6935 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
6936 kfree(sg->sgp);
6937
6938 kfree(sg);
6939 sg = tmp;
6940 } while (sg != first);
6941}
6942
6943static void free_sched_domain(struct rcu_head *rcu)
6944{
6945 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6946
6947 /*
6948 * If its an overlapping domain it has private groups, iterate and
6949 * nuke them all.
6950 */
6951 if (sd->flags & SD_OVERLAP) {
6952 free_sched_groups(sd->groups, 1);
6953 } else if (atomic_dec_and_test(&sd->groups->ref)) {
6954 kfree(sd->groups->sgp);
6955 kfree(sd->groups);
6956 }
6957 kfree(sd);
6958}
6959
6960static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6961{
6962 call_rcu(&sd->rcu, free_sched_domain);
6963}
6964
6965static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6966{
6967 for (; sd; sd = sd->parent)
6968 destroy_sched_domain(sd, cpu);
6969}
6970
6411/* 6971/*
6412 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6972 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6413 * hold the hotplug lock. 6973 * hold the hotplug lock.
@@ -6418,9 +6978,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6418 struct rq *rq = cpu_rq(cpu); 6978 struct rq *rq = cpu_rq(cpu);
6419 struct sched_domain *tmp; 6979 struct sched_domain *tmp;
6420 6980
6421 for (tmp = sd; tmp; tmp = tmp->parent)
6422 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6423
6424 /* Remove the sched domains which do not contribute to scheduling. */ 6981 /* Remove the sched domains which do not contribute to scheduling. */
6425 for (tmp = sd; tmp; ) { 6982 for (tmp = sd; tmp; ) {
6426 struct sched_domain *parent = tmp->parent; 6983 struct sched_domain *parent = tmp->parent;
@@ -6431,12 +6988,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6431 tmp->parent = parent->parent; 6988 tmp->parent = parent->parent;
6432 if (parent->parent) 6989 if (parent->parent)
6433 parent->parent->child = tmp; 6990 parent->parent->child = tmp;
6991 destroy_sched_domain(parent, cpu);
6434 } else 6992 } else
6435 tmp = tmp->parent; 6993 tmp = tmp->parent;
6436 } 6994 }
6437 6995
6438 if (sd && sd_degenerate(sd)) { 6996 if (sd && sd_degenerate(sd)) {
6997 tmp = sd;
6439 sd = sd->parent; 6998 sd = sd->parent;
6999 destroy_sched_domain(tmp, cpu);
6440 if (sd) 7000 if (sd)
6441 sd->child = NULL; 7001 sd->child = NULL;
6442 } 7002 }
@@ -6444,7 +7004,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6444 sched_domain_debug(sd, cpu); 7004 sched_domain_debug(sd, cpu);
6445 7005
6446 rq_attach_root(rq, rd); 7006 rq_attach_root(rq, rd);
7007 tmp = rq->sd;
6447 rcu_assign_pointer(rq->sd, sd); 7008 rcu_assign_pointer(rq->sd, sd);
7009 destroy_sched_domains(tmp, cpu);
6448} 7010}
6449 7011
6450/* cpus with isolated domains */ 7012/* cpus with isolated domains */
@@ -6460,56 +7022,6 @@ static int __init isolated_cpu_setup(char *str)
6460 7022
6461__setup("isolcpus=", isolated_cpu_setup); 7023__setup("isolcpus=", isolated_cpu_setup);
6462 7024
6463/*
6464 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6465 * to a function which identifies what group(along with sched group) a CPU
6466 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6467 * (due to the fact that we keep track of groups covered with a struct cpumask).
6468 *
6469 * init_sched_build_groups will build a circular linked list of the groups
6470 * covered by the given span, and will set each group's ->cpumask correctly,
6471 * and ->cpu_power to 0.
6472 */
6473static void
6474init_sched_build_groups(const struct cpumask *span,
6475 const struct cpumask *cpu_map,
6476 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6477 struct sched_group **sg,
6478 struct cpumask *tmpmask),
6479 struct cpumask *covered, struct cpumask *tmpmask)
6480{
6481 struct sched_group *first = NULL, *last = NULL;
6482 int i;
6483
6484 cpumask_clear(covered);
6485
6486 for_each_cpu(i, span) {
6487 struct sched_group *sg;
6488 int group = group_fn(i, cpu_map, &sg, tmpmask);
6489 int j;
6490
6491 if (cpumask_test_cpu(i, covered))
6492 continue;
6493
6494 cpumask_clear(sched_group_cpus(sg));
6495 sg->cpu_power = 0;
6496
6497 for_each_cpu(j, span) {
6498 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6499 continue;
6500
6501 cpumask_set_cpu(j, covered);
6502 cpumask_set_cpu(j, sched_group_cpus(sg));
6503 }
6504 if (!first)
6505 first = sg;
6506 if (last)
6507 last->next = sg;
6508 last = sg;
6509 }
6510 last->next = first;
6511}
6512
6513#define SD_NODES_PER_DOMAIN 16 7025#define SD_NODES_PER_DOMAIN 16
6514 7026
6515#ifdef CONFIG_NUMA 7027#ifdef CONFIG_NUMA
@@ -6526,7 +7038,7 @@ init_sched_build_groups(const struct cpumask *span,
6526 */ 7038 */
6527static int find_next_best_node(int node, nodemask_t *used_nodes) 7039static int find_next_best_node(int node, nodemask_t *used_nodes)
6528{ 7040{
6529 int i, n, val, min_val, best_node = 0; 7041 int i, n, val, min_val, best_node = -1;
6530 7042
6531 min_val = INT_MAX; 7043 min_val = INT_MAX;
6532 7044
@@ -6550,7 +7062,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6550 } 7062 }
6551 } 7063 }
6552 7064
6553 node_set(best_node, *used_nodes); 7065 if (best_node != -1)
7066 node_set(best_node, *used_nodes);
6554 return best_node; 7067 return best_node;
6555} 7068}
6556 7069
@@ -6576,293 +7089,197 @@ static void sched_domain_node_span(int node, struct cpumask *span)
6576 7089
6577 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 7090 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6578 int next_node = find_next_best_node(node, &used_nodes); 7091 int next_node = find_next_best_node(node, &used_nodes);
6579 7092 if (next_node < 0)
7093 break;
6580 cpumask_or(span, span, cpumask_of_node(next_node)); 7094 cpumask_or(span, span, cpumask_of_node(next_node));
6581 } 7095 }
6582} 7096}
7097
7098static const struct cpumask *cpu_node_mask(int cpu)
7099{
7100 lockdep_assert_held(&sched_domains_mutex);
7101
7102 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
7103
7104 return sched_domains_tmpmask;
7105}
7106
7107static const struct cpumask *cpu_allnodes_mask(int cpu)
7108{
7109 return cpu_possible_mask;
7110}
6583#endif /* CONFIG_NUMA */ 7111#endif /* CONFIG_NUMA */
6584 7112
6585int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7113static const struct cpumask *cpu_cpu_mask(int cpu)
7114{
7115 return cpumask_of_node(cpu_to_node(cpu));
7116}
6586 7117
6587/* 7118int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6588 * The cpus mask in sched_group and sched_domain hangs off the end.
6589 *
6590 * ( See the the comments in include/linux/sched.h:struct sched_group
6591 * and struct sched_domain. )
6592 */
6593struct static_sched_group {
6594 struct sched_group sg;
6595 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6596};
6597 7119
6598struct static_sched_domain { 7120struct sd_data {
6599 struct sched_domain sd; 7121 struct sched_domain **__percpu sd;
6600 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 7122 struct sched_group **__percpu sg;
7123 struct sched_group_power **__percpu sgp;
6601}; 7124};
6602 7125
6603struct s_data { 7126struct s_data {
6604#ifdef CONFIG_NUMA 7127 struct sched_domain ** __percpu sd;
6605 int sd_allnodes;
6606 cpumask_var_t domainspan;
6607 cpumask_var_t covered;
6608 cpumask_var_t notcovered;
6609#endif
6610 cpumask_var_t nodemask;
6611 cpumask_var_t this_sibling_map;
6612 cpumask_var_t this_core_map;
6613 cpumask_var_t send_covered;
6614 cpumask_var_t tmpmask;
6615 struct sched_group **sched_group_nodes;
6616 struct root_domain *rd; 7128 struct root_domain *rd;
6617}; 7129};
6618 7130
6619enum s_alloc { 7131enum s_alloc {
6620 sa_sched_groups = 0,
6621 sa_rootdomain, 7132 sa_rootdomain,
6622 sa_tmpmask, 7133 sa_sd,
6623 sa_send_covered, 7134 sa_sd_storage,
6624 sa_this_core_map,
6625 sa_this_sibling_map,
6626 sa_nodemask,
6627 sa_sched_group_nodes,
6628#ifdef CONFIG_NUMA
6629 sa_notcovered,
6630 sa_covered,
6631 sa_domainspan,
6632#endif
6633 sa_none, 7135 sa_none,
6634}; 7136};
6635 7137
6636/* 7138struct sched_domain_topology_level;
6637 * SMT sched-domains:
6638 */
6639#ifdef CONFIG_SCHED_SMT
6640static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6641static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6642 7139
6643static int 7140typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6644cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 7141typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6645 struct sched_group **sg, struct cpumask *unused)
6646{
6647 if (sg)
6648 *sg = &per_cpu(sched_groups, cpu).sg;
6649 return cpu;
6650}
6651#endif /* CONFIG_SCHED_SMT */
6652 7142
6653/* 7143#define SDTL_OVERLAP 0x01
6654 * multi-core sched-domains:
6655 */
6656#ifdef CONFIG_SCHED_MC
6657static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6658static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6659#endif /* CONFIG_SCHED_MC */
6660 7144
6661#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7145struct sched_domain_topology_level {
6662static int 7146 sched_domain_init_f init;
6663cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 7147 sched_domain_mask_f mask;
6664 struct sched_group **sg, struct cpumask *mask) 7148 int flags;
6665{ 7149 struct sd_data data;
6666 int group; 7150};
6667 7151
6668 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6669 group = cpumask_first(mask);
6670 if (sg)
6671 *sg = &per_cpu(sched_group_core, group).sg;
6672 return group;
6673}
6674#elif defined(CONFIG_SCHED_MC)
6675static int 7152static int
6676cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 7153build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6677 struct sched_group **sg, struct cpumask *unused)
6678{ 7154{
6679 if (sg) 7155 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
6680 *sg = &per_cpu(sched_group_core, cpu).sg; 7156 const struct cpumask *span = sched_domain_span(sd);
6681 return cpu; 7157 struct cpumask *covered = sched_domains_tmpmask;
6682} 7158 struct sd_data *sdd = sd->private;
6683#endif 7159 struct sched_domain *child;
7160 int i;
6684 7161
6685static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 7162 cpumask_clear(covered);
6686static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
6687 7163
6688static int 7164 for_each_cpu(i, span) {
6689cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, 7165 struct cpumask *sg_span;
6690 struct sched_group **sg, struct cpumask *mask)
6691{
6692 int group;
6693#ifdef CONFIG_SCHED_MC
6694 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6695 group = cpumask_first(mask);
6696#elif defined(CONFIG_SCHED_SMT)
6697 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6698 group = cpumask_first(mask);
6699#else
6700 group = cpu;
6701#endif
6702 if (sg)
6703 *sg = &per_cpu(sched_group_phys, group).sg;
6704 return group;
6705}
6706 7166
6707#ifdef CONFIG_NUMA 7167 if (cpumask_test_cpu(i, covered))
6708/* 7168 continue;
6709 * The init_sched_build_groups can't handle what we want to do with node
6710 * groups, so roll our own. Now each node has its own list of groups which
6711 * gets dynamically allocated.
6712 */
6713static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
6714static struct sched_group ***sched_group_nodes_bycpu;
6715 7169
6716static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); 7170 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6717static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); 7171 GFP_KERNEL, cpu_to_node(i));
6718 7172
6719static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, 7173 if (!sg)
6720 struct sched_group **sg, 7174 goto fail;
6721 struct cpumask *nodemask)
6722{
6723 int group;
6724 7175
6725 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); 7176 sg_span = sched_group_cpus(sg);
6726 group = cpumask_first(nodemask);
6727 7177
6728 if (sg) 7178 child = *per_cpu_ptr(sdd->sd, i);
6729 *sg = &per_cpu(sched_group_allnodes, group).sg; 7179 if (child->child) {
6730 return group; 7180 child = child->child;
6731} 7181 cpumask_copy(sg_span, sched_domain_span(child));
7182 } else
7183 cpumask_set_cpu(i, sg_span);
6732 7184
6733static void init_numa_sched_groups_power(struct sched_group *group_head) 7185 cpumask_or(covered, covered, sg_span);
6734{
6735 struct sched_group *sg = group_head;
6736 int j;
6737 7186
6738 if (!sg) 7187 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
6739 return; 7188 atomic_inc(&sg->sgp->ref);
6740 do {
6741 for_each_cpu(j, sched_group_cpus(sg)) {
6742 struct sched_domain *sd;
6743 7189
6744 sd = &per_cpu(phys_domains, j).sd; 7190 if (cpumask_test_cpu(cpu, sg_span))
6745 if (j != group_first_cpu(sd->groups)) { 7191 groups = sg;
6746 /*
6747 * Only add "power" once for each
6748 * physical package.
6749 */
6750 continue;
6751 }
6752 7192
6753 sg->cpu_power += sd->groups->cpu_power; 7193 if (!first)
6754 } 7194 first = sg;
6755 sg = sg->next; 7195 if (last)
6756 } while (sg != group_head); 7196 last->next = sg;
7197 last = sg;
7198 last->next = first;
7199 }
7200 sd->groups = groups;
7201
7202 return 0;
7203
7204fail:
7205 free_sched_groups(first, 0);
7206
7207 return -ENOMEM;
6757} 7208}
6758 7209
6759static int build_numa_sched_groups(struct s_data *d, 7210static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6760 const struct cpumask *cpu_map, int num)
6761{ 7211{
6762 struct sched_domain *sd; 7212 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6763 struct sched_group *sg, *prev; 7213 struct sched_domain *child = sd->child;
6764 int n, j;
6765 7214
6766 cpumask_clear(d->covered); 7215 if (child)
6767 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); 7216 cpu = cpumask_first(sched_domain_span(child));
6768 if (cpumask_empty(d->nodemask)) { 7217
6769 d->sched_group_nodes[num] = NULL; 7218 if (sg) {
6770 goto out; 7219 *sg = *per_cpu_ptr(sdd->sg, cpu);
7220 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
7221 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
6771 } 7222 }
6772 7223
6773 sched_domain_node_span(num, d->domainspan); 7224 return cpu;
6774 cpumask_and(d->domainspan, d->domainspan, cpu_map); 7225}
6775 7226
6776 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 7227/*
6777 GFP_KERNEL, num); 7228 * build_sched_groups will build a circular linked list of the groups
6778 if (!sg) { 7229 * covered by the given span, and will set each group's ->cpumask correctly,
6779 printk(KERN_WARNING "Can not alloc domain group for node %d\n", 7230 * and ->cpu_power to 0.
6780 num); 7231 *
6781 return -ENOMEM; 7232 * Assumes the sched_domain tree is fully constructed
6782 } 7233 */
6783 d->sched_group_nodes[num] = sg; 7234static int
7235build_sched_groups(struct sched_domain *sd, int cpu)
7236{
7237 struct sched_group *first = NULL, *last = NULL;
7238 struct sd_data *sdd = sd->private;
7239 const struct cpumask *span = sched_domain_span(sd);
7240 struct cpumask *covered;
7241 int i;
6784 7242
6785 for_each_cpu(j, d->nodemask) { 7243 get_group(cpu, sdd, &sd->groups);
6786 sd = &per_cpu(node_domains, j).sd; 7244 atomic_inc(&sd->groups->ref);
6787 sd->groups = sg;
6788 }
6789 7245
6790 sg->cpu_power = 0; 7246 if (cpu != cpumask_first(sched_domain_span(sd)))
6791 cpumask_copy(sched_group_cpus(sg), d->nodemask); 7247 return 0;
6792 sg->next = sg;
6793 cpumask_or(d->covered, d->covered, d->nodemask);
6794 7248
6795 prev = sg; 7249 lockdep_assert_held(&sched_domains_mutex);
6796 for (j = 0; j < nr_node_ids; j++) { 7250 covered = sched_domains_tmpmask;
6797 n = (num + j) % nr_node_ids;
6798 cpumask_complement(d->notcovered, d->covered);
6799 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
6800 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
6801 if (cpumask_empty(d->tmpmask))
6802 break;
6803 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
6804 if (cpumask_empty(d->tmpmask))
6805 continue;
6806 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
6807 GFP_KERNEL, num);
6808 if (!sg) {
6809 printk(KERN_WARNING
6810 "Can not alloc domain group for node %d\n", j);
6811 return -ENOMEM;
6812 }
6813 sg->cpu_power = 0;
6814 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
6815 sg->next = prev->next;
6816 cpumask_or(d->covered, d->covered, d->tmpmask);
6817 prev->next = sg;
6818 prev = sg;
6819 }
6820out:
6821 return 0;
6822}
6823#endif /* CONFIG_NUMA */
6824 7251
6825#ifdef CONFIG_NUMA 7252 cpumask_clear(covered);
6826/* Free memory allocated for various sched_group structures */
6827static void free_sched_groups(const struct cpumask *cpu_map,
6828 struct cpumask *nodemask)
6829{
6830 int cpu, i;
6831 7253
6832 for_each_cpu(cpu, cpu_map) { 7254 for_each_cpu(i, span) {
6833 struct sched_group **sched_group_nodes 7255 struct sched_group *sg;
6834 = sched_group_nodes_bycpu[cpu]; 7256 int group = get_group(i, sdd, &sg);
7257 int j;
6835 7258
6836 if (!sched_group_nodes) 7259 if (cpumask_test_cpu(i, covered))
6837 continue; 7260 continue;
6838 7261
6839 for (i = 0; i < nr_node_ids; i++) { 7262 cpumask_clear(sched_group_cpus(sg));
6840 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7263 sg->sgp->power = 0;
6841 7264
6842 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 7265 for_each_cpu(j, span) {
6843 if (cpumask_empty(nodemask)) 7266 if (get_group(j, sdd, NULL) != group)
6844 continue; 7267 continue;
6845 7268
6846 if (sg == NULL) 7269 cpumask_set_cpu(j, covered);
6847 continue; 7270 cpumask_set_cpu(j, sched_group_cpus(sg));
6848 sg = sg->next;
6849next_sg:
6850 oldsg = sg;
6851 sg = sg->next;
6852 kfree(oldsg);
6853 if (oldsg != sched_group_nodes[i])
6854 goto next_sg;
6855 } 7271 }
6856 kfree(sched_group_nodes); 7272
6857 sched_group_nodes_bycpu[cpu] = NULL; 7273 if (!first)
7274 first = sg;
7275 if (last)
7276 last->next = sg;
7277 last = sg;
6858 } 7278 }
7279 last->next = first;
7280
7281 return 0;
6859} 7282}
6860#else /* !CONFIG_NUMA */
6861static void free_sched_groups(const struct cpumask *cpu_map,
6862 struct cpumask *nodemask)
6863{
6864}
6865#endif /* CONFIG_NUMA */
6866 7283
6867/* 7284/*
6868 * Initialize sched groups cpu_power. 7285 * Initialize sched groups cpu_power.
@@ -6876,46 +7293,19 @@ static void free_sched_groups(const struct cpumask *cpu_map,
6876 */ 7293 */
6877static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7294static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6878{ 7295{
6879 struct sched_domain *child; 7296 struct sched_group *sg = sd->groups;
6880 struct sched_group *group;
6881 long power;
6882 int weight;
6883
6884 WARN_ON(!sd || !sd->groups);
6885 7297
6886 if (cpu != group_first_cpu(sd->groups)) 7298 WARN_ON(!sd || !sg);
6887 return;
6888
6889 child = sd->child;
6890 7299
6891 sd->groups->cpu_power = 0; 7300 do {
7301 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
7302 sg = sg->next;
7303 } while (sg != sd->groups);
6892 7304
6893 if (!child) { 7305 if (cpu != group_first_cpu(sg))
6894 power = SCHED_LOAD_SCALE;
6895 weight = cpumask_weight(sched_domain_span(sd));
6896 /*
6897 * SMT siblings share the power of a single core.
6898 * Usually multiple threads get a better yield out of
6899 * that one core than a single thread would have,
6900 * reflect that in sd->smt_gain.
6901 */
6902 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
6903 power *= sd->smt_gain;
6904 power /= weight;
6905 power >>= SCHED_LOAD_SHIFT;
6906 }
6907 sd->groups->cpu_power += power;
6908 return; 7306 return;
6909 }
6910 7307
6911 /* 7308 update_group_power(sd, cpu);
6912 * Add cpu_power of each child group to this groups cpu_power.
6913 */
6914 group = child->groups;
6915 do {
6916 sd->groups->cpu_power += group->cpu_power;
6917 group = group->next;
6918 } while (group != child->groups);
6919} 7309}
6920 7310
6921/* 7311/*
@@ -6929,15 +7319,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6929# define SD_INIT_NAME(sd, type) do { } while (0) 7319# define SD_INIT_NAME(sd, type) do { } while (0)
6930#endif 7320#endif
6931 7321
6932#define SD_INIT(sd, type) sd_init_##type(sd) 7322#define SD_INIT_FUNC(type) \
6933 7323static noinline struct sched_domain * \
6934#define SD_INIT_FUNC(type) \ 7324sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6935static noinline void sd_init_##type(struct sched_domain *sd) \ 7325{ \
6936{ \ 7326 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
6937 memset(sd, 0, sizeof(*sd)); \ 7327 *sd = SD_##type##_INIT; \
6938 *sd = SD_##type##_INIT; \ 7328 SD_INIT_NAME(sd, type); \
6939 sd->level = SD_LV_##type; \ 7329 sd->private = &tl->data; \
6940 SD_INIT_NAME(sd, type); \ 7330 return sd; \
6941} 7331}
6942 7332
6943SD_INIT_FUNC(CPU) 7333SD_INIT_FUNC(CPU)
@@ -6951,15 +7341,19 @@ SD_INIT_FUNC(CPU)
6951#ifdef CONFIG_SCHED_MC 7341#ifdef CONFIG_SCHED_MC
6952 SD_INIT_FUNC(MC) 7342 SD_INIT_FUNC(MC)
6953#endif 7343#endif
7344#ifdef CONFIG_SCHED_BOOK
7345 SD_INIT_FUNC(BOOK)
7346#endif
6954 7347
6955static int default_relax_domain_level = -1; 7348static int default_relax_domain_level = -1;
7349int sched_domain_level_max;
6956 7350
6957static int __init setup_relax_domain_level(char *str) 7351static int __init setup_relax_domain_level(char *str)
6958{ 7352{
6959 unsigned long val; 7353 unsigned long val;
6960 7354
6961 val = simple_strtoul(str, NULL, 0); 7355 val = simple_strtoul(str, NULL, 0);
6962 if (val < SD_LV_MAX) 7356 if (val < sched_domain_level_max)
6963 default_relax_domain_level = val; 7357 default_relax_domain_level = val;
6964 7358
6965 return 1; 7359 return 1;
@@ -6987,35 +7381,20 @@ static void set_domain_attribute(struct sched_domain *sd,
6987 } 7381 }
6988} 7382}
6989 7383
7384static void __sdt_free(const struct cpumask *cpu_map);
7385static int __sdt_alloc(const struct cpumask *cpu_map);
7386
6990static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7387static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6991 const struct cpumask *cpu_map) 7388 const struct cpumask *cpu_map)
6992{ 7389{
6993 switch (what) { 7390 switch (what) {
6994 case sa_sched_groups:
6995 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
6996 d->sched_group_nodes = NULL;
6997 case sa_rootdomain: 7391 case sa_rootdomain:
6998 free_rootdomain(d->rd); /* fall through */ 7392 if (!atomic_read(&d->rd->refcount))
6999 case sa_tmpmask: 7393 free_rootdomain(&d->rd->rcu); /* fall through */
7000 free_cpumask_var(d->tmpmask); /* fall through */ 7394 case sa_sd:
7001 case sa_send_covered: 7395 free_percpu(d->sd); /* fall through */
7002 free_cpumask_var(d->send_covered); /* fall through */ 7396 case sa_sd_storage:
7003 case sa_this_core_map: 7397 __sdt_free(cpu_map); /* fall through */
7004 free_cpumask_var(d->this_core_map); /* fall through */
7005 case sa_this_sibling_map:
7006 free_cpumask_var(d->this_sibling_map); /* fall through */
7007 case sa_nodemask:
7008 free_cpumask_var(d->nodemask); /* fall through */
7009 case sa_sched_group_nodes:
7010#ifdef CONFIG_NUMA
7011 kfree(d->sched_group_nodes); /* fall through */
7012 case sa_notcovered:
7013 free_cpumask_var(d->notcovered); /* fall through */
7014 case sa_covered:
7015 free_cpumask_var(d->covered); /* fall through */
7016 case sa_domainspan:
7017 free_cpumask_var(d->domainspan); /* fall through */
7018#endif
7019 case sa_none: 7398 case sa_none:
7020 break; 7399 break;
7021 } 7400 }
@@ -7024,270 +7403,233 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7024static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7403static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7025 const struct cpumask *cpu_map) 7404 const struct cpumask *cpu_map)
7026{ 7405{
7027#ifdef CONFIG_NUMA 7406 memset(d, 0, sizeof(*d));
7028 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7407
7029 return sa_none; 7408 if (__sdt_alloc(cpu_map))
7030 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7409 return sa_sd_storage;
7031 return sa_domainspan; 7410 d->sd = alloc_percpu(struct sched_domain *);
7032 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7411 if (!d->sd)
7033 return sa_covered; 7412 return sa_sd_storage;
7034 /* Allocate the per-node list of sched groups */
7035 d->sched_group_nodes = kcalloc(nr_node_ids,
7036 sizeof(struct sched_group *), GFP_KERNEL);
7037 if (!d->sched_group_nodes) {
7038 printk(KERN_WARNING "Can not alloc sched group node list\n");
7039 return sa_notcovered;
7040 }
7041 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7042#endif
7043 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7044 return sa_sched_group_nodes;
7045 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7046 return sa_nodemask;
7047 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7048 return sa_this_sibling_map;
7049 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7050 return sa_this_core_map;
7051 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7052 return sa_send_covered;
7053 d->rd = alloc_rootdomain(); 7413 d->rd = alloc_rootdomain();
7054 if (!d->rd) { 7414 if (!d->rd)
7055 printk(KERN_WARNING "Cannot alloc root domain\n"); 7415 return sa_sd;
7056 return sa_tmpmask;
7057 }
7058 return sa_rootdomain; 7416 return sa_rootdomain;
7059} 7417}
7060 7418
7061static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7419/*
7062 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7420 * NULL the sd_data elements we've used to build the sched_domain and
7421 * sched_group structure so that the subsequent __free_domain_allocs()
7422 * will not free the data we're using.
7423 */
7424static void claim_allocations(int cpu, struct sched_domain *sd)
7063{ 7425{
7064 struct sched_domain *sd = NULL; 7426 struct sd_data *sdd = sd->private;
7065#ifdef CONFIG_NUMA
7066 struct sched_domain *parent;
7067
7068 d->sd_allnodes = 0;
7069 if (cpumask_weight(cpu_map) >
7070 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7071 sd = &per_cpu(allnodes_domains, i).sd;
7072 SD_INIT(sd, ALLNODES);
7073 set_domain_attribute(sd, attr);
7074 cpumask_copy(sched_domain_span(sd), cpu_map);
7075 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7076 d->sd_allnodes = 1;
7077 }
7078 parent = sd;
7079
7080 sd = &per_cpu(node_domains, i).sd;
7081 SD_INIT(sd, NODE);
7082 set_domain_attribute(sd, attr);
7083 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7084 sd->parent = parent;
7085 if (parent)
7086 parent->child = sd;
7087 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7088#endif
7089 return sd;
7090}
7091 7427
7092static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7428 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7093 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7429 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7094 struct sched_domain *parent, int i)
7095{
7096 struct sched_domain *sd;
7097 sd = &per_cpu(phys_domains, i).sd;
7098 SD_INIT(sd, CPU);
7099 set_domain_attribute(sd, attr);
7100 cpumask_copy(sched_domain_span(sd), d->nodemask);
7101 sd->parent = parent;
7102 if (parent)
7103 parent->child = sd;
7104 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7105 return sd;
7106}
7107 7430
7108static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7431 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
7109 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7432 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7110 struct sched_domain *parent, int i) 7433
7111{ 7434 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
7112 struct sched_domain *sd = parent; 7435 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
7113#ifdef CONFIG_SCHED_MC
7114 sd = &per_cpu(core_domains, i).sd;
7115 SD_INIT(sd, MC);
7116 set_domain_attribute(sd, attr);
7117 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7118 sd->parent = parent;
7119 parent->child = sd;
7120 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7121#endif
7122 return sd;
7123} 7436}
7124 7437
7125static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7126 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7127 struct sched_domain *parent, int i)
7128{
7129 struct sched_domain *sd = parent;
7130#ifdef CONFIG_SCHED_SMT 7438#ifdef CONFIG_SCHED_SMT
7131 sd = &per_cpu(cpu_domains, i).sd; 7439static const struct cpumask *cpu_smt_mask(int cpu)
7132 SD_INIT(sd, SIBLING); 7440{
7133 set_domain_attribute(sd, attr); 7441 return topology_thread_cpumask(cpu);
7134 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7135 sd->parent = parent;
7136 parent->child = sd;
7137 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7138#endif
7139 return sd;
7140} 7442}
7443#endif
7141 7444
7142static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7445/*
7143 const struct cpumask *cpu_map, int cpu) 7446 * Topology list, bottom-up.
7144{ 7447 */
7145 switch (l) { 7448static struct sched_domain_topology_level default_topology[] = {
7146#ifdef CONFIG_SCHED_SMT 7449#ifdef CONFIG_SCHED_SMT
7147 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7450 { sd_init_SIBLING, cpu_smt_mask, },
7148 cpumask_and(d->this_sibling_map, cpu_map,
7149 topology_thread_cpumask(cpu));
7150 if (cpu == cpumask_first(d->this_sibling_map))
7151 init_sched_build_groups(d->this_sibling_map, cpu_map,
7152 &cpu_to_cpu_group,
7153 d->send_covered, d->tmpmask);
7154 break;
7155#endif 7451#endif
7156#ifdef CONFIG_SCHED_MC 7452#ifdef CONFIG_SCHED_MC
7157 case SD_LV_MC: /* set up multi-core groups */ 7453 { sd_init_MC, cpu_coregroup_mask, },
7158 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7159 if (cpu == cpumask_first(d->this_core_map))
7160 init_sched_build_groups(d->this_core_map, cpu_map,
7161 &cpu_to_core_group,
7162 d->send_covered, d->tmpmask);
7163 break;
7164#endif 7454#endif
7165 case SD_LV_CPU: /* set up physical groups */ 7455#ifdef CONFIG_SCHED_BOOK
7166 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7456 { sd_init_BOOK, cpu_book_mask, },
7167 if (!cpumask_empty(d->nodemask)) 7457#endif
7168 init_sched_build_groups(d->nodemask, cpu_map, 7458 { sd_init_CPU, cpu_cpu_mask, },
7169 &cpu_to_phys_group,
7170 d->send_covered, d->tmpmask);
7171 break;
7172#ifdef CONFIG_NUMA 7459#ifdef CONFIG_NUMA
7173 case SD_LV_ALLNODES: 7460 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
7174 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7461 { sd_init_ALLNODES, cpu_allnodes_mask, },
7175 d->send_covered, d->tmpmask);
7176 break;
7177#endif 7462#endif
7178 default: 7463 { NULL, },
7179 break; 7464};
7465
7466static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7467
7468static int __sdt_alloc(const struct cpumask *cpu_map)
7469{
7470 struct sched_domain_topology_level *tl;
7471 int j;
7472
7473 for (tl = sched_domain_topology; tl->init; tl++) {
7474 struct sd_data *sdd = &tl->data;
7475
7476 sdd->sd = alloc_percpu(struct sched_domain *);
7477 if (!sdd->sd)
7478 return -ENOMEM;
7479
7480 sdd->sg = alloc_percpu(struct sched_group *);
7481 if (!sdd->sg)
7482 return -ENOMEM;
7483
7484 sdd->sgp = alloc_percpu(struct sched_group_power *);
7485 if (!sdd->sgp)
7486 return -ENOMEM;
7487
7488 for_each_cpu(j, cpu_map) {
7489 struct sched_domain *sd;
7490 struct sched_group *sg;
7491 struct sched_group_power *sgp;
7492
7493 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7494 GFP_KERNEL, cpu_to_node(j));
7495 if (!sd)
7496 return -ENOMEM;
7497
7498 *per_cpu_ptr(sdd->sd, j) = sd;
7499
7500 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7501 GFP_KERNEL, cpu_to_node(j));
7502 if (!sg)
7503 return -ENOMEM;
7504
7505 *per_cpu_ptr(sdd->sg, j) = sg;
7506
7507 sgp = kzalloc_node(sizeof(struct sched_group_power),
7508 GFP_KERNEL, cpu_to_node(j));
7509 if (!sgp)
7510 return -ENOMEM;
7511
7512 *per_cpu_ptr(sdd->sgp, j) = sgp;
7513 }
7514 }
7515
7516 return 0;
7517}
7518
7519static void __sdt_free(const struct cpumask *cpu_map)
7520{
7521 struct sched_domain_topology_level *tl;
7522 int j;
7523
7524 for (tl = sched_domain_topology; tl->init; tl++) {
7525 struct sd_data *sdd = &tl->data;
7526
7527 for_each_cpu(j, cpu_map) {
7528 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
7529 if (sd && (sd->flags & SD_OVERLAP))
7530 free_sched_groups(sd->groups, 0);
7531 kfree(*per_cpu_ptr(sdd->sg, j));
7532 kfree(*per_cpu_ptr(sdd->sgp, j));
7533 }
7534 free_percpu(sdd->sd);
7535 free_percpu(sdd->sg);
7536 free_percpu(sdd->sgp);
7537 }
7538}
7539
7540struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7541 struct s_data *d, const struct cpumask *cpu_map,
7542 struct sched_domain_attr *attr, struct sched_domain *child,
7543 int cpu)
7544{
7545 struct sched_domain *sd = tl->init(tl, cpu);
7546 if (!sd)
7547 return child;
7548
7549 set_domain_attribute(sd, attr);
7550 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7551 if (child) {
7552 sd->level = child->level + 1;
7553 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7554 child->parent = sd;
7180 } 7555 }
7556 sd->child = child;
7557
7558 return sd;
7181} 7559}
7182 7560
7183/* 7561/*
7184 * Build sched domains for a given set of cpus and attach the sched domains 7562 * Build sched domains for a given set of cpus and attach the sched domains
7185 * to the individual cpus 7563 * to the individual cpus
7186 */ 7564 */
7187static int __build_sched_domains(const struct cpumask *cpu_map, 7565static int build_sched_domains(const struct cpumask *cpu_map,
7188 struct sched_domain_attr *attr) 7566 struct sched_domain_attr *attr)
7189{ 7567{
7190 enum s_alloc alloc_state = sa_none; 7568 enum s_alloc alloc_state = sa_none;
7191 struct s_data d;
7192 struct sched_domain *sd; 7569 struct sched_domain *sd;
7193 int i; 7570 struct s_data d;
7194#ifdef CONFIG_NUMA 7571 int i, ret = -ENOMEM;
7195 d.sd_allnodes = 0;
7196#endif
7197 7572
7198 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7573 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7199 if (alloc_state != sa_rootdomain) 7574 if (alloc_state != sa_rootdomain)
7200 goto error; 7575 goto error;
7201 alloc_state = sa_sched_groups;
7202
7203 /*
7204 * Set up domains for cpus specified by the cpu_map.
7205 */
7206 for_each_cpu(i, cpu_map) {
7207 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
7208 cpu_map);
7209
7210 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7211 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7212 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7213 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7214 }
7215 7576
7577 /* Set up domains for cpus specified by the cpu_map. */
7216 for_each_cpu(i, cpu_map) { 7578 for_each_cpu(i, cpu_map) {
7217 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7579 struct sched_domain_topology_level *tl;
7218 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7580
7219 } 7581 sd = NULL;
7220 7582 for (tl = sched_domain_topology; tl->init; tl++) {
7221 /* Set up physical groups */ 7583 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7222 for (i = 0; i < nr_node_ids; i++) 7584 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7223 build_sched_groups(&d, SD_LV_CPU, cpu_map, i); 7585 sd->flags |= SD_OVERLAP;
7224 7586 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
7225#ifdef CONFIG_NUMA 7587 break;
7226 /* Set up node groups */ 7588 }
7227 if (d.sd_allnodes)
7228 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7229 7589
7230 for (i = 0; i < nr_node_ids; i++) 7590 while (sd->child)
7231 if (build_numa_sched_groups(&d, cpu_map, i)) 7591 sd = sd->child;
7232 goto error;
7233#endif
7234 7592
7235 /* Calculate CPU power for physical packages and nodes */ 7593 *per_cpu_ptr(d.sd, i) = sd;
7236#ifdef CONFIG_SCHED_SMT
7237 for_each_cpu(i, cpu_map) {
7238 sd = &per_cpu(cpu_domains, i).sd;
7239 init_sched_groups_power(i, sd);
7240 }
7241#endif
7242#ifdef CONFIG_SCHED_MC
7243 for_each_cpu(i, cpu_map) {
7244 sd = &per_cpu(core_domains, i).sd;
7245 init_sched_groups_power(i, sd);
7246 } 7594 }
7247#endif
7248 7595
7596 /* Build the groups for the domains */
7249 for_each_cpu(i, cpu_map) { 7597 for_each_cpu(i, cpu_map) {
7250 sd = &per_cpu(phys_domains, i).sd; 7598 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7251 init_sched_groups_power(i, sd); 7599 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7600 if (sd->flags & SD_OVERLAP) {
7601 if (build_overlap_sched_groups(sd, i))
7602 goto error;
7603 } else {
7604 if (build_sched_groups(sd, i))
7605 goto error;
7606 }
7607 }
7252 } 7608 }
7253 7609
7254#ifdef CONFIG_NUMA 7610 /* Calculate CPU power for physical packages and nodes */
7255 for (i = 0; i < nr_node_ids; i++) 7611 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7256 init_numa_sched_groups_power(d.sched_group_nodes[i]); 7612 if (!cpumask_test_cpu(i, cpu_map))
7257 7613 continue;
7258 if (d.sd_allnodes) {
7259 struct sched_group *sg;
7260 7614
7261 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7615 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7262 d.tmpmask); 7616 claim_allocations(i, sd);
7263 init_numa_sched_groups_power(sg); 7617 init_sched_groups_power(i, sd);
7618 }
7264 } 7619 }
7265#endif
7266 7620
7267 /* Attach the domains */ 7621 /* Attach the domains */
7622 rcu_read_lock();
7268 for_each_cpu(i, cpu_map) { 7623 for_each_cpu(i, cpu_map) {
7269#ifdef CONFIG_SCHED_SMT 7624 sd = *per_cpu_ptr(d.sd, i);
7270 sd = &per_cpu(cpu_domains, i).sd;
7271#elif defined(CONFIG_SCHED_MC)
7272 sd = &per_cpu(core_domains, i).sd;
7273#else
7274 sd = &per_cpu(phys_domains, i).sd;
7275#endif
7276 cpu_attach_domain(sd, d.rd, i); 7625 cpu_attach_domain(sd, d.rd, i);
7277 } 7626 }
7627 rcu_read_unlock();
7278 7628
7279 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7629 ret = 0;
7280 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7281 return 0;
7282
7283error: 7630error:
7284 __free_domain_allocs(&d, alloc_state, cpu_map); 7631 __free_domain_allocs(&d, alloc_state, cpu_map);
7285 return -ENOMEM; 7632 return ret;
7286}
7287
7288static int build_sched_domains(const struct cpumask *cpu_map)
7289{
7290 return __build_sched_domains(cpu_map, NULL);
7291} 7633}
7292 7634
7293static cpumask_var_t *doms_cur; /* current sched domains */ 7635static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7342,7 +7684,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7342 * For now this just excludes isolated cpus, but could be used to 7684 * For now this just excludes isolated cpus, but could be used to
7343 * exclude other special cases in the future. 7685 * exclude other special cases in the future.
7344 */ 7686 */
7345static int arch_init_sched_domains(const struct cpumask *cpu_map) 7687static int init_sched_domains(const struct cpumask *cpu_map)
7346{ 7688{
7347 int err; 7689 int err;
7348 7690
@@ -7353,32 +7695,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
7353 doms_cur = &fallback_doms; 7695 doms_cur = &fallback_doms;
7354 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7696 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7355 dattr_cur = NULL; 7697 dattr_cur = NULL;
7356 err = build_sched_domains(doms_cur[0]); 7698 err = build_sched_domains(doms_cur[0], NULL);
7357 register_sched_domain_sysctl(); 7699 register_sched_domain_sysctl();
7358 7700
7359 return err; 7701 return err;
7360} 7702}
7361 7703
7362static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7363 struct cpumask *tmpmask)
7364{
7365 free_sched_groups(cpu_map, tmpmask);
7366}
7367
7368/* 7704/*
7369 * Detach sched domains from a group of cpus specified in cpu_map 7705 * Detach sched domains from a group of cpus specified in cpu_map
7370 * These cpus will now be attached to the NULL domain 7706 * These cpus will now be attached to the NULL domain
7371 */ 7707 */
7372static void detach_destroy_domains(const struct cpumask *cpu_map) 7708static void detach_destroy_domains(const struct cpumask *cpu_map)
7373{ 7709{
7374 /* Save because hotplug lock held. */
7375 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7376 int i; 7710 int i;
7377 7711
7712 rcu_read_lock();
7378 for_each_cpu(i, cpu_map) 7713 for_each_cpu(i, cpu_map)
7379 cpu_attach_domain(NULL, &def_root_domain, i); 7714 cpu_attach_domain(NULL, &def_root_domain, i);
7380 synchronize_sched(); 7715 rcu_read_unlock();
7381 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7382} 7716}
7383 7717
7384/* handle null as "default" */ 7718/* handle null as "default" */
@@ -7467,8 +7801,7 @@ match1:
7467 goto match2; 7801 goto match2;
7468 } 7802 }
7469 /* no match - add a new doms_new */ 7803 /* no match - add a new doms_new */
7470 __build_sched_domains(doms_new[i], 7804 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7471 dattr_new ? dattr_new + i : NULL);
7472match2: 7805match2:
7473 ; 7806 ;
7474 } 7807 }
@@ -7487,7 +7820,7 @@ match2:
7487} 7820}
7488 7821
7489#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7822#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7490static void arch_reinit_sched_domains(void) 7823static void reinit_sched_domains(void)
7491{ 7824{
7492 get_online_cpus(); 7825 get_online_cpus();
7493 7826
@@ -7520,7 +7853,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7520 else 7853 else
7521 sched_mc_power_savings = level; 7854 sched_mc_power_savings = level;
7522 7855
7523 arch_reinit_sched_domains(); 7856 reinit_sched_domains();
7524 7857
7525 return count; 7858 return count;
7526} 7859}
@@ -7639,14 +7972,9 @@ void __init sched_init_smp(void)
7639 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7972 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7640 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7973 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7641 7974
7642#if defined(CONFIG_NUMA)
7643 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7644 GFP_KERNEL);
7645 BUG_ON(sched_group_nodes_bycpu == NULL);
7646#endif
7647 get_online_cpus(); 7975 get_online_cpus();
7648 mutex_lock(&sched_domains_mutex); 7976 mutex_lock(&sched_domains_mutex);
7649 arch_init_sched_domains(cpu_active_mask); 7977 init_sched_domains(cpu_active_mask);
7650 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7978 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7651 if (cpumask_empty(non_isolated_cpus)) 7979 if (cpumask_empty(non_isolated_cpus))
7652 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7980 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -7691,8 +8019,15 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7691 INIT_LIST_HEAD(&cfs_rq->tasks); 8019 INIT_LIST_HEAD(&cfs_rq->tasks);
7692#ifdef CONFIG_FAIR_GROUP_SCHED 8020#ifdef CONFIG_FAIR_GROUP_SCHED
7693 cfs_rq->rq = rq; 8021 cfs_rq->rq = rq;
8022 /* allow initial update_cfs_load() to truncate */
8023#ifdef CONFIG_SMP
8024 cfs_rq->load_stamp = 1;
8025#endif
7694#endif 8026#endif
7695 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 8027 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8028#ifndef CONFIG_64BIT
8029 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8030#endif
7696} 8031}
7697 8032
7698static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 8033static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -7733,18 +8068,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7733 8068
7734#ifdef CONFIG_FAIR_GROUP_SCHED 8069#ifdef CONFIG_FAIR_GROUP_SCHED
7735static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 8070static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7736 struct sched_entity *se, int cpu, int add, 8071 struct sched_entity *se, int cpu,
7737 struct sched_entity *parent) 8072 struct sched_entity *parent)
7738{ 8073{
7739 struct rq *rq = cpu_rq(cpu); 8074 struct rq *rq = cpu_rq(cpu);
7740 tg->cfs_rq[cpu] = cfs_rq; 8075 tg->cfs_rq[cpu] = cfs_rq;
7741 init_cfs_rq(cfs_rq, rq); 8076 init_cfs_rq(cfs_rq, rq);
7742 cfs_rq->tg = tg; 8077 cfs_rq->tg = tg;
7743 if (add)
7744 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7745 8078
7746 tg->se[cpu] = se; 8079 tg->se[cpu] = se;
7747 /* se could be NULL for init_task_group */ 8080 /* se could be NULL for root_task_group */
7748 if (!se) 8081 if (!se)
7749 return; 8082 return;
7750 8083
@@ -7754,15 +8087,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7754 se->cfs_rq = parent->my_q; 8087 se->cfs_rq = parent->my_q;
7755 8088
7756 se->my_q = cfs_rq; 8089 se->my_q = cfs_rq;
7757 se->load.weight = tg->shares; 8090 update_load_set(&se->load, 0);
7758 se->load.inv_weight = 0;
7759 se->parent = parent; 8091 se->parent = parent;
7760} 8092}
7761#endif 8093#endif
7762 8094
7763#ifdef CONFIG_RT_GROUP_SCHED 8095#ifdef CONFIG_RT_GROUP_SCHED
7764static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 8096static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7765 struct sched_rt_entity *rt_se, int cpu, int add, 8097 struct sched_rt_entity *rt_se, int cpu,
7766 struct sched_rt_entity *parent) 8098 struct sched_rt_entity *parent)
7767{ 8099{
7768 struct rq *rq = cpu_rq(cpu); 8100 struct rq *rq = cpu_rq(cpu);
@@ -7771,8 +8103,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7771 init_rt_rq(rt_rq, rq); 8103 init_rt_rq(rt_rq, rq);
7772 rt_rq->tg = tg; 8104 rt_rq->tg = tg;
7773 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 8105 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7774 if (add)
7775 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7776 8106
7777 tg->rt_se[cpu] = rt_se; 8107 tg->rt_se[cpu] = rt_se;
7778 if (!rt_se) 8108 if (!rt_se)
@@ -7807,18 +8137,18 @@ void __init sched_init(void)
7807 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 8137 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7808 8138
7809#ifdef CONFIG_FAIR_GROUP_SCHED 8139#ifdef CONFIG_FAIR_GROUP_SCHED
7810 init_task_group.se = (struct sched_entity **)ptr; 8140 root_task_group.se = (struct sched_entity **)ptr;
7811 ptr += nr_cpu_ids * sizeof(void **); 8141 ptr += nr_cpu_ids * sizeof(void **);
7812 8142
7813 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 8143 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7814 ptr += nr_cpu_ids * sizeof(void **); 8144 ptr += nr_cpu_ids * sizeof(void **);
7815 8145
7816#endif /* CONFIG_FAIR_GROUP_SCHED */ 8146#endif /* CONFIG_FAIR_GROUP_SCHED */
7817#ifdef CONFIG_RT_GROUP_SCHED 8147#ifdef CONFIG_RT_GROUP_SCHED
7818 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 8148 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7819 ptr += nr_cpu_ids * sizeof(void **); 8149 ptr += nr_cpu_ids * sizeof(void **);
7820 8150
7821 init_task_group.rt_rq = (struct rt_rq **)ptr; 8151 root_task_group.rt_rq = (struct rt_rq **)ptr;
7822 ptr += nr_cpu_ids * sizeof(void **); 8152 ptr += nr_cpu_ids * sizeof(void **);
7823 8153
7824#endif /* CONFIG_RT_GROUP_SCHED */ 8154#endif /* CONFIG_RT_GROUP_SCHED */
@@ -7838,20 +8168,16 @@ void __init sched_init(void)
7838 global_rt_period(), global_rt_runtime()); 8168 global_rt_period(), global_rt_runtime());
7839 8169
7840#ifdef CONFIG_RT_GROUP_SCHED 8170#ifdef CONFIG_RT_GROUP_SCHED
7841 init_rt_bandwidth(&init_task_group.rt_bandwidth, 8171 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7842 global_rt_period(), global_rt_runtime()); 8172 global_rt_period(), global_rt_runtime());
7843#endif /* CONFIG_RT_GROUP_SCHED */ 8173#endif /* CONFIG_RT_GROUP_SCHED */
7844 8174
7845#ifdef CONFIG_CGROUP_SCHED 8175#ifdef CONFIG_CGROUP_SCHED
7846 list_add(&init_task_group.list, &task_groups); 8176 list_add(&root_task_group.list, &task_groups);
7847 INIT_LIST_HEAD(&init_task_group.children); 8177 INIT_LIST_HEAD(&root_task_group.children);
7848 8178 autogroup_init(&init_task);
7849#endif /* CONFIG_CGROUP_SCHED */ 8179#endif /* CONFIG_CGROUP_SCHED */
7850 8180
7851#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7852 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7853 __alignof__(unsigned long));
7854#endif
7855 for_each_possible_cpu(i) { 8181 for_each_possible_cpu(i) {
7856 struct rq *rq; 8182 struct rq *rq;
7857 8183
@@ -7863,38 +8189,34 @@ void __init sched_init(void)
7863 init_cfs_rq(&rq->cfs, rq); 8189 init_cfs_rq(&rq->cfs, rq);
7864 init_rt_rq(&rq->rt, rq); 8190 init_rt_rq(&rq->rt, rq);
7865#ifdef CONFIG_FAIR_GROUP_SCHED 8191#ifdef CONFIG_FAIR_GROUP_SCHED
7866 init_task_group.shares = init_task_group_load; 8192 root_task_group.shares = root_task_group_load;
7867 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 8193 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7868#ifdef CONFIG_CGROUP_SCHED
7869 /* 8194 /*
7870 * How much cpu bandwidth does init_task_group get? 8195 * How much cpu bandwidth does root_task_group get?
7871 * 8196 *
7872 * In case of task-groups formed thr' the cgroup filesystem, it 8197 * In case of task-groups formed thr' the cgroup filesystem, it
7873 * gets 100% of the cpu resources in the system. This overall 8198 * gets 100% of the cpu resources in the system. This overall
7874 * system cpu resource is divided among the tasks of 8199 * system cpu resource is divided among the tasks of
7875 * init_task_group and its child task-groups in a fair manner, 8200 * root_task_group and its child task-groups in a fair manner,
7876 * based on each entity's (task or task-group's) weight 8201 * based on each entity's (task or task-group's) weight
7877 * (se->load.weight). 8202 * (se->load.weight).
7878 * 8203 *
7879 * In other words, if init_task_group has 10 tasks of weight 8204 * In other words, if root_task_group has 10 tasks of weight
7880 * 1024) and two child groups A0 and A1 (of weight 1024 each), 8205 * 1024) and two child groups A0 and A1 (of weight 1024 each),
7881 * then A0's share of the cpu resource is: 8206 * then A0's share of the cpu resource is:
7882 * 8207 *
7883 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 8208 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
7884 * 8209 *
7885 * We achieve this by letting init_task_group's tasks sit 8210 * We achieve this by letting root_task_group's tasks sit
7886 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 8211 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
7887 */ 8212 */
7888 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 8213 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7889#endif
7890#endif /* CONFIG_FAIR_GROUP_SCHED */ 8214#endif /* CONFIG_FAIR_GROUP_SCHED */
7891 8215
7892 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 8216 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7893#ifdef CONFIG_RT_GROUP_SCHED 8217#ifdef CONFIG_RT_GROUP_SCHED
7894 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 8218 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7895#ifdef CONFIG_CGROUP_SCHED 8219 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7896 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
7897#endif
7898#endif 8220#endif
7899 8221
7900 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 8222 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7905,7 +8227,7 @@ void __init sched_init(void)
7905#ifdef CONFIG_SMP 8227#ifdef CONFIG_SMP
7906 rq->sd = NULL; 8228 rq->sd = NULL;
7907 rq->rd = NULL; 8229 rq->rd = NULL;
7908 rq->cpu_power = SCHED_LOAD_SCALE; 8230 rq->cpu_power = SCHED_POWER_SCALE;
7909 rq->post_schedule = 0; 8231 rq->post_schedule = 0;
7910 rq->active_balance = 0; 8232 rq->active_balance = 0;
7911 rq->next_balance = jiffies; 8233 rq->next_balance = jiffies;
@@ -7962,6 +8284,7 @@ void __init sched_init(void)
7962 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 8284 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
7963 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 8285 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7964#ifdef CONFIG_SMP 8286#ifdef CONFIG_SMP
8287 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7965#ifdef CONFIG_NO_HZ 8288#ifdef CONFIG_NO_HZ
7966 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 8289 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7967 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 8290 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -7974,8 +8297,6 @@ void __init sched_init(void)
7974 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8297 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7975#endif /* SMP */ 8298#endif /* SMP */
7976 8299
7977 perf_event_init();
7978
7979 scheduler_running = 1; 8300 scheduler_running = 1;
7980} 8301}
7981 8302
@@ -7984,7 +8305,7 @@ static inline int preempt_count_equals(int preempt_offset)
7984{ 8305{
7985 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8306 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7986 8307
7987 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 8308 return (nested == preempt_offset);
7988} 8309}
7989 8310
7990void __might_sleep(const char *file, int line, int preempt_offset) 8311void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8019,9 +8340,11 @@ EXPORT_SYMBOL(__might_sleep);
8019#ifdef CONFIG_MAGIC_SYSRQ 8340#ifdef CONFIG_MAGIC_SYSRQ
8020static void normalize_task(struct rq *rq, struct task_struct *p) 8341static void normalize_task(struct rq *rq, struct task_struct *p)
8021{ 8342{
8343 const struct sched_class *prev_class = p->sched_class;
8344 int old_prio = p->prio;
8022 int on_rq; 8345 int on_rq;
8023 8346
8024 on_rq = p->se.on_rq; 8347 on_rq = p->on_rq;
8025 if (on_rq) 8348 if (on_rq)
8026 deactivate_task(rq, p, 0); 8349 deactivate_task(rq, p, 0);
8027 __setscheduler(rq, p, SCHED_NORMAL, 0); 8350 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8029,6 +8352,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8029 activate_task(rq, p, 0); 8352 activate_task(rq, p, 0);
8030 resched_task(rq->curr); 8353 resched_task(rq->curr);
8031 } 8354 }
8355
8356 check_class_changed(rq, p, prev_class, old_prio);
8032} 8357}
8033 8358
8034void normalize_rt_tasks(void) 8359void normalize_rt_tasks(void)
@@ -8144,7 +8469,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8144{ 8469{
8145 struct cfs_rq *cfs_rq; 8470 struct cfs_rq *cfs_rq;
8146 struct sched_entity *se; 8471 struct sched_entity *se;
8147 struct rq *rq;
8148 int i; 8472 int i;
8149 8473
8150 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8474 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8157,8 +8481,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8157 tg->shares = NICE_0_LOAD; 8481 tg->shares = NICE_0_LOAD;
8158 8482
8159 for_each_possible_cpu(i) { 8483 for_each_possible_cpu(i) {
8160 rq = cpu_rq(i);
8161
8162 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8484 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8163 GFP_KERNEL, cpu_to_node(i)); 8485 GFP_KERNEL, cpu_to_node(i));
8164 if (!cfs_rq) 8486 if (!cfs_rq)
@@ -8169,26 +8491,32 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8169 if (!se) 8491 if (!se)
8170 goto err_free_rq; 8492 goto err_free_rq;
8171 8493
8172 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8494 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8173 } 8495 }
8174 8496
8175 return 1; 8497 return 1;
8176 8498
8177 err_free_rq: 8499err_free_rq:
8178 kfree(cfs_rq); 8500 kfree(cfs_rq);
8179 err: 8501err:
8180 return 0; 8502 return 0;
8181} 8503}
8182 8504
8183static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8184{
8185 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8186 &cpu_rq(cpu)->leaf_cfs_rq_list);
8187}
8188
8189static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8505static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8190{ 8506{
8191 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8507 struct rq *rq = cpu_rq(cpu);
8508 unsigned long flags;
8509
8510 /*
8511 * Only empty task groups can be destroyed; so we can speculatively
8512 * check on_list without danger of it being re-added.
8513 */
8514 if (!tg->cfs_rq[cpu]->on_list)
8515 return;
8516
8517 raw_spin_lock_irqsave(&rq->lock, flags);
8518 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8519 raw_spin_unlock_irqrestore(&rq->lock, flags);
8192} 8520}
8193#else /* !CONFG_FAIR_GROUP_SCHED */ 8521#else /* !CONFG_FAIR_GROUP_SCHED */
8194static inline void free_fair_sched_group(struct task_group *tg) 8522static inline void free_fair_sched_group(struct task_group *tg)
@@ -8201,10 +8529,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8201 return 1; 8529 return 1;
8202} 8530}
8203 8531
8204static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8205{
8206}
8207
8208static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8532static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8209{ 8533{
8210} 8534}
@@ -8233,7 +8557,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8233{ 8557{
8234 struct rt_rq *rt_rq; 8558 struct rt_rq *rt_rq;
8235 struct sched_rt_entity *rt_se; 8559 struct sched_rt_entity *rt_se;
8236 struct rq *rq;
8237 int i; 8560 int i;
8238 8561
8239 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8562 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8247,8 +8570,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8247 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8570 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8248 8571
8249 for_each_possible_cpu(i) { 8572 for_each_possible_cpu(i) {
8250 rq = cpu_rq(i);
8251
8252 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8573 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8253 GFP_KERNEL, cpu_to_node(i)); 8574 GFP_KERNEL, cpu_to_node(i));
8254 if (!rt_rq) 8575 if (!rt_rq)
@@ -8259,27 +8580,16 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8259 if (!rt_se) 8580 if (!rt_se)
8260 goto err_free_rq; 8581 goto err_free_rq;
8261 8582
8262 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8583 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8263 } 8584 }
8264 8585
8265 return 1; 8586 return 1;
8266 8587
8267 err_free_rq: 8588err_free_rq:
8268 kfree(rt_rq); 8589 kfree(rt_rq);
8269 err: 8590err:
8270 return 0; 8591 return 0;
8271} 8592}
8272
8273static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8274{
8275 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8276 &cpu_rq(cpu)->leaf_rt_rq_list);
8277}
8278
8279static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8280{
8281 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8282}
8283#else /* !CONFIG_RT_GROUP_SCHED */ 8593#else /* !CONFIG_RT_GROUP_SCHED */
8284static inline void free_rt_sched_group(struct task_group *tg) 8594static inline void free_rt_sched_group(struct task_group *tg)
8285{ 8595{
@@ -8290,14 +8600,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8290{ 8600{
8291 return 1; 8601 return 1;
8292} 8602}
8293
8294static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8295{
8296}
8297
8298static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8299{
8300}
8301#endif /* CONFIG_RT_GROUP_SCHED */ 8603#endif /* CONFIG_RT_GROUP_SCHED */
8302 8604
8303#ifdef CONFIG_CGROUP_SCHED 8605#ifdef CONFIG_CGROUP_SCHED
@@ -8305,6 +8607,7 @@ static void free_sched_group(struct task_group *tg)
8305{ 8607{
8306 free_fair_sched_group(tg); 8608 free_fair_sched_group(tg);
8307 free_rt_sched_group(tg); 8609 free_rt_sched_group(tg);
8610 autogroup_free(tg);
8308 kfree(tg); 8611 kfree(tg);
8309} 8612}
8310 8613
@@ -8313,7 +8616,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8313{ 8616{
8314 struct task_group *tg; 8617 struct task_group *tg;
8315 unsigned long flags; 8618 unsigned long flags;
8316 int i;
8317 8619
8318 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8620 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8319 if (!tg) 8621 if (!tg)
@@ -8326,10 +8628,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8326 goto err; 8628 goto err;
8327 8629
8328 spin_lock_irqsave(&task_group_lock, flags); 8630 spin_lock_irqsave(&task_group_lock, flags);
8329 for_each_possible_cpu(i) {
8330 register_fair_sched_group(tg, i);
8331 register_rt_sched_group(tg, i);
8332 }
8333 list_add_rcu(&tg->list, &task_groups); 8631 list_add_rcu(&tg->list, &task_groups);
8334 8632
8335 WARN_ON(!parent); /* root should already exist */ 8633 WARN_ON(!parent); /* root should already exist */
@@ -8359,11 +8657,11 @@ void sched_destroy_group(struct task_group *tg)
8359 unsigned long flags; 8657 unsigned long flags;
8360 int i; 8658 int i;
8361 8659
8362 spin_lock_irqsave(&task_group_lock, flags); 8660 /* end participation in shares distribution */
8363 for_each_possible_cpu(i) { 8661 for_each_possible_cpu(i)
8364 unregister_fair_sched_group(tg, i); 8662 unregister_fair_sched_group(tg, i);
8365 unregister_rt_sched_group(tg, i); 8663
8366 } 8664 spin_lock_irqsave(&task_group_lock, flags);
8367 list_del_rcu(&tg->list); 8665 list_del_rcu(&tg->list);
8368 list_del_rcu(&tg->siblings); 8666 list_del_rcu(&tg->siblings);
8369 spin_unlock_irqrestore(&task_group_lock, flags); 8667 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8386,57 +8684,30 @@ void sched_move_task(struct task_struct *tsk)
8386 rq = task_rq_lock(tsk, &flags); 8684 rq = task_rq_lock(tsk, &flags);
8387 8685
8388 running = task_current(rq, tsk); 8686 running = task_current(rq, tsk);
8389 on_rq = tsk->se.on_rq; 8687 on_rq = tsk->on_rq;
8390 8688
8391 if (on_rq) 8689 if (on_rq)
8392 dequeue_task(rq, tsk, 0); 8690 dequeue_task(rq, tsk, 0);
8393 if (unlikely(running)) 8691 if (unlikely(running))
8394 tsk->sched_class->put_prev_task(rq, tsk); 8692 tsk->sched_class->put_prev_task(rq, tsk);
8395 8693
8396 set_task_rq(tsk, task_cpu(tsk));
8397
8398#ifdef CONFIG_FAIR_GROUP_SCHED 8694#ifdef CONFIG_FAIR_GROUP_SCHED
8399 if (tsk->sched_class->moved_group) 8695 if (tsk->sched_class->task_move_group)
8400 tsk->sched_class->moved_group(tsk, on_rq); 8696 tsk->sched_class->task_move_group(tsk, on_rq);
8697 else
8401#endif 8698#endif
8699 set_task_rq(tsk, task_cpu(tsk));
8402 8700
8403 if (unlikely(running)) 8701 if (unlikely(running))
8404 tsk->sched_class->set_curr_task(rq); 8702 tsk->sched_class->set_curr_task(rq);
8405 if (on_rq) 8703 if (on_rq)
8406 enqueue_task(rq, tsk, 0); 8704 enqueue_task(rq, tsk, 0);
8407 8705
8408 task_rq_unlock(rq, &flags); 8706 task_rq_unlock(rq, tsk, &flags);
8409} 8707}
8410#endif /* CONFIG_CGROUP_SCHED */ 8708#endif /* CONFIG_CGROUP_SCHED */
8411 8709
8412#ifdef CONFIG_FAIR_GROUP_SCHED 8710#ifdef CONFIG_FAIR_GROUP_SCHED
8413static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8414{
8415 struct cfs_rq *cfs_rq = se->cfs_rq;
8416 int on_rq;
8417
8418 on_rq = se->on_rq;
8419 if (on_rq)
8420 dequeue_entity(cfs_rq, se, 0);
8421
8422 se->load.weight = shares;
8423 se->load.inv_weight = 0;
8424
8425 if (on_rq)
8426 enqueue_entity(cfs_rq, se, 0);
8427}
8428
8429static void set_se_shares(struct sched_entity *se, unsigned long shares)
8430{
8431 struct cfs_rq *cfs_rq = se->cfs_rq;
8432 struct rq *rq = cfs_rq->rq;
8433 unsigned long flags;
8434
8435 raw_spin_lock_irqsave(&rq->lock, flags);
8436 __set_se_shares(se, shares);
8437 raw_spin_unlock_irqrestore(&rq->lock, flags);
8438}
8439
8440static DEFINE_MUTEX(shares_mutex); 8711static DEFINE_MUTEX(shares_mutex);
8441 8712
8442int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8713int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8450,46 +8721,25 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8450 if (!tg->se[0]) 8721 if (!tg->se[0])
8451 return -EINVAL; 8722 return -EINVAL;
8452 8723
8453 if (shares < MIN_SHARES) 8724 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8454 shares = MIN_SHARES;
8455 else if (shares > MAX_SHARES)
8456 shares = MAX_SHARES;
8457 8725
8458 mutex_lock(&shares_mutex); 8726 mutex_lock(&shares_mutex);
8459 if (tg->shares == shares) 8727 if (tg->shares == shares)
8460 goto done; 8728 goto done;
8461 8729
8462 spin_lock_irqsave(&task_group_lock, flags);
8463 for_each_possible_cpu(i)
8464 unregister_fair_sched_group(tg, i);
8465 list_del_rcu(&tg->siblings);
8466 spin_unlock_irqrestore(&task_group_lock, flags);
8467
8468 /* wait for any ongoing reference to this group to finish */
8469 synchronize_sched();
8470
8471 /*
8472 * Now we are free to modify the group's share on each cpu
8473 * w/o tripping rebalance_share or load_balance_fair.
8474 */
8475 tg->shares = shares; 8730 tg->shares = shares;
8476 for_each_possible_cpu(i) { 8731 for_each_possible_cpu(i) {
8477 /* 8732 struct rq *rq = cpu_rq(i);
8478 * force a rebalance 8733 struct sched_entity *se;
8479 */ 8734
8480 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8735 se = tg->se[i];
8481 set_se_shares(tg->se[i], shares); 8736 /* Propagate contribution to hierarchy */
8737 raw_spin_lock_irqsave(&rq->lock, flags);
8738 for_each_sched_entity(se)
8739 update_cfs_shares(group_cfs_rq(se));
8740 raw_spin_unlock_irqrestore(&rq->lock, flags);
8482 } 8741 }
8483 8742
8484 /*
8485 * Enable load balance activity on this group, by inserting it back on
8486 * each cpu's rq->leaf_cfs_rq_list.
8487 */
8488 spin_lock_irqsave(&task_group_lock, flags);
8489 for_each_possible_cpu(i)
8490 register_fair_sched_group(tg, i);
8491 list_add_rcu(&tg->siblings, &tg->parent->children);
8492 spin_unlock_irqrestore(&task_group_lock, flags);
8493done: 8743done:
8494 mutex_unlock(&shares_mutex); 8744 mutex_unlock(&shares_mutex);
8495 return 0; 8745 return 0;
@@ -8624,7 +8874,7 @@ static int tg_set_bandwidth(struct task_group *tg,
8624 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8874 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8625 } 8875 }
8626 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8876 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8627 unlock: 8877unlock:
8628 read_unlock(&tasklist_lock); 8878 read_unlock(&tasklist_lock);
8629 mutex_unlock(&rt_constraints_mutex); 8879 mutex_unlock(&rt_constraints_mutex);
8630 8880
@@ -8788,7 +9038,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8788 9038
8789 if (!cgrp->parent) { 9039 if (!cgrp->parent) {
8790 /* This is early initialization for the top cgroup */ 9040 /* This is early initialization for the top cgroup */
8791 return &init_task_group.css; 9041 return &root_task_group.css;
8792 } 9042 }
8793 9043
8794 parent = cgroup_tg(cgrp->parent); 9044 parent = cgroup_tg(cgrp->parent);
@@ -8821,56 +9071,39 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8821 return 0; 9071 return 0;
8822} 9072}
8823 9073
8824static int 9074static void
8825cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 9075cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8826 struct task_struct *tsk, bool threadgroup)
8827{ 9076{
8828 int retval = cpu_cgroup_can_attach_task(cgrp, tsk); 9077 sched_move_task(tsk);
8829 if (retval)
8830 return retval;
8831 if (threadgroup) {
8832 struct task_struct *c;
8833 rcu_read_lock();
8834 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
8835 retval = cpu_cgroup_can_attach_task(cgrp, c);
8836 if (retval) {
8837 rcu_read_unlock();
8838 return retval;
8839 }
8840 }
8841 rcu_read_unlock();
8842 }
8843 return 0;
8844} 9078}
8845 9079
8846static void 9080static void
8847cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 9081cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
8848 struct cgroup *old_cont, struct task_struct *tsk, 9082 struct cgroup *old_cgrp, struct task_struct *task)
8849 bool threadgroup)
8850{ 9083{
8851 sched_move_task(tsk); 9084 /*
8852 if (threadgroup) { 9085 * cgroup_exit() is called in the copy_process() failure path.
8853 struct task_struct *c; 9086 * Ignore this case since the task hasn't ran yet, this avoids
8854 rcu_read_lock(); 9087 * trying to poke a half freed task state from generic code.
8855 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 9088 */
8856 sched_move_task(c); 9089 if (!(task->flags & PF_EXITING))
8857 } 9090 return;
8858 rcu_read_unlock(); 9091
8859 } 9092 sched_move_task(task);
8860} 9093}
8861 9094
8862#ifdef CONFIG_FAIR_GROUP_SCHED 9095#ifdef CONFIG_FAIR_GROUP_SCHED
8863static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 9096static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8864 u64 shareval) 9097 u64 shareval)
8865{ 9098{
8866 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 9099 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
8867} 9100}
8868 9101
8869static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 9102static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8870{ 9103{
8871 struct task_group *tg = cgroup_tg(cgrp); 9104 struct task_group *tg = cgroup_tg(cgrp);
8872 9105
8873 return (u64) tg->shares; 9106 return (u64) scale_load_down(tg->shares);
8874} 9107}
8875#endif /* CONFIG_FAIR_GROUP_SCHED */ 9108#endif /* CONFIG_FAIR_GROUP_SCHED */
8876 9109
@@ -8929,8 +9162,9 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8929 .name = "cpu", 9162 .name = "cpu",
8930 .create = cpu_cgroup_create, 9163 .create = cpu_cgroup_create,
8931 .destroy = cpu_cgroup_destroy, 9164 .destroy = cpu_cgroup_destroy,
8932 .can_attach = cpu_cgroup_can_attach, 9165 .can_attach_task = cpu_cgroup_can_attach_task,
8933 .attach = cpu_cgroup_attach, 9166 .attach_task = cpu_cgroup_attach_task,
9167 .exit = cpu_cgroup_exit,
8934 .populate = cpu_cgroup_populate, 9168 .populate = cpu_cgroup_populate,
8935 .subsys_id = cpu_cgroup_subsys_id, 9169 .subsys_id = cpu_cgroup_subsys_id,
8936 .early_init = 1, 9170 .early_init = 1,
@@ -9215,72 +9449,3 @@ struct cgroup_subsys cpuacct_subsys = {
9215}; 9449};
9216#endif /* CONFIG_CGROUP_CPUACCT */ 9450#endif /* CONFIG_CGROUP_CPUACCT */
9217 9451
9218#ifndef CONFIG_SMP
9219
9220void synchronize_sched_expedited(void)
9221{
9222 barrier();
9223}
9224EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9225
9226#else /* #ifndef CONFIG_SMP */
9227
9228static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9229
9230static int synchronize_sched_expedited_cpu_stop(void *data)
9231{
9232 /*
9233 * There must be a full memory barrier on each affected CPU
9234 * between the time that try_stop_cpus() is called and the
9235 * time that it returns.
9236 *
9237 * In the current initial implementation of cpu_stop, the
9238 * above condition is already met when the control reaches
9239 * this point and the following smp_mb() is not strictly
9240 * necessary. Do smp_mb() anyway for documentation and
9241 * robustness against future implementation changes.
9242 */
9243 smp_mb(); /* See above comment block. */
9244 return 0;
9245}
9246
9247/*
9248 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9249 * approach to force grace period to end quickly. This consumes
9250 * significant time on all CPUs, and is thus not recommended for
9251 * any sort of common-case code.
9252 *
9253 * Note that it is illegal to call this function while holding any
9254 * lock that is acquired by a CPU-hotplug notifier. Failing to
9255 * observe this restriction will result in deadlock.
9256 */
9257void synchronize_sched_expedited(void)
9258{
9259 int snap, trycount = 0;
9260
9261 smp_mb(); /* ensure prior mod happens before capturing snap. */
9262 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9263 get_online_cpus();
9264 while (try_stop_cpus(cpu_online_mask,
9265 synchronize_sched_expedited_cpu_stop,
9266 NULL) == -EAGAIN) {
9267 put_online_cpus();
9268 if (trycount++ < 10)
9269 udelay(trycount * num_online_cpus());
9270 else {
9271 synchronize_sched();
9272 return;
9273 }
9274 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9275 smp_mb(); /* ensure test happens before caller kfree */
9276 return;
9277 }
9278 get_online_cpus();
9279 }
9280 atomic_inc(&synchronize_sched_expedited_count);
9281 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9282 put_online_cpus();
9283}
9284EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9285
9286#endif /* #else #ifndef CONFIG_SMP */