aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2012-09-29 13:04:40 -0400
committerJonathan Herman <hermanjl@cs.unc.edu>2012-09-29 13:04:40 -0400
commitdaf1e620bff2cb6d830ef66725369bba9c858f62 (patch)
tree1aed8f7cb55371c70d2139b6754d90ea89a26147 /kernel/sched.c
parent451ed3b075c2a8e322e5a44f177e2470426a821d (diff)
parent1cb90226816c7af7808be4c0de866c54da17ecc9 (diff)
Merge branch 'wip-color' into wip-mc
Conflicts: include/litmus/budget.h include/litmus/litmus.h include/litmus/rt_param.h include/litmus/sched_trace.h include/litmus/trace.h include/trace/events/litmus.h litmus/Makefile litmus/budget.c litmus/ftdev.c litmus/jobs.c litmus/litmus.c litmus/locking.c litmus/preempt.c litmus/rt_domain.c litmus/sched_gsn_edf.c litmus/trace.c
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c3440
1 files changed, 1804 insertions, 1636 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 1db6b746845c..d9d591e70b03 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/uaccess.h> 33#include <linux/uaccess.h>
34#include <linux/highmem.h> 34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h> 35#include <asm/mmu_context.h>
37#include <linux/interrupt.h> 36#include <linux/interrupt.h>
38#include <linux/capability.h> 37#include <linux/capability.h>
@@ -75,9 +74,14 @@
75 74
76#include <asm/tlb.h> 75#include <asm/tlb.h>
77#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
77#include <asm/mutex.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h" 80#include "workqueue_sched.h"
81#include "sched_autogroup.h"
82
83#define CREATE_TRACE_POINTS
84#include <trace/events/sched.h>
81 85
82#define CREATE_TRACE_POINTS 86#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 87#include <trace/events/sched.h>
@@ -235,7 +239,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
235#endif 239#endif
236 240
237/* 241/*
238 * sched_domains_mutex serializes calls to arch_init_sched_domains, 242 * sched_domains_mutex serializes calls to init_sched_domains,
239 * detach_destroy_domains and partition_sched_domains. 243 * detach_destroy_domains and partition_sched_domains.
240 */ 244 */
241static DEFINE_MUTEX(sched_domains_mutex); 245static DEFINE_MUTEX(sched_domains_mutex);
@@ -258,6 +262,8 @@ struct task_group {
258 /* runqueue "owned" by this group on each cpu */ 262 /* runqueue "owned" by this group on each cpu */
259 struct cfs_rq **cfs_rq; 263 struct cfs_rq **cfs_rq;
260 unsigned long shares; 264 unsigned long shares;
265
266 atomic_t load_weight;
261#endif 267#endif
262 268
263#ifdef CONFIG_RT_GROUP_SCHED 269#ifdef CONFIG_RT_GROUP_SCHED
@@ -273,25 +279,18 @@ struct task_group {
273 struct task_group *parent; 279 struct task_group *parent;
274 struct list_head siblings; 280 struct list_head siblings;
275 struct list_head children; 281 struct list_head children;
276};
277 282
278#define root_task_group init_task_group 283#ifdef CONFIG_SCHED_AUTOGROUP
284 struct autogroup *autogroup;
285#endif
286};
279 287
280/* task_group_lock serializes add/remove of task groups and also changes to 288/* task_group_lock serializes the addition/removal of task groups */
281 * a task group's cpu shares.
282 */
283static DEFINE_SPINLOCK(task_group_lock); 289static DEFINE_SPINLOCK(task_group_lock);
284 290
285#ifdef CONFIG_FAIR_GROUP_SCHED 291#ifdef CONFIG_FAIR_GROUP_SCHED
286 292
287#ifdef CONFIG_SMP 293# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
288static int root_task_group_empty(void)
289{
290 return list_empty(&root_task_group.children);
291}
292#endif
293
294# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
295 294
296/* 295/*
297 * A weight of 0 or 1 can cause arithmetics problems. 296 * A weight of 0 or 1 can cause arithmetics problems.
@@ -301,16 +300,16 @@ static int root_task_group_empty(void)
301 * (The default weight is 1024 - so there's no practical 300 * (The default weight is 1024 - so there's no practical
302 * limitation from this.) 301 * limitation from this.)
303 */ 302 */
304#define MIN_SHARES 2 303#define MIN_SHARES (1UL << 1)
305#define MAX_SHARES (1UL << 18) 304#define MAX_SHARES (1UL << 18)
306 305
307static int init_task_group_load = INIT_TASK_GROUP_LOAD; 306static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
308#endif 307#endif
309 308
310/* Default task group. 309/* Default task group.
311 * Every task in system belong to this group at bootup. 310 * Every task in system belong to this group at bootup.
312 */ 311 */
313struct task_group init_task_group; 312struct task_group root_task_group;
314 313
315#endif /* CONFIG_CGROUP_SCHED */ 314#endif /* CONFIG_CGROUP_SCHED */
316 315
@@ -321,6 +320,9 @@ struct cfs_rq {
321 320
322 u64 exec_clock; 321 u64 exec_clock;
323 u64 min_vruntime; 322 u64 min_vruntime;
323#ifndef CONFIG_64BIT
324 u64 min_vruntime_copy;
325#endif
324 326
325 struct rb_root tasks_timeline; 327 struct rb_root tasks_timeline;
326 struct rb_node *rb_leftmost; 328 struct rb_node *rb_leftmost;
@@ -332,9 +334,11 @@ struct cfs_rq {
332 * 'curr' points to currently running entity on this cfs_rq. 334 * 'curr' points to currently running entity on this cfs_rq.
333 * It is set to NULL otherwise (i.e when none are currently running). 335 * It is set to NULL otherwise (i.e when none are currently running).
334 */ 336 */
335 struct sched_entity *curr, *next, *last; 337 struct sched_entity *curr, *next, *last, *skip;
336 338
339#ifdef CONFIG_SCHED_DEBUG
337 unsigned int nr_spread_over; 340 unsigned int nr_spread_over;
341#endif
338 342
339#ifdef CONFIG_FAIR_GROUP_SCHED 343#ifdef CONFIG_FAIR_GROUP_SCHED
340 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 344 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -347,6 +351,7 @@ struct cfs_rq {
347 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 351 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
348 * list is used during load balance. 352 * list is used during load balance.
349 */ 353 */
354 int on_list;
350 struct list_head leaf_cfs_rq_list; 355 struct list_head leaf_cfs_rq_list;
351 struct task_group *tg; /* group that "owns" this runqueue */ 356 struct task_group *tg; /* group that "owns" this runqueue */
352 357
@@ -365,14 +370,17 @@ struct cfs_rq {
365 unsigned long h_load; 370 unsigned long h_load;
366 371
367 /* 372 /*
368 * this cpu's part of tg->shares 373 * Maintaining per-cpu shares distribution for group scheduling
374 *
375 * load_stamp is the last time we updated the load average
376 * load_last is the last time we updated the load average and saw load
377 * load_unacc_exec_time is currently unaccounted execution time
369 */ 378 */
370 unsigned long shares; 379 u64 load_avg;
380 u64 load_period;
381 u64 load_stamp, load_last, load_unacc_exec_time;
371 382
372 /* 383 unsigned long load_contribution;
373 * load.weight at the time we set shares
374 */
375 unsigned long rq_weight;
376#endif 384#endif
377#endif 385#endif
378}; 386};
@@ -428,6 +436,7 @@ struct litmus_rq {
428 */ 436 */
429struct root_domain { 437struct root_domain {
430 atomic_t refcount; 438 atomic_t refcount;
439 struct rcu_head rcu;
431 cpumask_var_t span; 440 cpumask_var_t span;
432 cpumask_var_t online; 441 cpumask_var_t online;
433 442
@@ -437,9 +446,7 @@ struct root_domain {
437 */ 446 */
438 cpumask_var_t rto_mask; 447 cpumask_var_t rto_mask;
439 atomic_t rto_count; 448 atomic_t rto_count;
440#ifdef CONFIG_SMP
441 struct cpupri cpupri; 449 struct cpupri cpupri;
442#endif
443}; 450};
444 451
445/* 452/*
@@ -448,7 +455,7 @@ struct root_domain {
448 */ 455 */
449static struct root_domain def_root_domain; 456static struct root_domain def_root_domain;
450 457
451#endif 458#endif /* CONFIG_SMP */
452 459
453/* 460/*
454 * This is the main, per-CPU runqueue data structure. 461 * This is the main, per-CPU runqueue data structure.
@@ -473,7 +480,7 @@ struct rq {
473 u64 nohz_stamp; 480 u64 nohz_stamp;
474 unsigned char nohz_balance_kick; 481 unsigned char nohz_balance_kick;
475#endif 482#endif
476 unsigned int skip_clock_update; 483 int skip_clock_update;
477 484
478 /* capture load from *all* tasks on this cpu: */ 485 /* capture load from *all* tasks on this cpu: */
479 struct load_weight load; 486 struct load_weight load;
@@ -500,11 +507,12 @@ struct rq {
500 */ 507 */
501 unsigned long nr_uninterruptible; 508 unsigned long nr_uninterruptible;
502 509
503 struct task_struct *curr, *idle; 510 struct task_struct *curr, *idle, *stop;
504 unsigned long next_balance; 511 unsigned long next_balance;
505 struct mm_struct *prev_mm; 512 struct mm_struct *prev_mm;
506 513
507 u64 clock; 514 u64 clock;
515 u64 clock_task;
508 516
509 atomic_t nr_iowait; 517 atomic_t nr_iowait;
510 518
@@ -532,6 +540,10 @@ struct rq {
532 u64 avg_idle; 540 u64 avg_idle;
533#endif 541#endif
534 542
543#ifdef CONFIG_IRQ_TIME_ACCOUNTING
544 u64 prev_irq_time;
545#endif
546
535 /* calc_load related fields */ 547 /* calc_load related fields */
536 unsigned long calc_load_update; 548 unsigned long calc_load_update;
537 long calc_load_active; 549 long calc_load_active;
@@ -561,32 +573,17 @@ struct rq {
561 /* try_to_wake_up() stats */ 573 /* try_to_wake_up() stats */
562 unsigned int ttwu_count; 574 unsigned int ttwu_count;
563 unsigned int ttwu_local; 575 unsigned int ttwu_local;
576#endif
564 577
565 /* BKL stats */ 578#ifdef CONFIG_SMP
566 unsigned int bkl_count; 579 struct task_struct *wake_list;
567#endif 580#endif
568}; 581};
569 582
570static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 583static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
571 584
572static inline
573void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
574{
575 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
576 585
577 /* 586static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
578 * A queue event has occurred, and we're going to schedule. In
579 * this case, we can save a useless back to back clock update.
580 */
581 /* LITMUS^RT: turning off the clock update is buggy in Linux 2.6.36;
582 * the scheduler can "forget" to renable the runqueue clock in some
583 * cases. LITMUS^RT amplifies the effects of this problem. Hence, we
584 * turn it off to avoid stalling clocks. */
585 /*
586 if (test_tsk_need_resched(p))
587 rq->skip_clock_update = 1;
588 */
589}
590 587
591static inline int cpu_of(struct rq *rq) 588static inline int cpu_of(struct rq *rq)
592{ 589{
@@ -599,7 +596,7 @@ static inline int cpu_of(struct rq *rq)
599 596
600#define rcu_dereference_check_sched_domain(p) \ 597#define rcu_dereference_check_sched_domain(p) \
601 rcu_dereference_check((p), \ 598 rcu_dereference_check((p), \
602 rcu_read_lock_sched_held() || \ 599 rcu_read_lock_held() || \
603 lockdep_is_held(&sched_domains_mutex)) 600 lockdep_is_held(&sched_domains_mutex))
604 601
605/* 602/*
@@ -623,18 +620,22 @@ static inline int cpu_of(struct rq *rq)
623/* 620/*
624 * Return the group to which this tasks belongs. 621 * Return the group to which this tasks belongs.
625 * 622 *
626 * We use task_subsys_state_check() and extend the RCU verification 623 * We use task_subsys_state_check() and extend the RCU verification with
627 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 624 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
628 * holds that lock for each task it moves into the cgroup. Therefore 625 * task it moves into the cgroup. Therefore by holding either of those locks,
629 * by holding that lock, we pin the task to the current cgroup. 626 * we pin the task to the current cgroup.
630 */ 627 */
631static inline struct task_group *task_group(struct task_struct *p) 628static inline struct task_group *task_group(struct task_struct *p)
632{ 629{
630 struct task_group *tg;
633 struct cgroup_subsys_state *css; 631 struct cgroup_subsys_state *css;
634 632
635 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 633 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
634 lockdep_is_held(&p->pi_lock) ||
636 lockdep_is_held(&task_rq(p)->lock)); 635 lockdep_is_held(&task_rq(p)->lock));
637 return container_of(css, struct task_group, css); 636 tg = container_of(css, struct task_group, css);
637
638 return autogroup_task_group(p, tg);
638} 639}
639 640
640/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 641/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -661,10 +662,18 @@ static inline struct task_group *task_group(struct task_struct *p)
661 662
662#endif /* CONFIG_CGROUP_SCHED */ 663#endif /* CONFIG_CGROUP_SCHED */
663 664
664inline void update_rq_clock(struct rq *rq) 665static void update_rq_clock_task(struct rq *rq, s64 delta);
666
667static void update_rq_clock(struct rq *rq)
665{ 668{
666 if (!rq->skip_clock_update) 669 s64 delta;
667 rq->clock = sched_clock_cpu(cpu_of(rq)); 670
671 if (rq->skip_clock_update > 0)
672 return;
673
674 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
675 rq->clock += delta;
676 update_rq_clock_task(rq, delta);
668} 677}
669 678
670/* 679/*
@@ -677,10 +686,9 @@ inline void update_rq_clock(struct rq *rq)
677#endif 686#endif
678 687
679/** 688/**
680 * runqueue_is_locked 689 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
681 * @cpu: the processor in question. 690 * @cpu: the processor in question.
682 * 691 *
683 * Returns true if the current cpu runqueue is locked.
684 * This interface allows printk to be called with the runqueue lock 692 * This interface allows printk to be called with the runqueue lock
685 * held and know whether or not it is OK to wake up the klogd. 693 * held and know whether or not it is OK to wake up the klogd.
686 */ 694 */
@@ -741,7 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
741 size_t cnt, loff_t *ppos) 749 size_t cnt, loff_t *ppos)
742{ 750{
743 char buf[64]; 751 char buf[64];
744 char *cmp = buf; 752 char *cmp;
745 int neg = 0; 753 int neg = 0;
746 int i; 754 int i;
747 755
@@ -752,16 +760,15 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
752 return -EFAULT; 760 return -EFAULT;
753 761
754 buf[cnt] = 0; 762 buf[cnt] = 0;
763 cmp = strstrip(buf);
755 764
756 if (strncmp(buf, "NO_", 3) == 0) { 765 if (strncmp(cmp, "NO_", 3) == 0) {
757 neg = 1; 766 neg = 1;
758 cmp += 3; 767 cmp += 3;
759 } 768 }
760 769
761 for (i = 0; sched_feat_names[i]; i++) { 770 for (i = 0; sched_feat_names[i]; i++) {
762 int len = strlen(sched_feat_names[i]); 771 if (strcmp(cmp, sched_feat_names[i]) == 0) {
763
764 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
765 if (neg) 772 if (neg)
766 sysctl_sched_features &= ~(1UL << i); 773 sysctl_sched_features &= ~(1UL << i);
767 else 774 else
@@ -811,20 +818,6 @@ late_initcall(sched_init_debug);
811const_debug unsigned int sysctl_sched_nr_migrate = 32; 818const_debug unsigned int sysctl_sched_nr_migrate = 32;
812 819
813/* 820/*
814 * ratelimit for updating the group shares.
815 * default: 0.25ms
816 */
817unsigned int sysctl_sched_shares_ratelimit = 250000;
818unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
819
820/*
821 * Inject some fuzzyness into changing the per-cpu group shares
822 * this avoids remote rq-locks at the expense of fairness.
823 * default: 4
824 */
825unsigned int sysctl_sched_shares_thresh = 4;
826
827/*
828 * period over which we average the RT time consumption, measured 821 * period over which we average the RT time consumption, measured
829 * in ms. 822 * in ms.
830 * 823 *
@@ -871,18 +864,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
871 return rq->curr == p; 864 return rq->curr == p;
872} 865}
873 866
874#ifndef __ARCH_WANT_UNLOCKED_CTXSW
875static inline int task_running(struct rq *rq, struct task_struct *p) 867static inline int task_running(struct rq *rq, struct task_struct *p)
876{ 868{
869#ifdef CONFIG_SMP
870 return p->on_cpu;
871#else
877 return task_current(rq, p); 872 return task_current(rq, p);
873#endif
878} 874}
879 875
876#ifndef __ARCH_WANT_UNLOCKED_CTXSW
880static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 877static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
881{ 878{
879#ifdef CONFIG_SMP
880 /*
881 * We can optimise this out completely for !SMP, because the
882 * SMP rebalancing from interrupt is the only thing that cares
883 * here.
884 */
885 next->on_cpu = 1;
886#endif
882} 887}
883 888
884static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 889static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
885{ 890{
891#ifdef CONFIG_SMP
892 /*
893 * After ->on_cpu is cleared, the task can be moved to a different CPU.
894 * We must ensure this doesn't happen until the switch is completely
895 * finished.
896 */
897 smp_wmb();
898 prev->on_cpu = 0;
899#endif
886#ifdef CONFIG_DEBUG_SPINLOCK 900#ifdef CONFIG_DEBUG_SPINLOCK
887 /* this is a valid case when another task releases the spinlock */ 901 /* this is a valid case when another task releases the spinlock */
888 rq->lock.owner = current; 902 rq->lock.owner = current;
@@ -898,15 +912,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
898} 912}
899 913
900#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 914#else /* __ARCH_WANT_UNLOCKED_CTXSW */
901static inline int task_running(struct rq *rq, struct task_struct *p)
902{
903#ifdef CONFIG_SMP
904 return p->oncpu;
905#else
906 return task_current(rq, p);
907#endif
908}
909
910static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 915static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
911{ 916{
912#ifdef CONFIG_SMP 917#ifdef CONFIG_SMP
@@ -915,7 +920,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
915 * SMP rebalancing from interrupt is the only thing that cares 920 * SMP rebalancing from interrupt is the only thing that cares
916 * here. 921 * here.
917 */ 922 */
918 next->oncpu = 1; 923 next->on_cpu = 1;
919#endif 924#endif
920#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 925#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
921 raw_spin_unlock_irq(&rq->lock); 926 raw_spin_unlock_irq(&rq->lock);
@@ -928,12 +933,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
928{ 933{
929#ifdef CONFIG_SMP 934#ifdef CONFIG_SMP
930 /* 935 /*
931 * After ->oncpu is cleared, the task can be moved to a different CPU. 936 * After ->on_cpu is cleared, the task can be moved to a different CPU.
932 * We must ensure this doesn't happen until the switch is completely 937 * We must ensure this doesn't happen until the switch is completely
933 * finished. 938 * finished.
934 */ 939 */
935 smp_wmb(); 940 smp_wmb();
936 prev->oncpu = 0; 941 prev->on_cpu = 0;
937#endif 942#endif
938#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 943#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
939 local_irq_enable(); 944 local_irq_enable();
@@ -942,23 +947,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
942#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 947#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
943 948
944/* 949/*
945 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 950 * __task_rq_lock - lock the rq @p resides on.
946 * against ttwu().
947 */
948static inline int task_is_waking(struct task_struct *p)
949{
950 return unlikely(p->state == TASK_WAKING);
951}
952
953/*
954 * __task_rq_lock - lock the runqueue a given task resides on.
955 * Must be called interrupts disabled.
956 */ 951 */
957static inline struct rq *__task_rq_lock(struct task_struct *p) 952static inline struct rq *__task_rq_lock(struct task_struct *p)
958 __acquires(rq->lock) 953 __acquires(rq->lock)
959{ 954{
960 struct rq *rq; 955 struct rq *rq;
961 956
957 lockdep_assert_held(&p->pi_lock);
958
962 for (;;) { 959 for (;;) {
963 rq = task_rq(p); 960 rq = task_rq(p);
964 raw_spin_lock(&rq->lock); 961 raw_spin_lock(&rq->lock);
@@ -969,22 +966,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
969} 966}
970 967
971/* 968/*
972 * task_rq_lock - lock the runqueue a given task resides on and disable 969 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
973 * interrupts. Note the ordering: we can safely lookup the task_rq without
974 * explicitly disabling preemption.
975 */ 970 */
976static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 971static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
972 __acquires(p->pi_lock)
977 __acquires(rq->lock) 973 __acquires(rq->lock)
978{ 974{
979 struct rq *rq; 975 struct rq *rq;
980 976
981 for (;;) { 977 for (;;) {
982 local_irq_save(*flags); 978 raw_spin_lock_irqsave(&p->pi_lock, *flags);
983 rq = task_rq(p); 979 rq = task_rq(p);
984 raw_spin_lock(&rq->lock); 980 raw_spin_lock(&rq->lock);
985 if (likely(rq == task_rq(p))) 981 if (likely(rq == task_rq(p)))
986 return rq; 982 return rq;
987 raw_spin_unlock_irqrestore(&rq->lock, *flags); 983 raw_spin_unlock(&rq->lock);
984 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
988 } 985 }
989} 986}
990 987
@@ -994,10 +991,13 @@ static void __task_rq_unlock(struct rq *rq)
994 raw_spin_unlock(&rq->lock); 991 raw_spin_unlock(&rq->lock);
995} 992}
996 993
997static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 994static inline void
995task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
998 __releases(rq->lock) 996 __releases(rq->lock)
997 __releases(p->pi_lock)
999{ 998{
1000 raw_spin_unlock_irqrestore(&rq->lock, *flags); 999 raw_spin_unlock(&rq->lock);
1000 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
1001} 1001}
1002 1002
1003/* 1003/*
@@ -1227,11 +1227,17 @@ int get_nohz_timer_target(void)
1227 int i; 1227 int i;
1228 struct sched_domain *sd; 1228 struct sched_domain *sd;
1229 1229
1230 rcu_read_lock();
1230 for_each_domain(cpu, sd) { 1231 for_each_domain(cpu, sd) {
1231 for_each_cpu(i, sched_domain_span(sd)) 1232 for_each_cpu(i, sched_domain_span(sd)) {
1232 if (!idle_cpu(i)) 1233 if (!idle_cpu(i)) {
1233 return i; 1234 cpu = i;
1235 goto unlock;
1236 }
1237 }
1234 } 1238 }
1239unlock:
1240 rcu_read_unlock();
1235 return cpu; 1241 return cpu;
1236} 1242}
1237/* 1243/*
@@ -1341,15 +1347,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1341{ 1347{
1342 u64 tmp; 1348 u64 tmp;
1343 1349
1350 /*
1351 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1352 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1353 * 2^SCHED_LOAD_RESOLUTION.
1354 */
1355 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1356 tmp = (u64)delta_exec * scale_load_down(weight);
1357 else
1358 tmp = (u64)delta_exec;
1359
1344 if (!lw->inv_weight) { 1360 if (!lw->inv_weight) {
1345 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1361 unsigned long w = scale_load_down(lw->weight);
1362
1363 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1346 lw->inv_weight = 1; 1364 lw->inv_weight = 1;
1365 else if (unlikely(!w))
1366 lw->inv_weight = WMULT_CONST;
1347 else 1367 else
1348 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1368 lw->inv_weight = WMULT_CONST / w;
1349 / (lw->weight+1);
1350 } 1369 }
1351 1370
1352 tmp = (u64)delta_exec * weight;
1353 /* 1371 /*
1354 * Check whether we'd overflow the 64-bit multiplication: 1372 * Check whether we'd overflow the 64-bit multiplication:
1355 */ 1373 */
@@ -1374,6 +1392,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1374 lw->inv_weight = 0; 1392 lw->inv_weight = 0;
1375} 1393}
1376 1394
1395static inline void update_load_set(struct load_weight *lw, unsigned long w)
1396{
1397 lw->weight = w;
1398 lw->inv_weight = 0;
1399}
1400
1377/* 1401/*
1378 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1402 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1379 * of tasks with abnormal "nice" values across CPUs the contribution that 1403 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1562,101 +1586,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1562 1586
1563#ifdef CONFIG_FAIR_GROUP_SCHED 1587#ifdef CONFIG_FAIR_GROUP_SCHED
1564 1588
1565static __read_mostly unsigned long __percpu *update_shares_data;
1566
1567static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1568
1569/*
1570 * Calculate and set the cpu's group shares.
1571 */
1572static void update_group_shares_cpu(struct task_group *tg, int cpu,
1573 unsigned long sd_shares,
1574 unsigned long sd_rq_weight,
1575 unsigned long *usd_rq_weight)
1576{
1577 unsigned long shares, rq_weight;
1578 int boost = 0;
1579
1580 rq_weight = usd_rq_weight[cpu];
1581 if (!rq_weight) {
1582 boost = 1;
1583 rq_weight = NICE_0_LOAD;
1584 }
1585
1586 /*
1587 * \Sum_j shares_j * rq_weight_i
1588 * shares_i = -----------------------------
1589 * \Sum_j rq_weight_j
1590 */
1591 shares = (sd_shares * rq_weight) / sd_rq_weight;
1592 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1593
1594 if (abs(shares - tg->se[cpu]->load.weight) >
1595 sysctl_sched_shares_thresh) {
1596 struct rq *rq = cpu_rq(cpu);
1597 unsigned long flags;
1598
1599 raw_spin_lock_irqsave(&rq->lock, flags);
1600 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1601 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1602 __set_se_shares(tg->se[cpu], shares);
1603 raw_spin_unlock_irqrestore(&rq->lock, flags);
1604 }
1605}
1606
1607/*
1608 * Re-compute the task group their per cpu shares over the given domain.
1609 * This needs to be done in a bottom-up fashion because the rq weight of a
1610 * parent group depends on the shares of its child groups.
1611 */
1612static int tg_shares_up(struct task_group *tg, void *data)
1613{
1614 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1615 unsigned long *usd_rq_weight;
1616 struct sched_domain *sd = data;
1617 unsigned long flags;
1618 int i;
1619
1620 if (!tg->se[0])
1621 return 0;
1622
1623 local_irq_save(flags);
1624 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1625
1626 for_each_cpu(i, sched_domain_span(sd)) {
1627 weight = tg->cfs_rq[i]->load.weight;
1628 usd_rq_weight[i] = weight;
1629
1630 rq_weight += weight;
1631 /*
1632 * If there are currently no tasks on the cpu pretend there
1633 * is one of average load so that when a new task gets to
1634 * run here it will not get delayed by group starvation.
1635 */
1636 if (!weight)
1637 weight = NICE_0_LOAD;
1638
1639 sum_weight += weight;
1640 shares += tg->cfs_rq[i]->shares;
1641 }
1642
1643 if (!rq_weight)
1644 rq_weight = sum_weight;
1645
1646 if ((!shares && rq_weight) || shares > tg->shares)
1647 shares = tg->shares;
1648
1649 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1650 shares = tg->shares;
1651
1652 for_each_cpu(i, sched_domain_span(sd))
1653 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1654
1655 local_irq_restore(flags);
1656
1657 return 0;
1658}
1659
1660/* 1589/*
1661 * Compute the cpu's hierarchical load factor for each task group. 1590 * Compute the cpu's hierarchical load factor for each task group.
1662 * This needs to be done in a top-down fashion because the load of a child 1591 * This needs to be done in a top-down fashion because the load of a child
@@ -1671,7 +1600,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1671 load = cpu_rq(cpu)->load.weight; 1600 load = cpu_rq(cpu)->load.weight;
1672 } else { 1601 } else {
1673 load = tg->parent->cfs_rq[cpu]->h_load; 1602 load = tg->parent->cfs_rq[cpu]->h_load;
1674 load *= tg->cfs_rq[cpu]->shares; 1603 load *= tg->se[cpu]->load.weight;
1675 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1604 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1676 } 1605 }
1677 1606
@@ -1680,34 +1609,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1680 return 0; 1609 return 0;
1681} 1610}
1682 1611
1683static void update_shares(struct sched_domain *sd)
1684{
1685 s64 elapsed;
1686 u64 now;
1687
1688 if (root_task_group_empty())
1689 return;
1690
1691 now = local_clock();
1692 elapsed = now - sd->last_update;
1693
1694 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1695 sd->last_update = now;
1696 walk_tg_tree(tg_nop, tg_shares_up, sd);
1697 }
1698}
1699
1700static void update_h_load(long cpu) 1612static void update_h_load(long cpu)
1701{ 1613{
1702 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1614 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1703} 1615}
1704 1616
1705#else
1706
1707static inline void update_shares(struct sched_domain *sd)
1708{
1709}
1710
1711#endif 1617#endif
1712 1618
1713#ifdef CONFIG_PREEMPT 1619#ifdef CONFIG_PREEMPT
@@ -1827,15 +1733,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1827 __release(rq2->lock); 1733 __release(rq2->lock);
1828} 1734}
1829 1735
1830#endif 1736#else /* CONFIG_SMP */
1831 1737
1832#ifdef CONFIG_FAIR_GROUP_SCHED 1738/*
1833static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1739 * double_rq_lock - safely lock two runqueues
1740 *
1741 * Note this does not disable interrupts like task_rq_lock,
1742 * you need to do so manually before calling.
1743 */
1744static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1745 __acquires(rq1->lock)
1746 __acquires(rq2->lock)
1834{ 1747{
1835#ifdef CONFIG_SMP 1748 BUG_ON(!irqs_disabled());
1836 cfs_rq->shares = shares; 1749 BUG_ON(rq1 != rq2);
1837#endif 1750 raw_spin_lock(&rq1->lock);
1751 __acquire(rq2->lock); /* Fake it out ;) */
1752}
1753
1754/*
1755 * double_rq_unlock - safely unlock two runqueues
1756 *
1757 * Note this does not restore interrupts like task_rq_unlock,
1758 * you need to do so manually after calling.
1759 */
1760static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1761 __releases(rq1->lock)
1762 __releases(rq2->lock)
1763{
1764 BUG_ON(rq1 != rq2);
1765 raw_spin_unlock(&rq1->lock);
1766 __release(rq2->lock);
1838} 1767}
1768
1839#endif 1769#endif
1840 1770
1841static void calc_load_account_idle(struct rq *this_rq); 1771static void calc_load_account_idle(struct rq *this_rq);
@@ -1877,23 +1807,20 @@ static void dec_nr_running(struct rq *rq)
1877 1807
1878static void set_load_weight(struct task_struct *p) 1808static void set_load_weight(struct task_struct *p)
1879{ 1809{
1880 if (task_has_rt_policy(p)) { 1810 int prio = p->static_prio - MAX_RT_PRIO;
1881 p->se.load.weight = 0; 1811 struct load_weight *load = &p->se.load;
1882 p->se.load.inv_weight = WMULT_CONST;
1883 return;
1884 }
1885 1812
1886 /* 1813 /*
1887 * SCHED_IDLE tasks get minimal weight: 1814 * SCHED_IDLE tasks get minimal weight:
1888 */ 1815 */
1889 if (p->policy == SCHED_IDLE) { 1816 if (p->policy == SCHED_IDLE) {
1890 p->se.load.weight = WEIGHT_IDLEPRIO; 1817 load->weight = scale_load(WEIGHT_IDLEPRIO);
1891 p->se.load.inv_weight = WMULT_IDLEPRIO; 1818 load->inv_weight = WMULT_IDLEPRIO;
1892 return; 1819 return;
1893 } 1820 }
1894 1821
1895 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 1822 load->weight = scale_load(prio_to_weight[prio]);
1896 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1823 load->inv_weight = prio_to_wmult[prio];
1897} 1824}
1898 1825
1899static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1826static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1901,7 +1828,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1901 update_rq_clock(rq); 1828 update_rq_clock(rq);
1902 sched_info_queued(p); 1829 sched_info_queued(p);
1903 p->sched_class->enqueue_task(rq, p, flags); 1830 p->sched_class->enqueue_task(rq, p, flags);
1904 p->se.on_rq = 1;
1905} 1831}
1906 1832
1907static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1833static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1909,7 +1835,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1909 update_rq_clock(rq); 1835 update_rq_clock(rq);
1910 sched_info_dequeued(p); 1836 sched_info_dequeued(p);
1911 p->sched_class->dequeue_task(rq, p, flags); 1837 p->sched_class->dequeue_task(rq, p, flags);
1912 p->se.on_rq = 0;
1913} 1838}
1914 1839
1915/* 1840/*
@@ -1936,14 +1861,227 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1936 dec_nr_running(rq); 1861 dec_nr_running(rq);
1937} 1862}
1938 1863
1864#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1865
1866/*
1867 * There are no locks covering percpu hardirq/softirq time.
1868 * They are only modified in account_system_vtime, on corresponding CPU
1869 * with interrupts disabled. So, writes are safe.
1870 * They are read and saved off onto struct rq in update_rq_clock().
1871 * This may result in other CPU reading this CPU's irq time and can
1872 * race with irq/account_system_vtime on this CPU. We would either get old
1873 * or new value with a side effect of accounting a slice of irq time to wrong
1874 * task when irq is in progress while we read rq->clock. That is a worthy
1875 * compromise in place of having locks on each irq in account_system_time.
1876 */
1877static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1878static DEFINE_PER_CPU(u64, cpu_softirq_time);
1879
1880static DEFINE_PER_CPU(u64, irq_start_time);
1881static int sched_clock_irqtime;
1882
1883void enable_sched_clock_irqtime(void)
1884{
1885 sched_clock_irqtime = 1;
1886}
1887
1888void disable_sched_clock_irqtime(void)
1889{
1890 sched_clock_irqtime = 0;
1891}
1892
1893#ifndef CONFIG_64BIT
1894static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1895
1896static inline void irq_time_write_begin(void)
1897{
1898 __this_cpu_inc(irq_time_seq.sequence);
1899 smp_wmb();
1900}
1901
1902static inline void irq_time_write_end(void)
1903{
1904 smp_wmb();
1905 __this_cpu_inc(irq_time_seq.sequence);
1906}
1907
1908static inline u64 irq_time_read(int cpu)
1909{
1910 u64 irq_time;
1911 unsigned seq;
1912
1913 do {
1914 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1915 irq_time = per_cpu(cpu_softirq_time, cpu) +
1916 per_cpu(cpu_hardirq_time, cpu);
1917 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1918
1919 return irq_time;
1920}
1921#else /* CONFIG_64BIT */
1922static inline void irq_time_write_begin(void)
1923{
1924}
1925
1926static inline void irq_time_write_end(void)
1927{
1928}
1929
1930static inline u64 irq_time_read(int cpu)
1931{
1932 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1933}
1934#endif /* CONFIG_64BIT */
1935
1936/*
1937 * Called before incrementing preempt_count on {soft,}irq_enter
1938 * and before decrementing preempt_count on {soft,}irq_exit.
1939 */
1940void account_system_vtime(struct task_struct *curr)
1941{
1942 unsigned long flags;
1943 s64 delta;
1944 int cpu;
1945
1946 if (!sched_clock_irqtime)
1947 return;
1948
1949 local_irq_save(flags);
1950
1951 cpu = smp_processor_id();
1952 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1953 __this_cpu_add(irq_start_time, delta);
1954
1955 irq_time_write_begin();
1956 /*
1957 * We do not account for softirq time from ksoftirqd here.
1958 * We want to continue accounting softirq time to ksoftirqd thread
1959 * in that case, so as not to confuse scheduler with a special task
1960 * that do not consume any time, but still wants to run.
1961 */
1962 if (hardirq_count())
1963 __this_cpu_add(cpu_hardirq_time, delta);
1964 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1965 __this_cpu_add(cpu_softirq_time, delta);
1966
1967 irq_time_write_end();
1968 local_irq_restore(flags);
1969}
1970EXPORT_SYMBOL_GPL(account_system_vtime);
1971
1972static void update_rq_clock_task(struct rq *rq, s64 delta)
1973{
1974 s64 irq_delta;
1975
1976 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1977
1978 /*
1979 * Since irq_time is only updated on {soft,}irq_exit, we might run into
1980 * this case when a previous update_rq_clock() happened inside a
1981 * {soft,}irq region.
1982 *
1983 * When this happens, we stop ->clock_task and only update the
1984 * prev_irq_time stamp to account for the part that fit, so that a next
1985 * update will consume the rest. This ensures ->clock_task is
1986 * monotonic.
1987 *
1988 * It does however cause some slight miss-attribution of {soft,}irq
1989 * time, a more accurate solution would be to update the irq_time using
1990 * the current rq->clock timestamp, except that would require using
1991 * atomic ops.
1992 */
1993 if (irq_delta > delta)
1994 irq_delta = delta;
1995
1996 rq->prev_irq_time += irq_delta;
1997 delta -= irq_delta;
1998 rq->clock_task += delta;
1999
2000 if (irq_delta && sched_feat(NONIRQ_POWER))
2001 sched_rt_avg_update(rq, irq_delta);
2002}
2003
2004static int irqtime_account_hi_update(void)
2005{
2006 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2007 unsigned long flags;
2008 u64 latest_ns;
2009 int ret = 0;
2010
2011 local_irq_save(flags);
2012 latest_ns = this_cpu_read(cpu_hardirq_time);
2013 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
2014 ret = 1;
2015 local_irq_restore(flags);
2016 return ret;
2017}
2018
2019static int irqtime_account_si_update(void)
2020{
2021 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2022 unsigned long flags;
2023 u64 latest_ns;
2024 int ret = 0;
2025
2026 local_irq_save(flags);
2027 latest_ns = this_cpu_read(cpu_softirq_time);
2028 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
2029 ret = 1;
2030 local_irq_restore(flags);
2031 return ret;
2032}
2033
2034#else /* CONFIG_IRQ_TIME_ACCOUNTING */
2035
2036#define sched_clock_irqtime (0)
2037
2038static void update_rq_clock_task(struct rq *rq, s64 delta)
2039{
2040 rq->clock_task += delta;
2041}
2042
2043#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2044
1939#include "sched_idletask.c" 2045#include "sched_idletask.c"
1940#include "sched_fair.c" 2046#include "sched_fair.c"
1941#include "sched_rt.c" 2047#include "sched_rt.c"
2048#include "sched_autogroup.c"
2049#include "sched_stoptask.c"
1942#include "../litmus/sched_litmus.c" 2050#include "../litmus/sched_litmus.c"
1943#ifdef CONFIG_SCHED_DEBUG 2051#ifdef CONFIG_SCHED_DEBUG
1944# include "sched_debug.c" 2052# include "sched_debug.c"
1945#endif 2053#endif
1946 2054
2055void sched_set_stop_task(int cpu, struct task_struct *stop)
2056{
2057 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2058 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2059
2060 if (stop) {
2061 /*
2062 * Make it appear like a SCHED_FIFO task, its something
2063 * userspace knows about and won't get confused about.
2064 *
2065 * Also, it will make PI more or less work without too
2066 * much confusion -- but then, stop work should not
2067 * rely on PI working anyway.
2068 */
2069 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2070
2071 stop->sched_class = &stop_sched_class;
2072 }
2073
2074 cpu_rq(cpu)->stop = stop;
2075
2076 if (old_stop) {
2077 /*
2078 * Reset it back to a normal scheduling class so that
2079 * it can die in pieces.
2080 */
2081 old_stop->sched_class = &rt_sched_class;
2082 }
2083}
2084
1947/* 2085/*
1948 * __normal_prio - return the priority that is based on the static prio 2086 * __normal_prio - return the priority that is based on the static prio
1949 */ 2087 */
@@ -2001,14 +2139,43 @@ inline int task_curr(const struct task_struct *p)
2001 2139
2002static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2140static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2003 const struct sched_class *prev_class, 2141 const struct sched_class *prev_class,
2004 int oldprio, int running) 2142 int oldprio)
2005{ 2143{
2006 if (prev_class != p->sched_class) { 2144 if (prev_class != p->sched_class) {
2007 if (prev_class->switched_from) 2145 if (prev_class->switched_from)
2008 prev_class->switched_from(rq, p, running); 2146 prev_class->switched_from(rq, p);
2009 p->sched_class->switched_to(rq, p, running); 2147 p->sched_class->switched_to(rq, p);
2010 } else 2148 } else if (oldprio != p->prio)
2011 p->sched_class->prio_changed(rq, p, oldprio, running); 2149 p->sched_class->prio_changed(rq, p, oldprio);
2150}
2151
2152static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2153{
2154 const struct sched_class *class;
2155
2156 if (p->sched_class == rq->curr->sched_class) {
2157 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2158 } else {
2159 for_each_class(class) {
2160 if (class == rq->curr->sched_class)
2161 break;
2162 if (class == p->sched_class) {
2163 resched_task(rq->curr);
2164 break;
2165 }
2166 }
2167 }
2168
2169 /*
2170 * A queue event has occurred, and we're going to schedule. In
2171 * this case, we can save a useless back to back clock update.
2172 */
2173 /* LITMUS^RT:
2174 * The "disable-clock-update" approach was buggy in Linux 2.6.36.
2175 * The issue has been solved in 2.6.37.
2176 */
2177 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2178 rq->skip_clock_update = 1;
2012} 2179}
2013 2180
2014#ifdef CONFIG_SMP 2181#ifdef CONFIG_SMP
@@ -2023,6 +2190,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2023 if (p->sched_class != &fair_sched_class) 2190 if (p->sched_class != &fair_sched_class)
2024 return 0; 2191 return 0;
2025 2192
2193 if (unlikely(p->policy == SCHED_IDLE))
2194 return 0;
2195
2026 /* 2196 /*
2027 * Buddy candidates are cache hot: 2197 * Buddy candidates are cache hot:
2028 */ 2198 */
@@ -2050,6 +2220,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2050 */ 2220 */
2051 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2221 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2052 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2222 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2223
2224#ifdef CONFIG_LOCKDEP
2225 /*
2226 * The caller should hold either p->pi_lock or rq->lock, when changing
2227 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2228 *
2229 * sched_move_task() holds both and thus holding either pins the cgroup,
2230 * see set_task_rq().
2231 *
2232 * Furthermore, all task_rq users should acquire both locks, see
2233 * task_rq_lock().
2234 */
2235 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2236 lockdep_is_held(&task_rq(p)->lock)));
2237#endif
2053#endif 2238#endif
2054 2239
2055 trace_sched_migrate_task(p, new_cpu); 2240 trace_sched_migrate_task(p, new_cpu);
@@ -2070,21 +2255,6 @@ struct migration_arg {
2070static int migration_cpu_stop(void *data); 2255static int migration_cpu_stop(void *data);
2071 2256
2072/* 2257/*
2073 * The task's runqueue lock must be held.
2074 * Returns true if you have to wait for migration thread.
2075 */
2076static bool migrate_task(struct task_struct *p, int dest_cpu)
2077{
2078 struct rq *rq = task_rq(p);
2079
2080 /*
2081 * If the task is not on a runqueue (and not running), then
2082 * the next wake-up will properly place the task.
2083 */
2084 return p->se.on_rq || task_running(rq, p);
2085}
2086
2087/*
2088 * wait_task_inactive - wait for a thread to unschedule. 2258 * wait_task_inactive - wait for a thread to unschedule.
2089 * 2259 *
2090 * If @match_state is nonzero, it's the @p->state value just checked and 2260 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2141,11 +2311,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2141 rq = task_rq_lock(p, &flags); 2311 rq = task_rq_lock(p, &flags);
2142 trace_sched_wait_task(p); 2312 trace_sched_wait_task(p);
2143 running = task_running(rq, p); 2313 running = task_running(rq, p);
2144 on_rq = p->se.on_rq; 2314 on_rq = p->on_rq;
2145 ncsw = 0; 2315 ncsw = 0;
2146 if (!match_state || p->state == match_state) 2316 if (!match_state || p->state == match_state)
2147 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2317 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2148 task_rq_unlock(rq, &flags); 2318 task_rq_unlock(rq, p, &flags);
2149 2319
2150 /* 2320 /*
2151 * If it changed from the expected state, bail out now. 2321 * If it changed from the expected state, bail out now.
@@ -2174,7 +2344,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2174 * yield - it could be a while. 2344 * yield - it could be a while.
2175 */ 2345 */
2176 if (unlikely(on_rq)) { 2346 if (unlikely(on_rq)) {
2177 schedule_timeout_uninterruptible(1); 2347 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2348
2349 set_current_state(TASK_UNINTERRUPTIBLE);
2350 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2178 continue; 2351 continue;
2179 } 2352 }
2180 2353
@@ -2196,7 +2369,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2196 * Cause a process which is running on another CPU to enter 2369 * Cause a process which is running on another CPU to enter
2197 * kernel-mode, without any delay. (to get signals handled.) 2370 * kernel-mode, without any delay. (to get signals handled.)
2198 * 2371 *
2199 * NOTE: this function doesnt have to take the runqueue lock, 2372 * NOTE: this function doesn't have to take the runqueue lock,
2200 * because all it wants to ensure is that the remote task enters 2373 * because all it wants to ensure is that the remote task enters
2201 * the kernel. If the IPI races and the task has been migrated 2374 * the kernel. If the IPI races and the task has been migrated
2202 * to another CPU then no harm is done and the purpose has been 2375 * to another CPU then no harm is done and the purpose has been
@@ -2215,30 +2388,9 @@ void kick_process(struct task_struct *p)
2215EXPORT_SYMBOL_GPL(kick_process); 2388EXPORT_SYMBOL_GPL(kick_process);
2216#endif /* CONFIG_SMP */ 2389#endif /* CONFIG_SMP */
2217 2390
2218/**
2219 * task_oncpu_function_call - call a function on the cpu on which a task runs
2220 * @p: the task to evaluate
2221 * @func: the function to be called
2222 * @info: the function call argument
2223 *
2224 * Calls the function @func when the task is currently running. This might
2225 * be on the current CPU, which just calls the function directly
2226 */
2227void task_oncpu_function_call(struct task_struct *p,
2228 void (*func) (void *info), void *info)
2229{
2230 int cpu;
2231
2232 preempt_disable();
2233 cpu = task_cpu(p);
2234 if (task_curr(p))
2235 smp_call_function_single(cpu, func, info, 1);
2236 preempt_enable();
2237}
2238
2239#ifdef CONFIG_SMP 2391#ifdef CONFIG_SMP
2240/* 2392/*
2241 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2393 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2242 */ 2394 */
2243static int select_fallback_rq(int cpu, struct task_struct *p) 2395static int select_fallback_rq(int cpu, struct task_struct *p)
2244{ 2396{
@@ -2256,30 +2408,27 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2256 return dest_cpu; 2408 return dest_cpu;
2257 2409
2258 /* No more Mr. Nice Guy. */ 2410 /* No more Mr. Nice Guy. */
2259 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2411 dest_cpu = cpuset_cpus_allowed_fallback(p);
2260 dest_cpu = cpuset_cpus_allowed_fallback(p); 2412 /*
2261 /* 2413 * Don't tell them about moving exiting tasks or
2262 * Don't tell them about moving exiting tasks or 2414 * kernel threads (both mm NULL), since they never
2263 * kernel threads (both mm NULL), since they never 2415 * leave kernel.
2264 * leave kernel. 2416 */
2265 */ 2417 if (p->mm && printk_ratelimit()) {
2266 if (p->mm && printk_ratelimit()) { 2418 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2267 printk(KERN_INFO "process %d (%s) no " 2419 task_pid_nr(p), p->comm, cpu);
2268 "longer affine to cpu%d\n",
2269 task_pid_nr(p), p->comm, cpu);
2270 }
2271 } 2420 }
2272 2421
2273 return dest_cpu; 2422 return dest_cpu;
2274} 2423}
2275 2424
2276/* 2425/*
2277 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2426 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2278 */ 2427 */
2279static inline 2428static inline
2280int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2429int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2281{ 2430{
2282 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2431 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2283 2432
2284 /* 2433 /*
2285 * In order not to call set_task_cpu() on a blocking task we need 2434 * In order not to call set_task_cpu() on a blocking task we need
@@ -2305,27 +2454,63 @@ static void update_avg(u64 *avg, u64 sample)
2305} 2454}
2306#endif 2455#endif
2307 2456
2308static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2457static void
2309 bool is_sync, bool is_migrate, bool is_local, 2458ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2310 unsigned long en_flags)
2311{ 2459{
2312 schedstat_inc(p, se.statistics.nr_wakeups); 2460#ifdef CONFIG_SCHEDSTATS
2313 if (is_sync) 2461 struct rq *rq = this_rq();
2314 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2462
2315 if (is_migrate) 2463#ifdef CONFIG_SMP
2316 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2464 int this_cpu = smp_processor_id();
2317 if (is_local) 2465
2466 if (cpu == this_cpu) {
2467 schedstat_inc(rq, ttwu_local);
2318 schedstat_inc(p, se.statistics.nr_wakeups_local); 2468 schedstat_inc(p, se.statistics.nr_wakeups_local);
2319 else 2469 } else {
2470 struct sched_domain *sd;
2471
2320 schedstat_inc(p, se.statistics.nr_wakeups_remote); 2472 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2473 rcu_read_lock();
2474 for_each_domain(this_cpu, sd) {
2475 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2476 schedstat_inc(sd, ttwu_wake_remote);
2477 break;
2478 }
2479 }
2480 rcu_read_unlock();
2481 }
2482
2483 if (wake_flags & WF_MIGRATED)
2484 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2485
2486#endif /* CONFIG_SMP */
2487
2488 schedstat_inc(rq, ttwu_count);
2489 schedstat_inc(p, se.statistics.nr_wakeups);
2321 2490
2491 if (wake_flags & WF_SYNC)
2492 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2493
2494#endif /* CONFIG_SCHEDSTATS */
2495}
2496
2497static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2498{
2322 activate_task(rq, p, en_flags); 2499 activate_task(rq, p, en_flags);
2500 p->on_rq = 1;
2501
2502 /* if a worker is waking up, notify workqueue */
2503 if (p->flags & PF_WQ_WORKER)
2504 wq_worker_waking_up(p, cpu_of(rq));
2323} 2505}
2324 2506
2325static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2507/*
2326 int wake_flags, bool success) 2508 * Mark the task runnable and perform wakeup-preemption.
2509 */
2510static void
2511ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2327{ 2512{
2328 trace_sched_wakeup(p, success); 2513 trace_sched_wakeup(p, true);
2329 check_preempt_curr(rq, p, wake_flags); 2514 check_preempt_curr(rq, p, wake_flags);
2330 2515
2331 p->state = TASK_RUNNING; 2516 p->state = TASK_RUNNING;
@@ -2344,9 +2529,156 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2344 rq->idle_stamp = 0; 2529 rq->idle_stamp = 0;
2345 } 2530 }
2346#endif 2531#endif
2347 /* if a worker is waking up, notify workqueue */ 2532}
2348 if ((p->flags & PF_WQ_WORKER) && success) 2533
2349 wq_worker_waking_up(p, cpu_of(rq)); 2534static void
2535ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2536{
2537#ifdef CONFIG_SMP
2538 if (p->sched_contributes_to_load)
2539 rq->nr_uninterruptible--;
2540#endif
2541
2542 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2543 ttwu_do_wakeup(rq, p, wake_flags);
2544}
2545
2546/*
2547 * Called in case the task @p isn't fully descheduled from its runqueue,
2548 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2549 * since all we need to do is flip p->state to TASK_RUNNING, since
2550 * the task is still ->on_rq.
2551 */
2552static int ttwu_remote(struct task_struct *p, int wake_flags)
2553{
2554 struct rq *rq;
2555 int ret = 0;
2556
2557 rq = __task_rq_lock(p);
2558 if (p->on_rq) {
2559 ttwu_do_wakeup(rq, p, wake_flags);
2560 ret = 1;
2561 }
2562 __task_rq_unlock(rq);
2563
2564 return ret;
2565}
2566
2567#ifdef CONFIG_SMP
2568static void sched_ttwu_do_pending(struct task_struct *list)
2569{
2570 struct rq *rq = this_rq();
2571
2572 raw_spin_lock(&rq->lock);
2573
2574 while (list) {
2575 struct task_struct *p = list;
2576 list = list->wake_entry;
2577 ttwu_do_activate(rq, p, 0);
2578 }
2579
2580 raw_spin_unlock(&rq->lock);
2581}
2582
2583#ifdef CONFIG_HOTPLUG_CPU
2584
2585static void sched_ttwu_pending(void)
2586{
2587 struct rq *rq = this_rq();
2588 struct task_struct *list = xchg(&rq->wake_list, NULL);
2589
2590 if (!list)
2591 return;
2592
2593 sched_ttwu_do_pending(list);
2594}
2595
2596#endif /* CONFIG_HOTPLUG_CPU */
2597
2598void scheduler_ipi(void)
2599{
2600 struct rq *rq = this_rq();
2601 struct task_struct *list = xchg(&rq->wake_list, NULL);
2602
2603 if (!list)
2604 return;
2605
2606 /*
2607 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2608 * traditionally all their work was done from the interrupt return
2609 * path. Now that we actually do some work, we need to make sure
2610 * we do call them.
2611 *
2612 * Some archs already do call them, luckily irq_enter/exit nest
2613 * properly.
2614 *
2615 * Arguably we should visit all archs and update all handlers,
2616 * however a fair share of IPIs are still resched only so this would
2617 * somewhat pessimize the simple resched case.
2618 */
2619 irq_enter();
2620 sched_ttwu_do_pending(list);
2621 irq_exit();
2622}
2623
2624static void ttwu_queue_remote(struct task_struct *p, int cpu)
2625{
2626 struct rq *rq = cpu_rq(cpu);
2627 struct task_struct *next = rq->wake_list;
2628
2629 for (;;) {
2630 struct task_struct *old = next;
2631
2632 p->wake_entry = next;
2633 next = cmpxchg(&rq->wake_list, old, p);
2634 if (next == old)
2635 break;
2636 }
2637
2638 if (!next)
2639 smp_send_reschedule(cpu);
2640}
2641
2642#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2643static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2644{
2645 struct rq *rq;
2646 int ret = 0;
2647
2648 rq = __task_rq_lock(p);
2649 if (p->on_cpu) {
2650 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2651 ttwu_do_wakeup(rq, p, wake_flags);
2652 ret = 1;
2653 }
2654 __task_rq_unlock(rq);
2655
2656 return ret;
2657
2658}
2659#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2660#endif /* CONFIG_SMP */
2661
2662static void ttwu_queue(struct task_struct *p, int cpu)
2663{
2664 struct rq *rq = cpu_rq(cpu);
2665
2666#if defined(CONFIG_SMP)
2667 /*
2668 * LITMUS^RT: whether to send an IPI to the remote CPU
2669 * is plugin specific.
2670 */
2671 if (!is_realtime(p) &&
2672 sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2673 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2674 ttwu_queue_remote(p, cpu);
2675 return;
2676 }
2677#endif
2678
2679 raw_spin_lock(&rq->lock);
2680 ttwu_do_activate(rq, p, 0);
2681 raw_spin_unlock(&rq->lock);
2350} 2682}
2351 2683
2352/** 2684/**
@@ -2364,97 +2696,79 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2364 * Returns %true if @p was woken up, %false if it was already running 2696 * Returns %true if @p was woken up, %false if it was already running
2365 * or @state didn't match @p's state. 2697 * or @state didn't match @p's state.
2366 */ 2698 */
2367static int try_to_wake_up(struct task_struct *p, unsigned int state, 2699static int
2368 int wake_flags) 2700try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2369{ 2701{
2370 int cpu, orig_cpu, this_cpu, success = 0;
2371 unsigned long flags; 2702 unsigned long flags;
2372 unsigned long en_flags = ENQUEUE_WAKEUP; 2703 int cpu, success = 0;
2373 struct rq *rq;
2374 2704
2375 if (is_realtime(p)) 2705 if (is_realtime(p))
2376 TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); 2706 TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
2377 2707
2378 this_cpu = get_cpu();
2379
2380 smp_wmb(); 2708 smp_wmb();
2381 rq = task_rq_lock(p, &flags); 2709 raw_spin_lock_irqsave(&p->pi_lock, flags);
2382 if (!(p->state & state)) 2710 if (!(p->state & state))
2383 goto out; 2711 goto out;
2384 2712
2385 if (p->se.on_rq) 2713 success = 1; /* we're going to change ->state */
2386 goto out_running;
2387
2388 cpu = task_cpu(p); 2714 cpu = task_cpu(p);
2389 orig_cpu = cpu;
2390 2715
2391#ifdef CONFIG_SMP 2716 if (p->on_rq && ttwu_remote(p, wake_flags))
2392 if (unlikely(task_running(rq, p)) || is_realtime(p)) 2717 goto stat;
2393 goto out_activate;
2394 2718
2719#ifdef CONFIG_SMP
2395 /* 2720 /*
2396 * In order to handle concurrent wakeups and release the rq->lock 2721 * If the owning (remote) cpu is still in the middle of schedule() with
2397 * we put the task in TASK_WAKING state. 2722 * this task as prev, wait until its done referencing the task.
2398 *
2399 * First fix up the nr_uninterruptible count:
2400 */ 2723 */
2401 if (task_contributes_to_load(p)) { 2724 while (p->on_cpu) {
2402 if (likely(cpu_online(orig_cpu))) 2725#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2403 rq->nr_uninterruptible--; 2726 /*
2404 else 2727 * In case the architecture enables interrupts in
2405 this_rq()->nr_uninterruptible--; 2728 * context_switch(), we cannot busy wait, since that
2406 } 2729 * would lead to deadlocks when an interrupt hits and
2407 p->state = TASK_WAKING; 2730 * tries to wake up @prev. So bail and do a complete
2408 2731 * remote wakeup.
2409 if (p->sched_class->task_waking) { 2732 */
2410 p->sched_class->task_waking(rq, p); 2733 if (ttwu_activate_remote(p, wake_flags))
2411 en_flags |= ENQUEUE_WAKING; 2734 goto stat;
2735#else
2736 cpu_relax();
2737#endif
2412 } 2738 }
2739 /*
2740 * Pairs with the smp_wmb() in finish_lock_switch().
2741 */
2742 smp_rmb();
2413 2743
2414 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); 2744 /* LITMUS^RT: once the task can be safely referenced by this
2415 if (cpu != orig_cpu) 2745 * CPU, don't mess up with Linux load balancing stuff.
2416 set_task_cpu(p, cpu); 2746 */
2417 __task_rq_unlock(rq); 2747 if (is_realtime(p))
2748 goto litmus_out_activate;
2418 2749
2419 rq = cpu_rq(cpu); 2750 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2420 raw_spin_lock(&rq->lock); 2751 p->state = TASK_WAKING;
2421 2752
2422 /* 2753 if (p->sched_class->task_waking)
2423 * We migrated the task without holding either rq->lock, however 2754 p->sched_class->task_waking(p);
2424 * since the task is not on the task list itself, nobody else
2425 * will try and migrate the task, hence the rq should match the
2426 * cpu we just moved it to.
2427 */
2428 WARN_ON(task_cpu(p) != cpu);
2429 WARN_ON(p->state != TASK_WAKING);
2430 2755
2431#ifdef CONFIG_SCHEDSTATS 2756 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2432 schedstat_inc(rq, ttwu_count); 2757 if (task_cpu(p) != cpu) {
2433 if (cpu == this_cpu) 2758 wake_flags |= WF_MIGRATED;
2434 schedstat_inc(rq, ttwu_local); 2759 set_task_cpu(p, cpu);
2435 else {
2436 struct sched_domain *sd;
2437 for_each_domain(this_cpu, sd) {
2438 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2439 schedstat_inc(sd, ttwu_wake_remote);
2440 break;
2441 }
2442 }
2443 } 2760 }
2444#endif /* CONFIG_SCHEDSTATS */
2445 2761
2446out_activate: 2762litmus_out_activate:
2447#endif /* CONFIG_SMP */ 2763#endif /* CONFIG_SMP */
2448 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2764
2449 cpu == this_cpu, en_flags); 2765 ttwu_queue(p, cpu);
2450 success = 1; 2766stat:
2451out_running: 2767 ttwu_stat(p, cpu, wake_flags);
2452 ttwu_post_activation(p, rq, wake_flags, success);
2453out: 2768out:
2454 if (is_realtime(p)) 2769 if (is_realtime(p))
2455 TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state); 2770 TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
2456 task_rq_unlock(rq, &flags); 2771 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2457 put_cpu();
2458 2772
2459 return success; 2773 return success;
2460} 2774}
@@ -2463,31 +2777,34 @@ out:
2463 * try_to_wake_up_local - try to wake up a local task with rq lock held 2777 * try_to_wake_up_local - try to wake up a local task with rq lock held
2464 * @p: the thread to be awakened 2778 * @p: the thread to be awakened
2465 * 2779 *
2466 * Put @p on the run-queue if it's not alredy there. The caller must 2780 * Put @p on the run-queue if it's not already there. The caller must
2467 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2781 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2468 * the current task. this_rq() stays locked over invocation. 2782 * the current task.
2469 */ 2783 */
2470static void try_to_wake_up_local(struct task_struct *p) 2784static void try_to_wake_up_local(struct task_struct *p)
2471{ 2785{
2472 struct rq *rq = task_rq(p); 2786 struct rq *rq = task_rq(p);
2473 bool success = false;
2474 2787
2475 BUG_ON(rq != this_rq()); 2788 BUG_ON(rq != this_rq());
2476 BUG_ON(p == current); 2789 BUG_ON(p == current);
2477 lockdep_assert_held(&rq->lock); 2790 lockdep_assert_held(&rq->lock);
2478 2791
2792 if (!raw_spin_trylock(&p->pi_lock)) {
2793 raw_spin_unlock(&rq->lock);
2794 raw_spin_lock(&p->pi_lock);
2795 raw_spin_lock(&rq->lock);
2796 }
2797
2479 if (!(p->state & TASK_NORMAL)) 2798 if (!(p->state & TASK_NORMAL))
2480 return; 2799 goto out;
2481 2800
2482 if (!p->se.on_rq) { 2801 if (!p->on_rq)
2483 if (likely(!task_running(rq, p))) { 2802 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2484 schedstat_inc(rq, ttwu_count); 2803
2485 schedstat_inc(rq, ttwu_local); 2804 ttwu_do_wakeup(rq, p, 0);
2486 } 2805 ttwu_stat(p, smp_processor_id(), 0);
2487 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2806out:
2488 success = true; 2807 raw_spin_unlock(&p->pi_lock);
2489 }
2490 ttwu_post_activation(p, rq, 0, success);
2491} 2808}
2492 2809
2493/** 2810/**
@@ -2520,18 +2837,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2520 */ 2837 */
2521static void __sched_fork(struct task_struct *p) 2838static void __sched_fork(struct task_struct *p)
2522{ 2839{
2840 p->on_rq = 0;
2841
2842 p->se.on_rq = 0;
2523 p->se.exec_start = 0; 2843 p->se.exec_start = 0;
2524 p->se.sum_exec_runtime = 0; 2844 p->se.sum_exec_runtime = 0;
2525 p->se.prev_sum_exec_runtime = 0; 2845 p->se.prev_sum_exec_runtime = 0;
2526 p->se.nr_migrations = 0; 2846 p->se.nr_migrations = 0;
2847 p->se.vruntime = 0;
2848 INIT_LIST_HEAD(&p->se.group_node);
2527 2849
2528#ifdef CONFIG_SCHEDSTATS 2850#ifdef CONFIG_SCHEDSTATS
2529 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2851 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2530#endif 2852#endif
2531 2853
2532 INIT_LIST_HEAD(&p->rt.run_list); 2854 INIT_LIST_HEAD(&p->rt.run_list);
2533 p->se.on_rq = 0;
2534 INIT_LIST_HEAD(&p->se.group_node);
2535 2855
2536#ifdef CONFIG_PREEMPT_NOTIFIERS 2856#ifdef CONFIG_PREEMPT_NOTIFIERS
2537 INIT_HLIST_HEAD(&p->preempt_notifiers); 2857 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2541,8 +2861,9 @@ static void __sched_fork(struct task_struct *p)
2541/* 2861/*
2542 * fork()/clone()-time setup: 2862 * fork()/clone()-time setup:
2543 */ 2863 */
2544void sched_fork(struct task_struct *p, int clone_flags) 2864void sched_fork(struct task_struct *p)
2545{ 2865{
2866 unsigned long flags;
2546 int cpu = get_cpu(); 2867 int cpu = get_cpu();
2547 2868
2548 __sched_fork(p); 2869 __sched_fork(p);
@@ -2594,22 +2915,24 @@ void sched_fork(struct task_struct *p, int clone_flags)
2594 * 2915 *
2595 * Silence PROVE_RCU. 2916 * Silence PROVE_RCU.
2596 */ 2917 */
2597 rcu_read_lock(); 2918 raw_spin_lock_irqsave(&p->pi_lock, flags);
2598 set_task_cpu(p, cpu); 2919 set_task_cpu(p, cpu);
2599 rcu_read_unlock(); 2920 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2600 2921
2601#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2922#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2602 if (likely(sched_info_on())) 2923 if (likely(sched_info_on()))
2603 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2924 memset(&p->sched_info, 0, sizeof(p->sched_info));
2604#endif 2925#endif
2605#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2926#if defined(CONFIG_SMP)
2606 p->oncpu = 0; 2927 p->on_cpu = 0;
2607#endif 2928#endif
2608#ifdef CONFIG_PREEMPT 2929#ifdef CONFIG_PREEMPT
2609 /* Want to start with kernel preemption disabled. */ 2930 /* Want to start with kernel preemption disabled. */
2610 task_thread_info(p)->preempt_count = 1; 2931 task_thread_info(p)->preempt_count = 1;
2611#endif 2932#endif
2933#ifdef CONFIG_SMP
2612 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2934 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2935#endif
2613 2936
2614 put_cpu(); 2937 put_cpu();
2615} 2938}
@@ -2621,41 +2944,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
2621 * that must be done for every newly created context, then puts the task 2944 * that must be done for every newly created context, then puts the task
2622 * on the runqueue and wakes it. 2945 * on the runqueue and wakes it.
2623 */ 2946 */
2624void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2947void wake_up_new_task(struct task_struct *p)
2625{ 2948{
2626 unsigned long flags; 2949 unsigned long flags;
2627 struct rq *rq; 2950 struct rq *rq;
2628 int cpu __maybe_unused = get_cpu();
2629 2951
2952 raw_spin_lock_irqsave(&p->pi_lock, flags);
2630#ifdef CONFIG_SMP 2953#ifdef CONFIG_SMP
2631 rq = task_rq_lock(p, &flags);
2632 p->state = TASK_WAKING;
2633
2634 /* 2954 /*
2635 * Fork balancing, do it here and not earlier because: 2955 * Fork balancing, do it here and not earlier because:
2636 * - cpus_allowed can change in the fork path 2956 * - cpus_allowed can change in the fork path
2637 * - any previously selected cpu might disappear through hotplug 2957 * - any previously selected cpu might disappear through hotplug
2638 *
2639 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2640 * without people poking at ->cpus_allowed.
2641 */ 2958 */
2642 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2959 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2643 set_task_cpu(p, cpu);
2644
2645 p->state = TASK_RUNNING;
2646 task_rq_unlock(rq, &flags);
2647#endif 2960#endif
2648 2961
2649 rq = task_rq_lock(p, &flags); 2962 rq = __task_rq_lock(p);
2650 activate_task(rq, p, 0); 2963 activate_task(rq, p, 0);
2651 trace_sched_wakeup_new(p, 1); 2964 p->on_rq = 1;
2965 trace_sched_wakeup_new(p, true);
2652 check_preempt_curr(rq, p, WF_FORK); 2966 check_preempt_curr(rq, p, WF_FORK);
2653#ifdef CONFIG_SMP 2967#ifdef CONFIG_SMP
2654 if (p->sched_class->task_woken) 2968 if (p->sched_class->task_woken)
2655 p->sched_class->task_woken(rq, p); 2969 p->sched_class->task_woken(rq, p);
2656#endif 2970#endif
2657 task_rq_unlock(rq, &flags); 2971 task_rq_unlock(rq, p, &flags);
2658 put_cpu();
2659} 2972}
2660 2973
2661#ifdef CONFIG_PREEMPT_NOTIFIERS 2974#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2733,9 +3046,12 @@ static inline void
2733prepare_task_switch(struct rq *rq, struct task_struct *prev, 3046prepare_task_switch(struct rq *rq, struct task_struct *prev,
2734 struct task_struct *next) 3047 struct task_struct *next)
2735{ 3048{
3049 sched_info_switch(prev, next);
3050 perf_event_task_sched_out(prev, next);
2736 fire_sched_out_preempt_notifiers(prev, next); 3051 fire_sched_out_preempt_notifiers(prev, next);
2737 prepare_lock_switch(rq, next); 3052 prepare_lock_switch(rq, next);
2738 prepare_arch_switch(next); 3053 prepare_arch_switch(next);
3054 trace_sched_switch(prev, next);
2739} 3055}
2740 3056
2741/** 3057/**
@@ -2879,7 +3195,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2879 struct mm_struct *mm, *oldmm; 3195 struct mm_struct *mm, *oldmm;
2880 3196
2881 prepare_task_switch(rq, prev, next); 3197 prepare_task_switch(rq, prev, next);
2882 trace_sched_switch(prev, next); 3198
2883 mm = next->mm; 3199 mm = next->mm;
2884 oldmm = prev->active_mm; 3200 oldmm = prev->active_mm;
2885 /* 3201 /*
@@ -2889,14 +3205,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2889 */ 3205 */
2890 arch_start_context_switch(prev); 3206 arch_start_context_switch(prev);
2891 3207
2892 if (likely(!mm)) { 3208 if (!mm) {
2893 next->active_mm = oldmm; 3209 next->active_mm = oldmm;
2894 atomic_inc(&oldmm->mm_count); 3210 atomic_inc(&oldmm->mm_count);
2895 enter_lazy_tlb(oldmm, next); 3211 enter_lazy_tlb(oldmm, next);
2896 } else 3212 } else
2897 switch_mm(oldmm, mm, next); 3213 switch_mm(oldmm, mm, next);
2898 3214
2899 if (likely(!prev->mm)) { 3215 if (!prev->mm) {
2900 prev->active_mm = NULL; 3216 prev->active_mm = NULL;
2901 rq->prev_mm = oldmm; 3217 rq->prev_mm = oldmm;
2902 } 3218 }
@@ -3011,6 +3327,15 @@ static long calc_load_fold_active(struct rq *this_rq)
3011 return delta; 3327 return delta;
3012} 3328}
3013 3329
3330static unsigned long
3331calc_load(unsigned long load, unsigned long exp, unsigned long active)
3332{
3333 load *= exp;
3334 load += active * (FIXED_1 - exp);
3335 load += 1UL << (FSHIFT - 1);
3336 return load >> FSHIFT;
3337}
3338
3014#ifdef CONFIG_NO_HZ 3339#ifdef CONFIG_NO_HZ
3015/* 3340/*
3016 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3341 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3040,6 +3365,128 @@ static long calc_load_fold_idle(void)
3040 3365
3041 return delta; 3366 return delta;
3042} 3367}
3368
3369/**
3370 * fixed_power_int - compute: x^n, in O(log n) time
3371 *
3372 * @x: base of the power
3373 * @frac_bits: fractional bits of @x
3374 * @n: power to raise @x to.
3375 *
3376 * By exploiting the relation between the definition of the natural power
3377 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3378 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3379 * (where: n_i \elem {0, 1}, the binary vector representing n),
3380 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3381 * of course trivially computable in O(log_2 n), the length of our binary
3382 * vector.
3383 */
3384static unsigned long
3385fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3386{
3387 unsigned long result = 1UL << frac_bits;
3388
3389 if (n) for (;;) {
3390 if (n & 1) {
3391 result *= x;
3392 result += 1UL << (frac_bits - 1);
3393 result >>= frac_bits;
3394 }
3395 n >>= 1;
3396 if (!n)
3397 break;
3398 x *= x;
3399 x += 1UL << (frac_bits - 1);
3400 x >>= frac_bits;
3401 }
3402
3403 return result;
3404}
3405
3406/*
3407 * a1 = a0 * e + a * (1 - e)
3408 *
3409 * a2 = a1 * e + a * (1 - e)
3410 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3411 * = a0 * e^2 + a * (1 - e) * (1 + e)
3412 *
3413 * a3 = a2 * e + a * (1 - e)
3414 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3415 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3416 *
3417 * ...
3418 *
3419 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3420 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3421 * = a0 * e^n + a * (1 - e^n)
3422 *
3423 * [1] application of the geometric series:
3424 *
3425 * n 1 - x^(n+1)
3426 * S_n := \Sum x^i = -------------
3427 * i=0 1 - x
3428 */
3429static unsigned long
3430calc_load_n(unsigned long load, unsigned long exp,
3431 unsigned long active, unsigned int n)
3432{
3433
3434 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3435}
3436
3437/*
3438 * NO_HZ can leave us missing all per-cpu ticks calling
3439 * calc_load_account_active(), but since an idle CPU folds its delta into
3440 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3441 * in the pending idle delta if our idle period crossed a load cycle boundary.
3442 *
3443 * Once we've updated the global active value, we need to apply the exponential
3444 * weights adjusted to the number of cycles missed.
3445 */
3446static void calc_global_nohz(unsigned long ticks)
3447{
3448 long delta, active, n;
3449
3450 if (time_before(jiffies, calc_load_update))
3451 return;
3452
3453 /*
3454 * If we crossed a calc_load_update boundary, make sure to fold
3455 * any pending idle changes, the respective CPUs might have
3456 * missed the tick driven calc_load_account_active() update
3457 * due to NO_HZ.
3458 */
3459 delta = calc_load_fold_idle();
3460 if (delta)
3461 atomic_long_add(delta, &calc_load_tasks);
3462
3463 /*
3464 * If we were idle for multiple load cycles, apply them.
3465 */
3466 if (ticks >= LOAD_FREQ) {
3467 n = ticks / LOAD_FREQ;
3468
3469 active = atomic_long_read(&calc_load_tasks);
3470 active = active > 0 ? active * FIXED_1 : 0;
3471
3472 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3473 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3474 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3475
3476 calc_load_update += n * LOAD_FREQ;
3477 }
3478
3479 /*
3480 * Its possible the remainder of the above division also crosses
3481 * a LOAD_FREQ period, the regular check in calc_global_load()
3482 * which comes after this will take care of that.
3483 *
3484 * Consider us being 11 ticks before a cycle completion, and us
3485 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3486 * age us 4 cycles, and the test in calc_global_load() will
3487 * pick up the final one.
3488 */
3489}
3043#else 3490#else
3044static void calc_load_account_idle(struct rq *this_rq) 3491static void calc_load_account_idle(struct rq *this_rq)
3045{ 3492{
@@ -3049,6 +3496,10 @@ static inline long calc_load_fold_idle(void)
3049{ 3496{
3050 return 0; 3497 return 0;
3051} 3498}
3499
3500static void calc_global_nohz(unsigned long ticks)
3501{
3502}
3052#endif 3503#endif
3053 3504
3054/** 3505/**
@@ -3066,24 +3517,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3066 loads[2] = (avenrun[2] + offset) << shift; 3517 loads[2] = (avenrun[2] + offset) << shift;
3067} 3518}
3068 3519
3069static unsigned long
3070calc_load(unsigned long load, unsigned long exp, unsigned long active)
3071{
3072 load *= exp;
3073 load += active * (FIXED_1 - exp);
3074 return load >> FSHIFT;
3075}
3076
3077/* 3520/*
3078 * calc_load - update the avenrun load estimates 10 ticks after the 3521 * calc_load - update the avenrun load estimates 10 ticks after the
3079 * CPUs have updated calc_load_tasks. 3522 * CPUs have updated calc_load_tasks.
3080 */ 3523 */
3081void calc_global_load(void) 3524void calc_global_load(unsigned long ticks)
3082{ 3525{
3083 unsigned long upd = calc_load_update + 10;
3084 long active; 3526 long active;
3085 3527
3086 if (time_before(jiffies, upd)) 3528 calc_global_nohz(ticks);
3529
3530 if (time_before(jiffies, calc_load_update + 10))
3087 return; 3531 return;
3088 3532
3089 active = atomic_long_read(&calc_load_tasks); 3533 active = atomic_long_read(&calc_load_tasks);
@@ -3244,27 +3688,22 @@ void sched_exec(void)
3244{ 3688{
3245 struct task_struct *p = current; 3689 struct task_struct *p = current;
3246 unsigned long flags; 3690 unsigned long flags;
3247 struct rq *rq;
3248 int dest_cpu; 3691 int dest_cpu;
3249 3692
3250 rq = task_rq_lock(p, &flags); 3693 raw_spin_lock_irqsave(&p->pi_lock, flags);
3251 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3694 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3252 if (dest_cpu == smp_processor_id()) 3695 if (dest_cpu == smp_processor_id())
3253 goto unlock; 3696 goto unlock;
3254 3697
3255 /* 3698 if (likely(cpu_active(dest_cpu))) {
3256 * select_task_rq() can race against ->cpus_allowed
3257 */
3258 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3259 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3260 struct migration_arg arg = { p, dest_cpu }; 3699 struct migration_arg arg = { p, dest_cpu };
3261 3700
3262 task_rq_unlock(rq, &flags); 3701 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3263 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3702 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3264 return; 3703 return;
3265 } 3704 }
3266unlock: 3705unlock:
3267 task_rq_unlock(rq, &flags); 3706 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3268} 3707}
3269 3708
3270#endif 3709#endif
@@ -3285,7 +3724,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3285 3724
3286 if (task_current(rq, p)) { 3725 if (task_current(rq, p)) {
3287 update_rq_clock(rq); 3726 update_rq_clock(rq);
3288 ns = rq->clock - p->se.exec_start; 3727 ns = rq->clock_task - p->se.exec_start;
3289 if ((s64)ns < 0) 3728 if ((s64)ns < 0)
3290 ns = 0; 3729 ns = 0;
3291 } 3730 }
@@ -3301,7 +3740,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3301 3740
3302 rq = task_rq_lock(p, &flags); 3741 rq = task_rq_lock(p, &flags);
3303 ns = do_task_delta_exec(p, rq); 3742 ns = do_task_delta_exec(p, rq);
3304 task_rq_unlock(rq, &flags); 3743 task_rq_unlock(rq, p, &flags);
3305 3744
3306 return ns; 3745 return ns;
3307} 3746}
@@ -3319,7 +3758,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3319 3758
3320 rq = task_rq_lock(p, &flags); 3759 rq = task_rq_lock(p, &flags);
3321 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3760 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3322 task_rq_unlock(rq, &flags); 3761 task_rq_unlock(rq, p, &flags);
3323 3762
3324 return ns; 3763 return ns;
3325} 3764}
@@ -3343,7 +3782,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3343 rq = task_rq_lock(p, &flags); 3782 rq = task_rq_lock(p, &flags);
3344 thread_group_cputime(p, &totals); 3783 thread_group_cputime(p, &totals);
3345 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3784 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3346 task_rq_unlock(rq, &flags); 3785 task_rq_unlock(rq, p, &flags);
3347 3786
3348 return ns; 3787 return ns;
3349} 3788}
@@ -3408,6 +3847,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3408} 3847}
3409 3848
3410/* 3849/*
3850 * Account system cpu time to a process and desired cpustat field
3851 * @p: the process that the cpu time gets accounted to
3852 * @cputime: the cpu time spent in kernel space since the last update
3853 * @cputime_scaled: cputime scaled by cpu frequency
3854 * @target_cputime64: pointer to cpustat field that has to be updated
3855 */
3856static inline
3857void __account_system_time(struct task_struct *p, cputime_t cputime,
3858 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3859{
3860 cputime64_t tmp = cputime_to_cputime64(cputime);
3861
3862 /* Add system time to process. */
3863 p->stime = cputime_add(p->stime, cputime);
3864 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3865 account_group_system_time(p, cputime);
3866
3867 /* Add system time to cpustat. */
3868 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3869 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3870
3871 /* Account for system time used */
3872 acct_update_integrals(p);
3873}
3874
3875/*
3411 * Account system cpu time to a process. 3876 * Account system cpu time to a process.
3412 * @p: the process that the cpu time gets accounted to 3877 * @p: the process that the cpu time gets accounted to
3413 * @hardirq_offset: the offset to subtract from hardirq_count() 3878 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3418,36 +3883,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3418 cputime_t cputime, cputime_t cputime_scaled) 3883 cputime_t cputime, cputime_t cputime_scaled)
3419{ 3884{
3420 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3885 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3421 cputime64_t tmp; 3886 cputime64_t *target_cputime64;
3422 3887
3423 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3888 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3424 account_guest_time(p, cputime, cputime_scaled); 3889 account_guest_time(p, cputime, cputime_scaled);
3425 return; 3890 return;
3426 } 3891 }
3427 3892
3428 /* Add system time to process. */
3429 p->stime = cputime_add(p->stime, cputime);
3430 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3431 account_group_system_time(p, cputime);
3432
3433 /* Add system time to cpustat. */
3434 tmp = cputime_to_cputime64(cputime);
3435 if (hardirq_count() - hardirq_offset) 3893 if (hardirq_count() - hardirq_offset)
3436 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3894 target_cputime64 = &cpustat->irq;
3437 else if (softirq_count()) 3895 else if (in_serving_softirq())
3438 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3896 target_cputime64 = &cpustat->softirq;
3439 else 3897 else
3440 cpustat->system = cputime64_add(cpustat->system, tmp); 3898 target_cputime64 = &cpustat->system;
3441
3442 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3443 3899
3444 /* Account for system time used */ 3900 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3445 acct_update_integrals(p);
3446} 3901}
3447 3902
3448/* 3903/*
3449 * Account for involuntary wait time. 3904 * Account for involuntary wait time.
3450 * @steal: the cpu time spent in involuntary wait 3905 * @cputime: the cpu time spent in involuntary wait
3451 */ 3906 */
3452void account_steal_time(cputime_t cputime) 3907void account_steal_time(cputime_t cputime)
3453{ 3908{
@@ -3475,6 +3930,73 @@ void account_idle_time(cputime_t cputime)
3475 3930
3476#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3931#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3477 3932
3933#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3934/*
3935 * Account a tick to a process and cpustat
3936 * @p: the process that the cpu time gets accounted to
3937 * @user_tick: is the tick from userspace
3938 * @rq: the pointer to rq
3939 *
3940 * Tick demultiplexing follows the order
3941 * - pending hardirq update
3942 * - pending softirq update
3943 * - user_time
3944 * - idle_time
3945 * - system time
3946 * - check for guest_time
3947 * - else account as system_time
3948 *
3949 * Check for hardirq is done both for system and user time as there is
3950 * no timer going off while we are on hardirq and hence we may never get an
3951 * opportunity to update it solely in system time.
3952 * p->stime and friends are only updated on system time and not on irq
3953 * softirq as those do not count in task exec_runtime any more.
3954 */
3955static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3956 struct rq *rq)
3957{
3958 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3959 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3960 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3961
3962 if (irqtime_account_hi_update()) {
3963 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3964 } else if (irqtime_account_si_update()) {
3965 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3966 } else if (this_cpu_ksoftirqd() == p) {
3967 /*
3968 * ksoftirqd time do not get accounted in cpu_softirq_time.
3969 * So, we have to handle it separately here.
3970 * Also, p->stime needs to be updated for ksoftirqd.
3971 */
3972 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3973 &cpustat->softirq);
3974 } else if (user_tick) {
3975 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3976 } else if (p == rq->idle) {
3977 account_idle_time(cputime_one_jiffy);
3978 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3979 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3980 } else {
3981 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3982 &cpustat->system);
3983 }
3984}
3985
3986static void irqtime_account_idle_ticks(int ticks)
3987{
3988 int i;
3989 struct rq *rq = this_rq();
3990
3991 for (i = 0; i < ticks; i++)
3992 irqtime_account_process_tick(current, 0, rq);
3993}
3994#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3995static void irqtime_account_idle_ticks(int ticks) {}
3996static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3997 struct rq *rq) {}
3998#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3999
3478/* 4000/*
3479 * Account a single tick of cpu time. 4001 * Account a single tick of cpu time.
3480 * @p: the process that the cpu time gets accounted to 4002 * @p: the process that the cpu time gets accounted to
@@ -3485,6 +4007,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
3485 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 4007 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3486 struct rq *rq = this_rq(); 4008 struct rq *rq = this_rq();
3487 4009
4010 if (sched_clock_irqtime) {
4011 irqtime_account_process_tick(p, user_tick, rq);
4012 return;
4013 }
4014
3488 if (user_tick) 4015 if (user_tick)
3489 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 4016 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3490 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 4017 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3510,6 +4037,12 @@ void account_steal_ticks(unsigned long ticks)
3510 */ 4037 */
3511void account_idle_ticks(unsigned long ticks) 4038void account_idle_ticks(unsigned long ticks)
3512{ 4039{
4040
4041 if (sched_clock_irqtime) {
4042 irqtime_account_idle_ticks(ticks);
4043 return;
4044 }
4045
3513 account_idle_time(jiffies_to_cputime(ticks)); 4046 account_idle_time(jiffies_to_cputime(ticks));
3514} 4047}
3515 4048
@@ -3603,9 +4136,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3603/* 4136/*
3604 * This function gets called by the timer code, with HZ frequency. 4137 * This function gets called by the timer code, with HZ frequency.
3605 * We call it with interrupts disabled. 4138 * We call it with interrupts disabled.
3606 *
3607 * It also gets called by the fork code, when changing the parent's
3608 * timeslices.
3609 */ 4139 */
3610void scheduler_tick(void) 4140void scheduler_tick(void)
3611{ 4141{
@@ -3627,7 +4157,7 @@ void scheduler_tick(void)
3627 4157
3628 raw_spin_unlock(&rq->lock); 4158 raw_spin_unlock(&rq->lock);
3629 4159
3630 perf_event_task_tick(curr); 4160 perf_event_task_tick();
3631 4161
3632#ifdef CONFIG_SMP 4162#ifdef CONFIG_SMP
3633 rq->idle_at_tick = idle_cpu(cpu); 4163 rq->idle_at_tick = idle_cpu(cpu);
@@ -3733,19 +4263,12 @@ static inline void schedule_debug(struct task_struct *prev)
3733 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4263 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3734 4264
3735 schedstat_inc(this_rq(), sched_count); 4265 schedstat_inc(this_rq(), sched_count);
3736#ifdef CONFIG_SCHEDSTATS
3737 if (unlikely(prev->lock_depth >= 0)) {
3738 schedstat_inc(this_rq(), bkl_count);
3739 schedstat_inc(prev, sched_info.bkl_count);
3740 }
3741#endif
3742} 4266}
3743 4267
3744static void put_prev_task(struct rq *rq, struct task_struct *prev) 4268static void put_prev_task(struct rq *rq, struct task_struct *prev)
3745{ 4269{
3746 if (prev->se.on_rq) 4270 if (prev->on_rq || rq->skip_clock_update < 0)
3747 update_rq_clock(rq); 4271 update_rq_clock(rq);
3748 rq->skip_clock_update = 0;
3749 prev->sched_class->put_prev_task(rq, prev); 4272 prev->sched_class->put_prev_task(rq, prev);
3750} 4273}
3751 4274
@@ -3776,17 +4299,13 @@ pick_next_task(struct rq *rq)
3776 } 4299 }
3777 */ 4300 */
3778 4301
3779 class = sched_class_highest; 4302 for_each_class(class) {
3780 for ( ; ; ) {
3781 p = class->pick_next_task(rq); 4303 p = class->pick_next_task(rq);
3782 if (p) 4304 if (p)
3783 return p; 4305 return p;
3784 /*
3785 * Will never be NULL as the idle class always
3786 * returns a non-NULL p:
3787 */
3788 class = class->next;
3789 } 4306 }
4307
4308 BUG(); /* the idle class will always have a runnable task */
3790} 4309}
3791 4310
3792/* 4311/*
@@ -3807,8 +4326,10 @@ need_resched:
3807 rcu_note_context_switch(cpu); 4326 rcu_note_context_switch(cpu);
3808 prev = rq->curr; 4327 prev = rq->curr;
3809 4328
3810 release_kernel_lock(prev); 4329 /* LITMUS^RT: quickly re-evaluate the scheduling decision
3811need_resched_nonpreemptible: 4330 * if the previous one is no longer valid after CTX.
4331 */
4332litmus_need_resched_nonpreemptible:
3812 TS_SCHED_START; 4333 TS_SCHED_START;
3813 TS_LVLA_SCHED_START; 4334 TS_LVLA_SCHED_START;
3814 TS_LVLB_SCHED_START; 4335 TS_LVLB_SCHED_START;
@@ -3821,18 +4342,19 @@ need_resched_nonpreemptible:
3821 hrtick_clear(rq); 4342 hrtick_clear(rq);
3822 4343
3823 raw_spin_lock_irq(&rq->lock); 4344 raw_spin_lock_irq(&rq->lock);
3824 clear_tsk_need_resched(prev);
3825 4345
3826 switch_count = &prev->nivcsw; 4346 switch_count = &prev->nivcsw;
3827 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4347 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3828 if (unlikely(signal_pending_state(prev->state, prev))) { 4348 if (unlikely(signal_pending_state(prev->state, prev))) {
3829 prev->state = TASK_RUNNING; 4349 prev->state = TASK_RUNNING;
3830 } else { 4350 } else {
4351 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4352 prev->on_rq = 0;
4353
3831 /* 4354 /*
3832 * If a worker is going to sleep, notify and 4355 * If a worker went to sleep, notify and ask workqueue
3833 * ask workqueue whether it wants to wake up a 4356 * whether it wants to wake up a task to maintain
3834 * task to maintain concurrency. If so, wake 4357 * concurrency.
3835 * up the task.
3836 */ 4358 */
3837 if (prev->flags & PF_WQ_WORKER) { 4359 if (prev->flags & PF_WQ_WORKER) {
3838 struct task_struct *to_wakeup; 4360 struct task_struct *to_wakeup;
@@ -3841,7 +4363,16 @@ need_resched_nonpreemptible:
3841 if (to_wakeup) 4363 if (to_wakeup)
3842 try_to_wake_up_local(to_wakeup); 4364 try_to_wake_up_local(to_wakeup);
3843 } 4365 }
3844 deactivate_task(rq, prev, DEQUEUE_SLEEP); 4366
4367 /*
4368 * If we are going to sleep and we have plugged IO
4369 * queued, make sure to submit it to avoid deadlocks.
4370 */
4371 if (blk_needs_flush_plug(prev)) {
4372 raw_spin_unlock(&rq->lock);
4373 blk_schedule_flush_plug(prev);
4374 raw_spin_lock(&rq->lock);
4375 }
3845 } 4376 }
3846 switch_count = &prev->nvcsw; 4377 switch_count = &prev->nvcsw;
3847 } 4378 }
@@ -3853,11 +4384,10 @@ need_resched_nonpreemptible:
3853 4384
3854 put_prev_task(rq, prev); 4385 put_prev_task(rq, prev);
3855 next = pick_next_task(rq); 4386 next = pick_next_task(rq);
4387 clear_tsk_need_resched(prev);
4388 rq->skip_clock_update = 0;
3856 4389
3857 if (likely(prev != next)) { 4390 if (likely(prev != next)) {
3858 sched_info_switch(prev, next);
3859 perf_event_task_sched_out(prev, next);
3860
3861 rq->nr_switches++; 4391 rq->nr_switches++;
3862 rq->curr = next; 4392 rq->curr = next;
3863 ++*switch_count; 4393 ++*switch_count;
@@ -3886,8 +4416,8 @@ need_resched_nonpreemptible:
3886 4416
3887 post_schedule(rq); 4417 post_schedule(rq);
3888 4418
3889 if (sched_state_validate_switch() || unlikely(reacquire_kernel_lock(prev))) 4419 if (sched_state_validate_switch())
3890 goto need_resched_nonpreemptible; 4420 goto litmus_need_resched_nonpreemptible;
3891 4421
3892 preempt_enable_no_resched(); 4422 preempt_enable_no_resched();
3893 if (need_resched()) 4423 if (need_resched())
@@ -3898,70 +4428,53 @@ need_resched_nonpreemptible:
3898EXPORT_SYMBOL(schedule); 4428EXPORT_SYMBOL(schedule);
3899 4429
3900#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4430#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4431
4432static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4433{
4434 bool ret = false;
4435
4436 rcu_read_lock();
4437 if (lock->owner != owner)
4438 goto fail;
4439
4440 /*
4441 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4442 * lock->owner still matches owner, if that fails, owner might
4443 * point to free()d memory, if it still matches, the rcu_read_lock()
4444 * ensures the memory stays valid.
4445 */
4446 barrier();
4447
4448 ret = owner->on_cpu;
4449fail:
4450 rcu_read_unlock();
4451
4452 return ret;
4453}
4454
3901/* 4455/*
3902 * Look out! "owner" is an entirely speculative pointer 4456 * Look out! "owner" is an entirely speculative pointer
3903 * access and not reliable. 4457 * access and not reliable.
3904 */ 4458 */
3905int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) 4459int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3906{ 4460{
3907 unsigned int cpu;
3908 struct rq *rq;
3909
3910 if (!sched_feat(OWNER_SPIN)) 4461 if (!sched_feat(OWNER_SPIN))
3911 return 0; 4462 return 0;
3912 4463
3913#ifdef CONFIG_DEBUG_PAGEALLOC 4464 while (owner_running(lock, owner)) {
3914 /* 4465 if (need_resched())
3915 * Need to access the cpu field knowing that 4466 return 0;
3916 * DEBUG_PAGEALLOC could have unmapped it if
3917 * the mutex owner just released it and exited.
3918 */
3919 if (probe_kernel_address(&owner->cpu, cpu))
3920 return 0;
3921#else
3922 cpu = owner->cpu;
3923#endif
3924 4467
3925 /* 4468 arch_mutex_cpu_relax();
3926 * Even if the access succeeded (likely case), 4469 }
3927 * the cpu field may no longer be valid.
3928 */
3929 if (cpu >= nr_cpumask_bits)
3930 return 0;
3931 4470
3932 /* 4471 /*
3933 * We need to validate that we can do a 4472 * If the owner changed to another task there is likely
3934 * get_cpu() and that we have the percpu area. 4473 * heavy contention, stop spinning.
3935 */ 4474 */
3936 if (!cpu_online(cpu)) 4475 if (lock->owner)
3937 return 0; 4476 return 0;
3938 4477
3939 rq = cpu_rq(cpu);
3940
3941 for (;;) {
3942 /*
3943 * Owner changed, break to re-assess state.
3944 */
3945 if (lock->owner != owner) {
3946 /*
3947 * If the lock has switched to a different owner,
3948 * we likely have heavy contention. Return 0 to quit
3949 * optimistic spinning and not contend further:
3950 */
3951 if (lock->owner)
3952 return 0;
3953 break;
3954 }
3955
3956 /*
3957 * Is that owner really running on that cpu?
3958 */
3959 if (task_thread_info(rq->curr) != owner || need_resched())
3960 return 0;
3961
3962 cpu_relax();
3963 }
3964
3965 return 1; 4478 return 1;
3966} 4479}
3967#endif 4480#endif
@@ -4091,6 +4604,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4091{ 4604{
4092 __wake_up_common(q, mode, 1, 0, key); 4605 __wake_up_common(q, mode, 1, 0, key);
4093} 4606}
4607EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4094 4608
4095/** 4609/**
4096 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 4610 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4282,7 +4796,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4282 * This waits for either a completion of a specific task to be signaled or for a 4796 * This waits for either a completion of a specific task to be signaled or for a
4283 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4797 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4284 */ 4798 */
4285unsigned long __sched 4799long __sched
4286wait_for_completion_interruptible_timeout(struct completion *x, 4800wait_for_completion_interruptible_timeout(struct completion *x,
4287 unsigned long timeout) 4801 unsigned long timeout)
4288{ 4802{
@@ -4315,7 +4829,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4315 * signaled or for a specified timeout to expire. It can be 4829 * signaled or for a specified timeout to expire. It can be
4316 * interrupted by a kill signal. The timeout is in jiffies. 4830 * interrupted by a kill signal. The timeout is in jiffies.
4317 */ 4831 */
4318unsigned long __sched 4832long __sched
4319wait_for_completion_killable_timeout(struct completion *x, 4833wait_for_completion_killable_timeout(struct completion *x,
4320 unsigned long timeout) 4834 unsigned long timeout)
4321{ 4835{
@@ -4431,18 +4945,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4431 */ 4945 */
4432void rt_mutex_setprio(struct task_struct *p, int prio) 4946void rt_mutex_setprio(struct task_struct *p, int prio)
4433{ 4947{
4434 unsigned long flags;
4435 int oldprio, on_rq, running; 4948 int oldprio, on_rq, running;
4436 struct rq *rq; 4949 struct rq *rq;
4437 const struct sched_class *prev_class; 4950 const struct sched_class *prev_class;
4438 4951
4439 BUG_ON(prio < 0 || prio > MAX_PRIO); 4952 BUG_ON(prio < 0 || prio > MAX_PRIO);
4440 4953
4441 rq = task_rq_lock(p, &flags); 4954 rq = __task_rq_lock(p);
4442 4955
4956 trace_sched_pi_setprio(p, prio);
4443 oldprio = p->prio; 4957 oldprio = p->prio;
4444 prev_class = p->sched_class; 4958 prev_class = p->sched_class;
4445 on_rq = p->se.on_rq; 4959 on_rq = p->on_rq;
4446 running = task_current(rq, p); 4960 running = task_current(rq, p);
4447 if (on_rq) 4961 if (on_rq)
4448 dequeue_task(rq, p, 0); 4962 dequeue_task(rq, p, 0);
@@ -4458,12 +4972,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4458 4972
4459 if (running) 4973 if (running)
4460 p->sched_class->set_curr_task(rq); 4974 p->sched_class->set_curr_task(rq);
4461 if (on_rq) { 4975 if (on_rq)
4462 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4976 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4463 4977
4464 check_class_changed(rq, p, prev_class, oldprio, running); 4978 check_class_changed(rq, p, prev_class, oldprio);
4465 } 4979 __task_rq_unlock(rq);
4466 task_rq_unlock(rq, &flags);
4467} 4980}
4468 4981
4469#endif 4982#endif
@@ -4491,7 +5004,7 @@ void set_user_nice(struct task_struct *p, long nice)
4491 p->static_prio = NICE_TO_PRIO(nice); 5004 p->static_prio = NICE_TO_PRIO(nice);
4492 goto out_unlock; 5005 goto out_unlock;
4493 } 5006 }
4494 on_rq = p->se.on_rq; 5007 on_rq = p->on_rq;
4495 if (on_rq) 5008 if (on_rq)
4496 dequeue_task(rq, p, 0); 5009 dequeue_task(rq, p, 0);
4497 5010
@@ -4511,7 +5024,7 @@ void set_user_nice(struct task_struct *p, long nice)
4511 resched_task(rq->curr); 5024 resched_task(rq->curr);
4512 } 5025 }
4513out_unlock: 5026out_unlock:
4514 task_rq_unlock(rq, &flags); 5027 task_rq_unlock(rq, p, &flags);
4515} 5028}
4516EXPORT_SYMBOL(set_user_nice); 5029EXPORT_SYMBOL(set_user_nice);
4517 5030
@@ -4625,8 +5138,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4625static void 5138static void
4626__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 5139__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4627{ 5140{
4628 BUG_ON(p->se.on_rq);
4629
4630 p->policy = policy; 5141 p->policy = policy;
4631 p->rt_priority = prio; 5142 p->rt_priority = prio;
4632 p->normal_prio = normal_prio(p); 5143 p->normal_prio = normal_prio(p);
@@ -4651,14 +5162,17 @@ static bool check_same_owner(struct task_struct *p)
4651 5162
4652 rcu_read_lock(); 5163 rcu_read_lock();
4653 pcred = __task_cred(p); 5164 pcred = __task_cred(p);
4654 match = (cred->euid == pcred->euid || 5165 if (cred->user->user_ns == pcred->user->user_ns)
4655 cred->euid == pcred->uid); 5166 match = (cred->euid == pcred->euid ||
5167 cred->euid == pcred->uid);
5168 else
5169 match = false;
4656 rcu_read_unlock(); 5170 rcu_read_unlock();
4657 return match; 5171 return match;
4658} 5172}
4659 5173
4660static int __sched_setscheduler(struct task_struct *p, int policy, 5174static int __sched_setscheduler(struct task_struct *p, int policy,
4661 struct sched_param *param, bool user) 5175 const struct sched_param *param, bool user)
4662{ 5176{
4663 int retval, oldprio, oldpolicy = -1, on_rq, running; 5177 int retval, oldprio, oldpolicy = -1, on_rq, running;
4664 unsigned long flags; 5178 unsigned long flags;
@@ -4714,12 +5228,15 @@ recheck:
4714 param->sched_priority > rlim_rtprio) 5228 param->sched_priority > rlim_rtprio)
4715 return -EPERM; 5229 return -EPERM;
4716 } 5230 }
5231
4717 /* 5232 /*
4718 * Like positive nice levels, dont allow tasks to 5233 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4719 * move out of SCHED_IDLE either: 5234 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4720 */ 5235 */
4721 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 5236 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4722 return -EPERM; 5237 if (!can_nice(p, TASK_NICE(p)))
5238 return -EPERM;
5239 }
4723 5240
4724 /* can't change other user's priorities */ 5241 /* can't change other user's priorities */
4725 if (!check_same_owner(p)) 5242 if (!check_same_owner(p))
@@ -4731,7 +5248,7 @@ recheck:
4731 } 5248 }
4732 5249
4733 if (user) { 5250 if (user) {
4734 retval = security_task_setscheduler(p, policy, param); 5251 retval = security_task_setscheduler(p);
4735 if (retval) 5252 if (retval)
4736 return retval; 5253 return retval;
4737 } 5254 }
@@ -4745,13 +5262,30 @@ recheck:
4745 /* 5262 /*
4746 * make sure no PI-waiters arrive (or leave) while we are 5263 * make sure no PI-waiters arrive (or leave) while we are
4747 * changing the priority of the task: 5264 * changing the priority of the task:
5265 *
5266 * To be able to change p->policy safely, the appropriate
5267 * runqueue lock must be held.
4748 */ 5268 */
4749 raw_spin_lock_irqsave(&p->pi_lock, flags); 5269 rq = task_rq_lock(p, &flags);
5270
4750 /* 5271 /*
4751 * To be able to change p->policy safely, the apropriate 5272 * Changing the policy of the stop threads its a very bad idea
4752 * runqueue lock must be held.
4753 */ 5273 */
4754 rq = __task_rq_lock(p); 5274 if (p == rq->stop) {
5275 task_rq_unlock(rq, p, &flags);
5276 return -EINVAL;
5277 }
5278
5279 /*
5280 * If not changing anything there's no need to proceed further:
5281 */
5282 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
5283 param->sched_priority == p->rt_priority))) {
5284
5285 __task_rq_unlock(rq);
5286 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5287 return 0;
5288 }
4755 5289
4756#ifdef CONFIG_RT_GROUP_SCHED 5290#ifdef CONFIG_RT_GROUP_SCHED
4757 if (user) { 5291 if (user) {
@@ -4760,9 +5294,9 @@ recheck:
4760 * assigned. 5294 * assigned.
4761 */ 5295 */
4762 if (rt_bandwidth_enabled() && rt_policy(policy) && 5296 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4763 task_group(p)->rt_bandwidth.rt_runtime == 0) { 5297 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4764 __task_rq_unlock(rq); 5298 !task_group_is_autogroup(task_group(p))) {
4765 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5299 task_rq_unlock(rq, p, &flags);
4766 return -EPERM; 5300 return -EPERM;
4767 } 5301 }
4768 } 5302 }
@@ -4771,11 +5305,10 @@ recheck:
4771 /* recheck policy now with rq lock held */ 5305 /* recheck policy now with rq lock held */
4772 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5306 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4773 policy = oldpolicy = -1; 5307 policy = oldpolicy = -1;
4774 __task_rq_unlock(rq); 5308 task_rq_unlock(rq, p, &flags);
4775 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4776 goto recheck; 5309 goto recheck;
4777 } 5310 }
4778 on_rq = p->se.on_rq; 5311 on_rq = p->on_rq;
4779 running = task_current(rq, p); 5312 running = task_current(rq, p);
4780 if (on_rq) 5313 if (on_rq)
4781 deactivate_task(rq, p, 0); 5314 deactivate_task(rq, p, 0);
@@ -4799,13 +5332,11 @@ recheck:
4799 5332
4800 if (running) 5333 if (running)
4801 p->sched_class->set_curr_task(rq); 5334 p->sched_class->set_curr_task(rq);
4802 if (on_rq) { 5335 if (on_rq)
4803 activate_task(rq, p, 0); 5336 activate_task(rq, p, 0);
4804 5337
4805 check_class_changed(rq, p, prev_class, oldprio, running); 5338 check_class_changed(rq, p, prev_class, oldprio);
4806 } 5339 task_rq_unlock(rq, p, &flags);
4807 __task_rq_unlock(rq);
4808 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4809 5340
4810 rt_mutex_adjust_pi(p); 5341 rt_mutex_adjust_pi(p);
4811 5342
@@ -4821,7 +5352,7 @@ recheck:
4821 * NOTE that the task may be already dead. 5352 * NOTE that the task may be already dead.
4822 */ 5353 */
4823int sched_setscheduler(struct task_struct *p, int policy, 5354int sched_setscheduler(struct task_struct *p, int policy,
4824 struct sched_param *param) 5355 const struct sched_param *param)
4825{ 5356{
4826 return __sched_setscheduler(p, policy, param, true); 5357 return __sched_setscheduler(p, policy, param, true);
4827} 5358}
@@ -4839,7 +5370,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
4839 * but our caller might not have that capability. 5370 * but our caller might not have that capability.
4840 */ 5371 */
4841int sched_setscheduler_nocheck(struct task_struct *p, int policy, 5372int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4842 struct sched_param *param) 5373 const struct sched_param *param)
4843{ 5374{
4844 return __sched_setscheduler(p, policy, param, false); 5375 return __sched_setscheduler(p, policy, param, false);
4845} 5376}
@@ -4986,16 +5517,16 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4986 goto out_free_cpus_allowed; 5517 goto out_free_cpus_allowed;
4987 } 5518 }
4988 retval = -EPERM; 5519 retval = -EPERM;
4989 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5520 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
4990 goto out_unlock; 5521 goto out_unlock;
4991 5522
4992 retval = security_task_setscheduler(p, 0, NULL); 5523 retval = security_task_setscheduler(p);
4993 if (retval) 5524 if (retval)
4994 goto out_unlock; 5525 goto out_unlock;
4995 5526
4996 cpuset_cpus_allowed(p, cpus_allowed); 5527 cpuset_cpus_allowed(p, cpus_allowed);
4997 cpumask_and(new_mask, in_mask, cpus_allowed); 5528 cpumask_and(new_mask, in_mask, cpus_allowed);
4998 again: 5529again:
4999 retval = set_cpus_allowed_ptr(p, new_mask); 5530 retval = set_cpus_allowed_ptr(p, new_mask);
5000 5531
5001 if (!retval) { 5532 if (!retval) {
@@ -5057,7 +5588,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5057{ 5588{
5058 struct task_struct *p; 5589 struct task_struct *p;
5059 unsigned long flags; 5590 unsigned long flags;
5060 struct rq *rq;
5061 int retval; 5591 int retval;
5062 5592
5063 get_online_cpus(); 5593 get_online_cpus();
@@ -5072,9 +5602,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5072 if (retval) 5602 if (retval)
5073 goto out_unlock; 5603 goto out_unlock;
5074 5604
5075 rq = task_rq_lock(p, &flags); 5605 raw_spin_lock_irqsave(&p->pi_lock, flags);
5076 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5606 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5077 task_rq_unlock(rq, &flags); 5607 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5078 5608
5079out_unlock: 5609out_unlock:
5080 rcu_read_unlock(); 5610 rcu_read_unlock();
@@ -5221,6 +5751,67 @@ void __sched yield(void)
5221} 5751}
5222EXPORT_SYMBOL(yield); 5752EXPORT_SYMBOL(yield);
5223 5753
5754/**
5755 * yield_to - yield the current processor to another thread in
5756 * your thread group, or accelerate that thread toward the
5757 * processor it's on.
5758 * @p: target task
5759 * @preempt: whether task preemption is allowed or not
5760 *
5761 * It's the caller's job to ensure that the target task struct
5762 * can't go away on us before we can do any checks.
5763 *
5764 * Returns true if we indeed boosted the target task.
5765 */
5766bool __sched yield_to(struct task_struct *p, bool preempt)
5767{
5768 struct task_struct *curr = current;
5769 struct rq *rq, *p_rq;
5770 unsigned long flags;
5771 bool yielded = 0;
5772
5773 local_irq_save(flags);
5774 rq = this_rq();
5775
5776again:
5777 p_rq = task_rq(p);
5778 double_rq_lock(rq, p_rq);
5779 while (task_rq(p) != p_rq) {
5780 double_rq_unlock(rq, p_rq);
5781 goto again;
5782 }
5783
5784 if (!curr->sched_class->yield_to_task)
5785 goto out;
5786
5787 if (curr->sched_class != p->sched_class)
5788 goto out;
5789
5790 if (task_running(p_rq, p) || p->state)
5791 goto out;
5792
5793 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5794 if (yielded) {
5795 schedstat_inc(rq, yld_count);
5796 /*
5797 * Make p's CPU reschedule; pick_next_entity takes care of
5798 * fairness.
5799 */
5800 if (preempt && rq != p_rq)
5801 resched_task(p_rq->curr);
5802 }
5803
5804out:
5805 double_rq_unlock(rq, p_rq);
5806 local_irq_restore(flags);
5807
5808 if (yielded)
5809 schedule();
5810
5811 return yielded;
5812}
5813EXPORT_SYMBOL_GPL(yield_to);
5814
5224/* 5815/*
5225 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5816 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5226 * that process accounting knows that this is a task in IO wait state. 5817 * that process accounting knows that this is a task in IO wait state.
@@ -5231,6 +5822,7 @@ void __sched io_schedule(void)
5231 5822
5232 delayacct_blkio_start(); 5823 delayacct_blkio_start();
5233 atomic_inc(&rq->nr_iowait); 5824 atomic_inc(&rq->nr_iowait);
5825 blk_flush_plug(current);
5234 current->in_iowait = 1; 5826 current->in_iowait = 1;
5235 schedule(); 5827 schedule();
5236 current->in_iowait = 0; 5828 current->in_iowait = 0;
@@ -5246,6 +5838,7 @@ long __sched io_schedule_timeout(long timeout)
5246 5838
5247 delayacct_blkio_start(); 5839 delayacct_blkio_start();
5248 atomic_inc(&rq->nr_iowait); 5840 atomic_inc(&rq->nr_iowait);
5841 blk_flush_plug(current);
5249 current->in_iowait = 1; 5842 current->in_iowait = 1;
5250 ret = schedule_timeout(timeout); 5843 ret = schedule_timeout(timeout);
5251 current->in_iowait = 0; 5844 current->in_iowait = 0;
@@ -5336,7 +5929,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5336 5929
5337 rq = task_rq_lock(p, &flags); 5930 rq = task_rq_lock(p, &flags);
5338 time_slice = p->sched_class->get_rr_interval(rq, p); 5931 time_slice = p->sched_class->get_rr_interval(rq, p);
5339 task_rq_unlock(rq, &flags); 5932 task_rq_unlock(rq, p, &flags);
5340 5933
5341 rcu_read_unlock(); 5934 rcu_read_unlock();
5342 jiffies_to_timespec(time_slice, &t); 5935 jiffies_to_timespec(time_slice, &t);
@@ -5356,7 +5949,7 @@ void sched_show_task(struct task_struct *p)
5356 unsigned state; 5949 unsigned state;
5357 5950
5358 state = p->state ? __ffs(p->state) + 1 : 0; 5951 state = p->state ? __ffs(p->state) + 1 : 0;
5359 printk(KERN_INFO "%-13.13s %c", p->comm, 5952 printk(KERN_INFO "%-15.15s %c", p->comm,
5360 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5953 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5361#if BITS_PER_LONG == 32 5954#if BITS_PER_LONG == 32
5362 if (state == TASK_RUNNING) 5955 if (state == TASK_RUNNING)
@@ -5394,7 +5987,7 @@ void show_state_filter(unsigned long state_filter)
5394 do_each_thread(g, p) { 5987 do_each_thread(g, p) {
5395 /* 5988 /*
5396 * reset the NMI-timeout, listing all files on a slow 5989 * reset the NMI-timeout, listing all files on a slow
5397 * console might take alot of time: 5990 * console might take a lot of time:
5398 */ 5991 */
5399 touch_nmi_watchdog(); 5992 touch_nmi_watchdog();
5400 if (!state_filter || (p->state & state_filter)) 5993 if (!state_filter || (p->state & state_filter))
@@ -5438,26 +6031,35 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5438 idle->state = TASK_RUNNING; 6031 idle->state = TASK_RUNNING;
5439 idle->se.exec_start = sched_clock(); 6032 idle->se.exec_start = sched_clock();
5440 6033
5441 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 6034 do_set_cpus_allowed(idle, cpumask_of(cpu));
6035 /*
6036 * We're having a chicken and egg problem, even though we are
6037 * holding rq->lock, the cpu isn't yet set to this cpu so the
6038 * lockdep check in task_group() will fail.
6039 *
6040 * Similar case to sched_fork(). / Alternatively we could
6041 * use task_rq_lock() here and obtain the other rq->lock.
6042 *
6043 * Silence PROVE_RCU
6044 */
6045 rcu_read_lock();
5442 __set_task_cpu(idle, cpu); 6046 __set_task_cpu(idle, cpu);
6047 rcu_read_unlock();
5443 6048
5444 rq->curr = rq->idle = idle; 6049 rq->curr = rq->idle = idle;
5445#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 6050#if defined(CONFIG_SMP)
5446 idle->oncpu = 1; 6051 idle->on_cpu = 1;
5447#endif 6052#endif
5448 raw_spin_unlock_irqrestore(&rq->lock, flags); 6053 raw_spin_unlock_irqrestore(&rq->lock, flags);
5449 6054
5450 /* Set the preempt count _outside_ the spinlocks! */ 6055 /* Set the preempt count _outside_ the spinlocks! */
5451#if defined(CONFIG_PREEMPT)
5452 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5453#else
5454 task_thread_info(idle)->preempt_count = 0; 6056 task_thread_info(idle)->preempt_count = 0;
5455#endif 6057
5456 /* 6058 /*
5457 * The idle tasks have their own, simple scheduling class: 6059 * The idle tasks have their own, simple scheduling class:
5458 */ 6060 */
5459 idle->sched_class = &idle_sched_class; 6061 idle->sched_class = &idle_sched_class;
5460 ftrace_graph_init_task(idle); 6062 ftrace_graph_init_idle_task(idle, cpu);
5461} 6063}
5462 6064
5463/* 6065/*
@@ -5508,7 +6110,6 @@ static void update_sysctl(void)
5508 SET_SYSCTL(sched_min_granularity); 6110 SET_SYSCTL(sched_min_granularity);
5509 SET_SYSCTL(sched_latency); 6111 SET_SYSCTL(sched_latency);
5510 SET_SYSCTL(sched_wakeup_granularity); 6112 SET_SYSCTL(sched_wakeup_granularity);
5511 SET_SYSCTL(sched_shares_ratelimit);
5512#undef SET_SYSCTL 6113#undef SET_SYSCTL
5513} 6114}
5514 6115
@@ -5518,6 +6119,16 @@ static inline void sched_init_granularity(void)
5518} 6119}
5519 6120
5520#ifdef CONFIG_SMP 6121#ifdef CONFIG_SMP
6122void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6123{
6124 if (p->sched_class && p->sched_class->set_cpus_allowed)
6125 p->sched_class->set_cpus_allowed(p, new_mask);
6126 else {
6127 cpumask_copy(&p->cpus_allowed, new_mask);
6128 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6129 }
6130}
6131
5521/* 6132/*
5522 * This is how migration works: 6133 * This is how migration works:
5523 * 6134 *
@@ -5548,52 +6159,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5548 unsigned int dest_cpu; 6159 unsigned int dest_cpu;
5549 int ret = 0; 6160 int ret = 0;
5550 6161
5551 /*
5552 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5553 * drop the rq->lock and still rely on ->cpus_allowed.
5554 */
5555again:
5556 while (task_is_waking(p))
5557 cpu_relax();
5558 rq = task_rq_lock(p, &flags); 6162 rq = task_rq_lock(p, &flags);
5559 if (task_is_waking(p)) { 6163
5560 task_rq_unlock(rq, &flags); 6164 if (cpumask_equal(&p->cpus_allowed, new_mask))
5561 goto again; 6165 goto out;
5562 }
5563 6166
5564 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 6167 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5565 ret = -EINVAL; 6168 ret = -EINVAL;
5566 goto out; 6169 goto out;
5567 } 6170 }
5568 6171
5569 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 6172 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5570 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5571 ret = -EINVAL; 6173 ret = -EINVAL;
5572 goto out; 6174 goto out;
5573 } 6175 }
5574 6176
5575 if (p->sched_class->set_cpus_allowed) 6177 do_set_cpus_allowed(p, new_mask);
5576 p->sched_class->set_cpus_allowed(p, new_mask);
5577 else {
5578 cpumask_copy(&p->cpus_allowed, new_mask);
5579 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5580 }
5581 6178
5582 /* Can the task run on the task's current CPU? If so, we're done */ 6179 /* Can the task run on the task's current CPU? If so, we're done */
5583 if (cpumask_test_cpu(task_cpu(p), new_mask)) 6180 if (cpumask_test_cpu(task_cpu(p), new_mask))
5584 goto out; 6181 goto out;
5585 6182
5586 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 6183 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5587 if (migrate_task(p, dest_cpu)) { 6184 if (p->on_rq) {
5588 struct migration_arg arg = { p, dest_cpu }; 6185 struct migration_arg arg = { p, dest_cpu };
5589 /* Need help from migration thread: drop lock and wait. */ 6186 /* Need help from migration thread: drop lock and wait. */
5590 task_rq_unlock(rq, &flags); 6187 task_rq_unlock(rq, p, &flags);
5591 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 6188 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5592 tlb_migrate_finish(p->mm); 6189 tlb_migrate_finish(p->mm);
5593 return 0; 6190 return 0;
5594 } 6191 }
5595out: 6192out:
5596 task_rq_unlock(rq, &flags); 6193 task_rq_unlock(rq, p, &flags);
5597 6194
5598 return ret; 6195 return ret;
5599} 6196}
@@ -5621,6 +6218,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5621 rq_src = cpu_rq(src_cpu); 6218 rq_src = cpu_rq(src_cpu);
5622 rq_dest = cpu_rq(dest_cpu); 6219 rq_dest = cpu_rq(dest_cpu);
5623 6220
6221 raw_spin_lock(&p->pi_lock);
5624 double_rq_lock(rq_src, rq_dest); 6222 double_rq_lock(rq_src, rq_dest);
5625 /* Already moved. */ 6223 /* Already moved. */
5626 if (task_cpu(p) != src_cpu) 6224 if (task_cpu(p) != src_cpu)
@@ -5633,7 +6231,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5633 * If we're not on a rq, the next wake-up will ensure we're 6231 * If we're not on a rq, the next wake-up will ensure we're
5634 * placed properly. 6232 * placed properly.
5635 */ 6233 */
5636 if (p->se.on_rq) { 6234 if (p->on_rq) {
5637 deactivate_task(rq_src, p, 0); 6235 deactivate_task(rq_src, p, 0);
5638 set_task_cpu(p, dest_cpu); 6236 set_task_cpu(p, dest_cpu);
5639 activate_task(rq_dest, p, 0); 6237 activate_task(rq_dest, p, 0);
@@ -5643,6 +6241,7 @@ done:
5643 ret = 1; 6241 ret = 1;
5644fail: 6242fail:
5645 double_rq_unlock(rq_src, rq_dest); 6243 double_rq_unlock(rq_src, rq_dest);
6244 raw_spin_unlock(&p->pi_lock);
5646 return ret; 6245 return ret;
5647} 6246}
5648 6247
@@ -5666,29 +6265,20 @@ static int migration_cpu_stop(void *data)
5666} 6265}
5667 6266
5668#ifdef CONFIG_HOTPLUG_CPU 6267#ifdef CONFIG_HOTPLUG_CPU
6268
5669/* 6269/*
5670 * Figure out where task on dead CPU should go, use force if necessary. 6270 * Ensures that the idle task is using init_mm right before its cpu goes
6271 * offline.
5671 */ 6272 */
5672void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6273void idle_task_exit(void)
5673{ 6274{
5674 struct rq *rq = cpu_rq(dead_cpu); 6275 struct mm_struct *mm = current->active_mm;
5675 int needs_cpu, uninitialized_var(dest_cpu);
5676 unsigned long flags;
5677 6276
5678 local_irq_save(flags); 6277 BUG_ON(cpu_online(smp_processor_id()));
5679 6278
5680 raw_spin_lock(&rq->lock); 6279 if (mm != &init_mm)
5681 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 6280 switch_mm(mm, &init_mm, current);
5682 if (needs_cpu) 6281 mmdrop(mm);
5683 dest_cpu = select_fallback_rq(dead_cpu, p);
5684 raw_spin_unlock(&rq->lock);
5685 /*
5686 * It can only fail if we race with set_cpus_allowed(),
5687 * in the racer should migrate the task anyway.
5688 */
5689 if (needs_cpu)
5690 __migrate_task(p, dead_cpu, dest_cpu);
5691 local_irq_restore(flags);
5692} 6282}
5693 6283
5694/* 6284/*
@@ -5701,128 +6291,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5701static void migrate_nr_uninterruptible(struct rq *rq_src) 6291static void migrate_nr_uninterruptible(struct rq *rq_src)
5702{ 6292{
5703 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 6293 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5704 unsigned long flags;
5705 6294
5706 local_irq_save(flags);
5707 double_rq_lock(rq_src, rq_dest);
5708 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 6295 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5709 rq_src->nr_uninterruptible = 0; 6296 rq_src->nr_uninterruptible = 0;
5710 double_rq_unlock(rq_src, rq_dest);
5711 local_irq_restore(flags);
5712}
5713
5714/* Run through task list and migrate tasks from the dead cpu. */
5715static void migrate_live_tasks(int src_cpu)
5716{
5717 struct task_struct *p, *t;
5718
5719 read_lock(&tasklist_lock);
5720
5721 do_each_thread(t, p) {
5722 if (p == current)
5723 continue;
5724
5725 if (task_cpu(p) == src_cpu)
5726 move_task_off_dead_cpu(src_cpu, p);
5727 } while_each_thread(t, p);
5728
5729 read_unlock(&tasklist_lock);
5730} 6297}
5731 6298
5732/* 6299/*
5733 * Schedules idle task to be the next runnable task on current CPU. 6300 * remove the tasks which were accounted by rq from calc_load_tasks.
5734 * It does so by boosting its priority to highest possible.
5735 * Used by CPU offline code.
5736 */ 6301 */
5737void sched_idle_next(void) 6302static void calc_global_load_remove(struct rq *rq)
5738{ 6303{
5739 int this_cpu = smp_processor_id(); 6304 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5740 struct rq *rq = cpu_rq(this_cpu); 6305 rq->calc_load_active = 0;
5741 struct task_struct *p = rq->idle;
5742 unsigned long flags;
5743
5744 /* cpu has to be offline */
5745 BUG_ON(cpu_online(this_cpu));
5746
5747 /*
5748 * Strictly not necessary since rest of the CPUs are stopped by now
5749 * and interrupts disabled on the current cpu.
5750 */
5751 raw_spin_lock_irqsave(&rq->lock, flags);
5752
5753 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5754
5755 activate_task(rq, p, 0);
5756
5757 raw_spin_unlock_irqrestore(&rq->lock, flags);
5758} 6306}
5759 6307
5760/* 6308/*
5761 * Ensures that the idle task is using init_mm right before its cpu goes 6309 * Migrate all tasks from the rq, sleeping tasks will be migrated by
5762 * offline. 6310 * try_to_wake_up()->select_task_rq().
6311 *
6312 * Called with rq->lock held even though we'er in stop_machine() and
6313 * there's no concurrency possible, we hold the required locks anyway
6314 * because of lock validation efforts.
5763 */ 6315 */
5764void idle_task_exit(void) 6316static void migrate_tasks(unsigned int dead_cpu)
5765{
5766 struct mm_struct *mm = current->active_mm;
5767
5768 BUG_ON(cpu_online(smp_processor_id()));
5769
5770 if (mm != &init_mm)
5771 switch_mm(mm, &init_mm, current);
5772 mmdrop(mm);
5773}
5774
5775/* called under rq->lock with disabled interrupts */
5776static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5777{ 6317{
5778 struct rq *rq = cpu_rq(dead_cpu); 6318 struct rq *rq = cpu_rq(dead_cpu);
5779 6319 struct task_struct *next, *stop = rq->stop;
5780 /* Must be exiting, otherwise would be on tasklist. */ 6320 int dest_cpu;
5781 BUG_ON(!p->exit_state);
5782
5783 /* Cannot have done final schedule yet: would have vanished. */
5784 BUG_ON(p->state == TASK_DEAD);
5785
5786 get_task_struct(p);
5787 6321
5788 /* 6322 /*
5789 * Drop lock around migration; if someone else moves it, 6323 * Fudge the rq selection such that the below task selection loop
5790 * that's OK. No task can be added to this CPU, so iteration is 6324 * doesn't get stuck on the currently eligible stop task.
5791 * fine. 6325 *
6326 * We're currently inside stop_machine() and the rq is either stuck
6327 * in the stop_machine_cpu_stop() loop, or we're executing this code,
6328 * either way we should never end up calling schedule() until we're
6329 * done here.
5792 */ 6330 */
5793 raw_spin_unlock_irq(&rq->lock); 6331 rq->stop = NULL;
5794 move_task_off_dead_cpu(dead_cpu, p);
5795 raw_spin_lock_irq(&rq->lock);
5796
5797 put_task_struct(p);
5798}
5799
5800/* release_task() removes task from tasklist, so we won't find dead tasks. */
5801static void migrate_dead_tasks(unsigned int dead_cpu)
5802{
5803 struct rq *rq = cpu_rq(dead_cpu);
5804 struct task_struct *next;
5805 6332
5806 for ( ; ; ) { 6333 for ( ; ; ) {
5807 if (!rq->nr_running) 6334 /*
6335 * There's this thread running, bail when that's the only
6336 * remaining thread.
6337 */
6338 if (rq->nr_running == 1)
5808 break; 6339 break;
6340
5809 next = pick_next_task(rq); 6341 next = pick_next_task(rq);
5810 if (!next) 6342 BUG_ON(!next);
5811 break;
5812 next->sched_class->put_prev_task(rq, next); 6343 next->sched_class->put_prev_task(rq, next);
5813 migrate_dead(dead_cpu, next);
5814 6344
6345 /* Find suitable destination for @next, with force if needed. */
6346 dest_cpu = select_fallback_rq(dead_cpu, next);
6347 raw_spin_unlock(&rq->lock);
6348
6349 __migrate_task(next, dead_cpu, dest_cpu);
6350
6351 raw_spin_lock(&rq->lock);
5815 } 6352 }
5816}
5817 6353
5818/* 6354 rq->stop = stop;
5819 * remove the tasks which were accounted by rq from calc_load_tasks.
5820 */
5821static void calc_global_load_remove(struct rq *rq)
5822{
5823 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5824 rq->calc_load_active = 0;
5825} 6355}
6356
5826#endif /* CONFIG_HOTPLUG_CPU */ 6357#endif /* CONFIG_HOTPLUG_CPU */
5827 6358
5828#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 6359#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6032,15 +6563,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6032 unsigned long flags; 6563 unsigned long flags;
6033 struct rq *rq = cpu_rq(cpu); 6564 struct rq *rq = cpu_rq(cpu);
6034 6565
6035 switch (action) { 6566 switch (action & ~CPU_TASKS_FROZEN) {
6036 6567
6037 case CPU_UP_PREPARE: 6568 case CPU_UP_PREPARE:
6038 case CPU_UP_PREPARE_FROZEN:
6039 rq->calc_load_update = calc_load_update; 6569 rq->calc_load_update = calc_load_update;
6040 break; 6570 break;
6041 6571
6042 case CPU_ONLINE: 6572 case CPU_ONLINE:
6043 case CPU_ONLINE_FROZEN:
6044 /* Update our root-domain */ 6573 /* Update our root-domain */
6045 raw_spin_lock_irqsave(&rq->lock, flags); 6574 raw_spin_lock_irqsave(&rq->lock, flags);
6046 if (rq->rd) { 6575 if (rq->rd) {
@@ -6052,33 +6581,26 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6052 break; 6581 break;
6053 6582
6054#ifdef CONFIG_HOTPLUG_CPU 6583#ifdef CONFIG_HOTPLUG_CPU
6055 case CPU_DEAD:
6056 case CPU_DEAD_FROZEN:
6057 migrate_live_tasks(cpu);
6058 /* Idle task back to normal (off runqueue, low prio) */
6059 raw_spin_lock_irq(&rq->lock);
6060 deactivate_task(rq, rq->idle, 0);
6061 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6062 rq->idle->sched_class = &idle_sched_class;
6063 migrate_dead_tasks(cpu);
6064 raw_spin_unlock_irq(&rq->lock);
6065 migrate_nr_uninterruptible(rq);
6066 BUG_ON(rq->nr_running != 0);
6067 calc_global_load_remove(rq);
6068 break;
6069
6070 case CPU_DYING: 6584 case CPU_DYING:
6071 case CPU_DYING_FROZEN: 6585 sched_ttwu_pending();
6072 /* Update our root-domain */ 6586 /* Update our root-domain */
6073 raw_spin_lock_irqsave(&rq->lock, flags); 6587 raw_spin_lock_irqsave(&rq->lock, flags);
6074 if (rq->rd) { 6588 if (rq->rd) {
6075 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6589 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6076 set_rq_offline(rq); 6590 set_rq_offline(rq);
6077 } 6591 }
6592 migrate_tasks(cpu);
6593 BUG_ON(rq->nr_running != 1); /* the migration thread */
6078 raw_spin_unlock_irqrestore(&rq->lock, flags); 6594 raw_spin_unlock_irqrestore(&rq->lock, flags);
6595
6596 migrate_nr_uninterruptible(rq);
6597 calc_global_load_remove(rq);
6079 break; 6598 break;
6080#endif 6599#endif
6081 } 6600 }
6601
6602 update_max_interval();
6603
6082 return NOTIFY_OK; 6604 return NOTIFY_OK;
6083} 6605}
6084 6606
@@ -6139,6 +6661,8 @@ early_initcall(migration_init);
6139 6661
6140#ifdef CONFIG_SMP 6662#ifdef CONFIG_SMP
6141 6663
6664static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6665
6142#ifdef CONFIG_SCHED_DEBUG 6666#ifdef CONFIG_SCHED_DEBUG
6143 6667
6144static __read_mostly int sched_domain_debug_enabled; 6668static __read_mostly int sched_domain_debug_enabled;
@@ -6189,7 +6713,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6189 break; 6713 break;
6190 } 6714 }
6191 6715
6192 if (!group->cpu_power) { 6716 if (!group->sgp->power) {
6193 printk(KERN_CONT "\n"); 6717 printk(KERN_CONT "\n");
6194 printk(KERN_ERR "ERROR: domain->cpu_power not " 6718 printk(KERN_ERR "ERROR: domain->cpu_power not "
6195 "set\n"); 6719 "set\n");
@@ -6213,9 +6737,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6213 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6737 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6214 6738
6215 printk(KERN_CONT " %s", str); 6739 printk(KERN_CONT " %s", str);
6216 if (group->cpu_power != SCHED_LOAD_SCALE) { 6740 if (group->sgp->power != SCHED_POWER_SCALE) {
6217 printk(KERN_CONT " (cpu_power = %d)", 6741 printk(KERN_CONT " (cpu_power = %d)",
6218 group->cpu_power); 6742 group->sgp->power);
6219 } 6743 }
6220 6744
6221 group = group->next; 6745 group = group->next;
@@ -6234,7 +6758,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6234 6758
6235static void sched_domain_debug(struct sched_domain *sd, int cpu) 6759static void sched_domain_debug(struct sched_domain *sd, int cpu)
6236{ 6760{
6237 cpumask_var_t groupmask;
6238 int level = 0; 6761 int level = 0;
6239 6762
6240 if (!sched_domain_debug_enabled) 6763 if (!sched_domain_debug_enabled)
@@ -6247,20 +6770,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6247 6770
6248 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6771 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6249 6772
6250 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6251 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6252 return;
6253 }
6254
6255 for (;;) { 6773 for (;;) {
6256 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6774 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6257 break; 6775 break;
6258 level++; 6776 level++;
6259 sd = sd->parent; 6777 sd = sd->parent;
6260 if (!sd) 6778 if (!sd)
6261 break; 6779 break;
6262 } 6780 }
6263 free_cpumask_var(groupmask);
6264} 6781}
6265#else /* !CONFIG_SCHED_DEBUG */ 6782#else /* !CONFIG_SCHED_DEBUG */
6266# define sched_domain_debug(sd, cpu) do { } while (0) 6783# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6317,12 +6834,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6317 return 1; 6834 return 1;
6318} 6835}
6319 6836
6320static void free_rootdomain(struct root_domain *rd) 6837static void free_rootdomain(struct rcu_head *rcu)
6321{ 6838{
6322 synchronize_sched(); 6839 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6323 6840
6324 cpupri_cleanup(&rd->cpupri); 6841 cpupri_cleanup(&rd->cpupri);
6325
6326 free_cpumask_var(rd->rto_mask); 6842 free_cpumask_var(rd->rto_mask);
6327 free_cpumask_var(rd->online); 6843 free_cpumask_var(rd->online);
6328 free_cpumask_var(rd->span); 6844 free_cpumask_var(rd->span);
@@ -6363,7 +6879,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6363 raw_spin_unlock_irqrestore(&rq->lock, flags); 6879 raw_spin_unlock_irqrestore(&rq->lock, flags);
6364 6880
6365 if (old_rd) 6881 if (old_rd)
6366 free_rootdomain(old_rd); 6882 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6367} 6883}
6368 6884
6369static int init_rootdomain(struct root_domain *rd) 6885static int init_rootdomain(struct root_domain *rd)
@@ -6414,6 +6930,53 @@ static struct root_domain *alloc_rootdomain(void)
6414 return rd; 6930 return rd;
6415} 6931}
6416 6932
6933static void free_sched_groups(struct sched_group *sg, int free_sgp)
6934{
6935 struct sched_group *tmp, *first;
6936
6937 if (!sg)
6938 return;
6939
6940 first = sg;
6941 do {
6942 tmp = sg->next;
6943
6944 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
6945 kfree(sg->sgp);
6946
6947 kfree(sg);
6948 sg = tmp;
6949 } while (sg != first);
6950}
6951
6952static void free_sched_domain(struct rcu_head *rcu)
6953{
6954 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6955
6956 /*
6957 * If its an overlapping domain it has private groups, iterate and
6958 * nuke them all.
6959 */
6960 if (sd->flags & SD_OVERLAP) {
6961 free_sched_groups(sd->groups, 1);
6962 } else if (atomic_dec_and_test(&sd->groups->ref)) {
6963 kfree(sd->groups->sgp);
6964 kfree(sd->groups);
6965 }
6966 kfree(sd);
6967}
6968
6969static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6970{
6971 call_rcu(&sd->rcu, free_sched_domain);
6972}
6973
6974static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6975{
6976 for (; sd; sd = sd->parent)
6977 destroy_sched_domain(sd, cpu);
6978}
6979
6417/* 6980/*
6418 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6981 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6419 * hold the hotplug lock. 6982 * hold the hotplug lock.
@@ -6424,9 +6987,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6424 struct rq *rq = cpu_rq(cpu); 6987 struct rq *rq = cpu_rq(cpu);
6425 struct sched_domain *tmp; 6988 struct sched_domain *tmp;
6426 6989
6427 for (tmp = sd; tmp; tmp = tmp->parent)
6428 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6429
6430 /* Remove the sched domains which do not contribute to scheduling. */ 6990 /* Remove the sched domains which do not contribute to scheduling. */
6431 for (tmp = sd; tmp; ) { 6991 for (tmp = sd; tmp; ) {
6432 struct sched_domain *parent = tmp->parent; 6992 struct sched_domain *parent = tmp->parent;
@@ -6437,12 +6997,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6437 tmp->parent = parent->parent; 6997 tmp->parent = parent->parent;
6438 if (parent->parent) 6998 if (parent->parent)
6439 parent->parent->child = tmp; 6999 parent->parent->child = tmp;
7000 destroy_sched_domain(parent, cpu);
6440 } else 7001 } else
6441 tmp = tmp->parent; 7002 tmp = tmp->parent;
6442 } 7003 }
6443 7004
6444 if (sd && sd_degenerate(sd)) { 7005 if (sd && sd_degenerate(sd)) {
7006 tmp = sd;
6445 sd = sd->parent; 7007 sd = sd->parent;
7008 destroy_sched_domain(tmp, cpu);
6446 if (sd) 7009 if (sd)
6447 sd->child = NULL; 7010 sd->child = NULL;
6448 } 7011 }
@@ -6450,7 +7013,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6450 sched_domain_debug(sd, cpu); 7013 sched_domain_debug(sd, cpu);
6451 7014
6452 rq_attach_root(rq, rd); 7015 rq_attach_root(rq, rd);
7016 tmp = rq->sd;
6453 rcu_assign_pointer(rq->sd, sd); 7017 rcu_assign_pointer(rq->sd, sd);
7018 destroy_sched_domains(tmp, cpu);
6454} 7019}
6455 7020
6456/* cpus with isolated domains */ 7021/* cpus with isolated domains */
@@ -6466,56 +7031,6 @@ static int __init isolated_cpu_setup(char *str)
6466 7031
6467__setup("isolcpus=", isolated_cpu_setup); 7032__setup("isolcpus=", isolated_cpu_setup);
6468 7033
6469/*
6470 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6471 * to a function which identifies what group(along with sched group) a CPU
6472 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6473 * (due to the fact that we keep track of groups covered with a struct cpumask).
6474 *
6475 * init_sched_build_groups will build a circular linked list of the groups
6476 * covered by the given span, and will set each group's ->cpumask correctly,
6477 * and ->cpu_power to 0.
6478 */
6479static void
6480init_sched_build_groups(const struct cpumask *span,
6481 const struct cpumask *cpu_map,
6482 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6483 struct sched_group **sg,
6484 struct cpumask *tmpmask),
6485 struct cpumask *covered, struct cpumask *tmpmask)
6486{
6487 struct sched_group *first = NULL, *last = NULL;
6488 int i;
6489
6490 cpumask_clear(covered);
6491
6492 for_each_cpu(i, span) {
6493 struct sched_group *sg;
6494 int group = group_fn(i, cpu_map, &sg, tmpmask);
6495 int j;
6496
6497 if (cpumask_test_cpu(i, covered))
6498 continue;
6499
6500 cpumask_clear(sched_group_cpus(sg));
6501 sg->cpu_power = 0;
6502
6503 for_each_cpu(j, span) {
6504 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6505 continue;
6506
6507 cpumask_set_cpu(j, covered);
6508 cpumask_set_cpu(j, sched_group_cpus(sg));
6509 }
6510 if (!first)
6511 first = sg;
6512 if (last)
6513 last->next = sg;
6514 last = sg;
6515 }
6516 last->next = first;
6517}
6518
6519#define SD_NODES_PER_DOMAIN 16 7034#define SD_NODES_PER_DOMAIN 16
6520 7035
6521#ifdef CONFIG_NUMA 7036#ifdef CONFIG_NUMA
@@ -6532,7 +7047,7 @@ init_sched_build_groups(const struct cpumask *span,
6532 */ 7047 */
6533static int find_next_best_node(int node, nodemask_t *used_nodes) 7048static int find_next_best_node(int node, nodemask_t *used_nodes)
6534{ 7049{
6535 int i, n, val, min_val, best_node = 0; 7050 int i, n, val, min_val, best_node = -1;
6536 7051
6537 min_val = INT_MAX; 7052 min_val = INT_MAX;
6538 7053
@@ -6556,7 +7071,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6556 } 7071 }
6557 } 7072 }
6558 7073
6559 node_set(best_node, *used_nodes); 7074 if (best_node != -1)
7075 node_set(best_node, *used_nodes);
6560 return best_node; 7076 return best_node;
6561} 7077}
6562 7078
@@ -6582,293 +7098,197 @@ static void sched_domain_node_span(int node, struct cpumask *span)
6582 7098
6583 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 7099 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6584 int next_node = find_next_best_node(node, &used_nodes); 7100 int next_node = find_next_best_node(node, &used_nodes);
6585 7101 if (next_node < 0)
7102 break;
6586 cpumask_or(span, span, cpumask_of_node(next_node)); 7103 cpumask_or(span, span, cpumask_of_node(next_node));
6587 } 7104 }
6588} 7105}
7106
7107static const struct cpumask *cpu_node_mask(int cpu)
7108{
7109 lockdep_assert_held(&sched_domains_mutex);
7110
7111 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
7112
7113 return sched_domains_tmpmask;
7114}
7115
7116static const struct cpumask *cpu_allnodes_mask(int cpu)
7117{
7118 return cpu_possible_mask;
7119}
6589#endif /* CONFIG_NUMA */ 7120#endif /* CONFIG_NUMA */
6590 7121
6591int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7122static const struct cpumask *cpu_cpu_mask(int cpu)
7123{
7124 return cpumask_of_node(cpu_to_node(cpu));
7125}
6592 7126
6593/* 7127int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6594 * The cpus mask in sched_group and sched_domain hangs off the end.
6595 *
6596 * ( See the the comments in include/linux/sched.h:struct sched_group
6597 * and struct sched_domain. )
6598 */
6599struct static_sched_group {
6600 struct sched_group sg;
6601 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6602};
6603 7128
6604struct static_sched_domain { 7129struct sd_data {
6605 struct sched_domain sd; 7130 struct sched_domain **__percpu sd;
6606 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 7131 struct sched_group **__percpu sg;
7132 struct sched_group_power **__percpu sgp;
6607}; 7133};
6608 7134
6609struct s_data { 7135struct s_data {
6610#ifdef CONFIG_NUMA 7136 struct sched_domain ** __percpu sd;
6611 int sd_allnodes;
6612 cpumask_var_t domainspan;
6613 cpumask_var_t covered;
6614 cpumask_var_t notcovered;
6615#endif
6616 cpumask_var_t nodemask;
6617 cpumask_var_t this_sibling_map;
6618 cpumask_var_t this_core_map;
6619 cpumask_var_t send_covered;
6620 cpumask_var_t tmpmask;
6621 struct sched_group **sched_group_nodes;
6622 struct root_domain *rd; 7137 struct root_domain *rd;
6623}; 7138};
6624 7139
6625enum s_alloc { 7140enum s_alloc {
6626 sa_sched_groups = 0,
6627 sa_rootdomain, 7141 sa_rootdomain,
6628 sa_tmpmask, 7142 sa_sd,
6629 sa_send_covered, 7143 sa_sd_storage,
6630 sa_this_core_map,
6631 sa_this_sibling_map,
6632 sa_nodemask,
6633 sa_sched_group_nodes,
6634#ifdef CONFIG_NUMA
6635 sa_notcovered,
6636 sa_covered,
6637 sa_domainspan,
6638#endif
6639 sa_none, 7144 sa_none,
6640}; 7145};
6641 7146
6642/* 7147struct sched_domain_topology_level;
6643 * SMT sched-domains:
6644 */
6645#ifdef CONFIG_SCHED_SMT
6646static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6647static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6648 7148
6649static int 7149typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6650cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 7150typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6651 struct sched_group **sg, struct cpumask *unused)
6652{
6653 if (sg)
6654 *sg = &per_cpu(sched_groups, cpu).sg;
6655 return cpu;
6656}
6657#endif /* CONFIG_SCHED_SMT */
6658 7151
6659/* 7152#define SDTL_OVERLAP 0x01
6660 * multi-core sched-domains:
6661 */
6662#ifdef CONFIG_SCHED_MC
6663static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6664static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6665#endif /* CONFIG_SCHED_MC */
6666 7153
6667#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7154struct sched_domain_topology_level {
6668static int 7155 sched_domain_init_f init;
6669cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 7156 sched_domain_mask_f mask;
6670 struct sched_group **sg, struct cpumask *mask) 7157 int flags;
6671{ 7158 struct sd_data data;
6672 int group; 7159};
6673 7160
6674 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6675 group = cpumask_first(mask);
6676 if (sg)
6677 *sg = &per_cpu(sched_group_core, group).sg;
6678 return group;
6679}
6680#elif defined(CONFIG_SCHED_MC)
6681static int 7161static int
6682cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 7162build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6683 struct sched_group **sg, struct cpumask *unused)
6684{ 7163{
6685 if (sg) 7164 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
6686 *sg = &per_cpu(sched_group_core, cpu).sg; 7165 const struct cpumask *span = sched_domain_span(sd);
6687 return cpu; 7166 struct cpumask *covered = sched_domains_tmpmask;
6688} 7167 struct sd_data *sdd = sd->private;
6689#endif 7168 struct sched_domain *child;
7169 int i;
6690 7170
6691static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 7171 cpumask_clear(covered);
6692static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
6693 7172
6694static int 7173 for_each_cpu(i, span) {
6695cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, 7174 struct cpumask *sg_span;
6696 struct sched_group **sg, struct cpumask *mask)
6697{
6698 int group;
6699#ifdef CONFIG_SCHED_MC
6700 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6701 group = cpumask_first(mask);
6702#elif defined(CONFIG_SCHED_SMT)
6703 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6704 group = cpumask_first(mask);
6705#else
6706 group = cpu;
6707#endif
6708 if (sg)
6709 *sg = &per_cpu(sched_group_phys, group).sg;
6710 return group;
6711}
6712 7175
6713#ifdef CONFIG_NUMA 7176 if (cpumask_test_cpu(i, covered))
6714/* 7177 continue;
6715 * The init_sched_build_groups can't handle what we want to do with node
6716 * groups, so roll our own. Now each node has its own list of groups which
6717 * gets dynamically allocated.
6718 */
6719static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
6720static struct sched_group ***sched_group_nodes_bycpu;
6721 7178
6722static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); 7179 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6723static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); 7180 GFP_KERNEL, cpu_to_node(i));
6724 7181
6725static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, 7182 if (!sg)
6726 struct sched_group **sg, 7183 goto fail;
6727 struct cpumask *nodemask)
6728{
6729 int group;
6730 7184
6731 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); 7185 sg_span = sched_group_cpus(sg);
6732 group = cpumask_first(nodemask);
6733 7186
6734 if (sg) 7187 child = *per_cpu_ptr(sdd->sd, i);
6735 *sg = &per_cpu(sched_group_allnodes, group).sg; 7188 if (child->child) {
6736 return group; 7189 child = child->child;
6737} 7190 cpumask_copy(sg_span, sched_domain_span(child));
7191 } else
7192 cpumask_set_cpu(i, sg_span);
6738 7193
6739static void init_numa_sched_groups_power(struct sched_group *group_head) 7194 cpumask_or(covered, covered, sg_span);
6740{
6741 struct sched_group *sg = group_head;
6742 int j;
6743 7195
6744 if (!sg) 7196 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
6745 return; 7197 atomic_inc(&sg->sgp->ref);
6746 do {
6747 for_each_cpu(j, sched_group_cpus(sg)) {
6748 struct sched_domain *sd;
6749 7198
6750 sd = &per_cpu(phys_domains, j).sd; 7199 if (cpumask_test_cpu(cpu, sg_span))
6751 if (j != group_first_cpu(sd->groups)) { 7200 groups = sg;
6752 /*
6753 * Only add "power" once for each
6754 * physical package.
6755 */
6756 continue;
6757 }
6758 7201
6759 sg->cpu_power += sd->groups->cpu_power; 7202 if (!first)
6760 } 7203 first = sg;
6761 sg = sg->next; 7204 if (last)
6762 } while (sg != group_head); 7205 last->next = sg;
7206 last = sg;
7207 last->next = first;
7208 }
7209 sd->groups = groups;
7210
7211 return 0;
7212
7213fail:
7214 free_sched_groups(first, 0);
7215
7216 return -ENOMEM;
6763} 7217}
6764 7218
6765static int build_numa_sched_groups(struct s_data *d, 7219static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6766 const struct cpumask *cpu_map, int num)
6767{ 7220{
6768 struct sched_domain *sd; 7221 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6769 struct sched_group *sg, *prev; 7222 struct sched_domain *child = sd->child;
6770 int n, j;
6771 7223
6772 cpumask_clear(d->covered); 7224 if (child)
6773 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); 7225 cpu = cpumask_first(sched_domain_span(child));
6774 if (cpumask_empty(d->nodemask)) { 7226
6775 d->sched_group_nodes[num] = NULL; 7227 if (sg) {
6776 goto out; 7228 *sg = *per_cpu_ptr(sdd->sg, cpu);
7229 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
7230 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
6777 } 7231 }
6778 7232
6779 sched_domain_node_span(num, d->domainspan); 7233 return cpu;
6780 cpumask_and(d->domainspan, d->domainspan, cpu_map); 7234}
6781 7235
6782 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 7236/*
6783 GFP_KERNEL, num); 7237 * build_sched_groups will build a circular linked list of the groups
6784 if (!sg) { 7238 * covered by the given span, and will set each group's ->cpumask correctly,
6785 printk(KERN_WARNING "Can not alloc domain group for node %d\n", 7239 * and ->cpu_power to 0.
6786 num); 7240 *
6787 return -ENOMEM; 7241 * Assumes the sched_domain tree is fully constructed
6788 } 7242 */
6789 d->sched_group_nodes[num] = sg; 7243static int
7244build_sched_groups(struct sched_domain *sd, int cpu)
7245{
7246 struct sched_group *first = NULL, *last = NULL;
7247 struct sd_data *sdd = sd->private;
7248 const struct cpumask *span = sched_domain_span(sd);
7249 struct cpumask *covered;
7250 int i;
6790 7251
6791 for_each_cpu(j, d->nodemask) { 7252 get_group(cpu, sdd, &sd->groups);
6792 sd = &per_cpu(node_domains, j).sd; 7253 atomic_inc(&sd->groups->ref);
6793 sd->groups = sg;
6794 }
6795 7254
6796 sg->cpu_power = 0; 7255 if (cpu != cpumask_first(sched_domain_span(sd)))
6797 cpumask_copy(sched_group_cpus(sg), d->nodemask); 7256 return 0;
6798 sg->next = sg;
6799 cpumask_or(d->covered, d->covered, d->nodemask);
6800 7257
6801 prev = sg; 7258 lockdep_assert_held(&sched_domains_mutex);
6802 for (j = 0; j < nr_node_ids; j++) { 7259 covered = sched_domains_tmpmask;
6803 n = (num + j) % nr_node_ids;
6804 cpumask_complement(d->notcovered, d->covered);
6805 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
6806 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
6807 if (cpumask_empty(d->tmpmask))
6808 break;
6809 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
6810 if (cpumask_empty(d->tmpmask))
6811 continue;
6812 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
6813 GFP_KERNEL, num);
6814 if (!sg) {
6815 printk(KERN_WARNING
6816 "Can not alloc domain group for node %d\n", j);
6817 return -ENOMEM;
6818 }
6819 sg->cpu_power = 0;
6820 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
6821 sg->next = prev->next;
6822 cpumask_or(d->covered, d->covered, d->tmpmask);
6823 prev->next = sg;
6824 prev = sg;
6825 }
6826out:
6827 return 0;
6828}
6829#endif /* CONFIG_NUMA */
6830 7260
6831#ifdef CONFIG_NUMA 7261 cpumask_clear(covered);
6832/* Free memory allocated for various sched_group structures */
6833static void free_sched_groups(const struct cpumask *cpu_map,
6834 struct cpumask *nodemask)
6835{
6836 int cpu, i;
6837 7262
6838 for_each_cpu(cpu, cpu_map) { 7263 for_each_cpu(i, span) {
6839 struct sched_group **sched_group_nodes 7264 struct sched_group *sg;
6840 = sched_group_nodes_bycpu[cpu]; 7265 int group = get_group(i, sdd, &sg);
7266 int j;
6841 7267
6842 if (!sched_group_nodes) 7268 if (cpumask_test_cpu(i, covered))
6843 continue; 7269 continue;
6844 7270
6845 for (i = 0; i < nr_node_ids; i++) { 7271 cpumask_clear(sched_group_cpus(sg));
6846 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7272 sg->sgp->power = 0;
6847 7273
6848 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 7274 for_each_cpu(j, span) {
6849 if (cpumask_empty(nodemask)) 7275 if (get_group(j, sdd, NULL) != group)
6850 continue; 7276 continue;
6851 7277
6852 if (sg == NULL) 7278 cpumask_set_cpu(j, covered);
6853 continue; 7279 cpumask_set_cpu(j, sched_group_cpus(sg));
6854 sg = sg->next;
6855next_sg:
6856 oldsg = sg;
6857 sg = sg->next;
6858 kfree(oldsg);
6859 if (oldsg != sched_group_nodes[i])
6860 goto next_sg;
6861 } 7280 }
6862 kfree(sched_group_nodes); 7281
6863 sched_group_nodes_bycpu[cpu] = NULL; 7282 if (!first)
7283 first = sg;
7284 if (last)
7285 last->next = sg;
7286 last = sg;
6864 } 7287 }
7288 last->next = first;
7289
7290 return 0;
6865} 7291}
6866#else /* !CONFIG_NUMA */
6867static void free_sched_groups(const struct cpumask *cpu_map,
6868 struct cpumask *nodemask)
6869{
6870}
6871#endif /* CONFIG_NUMA */
6872 7292
6873/* 7293/*
6874 * Initialize sched groups cpu_power. 7294 * Initialize sched groups cpu_power.
@@ -6882,46 +7302,19 @@ static void free_sched_groups(const struct cpumask *cpu_map,
6882 */ 7302 */
6883static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7303static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6884{ 7304{
6885 struct sched_domain *child; 7305 struct sched_group *sg = sd->groups;
6886 struct sched_group *group;
6887 long power;
6888 int weight;
6889
6890 WARN_ON(!sd || !sd->groups);
6891
6892 if (cpu != group_first_cpu(sd->groups))
6893 return;
6894 7306
6895 child = sd->child; 7307 WARN_ON(!sd || !sg);
6896 7308
6897 sd->groups->cpu_power = 0; 7309 do {
7310 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
7311 sg = sg->next;
7312 } while (sg != sd->groups);
6898 7313
6899 if (!child) { 7314 if (cpu != group_first_cpu(sg))
6900 power = SCHED_LOAD_SCALE;
6901 weight = cpumask_weight(sched_domain_span(sd));
6902 /*
6903 * SMT siblings share the power of a single core.
6904 * Usually multiple threads get a better yield out of
6905 * that one core than a single thread would have,
6906 * reflect that in sd->smt_gain.
6907 */
6908 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
6909 power *= sd->smt_gain;
6910 power /= weight;
6911 power >>= SCHED_LOAD_SHIFT;
6912 }
6913 sd->groups->cpu_power += power;
6914 return; 7315 return;
6915 }
6916 7316
6917 /* 7317 update_group_power(sd, cpu);
6918 * Add cpu_power of each child group to this groups cpu_power.
6919 */
6920 group = child->groups;
6921 do {
6922 sd->groups->cpu_power += group->cpu_power;
6923 group = group->next;
6924 } while (group != child->groups);
6925} 7318}
6926 7319
6927/* 7320/*
@@ -6935,15 +7328,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6935# define SD_INIT_NAME(sd, type) do { } while (0) 7328# define SD_INIT_NAME(sd, type) do { } while (0)
6936#endif 7329#endif
6937 7330
6938#define SD_INIT(sd, type) sd_init_##type(sd) 7331#define SD_INIT_FUNC(type) \
6939 7332static noinline struct sched_domain * \
6940#define SD_INIT_FUNC(type) \ 7333sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6941static noinline void sd_init_##type(struct sched_domain *sd) \ 7334{ \
6942{ \ 7335 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
6943 memset(sd, 0, sizeof(*sd)); \ 7336 *sd = SD_##type##_INIT; \
6944 *sd = SD_##type##_INIT; \ 7337 SD_INIT_NAME(sd, type); \
6945 sd->level = SD_LV_##type; \ 7338 sd->private = &tl->data; \
6946 SD_INIT_NAME(sd, type); \ 7339 return sd; \
6947} 7340}
6948 7341
6949SD_INIT_FUNC(CPU) 7342SD_INIT_FUNC(CPU)
@@ -6957,15 +7350,19 @@ SD_INIT_FUNC(CPU)
6957#ifdef CONFIG_SCHED_MC 7350#ifdef CONFIG_SCHED_MC
6958 SD_INIT_FUNC(MC) 7351 SD_INIT_FUNC(MC)
6959#endif 7352#endif
7353#ifdef CONFIG_SCHED_BOOK
7354 SD_INIT_FUNC(BOOK)
7355#endif
6960 7356
6961static int default_relax_domain_level = -1; 7357static int default_relax_domain_level = -1;
7358int sched_domain_level_max;
6962 7359
6963static int __init setup_relax_domain_level(char *str) 7360static int __init setup_relax_domain_level(char *str)
6964{ 7361{
6965 unsigned long val; 7362 unsigned long val;
6966 7363
6967 val = simple_strtoul(str, NULL, 0); 7364 val = simple_strtoul(str, NULL, 0);
6968 if (val < SD_LV_MAX) 7365 if (val < sched_domain_level_max)
6969 default_relax_domain_level = val; 7366 default_relax_domain_level = val;
6970 7367
6971 return 1; 7368 return 1;
@@ -6993,35 +7390,20 @@ static void set_domain_attribute(struct sched_domain *sd,
6993 } 7390 }
6994} 7391}
6995 7392
7393static void __sdt_free(const struct cpumask *cpu_map);
7394static int __sdt_alloc(const struct cpumask *cpu_map);
7395
6996static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7396static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6997 const struct cpumask *cpu_map) 7397 const struct cpumask *cpu_map)
6998{ 7398{
6999 switch (what) { 7399 switch (what) {
7000 case sa_sched_groups:
7001 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7002 d->sched_group_nodes = NULL;
7003 case sa_rootdomain: 7400 case sa_rootdomain:
7004 free_rootdomain(d->rd); /* fall through */ 7401 if (!atomic_read(&d->rd->refcount))
7005 case sa_tmpmask: 7402 free_rootdomain(&d->rd->rcu); /* fall through */
7006 free_cpumask_var(d->tmpmask); /* fall through */ 7403 case sa_sd:
7007 case sa_send_covered: 7404 free_percpu(d->sd); /* fall through */
7008 free_cpumask_var(d->send_covered); /* fall through */ 7405 case sa_sd_storage:
7009 case sa_this_core_map: 7406 __sdt_free(cpu_map); /* fall through */
7010 free_cpumask_var(d->this_core_map); /* fall through */
7011 case sa_this_sibling_map:
7012 free_cpumask_var(d->this_sibling_map); /* fall through */
7013 case sa_nodemask:
7014 free_cpumask_var(d->nodemask); /* fall through */
7015 case sa_sched_group_nodes:
7016#ifdef CONFIG_NUMA
7017 kfree(d->sched_group_nodes); /* fall through */
7018 case sa_notcovered:
7019 free_cpumask_var(d->notcovered); /* fall through */
7020 case sa_covered:
7021 free_cpumask_var(d->covered); /* fall through */
7022 case sa_domainspan:
7023 free_cpumask_var(d->domainspan); /* fall through */
7024#endif
7025 case sa_none: 7407 case sa_none:
7026 break; 7408 break;
7027 } 7409 }
@@ -7030,270 +7412,233 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7030static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7412static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7031 const struct cpumask *cpu_map) 7413 const struct cpumask *cpu_map)
7032{ 7414{
7033#ifdef CONFIG_NUMA 7415 memset(d, 0, sizeof(*d));
7034 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7416
7035 return sa_none; 7417 if (__sdt_alloc(cpu_map))
7036 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7418 return sa_sd_storage;
7037 return sa_domainspan; 7419 d->sd = alloc_percpu(struct sched_domain *);
7038 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7420 if (!d->sd)
7039 return sa_covered; 7421 return sa_sd_storage;
7040 /* Allocate the per-node list of sched groups */
7041 d->sched_group_nodes = kcalloc(nr_node_ids,
7042 sizeof(struct sched_group *), GFP_KERNEL);
7043 if (!d->sched_group_nodes) {
7044 printk(KERN_WARNING "Can not alloc sched group node list\n");
7045 return sa_notcovered;
7046 }
7047 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7048#endif
7049 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7050 return sa_sched_group_nodes;
7051 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7052 return sa_nodemask;
7053 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7054 return sa_this_sibling_map;
7055 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7056 return sa_this_core_map;
7057 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7058 return sa_send_covered;
7059 d->rd = alloc_rootdomain(); 7422 d->rd = alloc_rootdomain();
7060 if (!d->rd) { 7423 if (!d->rd)
7061 printk(KERN_WARNING "Cannot alloc root domain\n"); 7424 return sa_sd;
7062 return sa_tmpmask;
7063 }
7064 return sa_rootdomain; 7425 return sa_rootdomain;
7065} 7426}
7066 7427
7067static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7428/*
7068 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7429 * NULL the sd_data elements we've used to build the sched_domain and
7430 * sched_group structure so that the subsequent __free_domain_allocs()
7431 * will not free the data we're using.
7432 */
7433static void claim_allocations(int cpu, struct sched_domain *sd)
7069{ 7434{
7070 struct sched_domain *sd = NULL; 7435 struct sd_data *sdd = sd->private;
7071#ifdef CONFIG_NUMA
7072 struct sched_domain *parent;
7073
7074 d->sd_allnodes = 0;
7075 if (cpumask_weight(cpu_map) >
7076 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7077 sd = &per_cpu(allnodes_domains, i).sd;
7078 SD_INIT(sd, ALLNODES);
7079 set_domain_attribute(sd, attr);
7080 cpumask_copy(sched_domain_span(sd), cpu_map);
7081 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7082 d->sd_allnodes = 1;
7083 }
7084 parent = sd;
7085
7086 sd = &per_cpu(node_domains, i).sd;
7087 SD_INIT(sd, NODE);
7088 set_domain_attribute(sd, attr);
7089 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7090 sd->parent = parent;
7091 if (parent)
7092 parent->child = sd;
7093 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7094#endif
7095 return sd;
7096}
7097 7436
7098static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7437 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7099 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7438 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7100 struct sched_domain *parent, int i)
7101{
7102 struct sched_domain *sd;
7103 sd = &per_cpu(phys_domains, i).sd;
7104 SD_INIT(sd, CPU);
7105 set_domain_attribute(sd, attr);
7106 cpumask_copy(sched_domain_span(sd), d->nodemask);
7107 sd->parent = parent;
7108 if (parent)
7109 parent->child = sd;
7110 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7111 return sd;
7112}
7113 7439
7114static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7440 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
7115 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7441 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7116 struct sched_domain *parent, int i) 7442
7117{ 7443 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
7118 struct sched_domain *sd = parent; 7444 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
7119#ifdef CONFIG_SCHED_MC
7120 sd = &per_cpu(core_domains, i).sd;
7121 SD_INIT(sd, MC);
7122 set_domain_attribute(sd, attr);
7123 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7124 sd->parent = parent;
7125 parent->child = sd;
7126 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7127#endif
7128 return sd;
7129} 7445}
7130 7446
7131static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7132 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7133 struct sched_domain *parent, int i)
7134{
7135 struct sched_domain *sd = parent;
7136#ifdef CONFIG_SCHED_SMT 7447#ifdef CONFIG_SCHED_SMT
7137 sd = &per_cpu(cpu_domains, i).sd; 7448static const struct cpumask *cpu_smt_mask(int cpu)
7138 SD_INIT(sd, SIBLING); 7449{
7139 set_domain_attribute(sd, attr); 7450 return topology_thread_cpumask(cpu);
7140 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7141 sd->parent = parent;
7142 parent->child = sd;
7143 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7144#endif
7145 return sd;
7146} 7451}
7452#endif
7147 7453
7148static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7454/*
7149 const struct cpumask *cpu_map, int cpu) 7455 * Topology list, bottom-up.
7150{ 7456 */
7151 switch (l) { 7457static struct sched_domain_topology_level default_topology[] = {
7152#ifdef CONFIG_SCHED_SMT 7458#ifdef CONFIG_SCHED_SMT
7153 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7459 { sd_init_SIBLING, cpu_smt_mask, },
7154 cpumask_and(d->this_sibling_map, cpu_map,
7155 topology_thread_cpumask(cpu));
7156 if (cpu == cpumask_first(d->this_sibling_map))
7157 init_sched_build_groups(d->this_sibling_map, cpu_map,
7158 &cpu_to_cpu_group,
7159 d->send_covered, d->tmpmask);
7160 break;
7161#endif 7460#endif
7162#ifdef CONFIG_SCHED_MC 7461#ifdef CONFIG_SCHED_MC
7163 case SD_LV_MC: /* set up multi-core groups */ 7462 { sd_init_MC, cpu_coregroup_mask, },
7164 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7165 if (cpu == cpumask_first(d->this_core_map))
7166 init_sched_build_groups(d->this_core_map, cpu_map,
7167 &cpu_to_core_group,
7168 d->send_covered, d->tmpmask);
7169 break;
7170#endif 7463#endif
7171 case SD_LV_CPU: /* set up physical groups */ 7464#ifdef CONFIG_SCHED_BOOK
7172 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7465 { sd_init_BOOK, cpu_book_mask, },
7173 if (!cpumask_empty(d->nodemask)) 7466#endif
7174 init_sched_build_groups(d->nodemask, cpu_map, 7467 { sd_init_CPU, cpu_cpu_mask, },
7175 &cpu_to_phys_group,
7176 d->send_covered, d->tmpmask);
7177 break;
7178#ifdef CONFIG_NUMA 7468#ifdef CONFIG_NUMA
7179 case SD_LV_ALLNODES: 7469 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
7180 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7470 { sd_init_ALLNODES, cpu_allnodes_mask, },
7181 d->send_covered, d->tmpmask);
7182 break;
7183#endif 7471#endif
7184 default: 7472 { NULL, },
7185 break; 7473};
7474
7475static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7476
7477static int __sdt_alloc(const struct cpumask *cpu_map)
7478{
7479 struct sched_domain_topology_level *tl;
7480 int j;
7481
7482 for (tl = sched_domain_topology; tl->init; tl++) {
7483 struct sd_data *sdd = &tl->data;
7484
7485 sdd->sd = alloc_percpu(struct sched_domain *);
7486 if (!sdd->sd)
7487 return -ENOMEM;
7488
7489 sdd->sg = alloc_percpu(struct sched_group *);
7490 if (!sdd->sg)
7491 return -ENOMEM;
7492
7493 sdd->sgp = alloc_percpu(struct sched_group_power *);
7494 if (!sdd->sgp)
7495 return -ENOMEM;
7496
7497 for_each_cpu(j, cpu_map) {
7498 struct sched_domain *sd;
7499 struct sched_group *sg;
7500 struct sched_group_power *sgp;
7501
7502 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7503 GFP_KERNEL, cpu_to_node(j));
7504 if (!sd)
7505 return -ENOMEM;
7506
7507 *per_cpu_ptr(sdd->sd, j) = sd;
7508
7509 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7510 GFP_KERNEL, cpu_to_node(j));
7511 if (!sg)
7512 return -ENOMEM;
7513
7514 *per_cpu_ptr(sdd->sg, j) = sg;
7515
7516 sgp = kzalloc_node(sizeof(struct sched_group_power),
7517 GFP_KERNEL, cpu_to_node(j));
7518 if (!sgp)
7519 return -ENOMEM;
7520
7521 *per_cpu_ptr(sdd->sgp, j) = sgp;
7522 }
7523 }
7524
7525 return 0;
7526}
7527
7528static void __sdt_free(const struct cpumask *cpu_map)
7529{
7530 struct sched_domain_topology_level *tl;
7531 int j;
7532
7533 for (tl = sched_domain_topology; tl->init; tl++) {
7534 struct sd_data *sdd = &tl->data;
7535
7536 for_each_cpu(j, cpu_map) {
7537 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
7538 if (sd && (sd->flags & SD_OVERLAP))
7539 free_sched_groups(sd->groups, 0);
7540 kfree(*per_cpu_ptr(sdd->sg, j));
7541 kfree(*per_cpu_ptr(sdd->sgp, j));
7542 }
7543 free_percpu(sdd->sd);
7544 free_percpu(sdd->sg);
7545 free_percpu(sdd->sgp);
7546 }
7547}
7548
7549struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7550 struct s_data *d, const struct cpumask *cpu_map,
7551 struct sched_domain_attr *attr, struct sched_domain *child,
7552 int cpu)
7553{
7554 struct sched_domain *sd = tl->init(tl, cpu);
7555 if (!sd)
7556 return child;
7557
7558 set_domain_attribute(sd, attr);
7559 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7560 if (child) {
7561 sd->level = child->level + 1;
7562 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7563 child->parent = sd;
7186 } 7564 }
7565 sd->child = child;
7566
7567 return sd;
7187} 7568}
7188 7569
7189/* 7570/*
7190 * Build sched domains for a given set of cpus and attach the sched domains 7571 * Build sched domains for a given set of cpus and attach the sched domains
7191 * to the individual cpus 7572 * to the individual cpus
7192 */ 7573 */
7193static int __build_sched_domains(const struct cpumask *cpu_map, 7574static int build_sched_domains(const struct cpumask *cpu_map,
7194 struct sched_domain_attr *attr) 7575 struct sched_domain_attr *attr)
7195{ 7576{
7196 enum s_alloc alloc_state = sa_none; 7577 enum s_alloc alloc_state = sa_none;
7197 struct s_data d;
7198 struct sched_domain *sd; 7578 struct sched_domain *sd;
7199 int i; 7579 struct s_data d;
7200#ifdef CONFIG_NUMA 7580 int i, ret = -ENOMEM;
7201 d.sd_allnodes = 0;
7202#endif
7203 7581
7204 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7582 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7205 if (alloc_state != sa_rootdomain) 7583 if (alloc_state != sa_rootdomain)
7206 goto error; 7584 goto error;
7207 alloc_state = sa_sched_groups;
7208
7209 /*
7210 * Set up domains for cpus specified by the cpu_map.
7211 */
7212 for_each_cpu(i, cpu_map) {
7213 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
7214 cpu_map);
7215
7216 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7217 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7218 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7219 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7220 }
7221 7585
7586 /* Set up domains for cpus specified by the cpu_map. */
7222 for_each_cpu(i, cpu_map) { 7587 for_each_cpu(i, cpu_map) {
7223 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7588 struct sched_domain_topology_level *tl;
7224 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7589
7225 } 7590 sd = NULL;
7226 7591 for (tl = sched_domain_topology; tl->init; tl++) {
7227 /* Set up physical groups */ 7592 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7228 for (i = 0; i < nr_node_ids; i++) 7593 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7229 build_sched_groups(&d, SD_LV_CPU, cpu_map, i); 7594 sd->flags |= SD_OVERLAP;
7230 7595 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
7231#ifdef CONFIG_NUMA 7596 break;
7232 /* Set up node groups */ 7597 }
7233 if (d.sd_allnodes)
7234 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7235 7598
7236 for (i = 0; i < nr_node_ids; i++) 7599 while (sd->child)
7237 if (build_numa_sched_groups(&d, cpu_map, i)) 7600 sd = sd->child;
7238 goto error;
7239#endif
7240 7601
7241 /* Calculate CPU power for physical packages and nodes */ 7602 *per_cpu_ptr(d.sd, i) = sd;
7242#ifdef CONFIG_SCHED_SMT
7243 for_each_cpu(i, cpu_map) {
7244 sd = &per_cpu(cpu_domains, i).sd;
7245 init_sched_groups_power(i, sd);
7246 }
7247#endif
7248#ifdef CONFIG_SCHED_MC
7249 for_each_cpu(i, cpu_map) {
7250 sd = &per_cpu(core_domains, i).sd;
7251 init_sched_groups_power(i, sd);
7252 } 7603 }
7253#endif
7254 7604
7605 /* Build the groups for the domains */
7255 for_each_cpu(i, cpu_map) { 7606 for_each_cpu(i, cpu_map) {
7256 sd = &per_cpu(phys_domains, i).sd; 7607 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7257 init_sched_groups_power(i, sd); 7608 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7609 if (sd->flags & SD_OVERLAP) {
7610 if (build_overlap_sched_groups(sd, i))
7611 goto error;
7612 } else {
7613 if (build_sched_groups(sd, i))
7614 goto error;
7615 }
7616 }
7258 } 7617 }
7259 7618
7260#ifdef CONFIG_NUMA 7619 /* Calculate CPU power for physical packages and nodes */
7261 for (i = 0; i < nr_node_ids; i++) 7620 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7262 init_numa_sched_groups_power(d.sched_group_nodes[i]); 7621 if (!cpumask_test_cpu(i, cpu_map))
7263 7622 continue;
7264 if (d.sd_allnodes) {
7265 struct sched_group *sg;
7266 7623
7267 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7624 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7268 d.tmpmask); 7625 claim_allocations(i, sd);
7269 init_numa_sched_groups_power(sg); 7626 init_sched_groups_power(i, sd);
7627 }
7270 } 7628 }
7271#endif
7272 7629
7273 /* Attach the domains */ 7630 /* Attach the domains */
7631 rcu_read_lock();
7274 for_each_cpu(i, cpu_map) { 7632 for_each_cpu(i, cpu_map) {
7275#ifdef CONFIG_SCHED_SMT 7633 sd = *per_cpu_ptr(d.sd, i);
7276 sd = &per_cpu(cpu_domains, i).sd;
7277#elif defined(CONFIG_SCHED_MC)
7278 sd = &per_cpu(core_domains, i).sd;
7279#else
7280 sd = &per_cpu(phys_domains, i).sd;
7281#endif
7282 cpu_attach_domain(sd, d.rd, i); 7634 cpu_attach_domain(sd, d.rd, i);
7283 } 7635 }
7636 rcu_read_unlock();
7284 7637
7285 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7638 ret = 0;
7286 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7287 return 0;
7288
7289error: 7639error:
7290 __free_domain_allocs(&d, alloc_state, cpu_map); 7640 __free_domain_allocs(&d, alloc_state, cpu_map);
7291 return -ENOMEM; 7641 return ret;
7292}
7293
7294static int build_sched_domains(const struct cpumask *cpu_map)
7295{
7296 return __build_sched_domains(cpu_map, NULL);
7297} 7642}
7298 7643
7299static cpumask_var_t *doms_cur; /* current sched domains */ 7644static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7348,7 +7693,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7348 * For now this just excludes isolated cpus, but could be used to 7693 * For now this just excludes isolated cpus, but could be used to
7349 * exclude other special cases in the future. 7694 * exclude other special cases in the future.
7350 */ 7695 */
7351static int arch_init_sched_domains(const struct cpumask *cpu_map) 7696static int init_sched_domains(const struct cpumask *cpu_map)
7352{ 7697{
7353 int err; 7698 int err;
7354 7699
@@ -7359,32 +7704,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
7359 doms_cur = &fallback_doms; 7704 doms_cur = &fallback_doms;
7360 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7705 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7361 dattr_cur = NULL; 7706 dattr_cur = NULL;
7362 err = build_sched_domains(doms_cur[0]); 7707 err = build_sched_domains(doms_cur[0], NULL);
7363 register_sched_domain_sysctl(); 7708 register_sched_domain_sysctl();
7364 7709
7365 return err; 7710 return err;
7366} 7711}
7367 7712
7368static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7369 struct cpumask *tmpmask)
7370{
7371 free_sched_groups(cpu_map, tmpmask);
7372}
7373
7374/* 7713/*
7375 * Detach sched domains from a group of cpus specified in cpu_map 7714 * Detach sched domains from a group of cpus specified in cpu_map
7376 * These cpus will now be attached to the NULL domain 7715 * These cpus will now be attached to the NULL domain
7377 */ 7716 */
7378static void detach_destroy_domains(const struct cpumask *cpu_map) 7717static void detach_destroy_domains(const struct cpumask *cpu_map)
7379{ 7718{
7380 /* Save because hotplug lock held. */
7381 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7382 int i; 7719 int i;
7383 7720
7721 rcu_read_lock();
7384 for_each_cpu(i, cpu_map) 7722 for_each_cpu(i, cpu_map)
7385 cpu_attach_domain(NULL, &def_root_domain, i); 7723 cpu_attach_domain(NULL, &def_root_domain, i);
7386 synchronize_sched(); 7724 rcu_read_unlock();
7387 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7388} 7725}
7389 7726
7390/* handle null as "default" */ 7727/* handle null as "default" */
@@ -7473,8 +7810,7 @@ match1:
7473 goto match2; 7810 goto match2;
7474 } 7811 }
7475 /* no match - add a new doms_new */ 7812 /* no match - add a new doms_new */
7476 __build_sched_domains(doms_new[i], 7813 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7477 dattr_new ? dattr_new + i : NULL);
7478match2: 7814match2:
7479 ; 7815 ;
7480 } 7816 }
@@ -7493,7 +7829,7 @@ match2:
7493} 7829}
7494 7830
7495#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7831#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7496static void arch_reinit_sched_domains(void) 7832static void reinit_sched_domains(void)
7497{ 7833{
7498 get_online_cpus(); 7834 get_online_cpus();
7499 7835
@@ -7526,7 +7862,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7526 else 7862 else
7527 sched_mc_power_savings = level; 7863 sched_mc_power_savings = level;
7528 7864
7529 arch_reinit_sched_domains(); 7865 reinit_sched_domains();
7530 7866
7531 return count; 7867 return count;
7532} 7868}
@@ -7645,14 +7981,9 @@ void __init sched_init_smp(void)
7645 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7981 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7646 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7982 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7647 7983
7648#if defined(CONFIG_NUMA)
7649 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7650 GFP_KERNEL);
7651 BUG_ON(sched_group_nodes_bycpu == NULL);
7652#endif
7653 get_online_cpus(); 7984 get_online_cpus();
7654 mutex_lock(&sched_domains_mutex); 7985 mutex_lock(&sched_domains_mutex);
7655 arch_init_sched_domains(cpu_active_mask); 7986 init_sched_domains(cpu_active_mask);
7656 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7987 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7657 if (cpumask_empty(non_isolated_cpus)) 7988 if (cpumask_empty(non_isolated_cpus))
7658 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7989 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -7697,8 +8028,15 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7697 INIT_LIST_HEAD(&cfs_rq->tasks); 8028 INIT_LIST_HEAD(&cfs_rq->tasks);
7698#ifdef CONFIG_FAIR_GROUP_SCHED 8029#ifdef CONFIG_FAIR_GROUP_SCHED
7699 cfs_rq->rq = rq; 8030 cfs_rq->rq = rq;
8031 /* allow initial update_cfs_load() to truncate */
8032#ifdef CONFIG_SMP
8033 cfs_rq->load_stamp = 1;
8034#endif
7700#endif 8035#endif
7701 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 8036 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8037#ifndef CONFIG_64BIT
8038 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8039#endif
7702} 8040}
7703 8041
7704static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 8042static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -7739,18 +8077,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7739 8077
7740#ifdef CONFIG_FAIR_GROUP_SCHED 8078#ifdef CONFIG_FAIR_GROUP_SCHED
7741static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 8079static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7742 struct sched_entity *se, int cpu, int add, 8080 struct sched_entity *se, int cpu,
7743 struct sched_entity *parent) 8081 struct sched_entity *parent)
7744{ 8082{
7745 struct rq *rq = cpu_rq(cpu); 8083 struct rq *rq = cpu_rq(cpu);
7746 tg->cfs_rq[cpu] = cfs_rq; 8084 tg->cfs_rq[cpu] = cfs_rq;
7747 init_cfs_rq(cfs_rq, rq); 8085 init_cfs_rq(cfs_rq, rq);
7748 cfs_rq->tg = tg; 8086 cfs_rq->tg = tg;
7749 if (add)
7750 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7751 8087
7752 tg->se[cpu] = se; 8088 tg->se[cpu] = se;
7753 /* se could be NULL for init_task_group */ 8089 /* se could be NULL for root_task_group */
7754 if (!se) 8090 if (!se)
7755 return; 8091 return;
7756 8092
@@ -7760,15 +8096,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7760 se->cfs_rq = parent->my_q; 8096 se->cfs_rq = parent->my_q;
7761 8097
7762 se->my_q = cfs_rq; 8098 se->my_q = cfs_rq;
7763 se->load.weight = tg->shares; 8099 update_load_set(&se->load, 0);
7764 se->load.inv_weight = 0;
7765 se->parent = parent; 8100 se->parent = parent;
7766} 8101}
7767#endif 8102#endif
7768 8103
7769#ifdef CONFIG_RT_GROUP_SCHED 8104#ifdef CONFIG_RT_GROUP_SCHED
7770static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 8105static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7771 struct sched_rt_entity *rt_se, int cpu, int add, 8106 struct sched_rt_entity *rt_se, int cpu,
7772 struct sched_rt_entity *parent) 8107 struct sched_rt_entity *parent)
7773{ 8108{
7774 struct rq *rq = cpu_rq(cpu); 8109 struct rq *rq = cpu_rq(cpu);
@@ -7777,8 +8112,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7777 init_rt_rq(rt_rq, rq); 8112 init_rt_rq(rt_rq, rq);
7778 rt_rq->tg = tg; 8113 rt_rq->tg = tg;
7779 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 8114 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7780 if (add)
7781 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7782 8115
7783 tg->rt_se[cpu] = rt_se; 8116 tg->rt_se[cpu] = rt_se;
7784 if (!rt_se) 8117 if (!rt_se)
@@ -7813,18 +8146,18 @@ void __init sched_init(void)
7813 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 8146 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7814 8147
7815#ifdef CONFIG_FAIR_GROUP_SCHED 8148#ifdef CONFIG_FAIR_GROUP_SCHED
7816 init_task_group.se = (struct sched_entity **)ptr; 8149 root_task_group.se = (struct sched_entity **)ptr;
7817 ptr += nr_cpu_ids * sizeof(void **); 8150 ptr += nr_cpu_ids * sizeof(void **);
7818 8151
7819 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 8152 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7820 ptr += nr_cpu_ids * sizeof(void **); 8153 ptr += nr_cpu_ids * sizeof(void **);
7821 8154
7822#endif /* CONFIG_FAIR_GROUP_SCHED */ 8155#endif /* CONFIG_FAIR_GROUP_SCHED */
7823#ifdef CONFIG_RT_GROUP_SCHED 8156#ifdef CONFIG_RT_GROUP_SCHED
7824 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 8157 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7825 ptr += nr_cpu_ids * sizeof(void **); 8158 ptr += nr_cpu_ids * sizeof(void **);
7826 8159
7827 init_task_group.rt_rq = (struct rt_rq **)ptr; 8160 root_task_group.rt_rq = (struct rt_rq **)ptr;
7828 ptr += nr_cpu_ids * sizeof(void **); 8161 ptr += nr_cpu_ids * sizeof(void **);
7829 8162
7830#endif /* CONFIG_RT_GROUP_SCHED */ 8163#endif /* CONFIG_RT_GROUP_SCHED */
@@ -7844,20 +8177,16 @@ void __init sched_init(void)
7844 global_rt_period(), global_rt_runtime()); 8177 global_rt_period(), global_rt_runtime());
7845 8178
7846#ifdef CONFIG_RT_GROUP_SCHED 8179#ifdef CONFIG_RT_GROUP_SCHED
7847 init_rt_bandwidth(&init_task_group.rt_bandwidth, 8180 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7848 global_rt_period(), global_rt_runtime()); 8181 global_rt_period(), global_rt_runtime());
7849#endif /* CONFIG_RT_GROUP_SCHED */ 8182#endif /* CONFIG_RT_GROUP_SCHED */
7850 8183
7851#ifdef CONFIG_CGROUP_SCHED 8184#ifdef CONFIG_CGROUP_SCHED
7852 list_add(&init_task_group.list, &task_groups); 8185 list_add(&root_task_group.list, &task_groups);
7853 INIT_LIST_HEAD(&init_task_group.children); 8186 INIT_LIST_HEAD(&root_task_group.children);
7854 8187 autogroup_init(&init_task);
7855#endif /* CONFIG_CGROUP_SCHED */ 8188#endif /* CONFIG_CGROUP_SCHED */
7856 8189
7857#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7858 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7859 __alignof__(unsigned long));
7860#endif
7861 for_each_possible_cpu(i) { 8190 for_each_possible_cpu(i) {
7862 struct rq *rq; 8191 struct rq *rq;
7863 8192
@@ -7869,38 +8198,34 @@ void __init sched_init(void)
7869 init_cfs_rq(&rq->cfs, rq); 8198 init_cfs_rq(&rq->cfs, rq);
7870 init_rt_rq(&rq->rt, rq); 8199 init_rt_rq(&rq->rt, rq);
7871#ifdef CONFIG_FAIR_GROUP_SCHED 8200#ifdef CONFIG_FAIR_GROUP_SCHED
7872 init_task_group.shares = init_task_group_load; 8201 root_task_group.shares = root_task_group_load;
7873 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 8202 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7874#ifdef CONFIG_CGROUP_SCHED
7875 /* 8203 /*
7876 * How much cpu bandwidth does init_task_group get? 8204 * How much cpu bandwidth does root_task_group get?
7877 * 8205 *
7878 * In case of task-groups formed thr' the cgroup filesystem, it 8206 * In case of task-groups formed thr' the cgroup filesystem, it
7879 * gets 100% of the cpu resources in the system. This overall 8207 * gets 100% of the cpu resources in the system. This overall
7880 * system cpu resource is divided among the tasks of 8208 * system cpu resource is divided among the tasks of
7881 * init_task_group and its child task-groups in a fair manner, 8209 * root_task_group and its child task-groups in a fair manner,
7882 * based on each entity's (task or task-group's) weight 8210 * based on each entity's (task or task-group's) weight
7883 * (se->load.weight). 8211 * (se->load.weight).
7884 * 8212 *
7885 * In other words, if init_task_group has 10 tasks of weight 8213 * In other words, if root_task_group has 10 tasks of weight
7886 * 1024) and two child groups A0 and A1 (of weight 1024 each), 8214 * 1024) and two child groups A0 and A1 (of weight 1024 each),
7887 * then A0's share of the cpu resource is: 8215 * then A0's share of the cpu resource is:
7888 * 8216 *
7889 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 8217 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
7890 * 8218 *
7891 * We achieve this by letting init_task_group's tasks sit 8219 * We achieve this by letting root_task_group's tasks sit
7892 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 8220 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
7893 */ 8221 */
7894 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 8222 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7895#endif
7896#endif /* CONFIG_FAIR_GROUP_SCHED */ 8223#endif /* CONFIG_FAIR_GROUP_SCHED */
7897 8224
7898 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 8225 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7899#ifdef CONFIG_RT_GROUP_SCHED 8226#ifdef CONFIG_RT_GROUP_SCHED
7900 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 8227 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7901#ifdef CONFIG_CGROUP_SCHED 8228 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7902 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
7903#endif
7904#endif 8229#endif
7905 8230
7906 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 8231 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7911,7 +8236,7 @@ void __init sched_init(void)
7911#ifdef CONFIG_SMP 8236#ifdef CONFIG_SMP
7912 rq->sd = NULL; 8237 rq->sd = NULL;
7913 rq->rd = NULL; 8238 rq->rd = NULL;
7914 rq->cpu_power = SCHED_LOAD_SCALE; 8239 rq->cpu_power = SCHED_POWER_SCALE;
7915 rq->post_schedule = 0; 8240 rq->post_schedule = 0;
7916 rq->active_balance = 0; 8241 rq->active_balance = 0;
7917 rq->next_balance = jiffies; 8242 rq->next_balance = jiffies;
@@ -7968,6 +8293,7 @@ void __init sched_init(void)
7968 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 8293 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
7969 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 8294 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7970#ifdef CONFIG_SMP 8295#ifdef CONFIG_SMP
8296 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7971#ifdef CONFIG_NO_HZ 8297#ifdef CONFIG_NO_HZ
7972 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 8298 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7973 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 8299 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -7980,8 +8306,6 @@ void __init sched_init(void)
7980 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8306 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7981#endif /* SMP */ 8307#endif /* SMP */
7982 8308
7983 perf_event_init();
7984
7985 scheduler_running = 1; 8309 scheduler_running = 1;
7986} 8310}
7987 8311
@@ -7990,7 +8314,7 @@ static inline int preempt_count_equals(int preempt_offset)
7990{ 8314{
7991 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8315 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7992 8316
7993 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 8317 return (nested == preempt_offset);
7994} 8318}
7995 8319
7996void __might_sleep(const char *file, int line, int preempt_offset) 8320void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8025,9 +8349,11 @@ EXPORT_SYMBOL(__might_sleep);
8025#ifdef CONFIG_MAGIC_SYSRQ 8349#ifdef CONFIG_MAGIC_SYSRQ
8026static void normalize_task(struct rq *rq, struct task_struct *p) 8350static void normalize_task(struct rq *rq, struct task_struct *p)
8027{ 8351{
8352 const struct sched_class *prev_class = p->sched_class;
8353 int old_prio = p->prio;
8028 int on_rq; 8354 int on_rq;
8029 8355
8030 on_rq = p->se.on_rq; 8356 on_rq = p->on_rq;
8031 if (on_rq) 8357 if (on_rq)
8032 deactivate_task(rq, p, 0); 8358 deactivate_task(rq, p, 0);
8033 __setscheduler(rq, p, SCHED_NORMAL, 0); 8359 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8035,6 +8361,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8035 activate_task(rq, p, 0); 8361 activate_task(rq, p, 0);
8036 resched_task(rq->curr); 8362 resched_task(rq->curr);
8037 } 8363 }
8364
8365 check_class_changed(rq, p, prev_class, old_prio);
8038} 8366}
8039 8367
8040void normalize_rt_tasks(void) 8368void normalize_rt_tasks(void)
@@ -8150,7 +8478,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8150{ 8478{
8151 struct cfs_rq *cfs_rq; 8479 struct cfs_rq *cfs_rq;
8152 struct sched_entity *se; 8480 struct sched_entity *se;
8153 struct rq *rq;
8154 int i; 8481 int i;
8155 8482
8156 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8483 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8163,8 +8490,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8163 tg->shares = NICE_0_LOAD; 8490 tg->shares = NICE_0_LOAD;
8164 8491
8165 for_each_possible_cpu(i) { 8492 for_each_possible_cpu(i) {
8166 rq = cpu_rq(i);
8167
8168 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8493 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8169 GFP_KERNEL, cpu_to_node(i)); 8494 GFP_KERNEL, cpu_to_node(i));
8170 if (!cfs_rq) 8495 if (!cfs_rq)
@@ -8175,26 +8500,32 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8175 if (!se) 8500 if (!se)
8176 goto err_free_rq; 8501 goto err_free_rq;
8177 8502
8178 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8503 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8179 } 8504 }
8180 8505
8181 return 1; 8506 return 1;
8182 8507
8183 err_free_rq: 8508err_free_rq:
8184 kfree(cfs_rq); 8509 kfree(cfs_rq);
8185 err: 8510err:
8186 return 0; 8511 return 0;
8187} 8512}
8188 8513
8189static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8190{
8191 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8192 &cpu_rq(cpu)->leaf_cfs_rq_list);
8193}
8194
8195static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8514static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8196{ 8515{
8197 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8516 struct rq *rq = cpu_rq(cpu);
8517 unsigned long flags;
8518
8519 /*
8520 * Only empty task groups can be destroyed; so we can speculatively
8521 * check on_list without danger of it being re-added.
8522 */
8523 if (!tg->cfs_rq[cpu]->on_list)
8524 return;
8525
8526 raw_spin_lock_irqsave(&rq->lock, flags);
8527 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8528 raw_spin_unlock_irqrestore(&rq->lock, flags);
8198} 8529}
8199#else /* !CONFG_FAIR_GROUP_SCHED */ 8530#else /* !CONFG_FAIR_GROUP_SCHED */
8200static inline void free_fair_sched_group(struct task_group *tg) 8531static inline void free_fair_sched_group(struct task_group *tg)
@@ -8207,10 +8538,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8207 return 1; 8538 return 1;
8208} 8539}
8209 8540
8210static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8211{
8212}
8213
8214static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8541static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8215{ 8542{
8216} 8543}
@@ -8239,7 +8566,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8239{ 8566{
8240 struct rt_rq *rt_rq; 8567 struct rt_rq *rt_rq;
8241 struct sched_rt_entity *rt_se; 8568 struct sched_rt_entity *rt_se;
8242 struct rq *rq;
8243 int i; 8569 int i;
8244 8570
8245 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8571 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8253,8 +8579,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8253 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8579 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8254 8580
8255 for_each_possible_cpu(i) { 8581 for_each_possible_cpu(i) {
8256 rq = cpu_rq(i);
8257
8258 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8582 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8259 GFP_KERNEL, cpu_to_node(i)); 8583 GFP_KERNEL, cpu_to_node(i));
8260 if (!rt_rq) 8584 if (!rt_rq)
@@ -8265,27 +8589,16 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8265 if (!rt_se) 8589 if (!rt_se)
8266 goto err_free_rq; 8590 goto err_free_rq;
8267 8591
8268 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8592 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8269 } 8593 }
8270 8594
8271 return 1; 8595 return 1;
8272 8596
8273 err_free_rq: 8597err_free_rq:
8274 kfree(rt_rq); 8598 kfree(rt_rq);
8275 err: 8599err:
8276 return 0; 8600 return 0;
8277} 8601}
8278
8279static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8280{
8281 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8282 &cpu_rq(cpu)->leaf_rt_rq_list);
8283}
8284
8285static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8286{
8287 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8288}
8289#else /* !CONFIG_RT_GROUP_SCHED */ 8602#else /* !CONFIG_RT_GROUP_SCHED */
8290static inline void free_rt_sched_group(struct task_group *tg) 8603static inline void free_rt_sched_group(struct task_group *tg)
8291{ 8604{
@@ -8296,14 +8609,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8296{ 8609{
8297 return 1; 8610 return 1;
8298} 8611}
8299
8300static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8301{
8302}
8303
8304static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8305{
8306}
8307#endif /* CONFIG_RT_GROUP_SCHED */ 8612#endif /* CONFIG_RT_GROUP_SCHED */
8308 8613
8309#ifdef CONFIG_CGROUP_SCHED 8614#ifdef CONFIG_CGROUP_SCHED
@@ -8311,6 +8616,7 @@ static void free_sched_group(struct task_group *tg)
8311{ 8616{
8312 free_fair_sched_group(tg); 8617 free_fair_sched_group(tg);
8313 free_rt_sched_group(tg); 8618 free_rt_sched_group(tg);
8619 autogroup_free(tg);
8314 kfree(tg); 8620 kfree(tg);
8315} 8621}
8316 8622
@@ -8319,7 +8625,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8319{ 8625{
8320 struct task_group *tg; 8626 struct task_group *tg;
8321 unsigned long flags; 8627 unsigned long flags;
8322 int i;
8323 8628
8324 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8629 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8325 if (!tg) 8630 if (!tg)
@@ -8332,10 +8637,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8332 goto err; 8637 goto err;
8333 8638
8334 spin_lock_irqsave(&task_group_lock, flags); 8639 spin_lock_irqsave(&task_group_lock, flags);
8335 for_each_possible_cpu(i) {
8336 register_fair_sched_group(tg, i);
8337 register_rt_sched_group(tg, i);
8338 }
8339 list_add_rcu(&tg->list, &task_groups); 8640 list_add_rcu(&tg->list, &task_groups);
8340 8641
8341 WARN_ON(!parent); /* root should already exist */ 8642 WARN_ON(!parent); /* root should already exist */
@@ -8365,11 +8666,11 @@ void sched_destroy_group(struct task_group *tg)
8365 unsigned long flags; 8666 unsigned long flags;
8366 int i; 8667 int i;
8367 8668
8368 spin_lock_irqsave(&task_group_lock, flags); 8669 /* end participation in shares distribution */
8369 for_each_possible_cpu(i) { 8670 for_each_possible_cpu(i)
8370 unregister_fair_sched_group(tg, i); 8671 unregister_fair_sched_group(tg, i);
8371 unregister_rt_sched_group(tg, i); 8672
8372 } 8673 spin_lock_irqsave(&task_group_lock, flags);
8373 list_del_rcu(&tg->list); 8674 list_del_rcu(&tg->list);
8374 list_del_rcu(&tg->siblings); 8675 list_del_rcu(&tg->siblings);
8375 spin_unlock_irqrestore(&task_group_lock, flags); 8676 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8392,57 +8693,30 @@ void sched_move_task(struct task_struct *tsk)
8392 rq = task_rq_lock(tsk, &flags); 8693 rq = task_rq_lock(tsk, &flags);
8393 8694
8394 running = task_current(rq, tsk); 8695 running = task_current(rq, tsk);
8395 on_rq = tsk->se.on_rq; 8696 on_rq = tsk->on_rq;
8396 8697
8397 if (on_rq) 8698 if (on_rq)
8398 dequeue_task(rq, tsk, 0); 8699 dequeue_task(rq, tsk, 0);
8399 if (unlikely(running)) 8700 if (unlikely(running))
8400 tsk->sched_class->put_prev_task(rq, tsk); 8701 tsk->sched_class->put_prev_task(rq, tsk);
8401 8702
8402 set_task_rq(tsk, task_cpu(tsk));
8403
8404#ifdef CONFIG_FAIR_GROUP_SCHED 8703#ifdef CONFIG_FAIR_GROUP_SCHED
8405 if (tsk->sched_class->moved_group) 8704 if (tsk->sched_class->task_move_group)
8406 tsk->sched_class->moved_group(tsk, on_rq); 8705 tsk->sched_class->task_move_group(tsk, on_rq);
8706 else
8407#endif 8707#endif
8708 set_task_rq(tsk, task_cpu(tsk));
8408 8709
8409 if (unlikely(running)) 8710 if (unlikely(running))
8410 tsk->sched_class->set_curr_task(rq); 8711 tsk->sched_class->set_curr_task(rq);
8411 if (on_rq) 8712 if (on_rq)
8412 enqueue_task(rq, tsk, 0); 8713 enqueue_task(rq, tsk, 0);
8413 8714
8414 task_rq_unlock(rq, &flags); 8715 task_rq_unlock(rq, tsk, &flags);
8415} 8716}
8416#endif /* CONFIG_CGROUP_SCHED */ 8717#endif /* CONFIG_CGROUP_SCHED */
8417 8718
8418#ifdef CONFIG_FAIR_GROUP_SCHED 8719#ifdef CONFIG_FAIR_GROUP_SCHED
8419static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8420{
8421 struct cfs_rq *cfs_rq = se->cfs_rq;
8422 int on_rq;
8423
8424 on_rq = se->on_rq;
8425 if (on_rq)
8426 dequeue_entity(cfs_rq, se, 0);
8427
8428 se->load.weight = shares;
8429 se->load.inv_weight = 0;
8430
8431 if (on_rq)
8432 enqueue_entity(cfs_rq, se, 0);
8433}
8434
8435static void set_se_shares(struct sched_entity *se, unsigned long shares)
8436{
8437 struct cfs_rq *cfs_rq = se->cfs_rq;
8438 struct rq *rq = cfs_rq->rq;
8439 unsigned long flags;
8440
8441 raw_spin_lock_irqsave(&rq->lock, flags);
8442 __set_se_shares(se, shares);
8443 raw_spin_unlock_irqrestore(&rq->lock, flags);
8444}
8445
8446static DEFINE_MUTEX(shares_mutex); 8720static DEFINE_MUTEX(shares_mutex);
8447 8721
8448int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8722int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8456,46 +8730,25 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8456 if (!tg->se[0]) 8730 if (!tg->se[0])
8457 return -EINVAL; 8731 return -EINVAL;
8458 8732
8459 if (shares < MIN_SHARES) 8733 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8460 shares = MIN_SHARES;
8461 else if (shares > MAX_SHARES)
8462 shares = MAX_SHARES;
8463 8734
8464 mutex_lock(&shares_mutex); 8735 mutex_lock(&shares_mutex);
8465 if (tg->shares == shares) 8736 if (tg->shares == shares)
8466 goto done; 8737 goto done;
8467 8738
8468 spin_lock_irqsave(&task_group_lock, flags);
8469 for_each_possible_cpu(i)
8470 unregister_fair_sched_group(tg, i);
8471 list_del_rcu(&tg->siblings);
8472 spin_unlock_irqrestore(&task_group_lock, flags);
8473
8474 /* wait for any ongoing reference to this group to finish */
8475 synchronize_sched();
8476
8477 /*
8478 * Now we are free to modify the group's share on each cpu
8479 * w/o tripping rebalance_share or load_balance_fair.
8480 */
8481 tg->shares = shares; 8739 tg->shares = shares;
8482 for_each_possible_cpu(i) { 8740 for_each_possible_cpu(i) {
8483 /* 8741 struct rq *rq = cpu_rq(i);
8484 * force a rebalance 8742 struct sched_entity *se;
8485 */ 8743
8486 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8744 se = tg->se[i];
8487 set_se_shares(tg->se[i], shares); 8745 /* Propagate contribution to hierarchy */
8746 raw_spin_lock_irqsave(&rq->lock, flags);
8747 for_each_sched_entity(se)
8748 update_cfs_shares(group_cfs_rq(se));
8749 raw_spin_unlock_irqrestore(&rq->lock, flags);
8488 } 8750 }
8489 8751
8490 /*
8491 * Enable load balance activity on this group, by inserting it back on
8492 * each cpu's rq->leaf_cfs_rq_list.
8493 */
8494 spin_lock_irqsave(&task_group_lock, flags);
8495 for_each_possible_cpu(i)
8496 register_fair_sched_group(tg, i);
8497 list_add_rcu(&tg->siblings, &tg->parent->children);
8498 spin_unlock_irqrestore(&task_group_lock, flags);
8499done: 8752done:
8500 mutex_unlock(&shares_mutex); 8753 mutex_unlock(&shares_mutex);
8501 return 0; 8754 return 0;
@@ -8630,7 +8883,7 @@ static int tg_set_bandwidth(struct task_group *tg,
8630 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8883 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8631 } 8884 }
8632 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8885 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8633 unlock: 8886unlock:
8634 read_unlock(&tasklist_lock); 8887 read_unlock(&tasklist_lock);
8635 mutex_unlock(&rt_constraints_mutex); 8888 mutex_unlock(&rt_constraints_mutex);
8636 8889
@@ -8794,7 +9047,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8794 9047
8795 if (!cgrp->parent) { 9048 if (!cgrp->parent) {
8796 /* This is early initialization for the top cgroup */ 9049 /* This is early initialization for the top cgroup */
8797 return &init_task_group.css; 9050 return &root_task_group.css;
8798 } 9051 }
8799 9052
8800 parent = cgroup_tg(cgrp->parent); 9053 parent = cgroup_tg(cgrp->parent);
@@ -8827,56 +9080,39 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8827 return 0; 9080 return 0;
8828} 9081}
8829 9082
8830static int 9083static void
8831cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 9084cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8832 struct task_struct *tsk, bool threadgroup)
8833{ 9085{
8834 int retval = cpu_cgroup_can_attach_task(cgrp, tsk); 9086 sched_move_task(tsk);
8835 if (retval)
8836 return retval;
8837 if (threadgroup) {
8838 struct task_struct *c;
8839 rcu_read_lock();
8840 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
8841 retval = cpu_cgroup_can_attach_task(cgrp, c);
8842 if (retval) {
8843 rcu_read_unlock();
8844 return retval;
8845 }
8846 }
8847 rcu_read_unlock();
8848 }
8849 return 0;
8850} 9087}
8851 9088
8852static void 9089static void
8853cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 9090cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
8854 struct cgroup *old_cont, struct task_struct *tsk, 9091 struct cgroup *old_cgrp, struct task_struct *task)
8855 bool threadgroup)
8856{ 9092{
8857 sched_move_task(tsk); 9093 /*
8858 if (threadgroup) { 9094 * cgroup_exit() is called in the copy_process() failure path.
8859 struct task_struct *c; 9095 * Ignore this case since the task hasn't ran yet, this avoids
8860 rcu_read_lock(); 9096 * trying to poke a half freed task state from generic code.
8861 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 9097 */
8862 sched_move_task(c); 9098 if (!(task->flags & PF_EXITING))
8863 } 9099 return;
8864 rcu_read_unlock(); 9100
8865 } 9101 sched_move_task(task);
8866} 9102}
8867 9103
8868#ifdef CONFIG_FAIR_GROUP_SCHED 9104#ifdef CONFIG_FAIR_GROUP_SCHED
8869static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 9105static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8870 u64 shareval) 9106 u64 shareval)
8871{ 9107{
8872 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 9108 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
8873} 9109}
8874 9110
8875static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 9111static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8876{ 9112{
8877 struct task_group *tg = cgroup_tg(cgrp); 9113 struct task_group *tg = cgroup_tg(cgrp);
8878 9114
8879 return (u64) tg->shares; 9115 return (u64) scale_load_down(tg->shares);
8880} 9116}
8881#endif /* CONFIG_FAIR_GROUP_SCHED */ 9117#endif /* CONFIG_FAIR_GROUP_SCHED */
8882 9118
@@ -8935,8 +9171,9 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8935 .name = "cpu", 9171 .name = "cpu",
8936 .create = cpu_cgroup_create, 9172 .create = cpu_cgroup_create,
8937 .destroy = cpu_cgroup_destroy, 9173 .destroy = cpu_cgroup_destroy,
8938 .can_attach = cpu_cgroup_can_attach, 9174 .can_attach_task = cpu_cgroup_can_attach_task,
8939 .attach = cpu_cgroup_attach, 9175 .attach_task = cpu_cgroup_attach_task,
9176 .exit = cpu_cgroup_exit,
8940 .populate = cpu_cgroup_populate, 9177 .populate = cpu_cgroup_populate,
8941 .subsys_id = cpu_cgroup_subsys_id, 9178 .subsys_id = cpu_cgroup_subsys_id,
8942 .early_init = 1, 9179 .early_init = 1,
@@ -9221,72 +9458,3 @@ struct cgroup_subsys cpuacct_subsys = {
9221}; 9458};
9222#endif /* CONFIG_CGROUP_CPUACCT */ 9459#endif /* CONFIG_CGROUP_CPUACCT */
9223 9460
9224#ifndef CONFIG_SMP
9225
9226void synchronize_sched_expedited(void)
9227{
9228 barrier();
9229}
9230EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9231
9232#else /* #ifndef CONFIG_SMP */
9233
9234static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9235
9236static int synchronize_sched_expedited_cpu_stop(void *data)
9237{
9238 /*
9239 * There must be a full memory barrier on each affected CPU
9240 * between the time that try_stop_cpus() is called and the
9241 * time that it returns.
9242 *
9243 * In the current initial implementation of cpu_stop, the
9244 * above condition is already met when the control reaches
9245 * this point and the following smp_mb() is not strictly
9246 * necessary. Do smp_mb() anyway for documentation and
9247 * robustness against future implementation changes.
9248 */
9249 smp_mb(); /* See above comment block. */
9250 return 0;
9251}
9252
9253/*
9254 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9255 * approach to force grace period to end quickly. This consumes
9256 * significant time on all CPUs, and is thus not recommended for
9257 * any sort of common-case code.
9258 *
9259 * Note that it is illegal to call this function while holding any
9260 * lock that is acquired by a CPU-hotplug notifier. Failing to
9261 * observe this restriction will result in deadlock.
9262 */
9263void synchronize_sched_expedited(void)
9264{
9265 int snap, trycount = 0;
9266
9267 smp_mb(); /* ensure prior mod happens before capturing snap. */
9268 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9269 get_online_cpus();
9270 while (try_stop_cpus(cpu_online_mask,
9271 synchronize_sched_expedited_cpu_stop,
9272 NULL) == -EAGAIN) {
9273 put_online_cpus();
9274 if (trycount++ < 10)
9275 udelay(trycount * num_online_cpus());
9276 else {
9277 synchronize_sched();
9278 return;
9279 }
9280 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9281 smp_mb(); /* ensure test happens before caller kfree */
9282 return;
9283 }
9284 get_online_cpus();
9285 }
9286 atomic_inc(&synchronize_sched_expedited_count);
9287 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9288 put_online_cpus();
9289}
9290EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9291
9292#endif /* #else #ifndef CONFIG_SMP */