aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1350
1 files changed, 666 insertions, 684 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 5e3c509e0efe..6777dc7942a0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/cpuset.h> 56#include <linux/cpuset.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h> 58#include <linux/proc_fs.h>
60#include <linux/seq_file.h> 59#include <linux/seq_file.h>
60#include <linux/stop_machine.h>
61#include <linux/sysctl.h> 61#include <linux/sysctl.h>
62#include <linux/syscalls.h> 62#include <linux/syscalls.h>
63#include <linux/times.h> 63#include <linux/times.h>
@@ -77,6 +77,7 @@
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h"
80 81
81#include <litmus/sched_trace.h> 82#include <litmus/sched_trace.h>
82#include <litmus/trace.h> 83#include <litmus/trace.h>
@@ -309,52 +310,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
309 */ 310 */
310struct task_group init_task_group; 311struct task_group init_task_group;
311 312
312/* return group to which a task belongs */
313static inline struct task_group *task_group(struct task_struct *p)
314{
315 struct task_group *tg;
316
317#ifdef CONFIG_CGROUP_SCHED
318 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
319 struct task_group, css);
320#else
321 tg = &init_task_group;
322#endif
323 return tg;
324}
325
326/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
327static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
328{
329 /*
330 * Strictly speaking this rcu_read_lock() is not needed since the
331 * task_group is tied to the cgroup, which in turn can never go away
332 * as long as there are tasks attached to it.
333 *
334 * However since task_group() uses task_subsys_state() which is an
335 * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
336 */
337 rcu_read_lock();
338#ifdef CONFIG_FAIR_GROUP_SCHED
339 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
340 p->se.parent = task_group(p)->se[cpu];
341#endif
342
343#ifdef CONFIG_RT_GROUP_SCHED
344 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
345 p->rt.parent = task_group(p)->rt_se[cpu];
346#endif
347 rcu_read_unlock();
348}
349
350#else
351
352static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
353static inline struct task_group *task_group(struct task_struct *p)
354{
355 return NULL;
356}
357
358#endif /* CONFIG_CGROUP_SCHED */ 313#endif /* CONFIG_CGROUP_SCHED */
359 314
360/* CFS-related fields in a runqueue */ 315/* CFS-related fields in a runqueue */
@@ -511,9 +466,13 @@ struct rq {
511 unsigned long nr_running; 466 unsigned long nr_running;
512 #define CPU_LOAD_IDX_MAX 5 467 #define CPU_LOAD_IDX_MAX 5
513 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 468 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
469 unsigned long last_load_update_tick;
514#ifdef CONFIG_NO_HZ 470#ifdef CONFIG_NO_HZ
515 unsigned char in_nohz_recently; 471 u64 nohz_stamp;
472 unsigned char nohz_balance_kick;
516#endif 473#endif
474 unsigned int skip_clock_update;
475
517 /* capture load from *all* tasks on this cpu: */ 476 /* capture load from *all* tasks on this cpu: */
518 struct load_weight load; 477 struct load_weight load;
519 unsigned long nr_load_updates; 478 unsigned long nr_load_updates;
@@ -551,20 +510,20 @@ struct rq {
551 struct root_domain *rd; 510 struct root_domain *rd;
552 struct sched_domain *sd; 511 struct sched_domain *sd;
553 512
513 unsigned long cpu_power;
514
554 unsigned char idle_at_tick; 515 unsigned char idle_at_tick;
555 /* For active balancing */ 516 /* For active balancing */
556 int post_schedule; 517 int post_schedule;
557 int active_balance; 518 int active_balance;
558 int push_cpu; 519 int push_cpu;
520 struct cpu_stop_work active_balance_work;
559 /* cpu of this runqueue: */ 521 /* cpu of this runqueue: */
560 int cpu; 522 int cpu;
561 int online; 523 int online;
562 524
563 unsigned long avg_load_per_task; 525 unsigned long avg_load_per_task;
564 526
565 struct task_struct *migration_thread;
566 struct list_head migration_queue;
567
568 u64 rt_avg; 527 u64 rt_avg;
569 u64 age_stamp; 528 u64 age_stamp;
570 u64 idle_stamp; 529 u64 idle_stamp;
@@ -612,6 +571,13 @@ static inline
612void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 571void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
613{ 572{
614 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 573 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
574
575 /*
576 * A queue event has occurred, and we're going to schedule. In
577 * this case, we can save a useless back to back clock update.
578 */
579 if (test_tsk_need_resched(p))
580 rq->skip_clock_update = 1;
615} 581}
616 582
617static inline int cpu_of(struct rq *rq) 583static inline int cpu_of(struct rq *rq)
@@ -644,9 +610,53 @@ static inline int cpu_of(struct rq *rq)
644#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 610#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
645#define raw_rq() (&__raw_get_cpu_var(runqueues)) 611#define raw_rq() (&__raw_get_cpu_var(runqueues))
646 612
613#ifdef CONFIG_CGROUP_SCHED
614
615/*
616 * Return the group to which this tasks belongs.
617 *
618 * We use task_subsys_state_check() and extend the RCU verification
619 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
620 * holds that lock for each task it moves into the cgroup. Therefore
621 * by holding that lock, we pin the task to the current cgroup.
622 */
623static inline struct task_group *task_group(struct task_struct *p)
624{
625 struct cgroup_subsys_state *css;
626
627 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
628 lockdep_is_held(&task_rq(p)->lock));
629 return container_of(css, struct task_group, css);
630}
631
632/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
633static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
634{
635#ifdef CONFIG_FAIR_GROUP_SCHED
636 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
637 p->se.parent = task_group(p)->se[cpu];
638#endif
639
640#ifdef CONFIG_RT_GROUP_SCHED
641 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
642 p->rt.parent = task_group(p)->rt_se[cpu];
643#endif
644}
645
646#else /* CONFIG_CGROUP_SCHED */
647
648static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
649static inline struct task_group *task_group(struct task_struct *p)
650{
651 return NULL;
652}
653
654#endif /* CONFIG_CGROUP_SCHED */
655
647inline void update_rq_clock(struct rq *rq) 656inline void update_rq_clock(struct rq *rq)
648{ 657{
649 rq->clock = sched_clock_cpu(cpu_of(rq)); 658 if (!rq->skip_clock_update)
659 rq->clock = sched_clock_cpu(cpu_of(rq));
650} 660}
651 661
652/* 662/*
@@ -924,16 +934,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
924#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 934#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
925 935
926/* 936/*
927 * Check whether the task is waking, we use this to synchronize against 937 * Check whether the task is waking, we use this to synchronize ->cpus_allowed
928 * ttwu() so that task_cpu() reports a stable number. 938 * against ttwu().
929 *
930 * We need to make an exception for PF_STARTING tasks because the fork
931 * path might require task_rq_lock() to work, eg. it can call
932 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
933 */ 939 */
934static inline int task_is_waking(struct task_struct *p) 940static inline int task_is_waking(struct task_struct *p)
935{ 941{
936 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); 942 return unlikely(p->state == TASK_WAKING);
937} 943}
938 944
939/* 945/*
@@ -946,11 +952,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
946 struct rq *rq; 952 struct rq *rq;
947 953
948 for (;;) { 954 for (;;) {
949 while (task_is_waking(p))
950 cpu_relax();
951 rq = task_rq(p); 955 rq = task_rq(p);
952 raw_spin_lock(&rq->lock); 956 raw_spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p) && !task_is_waking(p))) 957 if (likely(rq == task_rq(p)))
954 return rq; 958 return rq;
955 raw_spin_unlock(&rq->lock); 959 raw_spin_unlock(&rq->lock);
956 } 960 }
@@ -967,25 +971,15 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
967 struct rq *rq; 971 struct rq *rq;
968 972
969 for (;;) { 973 for (;;) {
970 while (task_is_waking(p))
971 cpu_relax();
972 local_irq_save(*flags); 974 local_irq_save(*flags);
973 rq = task_rq(p); 975 rq = task_rq(p);
974 raw_spin_lock(&rq->lock); 976 raw_spin_lock(&rq->lock);
975 if (likely(rq == task_rq(p) && !task_is_waking(p))) 977 if (likely(rq == task_rq(p)))
976 return rq; 978 return rq;
977 raw_spin_unlock_irqrestore(&rq->lock, *flags); 979 raw_spin_unlock_irqrestore(&rq->lock, *flags);
978 } 980 }
979} 981}
980 982
981void task_rq_unlock_wait(struct task_struct *p)
982{
983 struct rq *rq = task_rq(p);
984
985 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
986 raw_spin_unlock_wait(&rq->lock);
987}
988
989static void __task_rq_unlock(struct rq *rq) 983static void __task_rq_unlock(struct rq *rq)
990 __releases(rq->lock) 984 __releases(rq->lock)
991{ 985{
@@ -1211,6 +1205,27 @@ static void resched_cpu(int cpu)
1211 1205
1212#ifdef CONFIG_NO_HZ 1206#ifdef CONFIG_NO_HZ
1213/* 1207/*
1208 * In the semi idle case, use the nearest busy cpu for migrating timers
1209 * from an idle cpu. This is good for power-savings.
1210 *
1211 * We don't do similar optimization for completely idle system, as
1212 * selecting an idle cpu will add more delays to the timers than intended
1213 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1214 */
1215int get_nohz_timer_target(void)
1216{
1217 int cpu = smp_processor_id();
1218 int i;
1219 struct sched_domain *sd;
1220
1221 for_each_domain(cpu, sd) {
1222 for_each_cpu(i, sched_domain_span(sd))
1223 if (!idle_cpu(i))
1224 return i;
1225 }
1226 return cpu;
1227}
1228/*
1214 * When add_timer_on() enqueues a timer into the timer wheel of an 1229 * When add_timer_on() enqueues a timer into the timer wheel of an
1215 * idle CPU then this timer might expire before the next timer event 1230 * idle CPU then this timer might expire before the next timer event
1216 * which is scheduled to wake up that CPU. In case of a completely 1231 * which is scheduled to wake up that CPU. In case of a completely
@@ -1249,6 +1264,7 @@ void wake_up_idle_cpu(int cpu)
1249 if (!tsk_is_polling(rq->idle)) 1264 if (!tsk_is_polling(rq->idle))
1250 smp_send_reschedule(cpu); 1265 smp_send_reschedule(cpu);
1251} 1266}
1267
1252#endif /* CONFIG_NO_HZ */ 1268#endif /* CONFIG_NO_HZ */
1253 1269
1254static u64 sched_avg_period(void) 1270static u64 sched_avg_period(void)
@@ -1261,6 +1277,12 @@ static void sched_avg_update(struct rq *rq)
1261 s64 period = sched_avg_period(); 1277 s64 period = sched_avg_period();
1262 1278
1263 while ((s64)(rq->clock - rq->age_stamp) > period) { 1279 while ((s64)(rq->clock - rq->age_stamp) > period) {
1280 /*
1281 * Inline assembly required to prevent the compiler
1282 * optimising this loop into a divmod call.
1283 * See __iter_div_u64_rem() for another example of this.
1284 */
1285 asm("" : "+rm" (rq->age_stamp));
1264 rq->age_stamp += period; 1286 rq->age_stamp += period;
1265 rq->rt_avg /= 2; 1287 rq->rt_avg /= 2;
1266 } 1288 }
@@ -1282,6 +1304,10 @@ static void resched_task(struct task_struct *p)
1282static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1304static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1283{ 1305{
1284} 1306}
1307
1308static void sched_avg_update(struct rq *rq)
1309{
1310}
1285#endif /* CONFIG_SMP */ 1311#endif /* CONFIG_SMP */
1286 1312
1287#if BITS_PER_LONG == 32 1313#if BITS_PER_LONG == 32
@@ -1505,24 +1531,9 @@ static unsigned long target_load(int cpu, int type)
1505 return max(rq->cpu_load[type-1], total); 1531 return max(rq->cpu_load[type-1], total);
1506} 1532}
1507 1533
1508static struct sched_group *group_of(int cpu)
1509{
1510 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1511
1512 if (!sd)
1513 return NULL;
1514
1515 return sd->groups;
1516}
1517
1518static unsigned long power_of(int cpu) 1534static unsigned long power_of(int cpu)
1519{ 1535{
1520 struct sched_group *group = group_of(cpu); 1536 return cpu_rq(cpu)->cpu_power;
1521
1522 if (!group)
1523 return SCHED_LOAD_SCALE;
1524
1525 return group->cpu_power;
1526} 1537}
1527 1538
1528static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1539static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1668,7 +1679,7 @@ static void update_shares(struct sched_domain *sd)
1668 if (root_task_group_empty()) 1679 if (root_task_group_empty())
1669 return; 1680 return;
1670 1681
1671 now = cpu_clock(raw_smp_processor_id()); 1682 now = local_clock();
1672 elapsed = now - sd->last_update; 1683 elapsed = now - sd->last_update;
1673 1684
1674 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1685 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1679,9 +1690,6 @@ static void update_shares(struct sched_domain *sd)
1679 1690
1680static void update_h_load(long cpu) 1691static void update_h_load(long cpu)
1681{ 1692{
1682 if (root_task_group_empty())
1683 return;
1684
1685 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1693 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1686} 1694}
1687 1695
@@ -1791,8 +1799,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1791 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1799 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1792 } 1800 }
1793 } 1801 }
1794 update_rq_clock(rq1);
1795 update_rq_clock(rq2);
1796} 1802}
1797 1803
1798/* 1804/*
@@ -1823,9 +1829,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1823} 1829}
1824#endif 1830#endif
1825 1831
1826static void calc_load_account_active(struct rq *this_rq); 1832static void calc_load_account_idle(struct rq *this_rq);
1827static void update_sysctl(void); 1833static void update_sysctl(void);
1828static int get_update_sysctl_factor(void); 1834static int get_update_sysctl_factor(void);
1835static void update_cpu_load(struct rq *this_rq);
1829 1836
1830static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1837static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1831{ 1838{
@@ -1862,8 +1869,8 @@ static void dec_nr_running(struct rq *rq)
1862static void set_load_weight(struct task_struct *p) 1869static void set_load_weight(struct task_struct *p)
1863{ 1870{
1864 if (task_has_rt_policy(p)) { 1871 if (task_has_rt_policy(p)) {
1865 p->se.load.weight = prio_to_weight[0] * 2; 1872 p->se.load.weight = 0;
1866 p->se.load.inv_weight = prio_to_wmult[0] >> 1; 1873 p->se.load.inv_weight = WMULT_CONST;
1867 return; 1874 return;
1868 } 1875 }
1869 1876
@@ -1880,62 +1887,43 @@ static void set_load_weight(struct task_struct *p)
1880 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1887 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1881} 1888}
1882 1889
1883static void update_avg(u64 *avg, u64 sample) 1890static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1884{
1885 s64 diff = sample - *avg;
1886 *avg += diff >> 3;
1887}
1888
1889static void
1890enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1891{ 1891{
1892 if (wakeup) 1892 update_rq_clock(rq);
1893 p->se.start_runtime = p->se.sum_exec_runtime;
1894
1895 sched_info_queued(p); 1893 sched_info_queued(p);
1896 p->sched_class->enqueue_task(rq, p, wakeup, head); 1894 p->sched_class->enqueue_task(rq, p, flags);
1897 p->se.on_rq = 1; 1895 p->se.on_rq = 1;
1898} 1896}
1899 1897
1900static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1898static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1901{ 1899{
1902 if (sleep) { 1900 update_rq_clock(rq);
1903 if (p->se.last_wakeup) {
1904 update_avg(&p->se.avg_overlap,
1905 p->se.sum_exec_runtime - p->se.last_wakeup);
1906 p->se.last_wakeup = 0;
1907 } else {
1908 update_avg(&p->se.avg_wakeup,
1909 sysctl_sched_wakeup_granularity);
1910 }
1911 }
1912
1913 sched_info_dequeued(p); 1901 sched_info_dequeued(p);
1914 p->sched_class->dequeue_task(rq, p, sleep); 1902 p->sched_class->dequeue_task(rq, p, flags);
1915 p->se.on_rq = 0; 1903 p->se.on_rq = 0;
1916} 1904}
1917 1905
1918/* 1906/*
1919 * activate_task - move a task to the runqueue. 1907 * activate_task - move a task to the runqueue.
1920 */ 1908 */
1921static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 1909static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1922{ 1910{
1923 if (task_contributes_to_load(p)) 1911 if (task_contributes_to_load(p))
1924 rq->nr_uninterruptible--; 1912 rq->nr_uninterruptible--;
1925 1913
1926 enqueue_task(rq, p, wakeup, false); 1914 enqueue_task(rq, p, flags);
1927 inc_nr_running(rq); 1915 inc_nr_running(rq);
1928} 1916}
1929 1917
1930/* 1918/*
1931 * deactivate_task - remove a task from the runqueue. 1919 * deactivate_task - remove a task from the runqueue.
1932 */ 1920 */
1933static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 1921static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1934{ 1922{
1935 if (task_contributes_to_load(p)) 1923 if (task_contributes_to_load(p))
1936 rq->nr_uninterruptible++; 1924 rq->nr_uninterruptible++;
1937 1925
1938 dequeue_task(rq, p, sleep); 1926 dequeue_task(rq, p, flags);
1939 dec_nr_running(rq); 1927 dec_nr_running(rq);
1940} 1928}
1941 1929
@@ -2065,21 +2053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2065 __set_task_cpu(p, new_cpu); 2053 __set_task_cpu(p, new_cpu);
2066} 2054}
2067 2055
2068struct migration_req { 2056struct migration_arg {
2069 struct list_head list;
2070
2071 struct task_struct *task; 2057 struct task_struct *task;
2072 int dest_cpu; 2058 int dest_cpu;
2073
2074 struct completion done;
2075}; 2059};
2076 2060
2061static int migration_cpu_stop(void *data);
2062
2077/* 2063/*
2078 * The task's runqueue lock must be held. 2064 * The task's runqueue lock must be held.
2079 * Returns true if you have to wait for migration thread. 2065 * Returns true if you have to wait for migration thread.
2080 */ 2066 */
2081static int 2067static bool migrate_task(struct task_struct *p, int dest_cpu)
2082migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2083{ 2068{
2084 struct rq *rq = task_rq(p); 2069 struct rq *rq = task_rq(p);
2085 2070
@@ -2087,58 +2072,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2087 * If the task is not on a runqueue (and not running), then 2072 * If the task is not on a runqueue (and not running), then
2088 * the next wake-up will properly place the task. 2073 * the next wake-up will properly place the task.
2089 */ 2074 */
2090 if (!p->se.on_rq && !task_running(rq, p)) 2075 return p->se.on_rq || task_running(rq, p);
2091 return 0;
2092
2093 init_completion(&req->done);
2094 req->task = p;
2095 req->dest_cpu = dest_cpu;
2096 list_add(&req->list, &rq->migration_queue);
2097
2098 return 1;
2099}
2100
2101/*
2102 * wait_task_context_switch - wait for a thread to complete at least one
2103 * context switch.
2104 *
2105 * @p must not be current.
2106 */
2107void wait_task_context_switch(struct task_struct *p)
2108{
2109 unsigned long nvcsw, nivcsw, flags;
2110 int running;
2111 struct rq *rq;
2112
2113 nvcsw = p->nvcsw;
2114 nivcsw = p->nivcsw;
2115 for (;;) {
2116 /*
2117 * The runqueue is assigned before the actual context
2118 * switch. We need to take the runqueue lock.
2119 *
2120 * We could check initially without the lock but it is
2121 * very likely that we need to take the lock in every
2122 * iteration.
2123 */
2124 rq = task_rq_lock(p, &flags);
2125 running = task_running(rq, p);
2126 task_rq_unlock(rq, &flags);
2127
2128 if (likely(!running))
2129 break;
2130 /*
2131 * The switch count is incremented before the actual
2132 * context switch. We thus wait for two switches to be
2133 * sure at least one completed.
2134 */
2135 if ((p->nvcsw - nvcsw) > 1)
2136 break;
2137 if ((p->nivcsw - nivcsw) > 1)
2138 break;
2139
2140 cpu_relax();
2141 }
2142} 2076}
2143 2077
2144/* 2078/*
@@ -2196,7 +2130,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2196 * just go back and repeat. 2130 * just go back and repeat.
2197 */ 2131 */
2198 rq = task_rq_lock(p, &flags); 2132 rq = task_rq_lock(p, &flags);
2199 trace_sched_wait_task(rq, p); 2133 trace_sched_wait_task(p);
2200 running = task_running(rq, p); 2134 running = task_running(rq, p);
2201 on_rq = p->se.on_rq; 2135 on_rq = p->se.on_rq;
2202 ncsw = 0; 2136 ncsw = 0;
@@ -2294,6 +2228,9 @@ void task_oncpu_function_call(struct task_struct *p,
2294} 2228}
2295 2229
2296#ifdef CONFIG_SMP 2230#ifdef CONFIG_SMP
2231/*
2232 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
2233 */
2297static int select_fallback_rq(int cpu, struct task_struct *p) 2234static int select_fallback_rq(int cpu, struct task_struct *p)
2298{ 2235{
2299 int dest_cpu; 2236 int dest_cpu;
@@ -2310,12 +2247,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2310 return dest_cpu; 2247 return dest_cpu;
2311 2248
2312 /* No more Mr. Nice Guy. */ 2249 /* No more Mr. Nice Guy. */
2313 if (dest_cpu >= nr_cpu_ids) { 2250 if (unlikely(dest_cpu >= nr_cpu_ids)) {
2314 rcu_read_lock(); 2251 dest_cpu = cpuset_cpus_allowed_fallback(p);
2315 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2316 rcu_read_unlock();
2317 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2318
2319 /* 2252 /*
2320 * Don't tell them about moving exiting tasks or 2253 * Don't tell them about moving exiting tasks or
2321 * kernel threads (both mm NULL), since they never 2254 * kernel threads (both mm NULL), since they never
@@ -2332,17 +2265,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2332} 2265}
2333 2266
2334/* 2267/*
2335 * Gets called from 3 sites (exec, fork, wakeup), since it is called without 2268 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
2336 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2337 * by:
2338 *
2339 * exec: is unstable, retry loop
2340 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2341 */ 2269 */
2342static inline 2270static inline
2343int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2271int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2344{ 2272{
2345 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2273 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2346 2274
2347 /* 2275 /*
2348 * In order not to call set_task_cpu() on a blocking task we need 2276 * In order not to call set_task_cpu() on a blocking task we need
@@ -2360,13 +2288,63 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2360 2288
2361 return cpu; 2289 return cpu;
2362} 2290}
2291
2292static void update_avg(u64 *avg, u64 sample)
2293{
2294 s64 diff = sample - *avg;
2295 *avg += diff >> 3;
2296}
2363#endif 2297#endif
2364 2298
2365/*** 2299static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2300 bool is_sync, bool is_migrate, bool is_local,
2301 unsigned long en_flags)
2302{
2303 schedstat_inc(p, se.statistics.nr_wakeups);
2304 if (is_sync)
2305 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2306 if (is_migrate)
2307 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2308 if (is_local)
2309 schedstat_inc(p, se.statistics.nr_wakeups_local);
2310 else
2311 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2312
2313 activate_task(rq, p, en_flags);
2314}
2315
2316static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2317 int wake_flags, bool success)
2318{
2319 trace_sched_wakeup(p, success);
2320 check_preempt_curr(rq, p, wake_flags);
2321
2322 p->state = TASK_RUNNING;
2323#ifdef CONFIG_SMP
2324 if (p->sched_class->task_woken)
2325 p->sched_class->task_woken(rq, p);
2326
2327 if (unlikely(rq->idle_stamp)) {
2328 u64 delta = rq->clock - rq->idle_stamp;
2329 u64 max = 2*sysctl_sched_migration_cost;
2330
2331 if (delta > max)
2332 rq->avg_idle = max;
2333 else
2334 update_avg(&rq->avg_idle, delta);
2335 rq->idle_stamp = 0;
2336 }
2337#endif
2338 /* if a worker is waking up, notify workqueue */
2339 if ((p->flags & PF_WQ_WORKER) && success)
2340 wq_worker_waking_up(p, cpu_of(rq));
2341}
2342
2343/**
2366 * try_to_wake_up - wake up a thread 2344 * try_to_wake_up - wake up a thread
2367 * @p: the to-be-woken-up thread 2345 * @p: the thread to be awakened
2368 * @state: the mask of task states that can be woken 2346 * @state: the mask of task states that can be woken
2369 * @sync: do a synchronous wakeup? 2347 * @wake_flags: wake modifier flags (WF_*)
2370 * 2348 *
2371 * Put it on the run-queue if it's not already there. The "current" 2349 * Put it on the run-queue if it's not already there. The "current"
2372 * thread is always on the run-queue (except when the actual 2350 * thread is always on the run-queue (except when the actual
@@ -2374,26 +2352,24 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2374 * the simpler "current->state = TASK_RUNNING" to mark yourself 2352 * the simpler "current->state = TASK_RUNNING" to mark yourself
2375 * runnable without the overhead of this. 2353 * runnable without the overhead of this.
2376 * 2354 *
2377 * returns failure only if the task is already active. 2355 * Returns %true if @p was woken up, %false if it was already running
2356 * or @state didn't match @p's state.
2378 */ 2357 */
2379static int try_to_wake_up(struct task_struct *p, unsigned int state, 2358static int try_to_wake_up(struct task_struct *p, unsigned int state,
2380 int wake_flags) 2359 int wake_flags)
2381{ 2360{
2382 int cpu, orig_cpu, this_cpu, success = 0; 2361 int cpu, orig_cpu, this_cpu, success = 0;
2383 unsigned long flags; 2362 unsigned long flags;
2363 unsigned long en_flags = ENQUEUE_WAKEUP;
2384 struct rq *rq; 2364 struct rq *rq;
2385 2365
2386 if (is_realtime(p)) 2366 if (is_realtime(p))
2387 TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); 2367 TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
2388 2368
2389 if (!sched_feat(SYNC_WAKEUPS))
2390 wake_flags &= ~WF_SYNC;
2391
2392 this_cpu = get_cpu(); 2369 this_cpu = get_cpu();
2393 2370
2394 smp_wmb(); 2371 smp_wmb();
2395 rq = task_rq_lock(p, &flags); 2372 rq = task_rq_lock(p, &flags);
2396 update_rq_clock(rq);
2397 if (!(p->state & state)) 2373 if (!(p->state & state))
2398 goto out; 2374 goto out;
2399 2375
@@ -2413,28 +2389,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2413 * 2389 *
2414 * First fix up the nr_uninterruptible count: 2390 * First fix up the nr_uninterruptible count:
2415 */ 2391 */
2416 if (task_contributes_to_load(p)) 2392 if (task_contributes_to_load(p)) {
2417 rq->nr_uninterruptible--; 2393 if (likely(cpu_online(orig_cpu)))
2394 rq->nr_uninterruptible--;
2395 else
2396 this_rq()->nr_uninterruptible--;
2397 }
2418 p->state = TASK_WAKING; 2398 p->state = TASK_WAKING;
2419 2399
2420 if (p->sched_class->task_waking) 2400 if (p->sched_class->task_waking) {
2421 p->sched_class->task_waking(rq, p); 2401 p->sched_class->task_waking(rq, p);
2402 en_flags |= ENQUEUE_WAKING;
2403 }
2422 2404
2423 __task_rq_unlock(rq); 2405 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2424 2406 if (cpu != orig_cpu)
2425 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2426 if (cpu != orig_cpu) {
2427 /*
2428 * Since we migrate the task without holding any rq->lock,
2429 * we need to be careful with task_rq_lock(), since that
2430 * might end up locking an invalid rq.
2431 */
2432 set_task_cpu(p, cpu); 2407 set_task_cpu(p, cpu);
2433 } 2408 __task_rq_unlock(rq);
2434 2409
2435 rq = cpu_rq(cpu); 2410 rq = cpu_rq(cpu);
2436 raw_spin_lock(&rq->lock); 2411 raw_spin_lock(&rq->lock);
2437 update_rq_clock(rq);
2438 2412
2439 /* 2413 /*
2440 * We migrated the task without holding either rq->lock, however 2414 * We migrated the task without holding either rq->lock, however
@@ -2462,54 +2436,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2462 2436
2463out_activate: 2437out_activate:
2464#endif /* CONFIG_SMP */ 2438#endif /* CONFIG_SMP */
2465 schedstat_inc(p, se.nr_wakeups); 2439 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2466 if (wake_flags & WF_SYNC) 2440 cpu == this_cpu, en_flags);
2467 schedstat_inc(p, se.nr_wakeups_sync);
2468 if (orig_cpu != cpu)
2469 schedstat_inc(p, se.nr_wakeups_migrate);
2470 if (cpu == this_cpu)
2471 schedstat_inc(p, se.nr_wakeups_local);
2472 else
2473 schedstat_inc(p, se.nr_wakeups_remote);
2474 activate_task(rq, p, 1);
2475 success = 1; 2441 success = 1;
2476
2477 /*
2478 * Only attribute actual wakeups done by this task.
2479 */
2480 if (!in_interrupt()) {
2481 struct sched_entity *se = &current->se;
2482 u64 sample = se->sum_exec_runtime;
2483
2484 if (se->last_wakeup)
2485 sample -= se->last_wakeup;
2486 else
2487 sample -= se->start_runtime;
2488 update_avg(&se->avg_wakeup, sample);
2489
2490 se->last_wakeup = se->sum_exec_runtime;
2491 }
2492
2493out_running: 2442out_running:
2494 trace_sched_wakeup(rq, p, success); 2443 ttwu_post_activation(p, rq, wake_flags, success);
2495 check_preempt_curr(rq, p, wake_flags);
2496
2497 p->state = TASK_RUNNING;
2498#ifdef CONFIG_SMP
2499 if (p->sched_class->task_woken)
2500 p->sched_class->task_woken(rq, p);
2501
2502 if (unlikely(rq->idle_stamp)) {
2503 u64 delta = rq->clock - rq->idle_stamp;
2504 u64 max = 2*sysctl_sched_migration_cost;
2505
2506 if (delta > max)
2507 rq->avg_idle = max;
2508 else
2509 update_avg(&rq->avg_idle, delta);
2510 rq->idle_stamp = 0;
2511 }
2512#endif
2513out: 2444out:
2514 if (is_realtime(p)) 2445 if (is_realtime(p))
2515 TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state); 2446 TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
@@ -2520,6 +2451,37 @@ out:
2520} 2451}
2521 2452
2522/** 2453/**
2454 * try_to_wake_up_local - try to wake up a local task with rq lock held
2455 * @p: the thread to be awakened
2456 *
2457 * Put @p on the run-queue if it's not alredy there. The caller must
2458 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2459 * the current task. this_rq() stays locked over invocation.
2460 */
2461static void try_to_wake_up_local(struct task_struct *p)
2462{
2463 struct rq *rq = task_rq(p);
2464 bool success = false;
2465
2466 BUG_ON(rq != this_rq());
2467 BUG_ON(p == current);
2468 lockdep_assert_held(&rq->lock);
2469
2470 if (!(p->state & TASK_NORMAL))
2471 return;
2472
2473 if (!p->se.on_rq) {
2474 if (likely(!task_running(rq, p))) {
2475 schedstat_inc(rq, ttwu_count);
2476 schedstat_inc(rq, ttwu_local);
2477 }
2478 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2479 success = true;
2480 }
2481 ttwu_post_activation(p, rq, 0, success);
2482}
2483
2484/**
2523 * wake_up_process - Wake up a specific process 2485 * wake_up_process - Wake up a specific process
2524 * @p: The process to be woken up. 2486 * @p: The process to be woken up.
2525 * 2487 *
@@ -2553,42 +2515,9 @@ static void __sched_fork(struct task_struct *p)
2553 p->se.sum_exec_runtime = 0; 2515 p->se.sum_exec_runtime = 0;
2554 p->se.prev_sum_exec_runtime = 0; 2516 p->se.prev_sum_exec_runtime = 0;
2555 p->se.nr_migrations = 0; 2517 p->se.nr_migrations = 0;
2556 p->se.last_wakeup = 0;
2557 p->se.avg_overlap = 0;
2558 p->se.start_runtime = 0;
2559 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2560 2518
2561#ifdef CONFIG_SCHEDSTATS 2519#ifdef CONFIG_SCHEDSTATS
2562 p->se.wait_start = 0; 2520 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2563 p->se.wait_max = 0;
2564 p->se.wait_count = 0;
2565 p->se.wait_sum = 0;
2566
2567 p->se.sleep_start = 0;
2568 p->se.sleep_max = 0;
2569 p->se.sum_sleep_runtime = 0;
2570
2571 p->se.block_start = 0;
2572 p->se.block_max = 0;
2573 p->se.exec_max = 0;
2574 p->se.slice_max = 0;
2575
2576 p->se.nr_migrations_cold = 0;
2577 p->se.nr_failed_migrations_affine = 0;
2578 p->se.nr_failed_migrations_running = 0;
2579 p->se.nr_failed_migrations_hot = 0;
2580 p->se.nr_forced_migrations = 0;
2581
2582 p->se.nr_wakeups = 0;
2583 p->se.nr_wakeups_sync = 0;
2584 p->se.nr_wakeups_migrate = 0;
2585 p->se.nr_wakeups_local = 0;
2586 p->se.nr_wakeups_remote = 0;
2587 p->se.nr_wakeups_affine = 0;
2588 p->se.nr_wakeups_affine_attempts = 0;
2589 p->se.nr_wakeups_passive = 0;
2590 p->se.nr_wakeups_idle = 0;
2591
2592#endif 2521#endif
2593 2522
2594 INIT_LIST_HEAD(&p->rt.run_list); 2523 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2609,11 +2538,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2609 2538
2610 __sched_fork(p); 2539 __sched_fork(p);
2611 /* 2540 /*
2612 * We mark the process as waking here. This guarantees that 2541 * We mark the process as running here. This guarantees that
2613 * nobody will actually run it, and a signal or other external 2542 * nobody will actually run it, and a signal or other external
2614 * event cannot wake it up and insert it on the runqueue either. 2543 * event cannot wake it up and insert it on the runqueue either.
2615 */ 2544 */
2616 p->state = TASK_WAKING; 2545 p->state = TASK_RUNNING;
2617 2546
2618 /* 2547 /*
2619 * Revert to default priority/policy on fork if requested. 2548 * Revert to default priority/policy on fork if requested.
@@ -2648,7 +2577,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2648 if (p->sched_class->task_fork) 2577 if (p->sched_class->task_fork)
2649 p->sched_class->task_fork(p); 2578 p->sched_class->task_fork(p);
2650 2579
2580 /*
2581 * The child is not yet in the pid-hash so no cgroup attach races,
2582 * and the cgroup is pinned to this child due to cgroup_fork()
2583 * is ran before sched_fork().
2584 *
2585 * Silence PROVE_RCU.
2586 */
2587 rcu_read_lock();
2651 set_task_cpu(p, cpu); 2588 set_task_cpu(p, cpu);
2589 rcu_read_unlock();
2652 2590
2653#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2591#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2654 if (likely(sched_info_on())) 2592 if (likely(sched_info_on()))
@@ -2680,31 +2618,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2680 int cpu __maybe_unused = get_cpu(); 2618 int cpu __maybe_unused = get_cpu();
2681 2619
2682#ifdef CONFIG_SMP 2620#ifdef CONFIG_SMP
2621 rq = task_rq_lock(p, &flags);
2622 p->state = TASK_WAKING;
2623
2683 /* 2624 /*
2684 * Fork balancing, do it here and not earlier because: 2625 * Fork balancing, do it here and not earlier because:
2685 * - cpus_allowed can change in the fork path 2626 * - cpus_allowed can change in the fork path
2686 * - any previously selected cpu might disappear through hotplug 2627 * - any previously selected cpu might disappear through hotplug
2687 * 2628 *
2688 * We still have TASK_WAKING but PF_STARTING is gone now, meaning 2629 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2689 * ->cpus_allowed is stable, we have preemption disabled, meaning 2630 * without people poking at ->cpus_allowed.
2690 * cpu_online_mask is stable.
2691 */ 2631 */
2692 cpu = select_task_rq(p, SD_BALANCE_FORK, 0); 2632 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2693 set_task_cpu(p, cpu); 2633 set_task_cpu(p, cpu);
2694#endif
2695 2634
2696 /*
2697 * Since the task is not on the rq and we still have TASK_WAKING set
2698 * nobody else will migrate this task.
2699 */
2700 rq = cpu_rq(cpu);
2701 raw_spin_lock_irqsave(&rq->lock, flags);
2702
2703 BUG_ON(p->state != TASK_WAKING);
2704 p->state = TASK_RUNNING; 2635 p->state = TASK_RUNNING;
2705 update_rq_clock(rq); 2636 task_rq_unlock(rq, &flags);
2637#endif
2638
2639 rq = task_rq_lock(p, &flags);
2706 activate_task(rq, p, 0); 2640 activate_task(rq, p, 0);
2707 trace_sched_wakeup_new(rq, p, 1); 2641 trace_sched_wakeup_new(p, 1);
2708 check_preempt_curr(rq, p, WF_FORK); 2642 check_preempt_curr(rq, p, WF_FORK);
2709#ifdef CONFIG_SMP 2643#ifdef CONFIG_SMP
2710 if (p->sched_class->task_woken) 2644 if (p->sched_class->task_woken)
@@ -2935,7 +2869,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2935 struct mm_struct *mm, *oldmm; 2869 struct mm_struct *mm, *oldmm;
2936 2870
2937 prepare_task_switch(rq, prev, next); 2871 prepare_task_switch(rq, prev, next);
2938 trace_sched_switch(rq, prev, next); 2872 trace_sched_switch(prev, next);
2939 mm = next->mm; 2873 mm = next->mm;
2940 oldmm = prev->active_mm; 2874 oldmm = prev->active_mm;
2941 /* 2875 /*
@@ -3033,9 +2967,9 @@ unsigned long nr_iowait(void)
3033 return sum; 2967 return sum;
3034} 2968}
3035 2969
3036unsigned long nr_iowait_cpu(void) 2970unsigned long nr_iowait_cpu(int cpu)
3037{ 2971{
3038 struct rq *this = this_rq(); 2972 struct rq *this = cpu_rq(cpu);
3039 return atomic_read(&this->nr_iowait); 2973 return atomic_read(&this->nr_iowait);
3040} 2974}
3041 2975
@@ -3052,6 +2986,61 @@ static unsigned long calc_load_update;
3052unsigned long avenrun[3]; 2986unsigned long avenrun[3];
3053EXPORT_SYMBOL(avenrun); 2987EXPORT_SYMBOL(avenrun);
3054 2988
2989static long calc_load_fold_active(struct rq *this_rq)
2990{
2991 long nr_active, delta = 0;
2992
2993 nr_active = this_rq->nr_running;
2994 nr_active += (long) this_rq->nr_uninterruptible;
2995
2996 if (nr_active != this_rq->calc_load_active) {
2997 delta = nr_active - this_rq->calc_load_active;
2998 this_rq->calc_load_active = nr_active;
2999 }
3000
3001 return delta;
3002}
3003
3004#ifdef CONFIG_NO_HZ
3005/*
3006 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
3007 *
3008 * When making the ILB scale, we should try to pull this in as well.
3009 */
3010static atomic_long_t calc_load_tasks_idle;
3011
3012static void calc_load_account_idle(struct rq *this_rq)
3013{
3014 long delta;
3015
3016 delta = calc_load_fold_active(this_rq);
3017 if (delta)
3018 atomic_long_add(delta, &calc_load_tasks_idle);
3019}
3020
3021static long calc_load_fold_idle(void)
3022{
3023 long delta = 0;
3024
3025 /*
3026 * Its got a race, we don't care...
3027 */
3028 if (atomic_long_read(&calc_load_tasks_idle))
3029 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
3030
3031 return delta;
3032}
3033#else
3034static void calc_load_account_idle(struct rq *this_rq)
3035{
3036}
3037
3038static inline long calc_load_fold_idle(void)
3039{
3040 return 0;
3041}
3042#endif
3043
3055/** 3044/**
3056 * get_avenrun - get the load average array 3045 * get_avenrun - get the load average array
3057 * @loads: pointer to dest load array 3046 * @loads: pointer to dest load array
@@ -3098,40 +3087,121 @@ void calc_global_load(void)
3098} 3087}
3099 3088
3100/* 3089/*
3101 * Either called from update_cpu_load() or from a cpu going idle 3090 * Called from update_cpu_load() to periodically update this CPU's
3091 * active count.
3102 */ 3092 */
3103static void calc_load_account_active(struct rq *this_rq) 3093static void calc_load_account_active(struct rq *this_rq)
3104{ 3094{
3105 long nr_active, delta; 3095 long delta;
3106 3096
3107 nr_active = this_rq->nr_running; 3097 if (time_before(jiffies, this_rq->calc_load_update))
3108 nr_active += (long) this_rq->nr_uninterruptible; 3098 return;
3109 3099
3110 if (nr_active != this_rq->calc_load_active) { 3100 delta = calc_load_fold_active(this_rq);
3111 delta = nr_active - this_rq->calc_load_active; 3101 delta += calc_load_fold_idle();
3112 this_rq->calc_load_active = nr_active; 3102 if (delta)
3113 atomic_long_add(delta, &calc_load_tasks); 3103 atomic_long_add(delta, &calc_load_tasks);
3104
3105 this_rq->calc_load_update += LOAD_FREQ;
3106}
3107
3108/*
3109 * The exact cpuload at various idx values, calculated at every tick would be
3110 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3111 *
3112 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3113 * on nth tick when cpu may be busy, then we have:
3114 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3115 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3116 *
3117 * decay_load_missed() below does efficient calculation of
3118 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3119 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3120 *
3121 * The calculation is approximated on a 128 point scale.
3122 * degrade_zero_ticks is the number of ticks after which load at any
3123 * particular idx is approximated to be zero.
3124 * degrade_factor is a precomputed table, a row for each load idx.
3125 * Each column corresponds to degradation factor for a power of two ticks,
3126 * based on 128 point scale.
3127 * Example:
3128 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3129 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3130 *
3131 * With this power of 2 load factors, we can degrade the load n times
3132 * by looking at 1 bits in n and doing as many mult/shift instead of
3133 * n mult/shifts needed by the exact degradation.
3134 */
3135#define DEGRADE_SHIFT 7
3136static const unsigned char
3137 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3138static const unsigned char
3139 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3140 {0, 0, 0, 0, 0, 0, 0, 0},
3141 {64, 32, 8, 0, 0, 0, 0, 0},
3142 {96, 72, 40, 12, 1, 0, 0},
3143 {112, 98, 75, 43, 15, 1, 0},
3144 {120, 112, 98, 76, 45, 16, 2} };
3145
3146/*
3147 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3148 * would be when CPU is idle and so we just decay the old load without
3149 * adding any new load.
3150 */
3151static unsigned long
3152decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3153{
3154 int j = 0;
3155
3156 if (!missed_updates)
3157 return load;
3158
3159 if (missed_updates >= degrade_zero_ticks[idx])
3160 return 0;
3161
3162 if (idx == 1)
3163 return load >> missed_updates;
3164
3165 while (missed_updates) {
3166 if (missed_updates % 2)
3167 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3168
3169 missed_updates >>= 1;
3170 j++;
3114 } 3171 }
3172 return load;
3115} 3173}
3116 3174
3117/* 3175/*
3118 * Update rq->cpu_load[] statistics. This function is usually called every 3176 * Update rq->cpu_load[] statistics. This function is usually called every
3119 * scheduler tick (TICK_NSEC). 3177 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3178 * every tick. We fix it up based on jiffies.
3120 */ 3179 */
3121static void update_cpu_load(struct rq *this_rq) 3180static void update_cpu_load(struct rq *this_rq)
3122{ 3181{
3123 unsigned long this_load = this_rq->load.weight; 3182 unsigned long this_load = this_rq->load.weight;
3183 unsigned long curr_jiffies = jiffies;
3184 unsigned long pending_updates;
3124 int i, scale; 3185 int i, scale;
3125 3186
3126 this_rq->nr_load_updates++; 3187 this_rq->nr_load_updates++;
3127 3188
3189 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3190 if (curr_jiffies == this_rq->last_load_update_tick)
3191 return;
3192
3193 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3194 this_rq->last_load_update_tick = curr_jiffies;
3195
3128 /* Update our load: */ 3196 /* Update our load: */
3129 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3197 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3198 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3130 unsigned long old_load, new_load; 3199 unsigned long old_load, new_load;
3131 3200
3132 /* scale is effectively 1 << i now, and >> i divides by scale */ 3201 /* scale is effectively 1 << i now, and >> i divides by scale */
3133 3202
3134 old_load = this_rq->cpu_load[i]; 3203 old_load = this_rq->cpu_load[i];
3204 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3135 new_load = this_load; 3205 new_load = this_load;
3136 /* 3206 /*
3137 * Round up the averaging division if load is increasing. This 3207 * Round up the averaging division if load is increasing. This
@@ -3139,14 +3209,19 @@ static void update_cpu_load(struct rq *this_rq)
3139 * example. 3209 * example.
3140 */ 3210 */
3141 if (new_load > old_load) 3211 if (new_load > old_load)
3142 new_load += scale-1; 3212 new_load += scale - 1;
3143 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3144 }
3145 3213
3146 if (time_after_eq(jiffies, this_rq->calc_load_update)) { 3214 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3147 this_rq->calc_load_update += LOAD_FREQ;
3148 calc_load_account_active(this_rq);
3149 } 3215 }
3216
3217 sched_avg_update(this_rq);
3218}
3219
3220static void update_cpu_load_active(struct rq *this_rq)
3221{
3222 update_cpu_load(this_rq);
3223
3224 calc_load_account_active(this_rq);
3150} 3225}
3151 3226
3152#ifdef CONFIG_SMP 3227#ifdef CONFIG_SMP
@@ -3158,44 +3233,27 @@ static void update_cpu_load(struct rq *this_rq)
3158void sched_exec(void) 3233void sched_exec(void)
3159{ 3234{
3160 struct task_struct *p = current; 3235 struct task_struct *p = current;
3161 struct migration_req req;
3162 int dest_cpu, this_cpu;
3163 unsigned long flags; 3236 unsigned long flags;
3164 struct rq *rq; 3237 struct rq *rq;
3165 3238 int dest_cpu;
3166again:
3167 this_cpu = get_cpu();
3168 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3169 if (dest_cpu == this_cpu) {
3170 put_cpu();
3171 return;
3172 }
3173 3239
3174 rq = task_rq_lock(p, &flags); 3240 rq = task_rq_lock(p, &flags);
3175 put_cpu(); 3241 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3242 if (dest_cpu == smp_processor_id())
3243 goto unlock;
3176 3244
3177 /* 3245 /*
3178 * select_task_rq() can race against ->cpus_allowed 3246 * select_task_rq() can race against ->cpus_allowed
3179 */ 3247 */
3180 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3248 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3181 || unlikely(!cpu_active(dest_cpu))) { 3249 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3182 task_rq_unlock(rq, &flags); 3250 struct migration_arg arg = { p, dest_cpu };
3183 goto again;
3184 }
3185
3186 /* force the process onto the specified CPU */
3187 if (migrate_task(p, dest_cpu, &req)) {
3188 /* Need to wait for migration thread (might exit: take ref). */
3189 struct task_struct *mt = rq->migration_thread;
3190 3251
3191 get_task_struct(mt);
3192 task_rq_unlock(rq, &flags); 3252 task_rq_unlock(rq, &flags);
3193 wake_up_process(mt); 3253 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3194 put_task_struct(mt);
3195 wait_for_completion(&req.done);
3196
3197 return; 3254 return;
3198 } 3255 }
3256unlock:
3199 task_rq_unlock(rq, &flags); 3257 task_rq_unlock(rq, &flags);
3200} 3258}
3201 3259
@@ -3482,9 +3540,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3482 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3540 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3483 3541
3484 if (total) { 3542 if (total) {
3485 u64 temp; 3543 u64 temp = rtime;
3486 3544
3487 temp = (u64)(rtime * utime); 3545 temp *= utime;
3488 do_div(temp, total); 3546 do_div(temp, total);
3489 utime = (cputime_t)temp; 3547 utime = (cputime_t)temp;
3490 } else 3548 } else
@@ -3515,9 +3573,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3515 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3573 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3516 3574
3517 if (total) { 3575 if (total) {
3518 u64 temp; 3576 u64 temp = rtime;
3519 3577
3520 temp = (u64)(rtime * cputime.utime); 3578 temp *= cputime.utime;
3521 do_div(temp, total); 3579 do_div(temp, total);
3522 utime = (cputime_t)temp; 3580 utime = (cputime_t)temp;
3523 } else 3581 } else
@@ -3551,7 +3609,7 @@ void scheduler_tick(void)
3551 3609
3552 raw_spin_lock(&rq->lock); 3610 raw_spin_lock(&rq->lock);
3553 update_rq_clock(rq); 3611 update_rq_clock(rq);
3554 update_cpu_load(rq); 3612 update_cpu_load_active(rq);
3555 curr->sched_class->task_tick(rq, curr, 0); 3613 curr->sched_class->task_tick(rq, curr, 0);
3556 3614
3557 /* litmus_tick may force current to resched */ 3615 /* litmus_tick may force current to resched */
@@ -3675,23 +3733,9 @@ static inline void schedule_debug(struct task_struct *prev)
3675 3733
3676static void put_prev_task(struct rq *rq, struct task_struct *prev) 3734static void put_prev_task(struct rq *rq, struct task_struct *prev)
3677{ 3735{
3678 if (prev->state == TASK_RUNNING) { 3736 if (prev->se.on_rq)
3679 u64 runtime = prev->se.sum_exec_runtime; 3737 update_rq_clock(rq);
3680 3738 rq->skip_clock_update = 0;
3681 runtime -= prev->se.prev_sum_exec_runtime;
3682 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
3683
3684 /*
3685 * In order to avoid avg_overlap growing stale when we are
3686 * indeed overlapping and hence not getting put to sleep, grow
3687 * the avg_overlap on preemption.
3688 *
3689 * We use the average preemption runtime because that
3690 * correlates to the amount of cache footprint a task can
3691 * build up.
3692 */
3693 update_avg(&prev->se.avg_overlap, runtime);
3694 }
3695 prev->sched_class->put_prev_task(rq, prev); 3739 prev->sched_class->put_prev_task(rq, prev);
3696} 3740}
3697 3741
@@ -3749,9 +3793,8 @@ need_resched:
3749 preempt_disable(); 3793 preempt_disable();
3750 cpu = smp_processor_id(); 3794 cpu = smp_processor_id();
3751 rq = cpu_rq(cpu); 3795 rq = cpu_rq(cpu);
3752 rcu_sched_qs(cpu); 3796 rcu_note_context_switch(cpu);
3753 prev = rq->curr; 3797 prev = rq->curr;
3754 switch_count = &prev->nivcsw;
3755 3798
3756 release_kernel_lock(prev); 3799 release_kernel_lock(prev);
3757need_resched_nonpreemptible: 3800need_resched_nonpreemptible:
@@ -3764,14 +3807,28 @@ need_resched_nonpreemptible:
3764 hrtick_clear(rq); 3807 hrtick_clear(rq);
3765 3808
3766 raw_spin_lock_irq(&rq->lock); 3809 raw_spin_lock_irq(&rq->lock);
3767 update_rq_clock(rq);
3768 clear_tsk_need_resched(prev); 3810 clear_tsk_need_resched(prev);
3769 3811
3812 switch_count = &prev->nivcsw;
3770 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3813 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3771 if (unlikely(signal_pending_state(prev->state, prev))) 3814 if (unlikely(signal_pending_state(prev->state, prev))) {
3772 prev->state = TASK_RUNNING; 3815 prev->state = TASK_RUNNING;
3773 else 3816 } else {
3774 deactivate_task(rq, prev, 1); 3817 /*
3818 * If a worker is going to sleep, notify and
3819 * ask workqueue whether it wants to wake up a
3820 * task to maintain concurrency. If so, wake
3821 * up the task.
3822 */
3823 if (prev->flags & PF_WQ_WORKER) {
3824 struct task_struct *to_wakeup;
3825
3826 to_wakeup = wq_worker_sleeping(prev, cpu);
3827 if (to_wakeup)
3828 try_to_wake_up_local(to_wakeup);
3829 }
3830 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3831 }
3775 switch_count = &prev->nvcsw; 3832 switch_count = &prev->nvcsw;
3776 } 3833 }
3777 3834
@@ -3796,8 +3853,10 @@ need_resched_nonpreemptible:
3796 context_switch(rq, prev, next); /* unlocks the rq */ 3853 context_switch(rq, prev, next); /* unlocks the rq */
3797 TS_CXS_END(current); 3854 TS_CXS_END(current);
3798 /* 3855 /*
3799 * the context switch might have flipped the stack from under 3856 * The context switch have flipped the stack from under us
3800 * us, hence refresh the local variables. 3857 * and restored the local variables which were saved when
3858 * this task called schedule() in the past. prev == current
3859 * is still correct, but it can be moved to another cpu/rq.
3801 */ 3860 */
3802 cpu = smp_processor_id(); 3861 cpu = smp_processor_id();
3803 rq = cpu_rq(cpu); 3862 rq = cpu_rq(cpu);
@@ -3810,11 +3869,8 @@ need_resched_nonpreemptible:
3810 3869
3811 post_schedule(rq); 3870 post_schedule(rq);
3812 3871
3813 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3872 if (unlikely(reacquire_kernel_lock(prev)))
3814 prev = rq->curr;
3815 switch_count = &prev->nivcsw;
3816 goto need_resched_nonpreemptible; 3873 goto need_resched_nonpreemptible;
3817 }
3818 3874
3819 preempt_enable_no_resched(); 3875 preempt_enable_no_resched();
3820 if (need_resched()) 3876 if (need_resched())
@@ -3870,8 +3926,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3870 /* 3926 /*
3871 * Owner changed, break to re-assess state. 3927 * Owner changed, break to re-assess state.
3872 */ 3928 */
3873 if (lock->owner != owner) 3929 if (lock->owner != owner) {
3930 /*
3931 * If the lock has switched to a different owner,
3932 * we likely have heavy contention. Return 0 to quit
3933 * optimistic spinning and not contend further:
3934 */
3935 if (lock->owner)
3936 return 0;
3874 break; 3937 break;
3938 }
3875 3939
3876 /* 3940 /*
3877 * Is that owner really running on that cpu? 3941 * Is that owner really running on that cpu?
@@ -3892,7 +3956,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3892 * off of preempt_enable. Kernel preemptions off return from interrupt 3956 * off of preempt_enable. Kernel preemptions off return from interrupt
3893 * occur there and call schedule directly. 3957 * occur there and call schedule directly.
3894 */ 3958 */
3895asmlinkage void __sched preempt_schedule(void) 3959asmlinkage void __sched notrace preempt_schedule(void)
3896{ 3960{
3897 struct thread_info *ti = current_thread_info(); 3961 struct thread_info *ti = current_thread_info();
3898 3962
@@ -3904,9 +3968,9 @@ asmlinkage void __sched preempt_schedule(void)
3904 return; 3968 return;
3905 3969
3906 do { 3970 do {
3907 add_preempt_count(PREEMPT_ACTIVE); 3971 add_preempt_count_notrace(PREEMPT_ACTIVE);
3908 schedule(); 3972 schedule();
3909 sub_preempt_count(PREEMPT_ACTIVE); 3973 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3910 3974
3911 /* 3975 /*
3912 * Check again in case we missed a preemption opportunity 3976 * Check again in case we missed a preemption opportunity
@@ -4005,6 +4069,7 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4005{ 4069{
4006 __wake_up_common(q, mode, 1, 0, NULL); 4070 __wake_up_common(q, mode, 1, 0, NULL);
4007} 4071}
4072EXPORT_SYMBOL_GPL(__wake_up_locked);
4008 4073
4009void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 4074void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4010{ 4075{
@@ -4115,8 +4180,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4115 if (!x->done) { 4180 if (!x->done) {
4116 DECLARE_WAITQUEUE(wait, current); 4181 DECLARE_WAITQUEUE(wait, current);
4117 4182
4118 wait.flags |= WQ_FLAG_EXCLUSIVE; 4183 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4119 __add_wait_queue_tail(&x->wait, &wait);
4120 do { 4184 do {
4121 if (signal_pending_state(state, current)) { 4185 if (signal_pending_state(state, current)) {
4122 timeout = -ERESTARTSYS; 4186 timeout = -ERESTARTSYS;
@@ -4227,6 +4291,23 @@ int __sched wait_for_completion_killable(struct completion *x)
4227EXPORT_SYMBOL(wait_for_completion_killable); 4291EXPORT_SYMBOL(wait_for_completion_killable);
4228 4292
4229/** 4293/**
4294 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
4295 * @x: holds the state of this particular completion
4296 * @timeout: timeout value in jiffies
4297 *
4298 * This waits for either a completion of a specific task to be
4299 * signaled or for a specified timeout to expire. It can be
4300 * interrupted by a kill signal. The timeout is in jiffies.
4301 */
4302unsigned long __sched
4303wait_for_completion_killable_timeout(struct completion *x,
4304 unsigned long timeout)
4305{
4306 return wait_for_common(x, timeout, TASK_KILLABLE);
4307}
4308EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4309
4310/**
4230 * try_wait_for_completion - try to decrement a completion without blocking 4311 * try_wait_for_completion - try to decrement a completion without blocking
4231 * @x: completion structure 4312 * @x: completion structure
4232 * 4313 *
@@ -4342,7 +4423,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4342 BUG_ON(prio < 0 || prio > MAX_PRIO); 4423 BUG_ON(prio < 0 || prio > MAX_PRIO);
4343 4424
4344 rq = task_rq_lock(p, &flags); 4425 rq = task_rq_lock(p, &flags);
4345 update_rq_clock(rq);
4346 4426
4347 oldprio = p->prio; 4427 oldprio = p->prio;
4348 prev_class = p->sched_class; 4428 prev_class = p->sched_class;
@@ -4363,7 +4443,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4363 if (running) 4443 if (running)
4364 p->sched_class->set_curr_task(rq); 4444 p->sched_class->set_curr_task(rq);
4365 if (on_rq) { 4445 if (on_rq) {
4366 enqueue_task(rq, p, 0, oldprio < prio); 4446 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4367 4447
4368 check_class_changed(rq, p, prev_class, oldprio, running); 4448 check_class_changed(rq, p, prev_class, oldprio, running);
4369 } 4449 }
@@ -4385,7 +4465,6 @@ void set_user_nice(struct task_struct *p, long nice)
4385 * the task might be in the middle of scheduling on another CPU. 4465 * the task might be in the middle of scheduling on another CPU.
4386 */ 4466 */
4387 rq = task_rq_lock(p, &flags); 4467 rq = task_rq_lock(p, &flags);
4388 update_rq_clock(rq);
4389 /* 4468 /*
4390 * The RT priorities are set via sched_setscheduler(), but we still 4469 * The RT priorities are set via sched_setscheduler(), but we still
4391 * allow the 'normal' nice value to be set - but as expected 4470 * allow the 'normal' nice value to be set - but as expected
@@ -4407,7 +4486,7 @@ void set_user_nice(struct task_struct *p, long nice)
4407 delta = p->prio - old_prio; 4486 delta = p->prio - old_prio;
4408 4487
4409 if (on_rq) { 4488 if (on_rq) {
4410 enqueue_task(rq, p, 0, false); 4489 enqueue_task(rq, p, 0);
4411 /* 4490 /*
4412 * If the task increased its priority or is running and 4491 * If the task increased its priority or is running and
4413 * lowered its priority, then reschedule its CPU: 4492 * lowered its priority, then reschedule its CPU:
@@ -4607,12 +4686,8 @@ recheck:
4607 */ 4686 */
4608 if (user && !capable(CAP_SYS_NICE)) { 4687 if (user && !capable(CAP_SYS_NICE)) {
4609 if (rt_policy(policy)) { 4688 if (rt_policy(policy)) {
4610 unsigned long rlim_rtprio; 4689 unsigned long rlim_rtprio =
4611 4690 task_rlimit(p, RLIMIT_RTPRIO);
4612 if (!lock_task_sighand(p, &flags))
4613 return -ESRCH;
4614 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4615 unlock_task_sighand(p, &flags);
4616 4691
4617 /* can't set/change the rt policy */ 4692 /* can't set/change the rt policy */
4618 if (policy != p->policy && !rlim_rtprio) 4693 if (policy != p->policy && !rlim_rtprio)
@@ -4640,16 +4715,6 @@ recheck:
4640 } 4715 }
4641 4716
4642 if (user) { 4717 if (user) {
4643#ifdef CONFIG_RT_GROUP_SCHED
4644 /*
4645 * Do not allow realtime tasks into groups that have no runtime
4646 * assigned.
4647 */
4648 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4649 task_group(p)->rt_bandwidth.rt_runtime == 0)
4650 return -EPERM;
4651#endif
4652
4653 retval = security_task_setscheduler(p, policy, param); 4718 retval = security_task_setscheduler(p, policy, param);
4654 if (retval) 4719 if (retval)
4655 return retval; 4720 return retval;
@@ -4671,6 +4736,22 @@ recheck:
4671 * runqueue lock must be held. 4736 * runqueue lock must be held.
4672 */ 4737 */
4673 rq = __task_rq_lock(p); 4738 rq = __task_rq_lock(p);
4739
4740#ifdef CONFIG_RT_GROUP_SCHED
4741 if (user) {
4742 /*
4743 * Do not allow realtime tasks into groups that have no runtime
4744 * assigned.
4745 */
4746 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4747 task_group(p)->rt_bandwidth.rt_runtime == 0) {
4748 __task_rq_unlock(rq);
4749 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4750 return -EPERM;
4751 }
4752 }
4753#endif
4754
4674 /* recheck policy now with rq lock held */ 4755 /* recheck policy now with rq lock held */
4675 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4756 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4676 policy = oldpolicy = -1; 4757 policy = oldpolicy = -1;
@@ -4678,7 +4759,6 @@ recheck:
4678 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4759 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4679 goto recheck; 4760 goto recheck;
4680 } 4761 }
4681 update_rq_clock(rq);
4682 on_rq = p->se.on_rq; 4762 on_rq = p->se.on_rq;
4683 running = task_current(rq, p); 4763 running = task_current(rq, p);
4684 if (on_rq) 4764 if (on_rq)
@@ -5425,17 +5505,15 @@ static inline void sched_init_granularity(void)
5425/* 5505/*
5426 * This is how migration works: 5506 * This is how migration works:
5427 * 5507 *
5428 * 1) we queue a struct migration_req structure in the source CPU's 5508 * 1) we invoke migration_cpu_stop() on the target CPU using
5429 * runqueue and wake up that CPU's migration thread. 5509 * stop_one_cpu().
5430 * 2) we down() the locked semaphore => thread blocks. 5510 * 2) stopper starts to run (implicitly forcing the migrated thread
5431 * 3) migration thread wakes up (implicitly it forces the migrated 5511 * off the CPU)
5432 * thread off the CPU) 5512 * 3) it checks whether the migrated task is still in the wrong runqueue.
5433 * 4) it gets the migration request and checks whether the migrated 5513 * 4) if it's in the wrong runqueue then the migration thread removes
5434 * task is still in the wrong runqueue.
5435 * 5) if it's in the wrong runqueue then the migration thread removes
5436 * it and puts it into the right queue. 5514 * it and puts it into the right queue.
5437 * 6) migration thread up()s the semaphore. 5515 * 5) stopper completes and stop_one_cpu() returns and the migration
5438 * 7) we wake up and the migration is done. 5516 * is done.
5439 */ 5517 */
5440 5518
5441/* 5519/*
@@ -5449,12 +5527,23 @@ static inline void sched_init_granularity(void)
5449 */ 5527 */
5450int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5528int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5451{ 5529{
5452 struct migration_req req;
5453 unsigned long flags; 5530 unsigned long flags;
5454 struct rq *rq; 5531 struct rq *rq;
5532 unsigned int dest_cpu;
5455 int ret = 0; 5533 int ret = 0;
5456 5534
5535 /*
5536 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5537 * drop the rq->lock and still rely on ->cpus_allowed.
5538 */
5539again:
5540 while (task_is_waking(p))
5541 cpu_relax();
5457 rq = task_rq_lock(p, &flags); 5542 rq = task_rq_lock(p, &flags);
5543 if (task_is_waking(p)) {
5544 task_rq_unlock(rq, &flags);
5545 goto again;
5546 }
5458 5547
5459 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5548 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5460 ret = -EINVAL; 5549 ret = -EINVAL;
@@ -5478,15 +5567,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5478 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5567 if (cpumask_test_cpu(task_cpu(p), new_mask))
5479 goto out; 5568 goto out;
5480 5569
5481 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { 5570 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5571 if (migrate_task(p, dest_cpu)) {
5572 struct migration_arg arg = { p, dest_cpu };
5482 /* Need help from migration thread: drop lock and wait. */ 5573 /* Need help from migration thread: drop lock and wait. */
5483 struct task_struct *mt = rq->migration_thread;
5484
5485 get_task_struct(mt);
5486 task_rq_unlock(rq, &flags); 5574 task_rq_unlock(rq, &flags);
5487 wake_up_process(mt); 5575 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5488 put_task_struct(mt);
5489 wait_for_completion(&req.done);
5490 tlb_migrate_finish(p->mm); 5576 tlb_migrate_finish(p->mm);
5491 return 0; 5577 return 0;
5492 } 5578 }
@@ -5544,98 +5630,49 @@ fail:
5544 return ret; 5630 return ret;
5545} 5631}
5546 5632
5547#define RCU_MIGRATION_IDLE 0
5548#define RCU_MIGRATION_NEED_QS 1
5549#define RCU_MIGRATION_GOT_QS 2
5550#define RCU_MIGRATION_MUST_SYNC 3
5551
5552/* 5633/*
5553 * migration_thread - this is a highprio system thread that performs 5634 * migration_cpu_stop - this will be executed by a highprio stopper thread
5554 * thread migration by bumping thread off CPU then 'pushing' onto 5635 * and performs thread migration by bumping thread off CPU then
5555 * another runqueue. 5636 * 'pushing' onto another runqueue.
5556 */ 5637 */
5557static int migration_thread(void *data) 5638static int migration_cpu_stop(void *data)
5558{
5559 int badcpu;
5560 int cpu = (long)data;
5561 struct rq *rq;
5562
5563 rq = cpu_rq(cpu);
5564 BUG_ON(rq->migration_thread != current);
5565
5566 set_current_state(TASK_INTERRUPTIBLE);
5567 while (!kthread_should_stop()) {
5568 struct migration_req *req;
5569 struct list_head *head;
5570
5571 raw_spin_lock_irq(&rq->lock);
5572
5573 if (cpu_is_offline(cpu)) {
5574 raw_spin_unlock_irq(&rq->lock);
5575 break;
5576 }
5577
5578 if (rq->active_balance) {
5579 active_load_balance(rq, cpu);
5580 rq->active_balance = 0;
5581 }
5582
5583 head = &rq->migration_queue;
5584
5585 if (list_empty(head)) {
5586 raw_spin_unlock_irq(&rq->lock);
5587 schedule();
5588 set_current_state(TASK_INTERRUPTIBLE);
5589 continue;
5590 }
5591 req = list_entry(head->next, struct migration_req, list);
5592 list_del_init(head->next);
5593
5594 if (req->task != NULL) {
5595 raw_spin_unlock(&rq->lock);
5596 __migrate_task(req->task, cpu, req->dest_cpu);
5597 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
5598 req->dest_cpu = RCU_MIGRATION_GOT_QS;
5599 raw_spin_unlock(&rq->lock);
5600 } else {
5601 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
5602 raw_spin_unlock(&rq->lock);
5603 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
5604 }
5605 local_irq_enable();
5606
5607 complete(&req->done);
5608 }
5609 __set_current_state(TASK_RUNNING);
5610
5611 return 0;
5612}
5613
5614#ifdef CONFIG_HOTPLUG_CPU
5615
5616static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5617{ 5639{
5618 int ret; 5640 struct migration_arg *arg = data;
5619 5641
5642 /*
5643 * The original target cpu might have gone down and we might
5644 * be on another cpu but it doesn't matter.
5645 */
5620 local_irq_disable(); 5646 local_irq_disable();
5621 ret = __migrate_task(p, src_cpu, dest_cpu); 5647 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5622 local_irq_enable(); 5648 local_irq_enable();
5623 return ret; 5649 return 0;
5624} 5650}
5625 5651
5652#ifdef CONFIG_HOTPLUG_CPU
5626/* 5653/*
5627 * Figure out where task on dead CPU should go, use force if necessary. 5654 * Figure out where task on dead CPU should go, use force if necessary.
5628 */ 5655 */
5629static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5656void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5630{ 5657{
5631 int dest_cpu; 5658 struct rq *rq = cpu_rq(dead_cpu);
5659 int needs_cpu, uninitialized_var(dest_cpu);
5660 unsigned long flags;
5632 5661
5633again: 5662 local_irq_save(flags);
5634 dest_cpu = select_fallback_rq(dead_cpu, p);
5635 5663
5636 /* It can have affinity changed while we were choosing. */ 5664 raw_spin_lock(&rq->lock);
5637 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5665 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
5638 goto again; 5666 if (needs_cpu)
5667 dest_cpu = select_fallback_rq(dead_cpu, p);
5668 raw_spin_unlock(&rq->lock);
5669 /*
5670 * It can only fail if we race with set_cpus_allowed(),
5671 * in the racer should migrate the task anyway.
5672 */
5673 if (needs_cpu)
5674 __migrate_task(p, dead_cpu, dest_cpu);
5675 local_irq_restore(flags);
5639} 5676}
5640 5677
5641/* 5678/*
@@ -5699,7 +5736,6 @@ void sched_idle_next(void)
5699 5736
5700 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5737 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5701 5738
5702 update_rq_clock(rq);
5703 activate_task(rq, p, 0); 5739 activate_task(rq, p, 0);
5704 5740
5705 raw_spin_unlock_irqrestore(&rq->lock, flags); 5741 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5754,7 +5790,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5754 for ( ; ; ) { 5790 for ( ; ; ) {
5755 if (!rq->nr_running) 5791 if (!rq->nr_running)
5756 break; 5792 break;
5757 update_rq_clock(rq);
5758 next = pick_next_task(rq); 5793 next = pick_next_task(rq);
5759 if (!next) 5794 if (!next)
5760 break; 5795 break;
@@ -5977,35 +6012,20 @@ static void set_rq_offline(struct rq *rq)
5977static int __cpuinit 6012static int __cpuinit
5978migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 6013migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5979{ 6014{
5980 struct task_struct *p;
5981 int cpu = (long)hcpu; 6015 int cpu = (long)hcpu;
5982 unsigned long flags; 6016 unsigned long flags;
5983 struct rq *rq; 6017 struct rq *rq = cpu_rq(cpu);
5984 6018
5985 switch (action) { 6019 switch (action) {
5986 6020
5987 case CPU_UP_PREPARE: 6021 case CPU_UP_PREPARE:
5988 case CPU_UP_PREPARE_FROZEN: 6022 case CPU_UP_PREPARE_FROZEN:
5989 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5990 if (IS_ERR(p))
5991 return NOTIFY_BAD;
5992 kthread_bind(p, cpu);
5993 /* Must be high prio: stop_machine expects to yield to it. */
5994 rq = task_rq_lock(p, &flags);
5995 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5996 task_rq_unlock(rq, &flags);
5997 get_task_struct(p);
5998 cpu_rq(cpu)->migration_thread = p;
5999 rq->calc_load_update = calc_load_update; 6023 rq->calc_load_update = calc_load_update;
6000 break; 6024 break;
6001 6025
6002 case CPU_ONLINE: 6026 case CPU_ONLINE:
6003 case CPU_ONLINE_FROZEN: 6027 case CPU_ONLINE_FROZEN:
6004 /* Strictly unnecessary, as first user will wake it. */
6005 wake_up_process(cpu_rq(cpu)->migration_thread);
6006
6007 /* Update our root-domain */ 6028 /* Update our root-domain */
6008 rq = cpu_rq(cpu);
6009 raw_spin_lock_irqsave(&rq->lock, flags); 6029 raw_spin_lock_irqsave(&rq->lock, flags);
6010 if (rq->rd) { 6030 if (rq->rd) {
6011 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6031 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6016,61 +6036,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6016 break; 6036 break;
6017 6037
6018#ifdef CONFIG_HOTPLUG_CPU 6038#ifdef CONFIG_HOTPLUG_CPU
6019 case CPU_UP_CANCELED:
6020 case CPU_UP_CANCELED_FROZEN:
6021 if (!cpu_rq(cpu)->migration_thread)
6022 break;
6023 /* Unbind it from offline cpu so it can run. Fall thru. */
6024 kthread_bind(cpu_rq(cpu)->migration_thread,
6025 cpumask_any(cpu_online_mask));
6026 kthread_stop(cpu_rq(cpu)->migration_thread);
6027 put_task_struct(cpu_rq(cpu)->migration_thread);
6028 cpu_rq(cpu)->migration_thread = NULL;
6029 break;
6030
6031 case CPU_DEAD: 6039 case CPU_DEAD:
6032 case CPU_DEAD_FROZEN: 6040 case CPU_DEAD_FROZEN:
6033 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
6034 migrate_live_tasks(cpu); 6041 migrate_live_tasks(cpu);
6035 rq = cpu_rq(cpu);
6036 kthread_stop(rq->migration_thread);
6037 put_task_struct(rq->migration_thread);
6038 rq->migration_thread = NULL;
6039 /* Idle task back to normal (off runqueue, low prio) */ 6042 /* Idle task back to normal (off runqueue, low prio) */
6040 raw_spin_lock_irq(&rq->lock); 6043 raw_spin_lock_irq(&rq->lock);
6041 update_rq_clock(rq);
6042 deactivate_task(rq, rq->idle, 0); 6044 deactivate_task(rq, rq->idle, 0);
6043 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 6045 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6044 rq->idle->sched_class = &idle_sched_class; 6046 rq->idle->sched_class = &idle_sched_class;
6045 migrate_dead_tasks(cpu); 6047 migrate_dead_tasks(cpu);
6046 raw_spin_unlock_irq(&rq->lock); 6048 raw_spin_unlock_irq(&rq->lock);
6047 cpuset_unlock();
6048 migrate_nr_uninterruptible(rq); 6049 migrate_nr_uninterruptible(rq);
6049 BUG_ON(rq->nr_running != 0); 6050 BUG_ON(rq->nr_running != 0);
6050 calc_global_load_remove(rq); 6051 calc_global_load_remove(rq);
6051 /*
6052 * No need to migrate the tasks: it was best-effort if
6053 * they didn't take sched_hotcpu_mutex. Just wake up
6054 * the requestors.
6055 */
6056 raw_spin_lock_irq(&rq->lock);
6057 while (!list_empty(&rq->migration_queue)) {
6058 struct migration_req *req;
6059
6060 req = list_entry(rq->migration_queue.next,
6061 struct migration_req, list);
6062 list_del_init(&req->list);
6063 raw_spin_unlock_irq(&rq->lock);
6064 complete(&req->done);
6065 raw_spin_lock_irq(&rq->lock);
6066 }
6067 raw_spin_unlock_irq(&rq->lock);
6068 break; 6052 break;
6069 6053
6070 case CPU_DYING: 6054 case CPU_DYING:
6071 case CPU_DYING_FROZEN: 6055 case CPU_DYING_FROZEN:
6072 /* Update our root-domain */ 6056 /* Update our root-domain */
6073 rq = cpu_rq(cpu);
6074 raw_spin_lock_irqsave(&rq->lock, flags); 6057 raw_spin_lock_irqsave(&rq->lock, flags);
6075 if (rq->rd) { 6058 if (rq->rd) {
6076 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6059 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6090,20 +6073,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6090 */ 6073 */
6091static struct notifier_block __cpuinitdata migration_notifier = { 6074static struct notifier_block __cpuinitdata migration_notifier = {
6092 .notifier_call = migration_call, 6075 .notifier_call = migration_call,
6093 .priority = 10 6076 .priority = CPU_PRI_MIGRATION,
6094}; 6077};
6095 6078
6079static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
6080 unsigned long action, void *hcpu)
6081{
6082 switch (action & ~CPU_TASKS_FROZEN) {
6083 case CPU_ONLINE:
6084 case CPU_DOWN_FAILED:
6085 set_cpu_active((long)hcpu, true);
6086 return NOTIFY_OK;
6087 default:
6088 return NOTIFY_DONE;
6089 }
6090}
6091
6092static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6093 unsigned long action, void *hcpu)
6094{
6095 switch (action & ~CPU_TASKS_FROZEN) {
6096 case CPU_DOWN_PREPARE:
6097 set_cpu_active((long)hcpu, false);
6098 return NOTIFY_OK;
6099 default:
6100 return NOTIFY_DONE;
6101 }
6102}
6103
6096static int __init migration_init(void) 6104static int __init migration_init(void)
6097{ 6105{
6098 void *cpu = (void *)(long)smp_processor_id(); 6106 void *cpu = (void *)(long)smp_processor_id();
6099 int err; 6107 int err;
6100 6108
6101 /* Start one for the boot CPU: */ 6109 /* Initialize migration for the boot CPU */
6102 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6110 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6103 BUG_ON(err == NOTIFY_BAD); 6111 BUG_ON(err == NOTIFY_BAD);
6104 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6112 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6105 register_cpu_notifier(&migration_notifier); 6113 register_cpu_notifier(&migration_notifier);
6106 6114
6115 /* Register cpu active notifiers */
6116 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6117 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6118
6107 return 0; 6119 return 0;
6108} 6120}
6109early_initcall(migration_init); 6121early_initcall(migration_init);
@@ -6338,23 +6350,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6338 free_rootdomain(old_rd); 6350 free_rootdomain(old_rd);
6339} 6351}
6340 6352
6341static int init_rootdomain(struct root_domain *rd, bool bootmem) 6353static int init_rootdomain(struct root_domain *rd)
6342{ 6354{
6343 gfp_t gfp = GFP_KERNEL;
6344
6345 memset(rd, 0, sizeof(*rd)); 6355 memset(rd, 0, sizeof(*rd));
6346 6356
6347 if (bootmem) 6357 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6348 gfp = GFP_NOWAIT;
6349
6350 if (!alloc_cpumask_var(&rd->span, gfp))
6351 goto out; 6358 goto out;
6352 if (!alloc_cpumask_var(&rd->online, gfp)) 6359 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6353 goto free_span; 6360 goto free_span;
6354 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 6361 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6355 goto free_online; 6362 goto free_online;
6356 6363
6357 if (cpupri_init(&rd->cpupri, bootmem) != 0) 6364 if (cpupri_init(&rd->cpupri) != 0)
6358 goto free_rto_mask; 6365 goto free_rto_mask;
6359 return 0; 6366 return 0;
6360 6367
@@ -6370,7 +6377,7 @@ out:
6370 6377
6371static void init_defrootdomain(void) 6378static void init_defrootdomain(void)
6372{ 6379{
6373 init_rootdomain(&def_root_domain, true); 6380 init_rootdomain(&def_root_domain);
6374 6381
6375 atomic_set(&def_root_domain.refcount, 1); 6382 atomic_set(&def_root_domain.refcount, 1);
6376} 6383}
@@ -6383,7 +6390,7 @@ static struct root_domain *alloc_rootdomain(void)
6383 if (!rd) 6390 if (!rd)
6384 return NULL; 6391 return NULL;
6385 6392
6386 if (init_rootdomain(rd, false) != 0) { 6393 if (init_rootdomain(rd) != 0) {
6387 kfree(rd); 6394 kfree(rd);
6388 return NULL; 6395 return NULL;
6389 } 6396 }
@@ -6401,6 +6408,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6401 struct rq *rq = cpu_rq(cpu); 6408 struct rq *rq = cpu_rq(cpu);
6402 struct sched_domain *tmp; 6409 struct sched_domain *tmp;
6403 6410
6411 for (tmp = sd; tmp; tmp = tmp->parent)
6412 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6413
6404 /* Remove the sched domains which do not contribute to scheduling. */ 6414 /* Remove the sched domains which do not contribute to scheduling. */
6405 for (tmp = sd; tmp; ) { 6415 for (tmp = sd; tmp; ) {
6406 struct sched_domain *parent = tmp->parent; 6416 struct sched_domain *parent = tmp->parent;
@@ -7559,29 +7569,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7559} 7569}
7560#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7570#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7561 7571
7562#ifndef CONFIG_CPUSETS
7563/* 7572/*
7564 * Add online and remove offline CPUs from the scheduler domains. 7573 * Update cpusets according to cpu_active mask. If cpusets are
7565 * When cpusets are enabled they take over this function. 7574 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7575 * around partition_sched_domains().
7566 */ 7576 */
7567static int update_sched_domains(struct notifier_block *nfb, 7577static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7568 unsigned long action, void *hcpu) 7578 void *hcpu)
7569{ 7579{
7570 switch (action) { 7580 switch (action & ~CPU_TASKS_FROZEN) {
7571 case CPU_ONLINE: 7581 case CPU_ONLINE:
7572 case CPU_ONLINE_FROZEN:
7573 case CPU_DOWN_PREPARE:
7574 case CPU_DOWN_PREPARE_FROZEN:
7575 case CPU_DOWN_FAILED: 7582 case CPU_DOWN_FAILED:
7576 case CPU_DOWN_FAILED_FROZEN: 7583 cpuset_update_active_cpus();
7577 partition_sched_domains(1, NULL, NULL);
7578 return NOTIFY_OK; 7584 return NOTIFY_OK;
7585 default:
7586 return NOTIFY_DONE;
7587 }
7588}
7579 7589
7590static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7591 void *hcpu)
7592{
7593 switch (action & ~CPU_TASKS_FROZEN) {
7594 case CPU_DOWN_PREPARE:
7595 cpuset_update_active_cpus();
7596 return NOTIFY_OK;
7580 default: 7597 default:
7581 return NOTIFY_DONE; 7598 return NOTIFY_DONE;
7582 } 7599 }
7583} 7600}
7584#endif
7585 7601
7586static int update_runtime(struct notifier_block *nfb, 7602static int update_runtime(struct notifier_block *nfb,
7587 unsigned long action, void *hcpu) 7603 unsigned long action, void *hcpu)
@@ -7627,10 +7643,8 @@ void __init sched_init_smp(void)
7627 mutex_unlock(&sched_domains_mutex); 7643 mutex_unlock(&sched_domains_mutex);
7628 put_online_cpus(); 7644 put_online_cpus();
7629 7645
7630#ifndef CONFIG_CPUSETS 7646 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7631 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7647 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7632 hotcpu_notifier(update_sched_domains, 0);
7633#endif
7634 7648
7635 /* RT runtime code needs to handle some hotplug events */ 7649 /* RT runtime code needs to handle some hotplug events */
7636 hotcpu_notifier(update_runtime, 0); 7650 hotcpu_notifier(update_runtime, 0);
@@ -7875,20 +7889,26 @@ void __init sched_init(void)
7875 7889
7876 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7890 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7877 rq->cpu_load[j] = 0; 7891 rq->cpu_load[j] = 0;
7892
7893 rq->last_load_update_tick = jiffies;
7894
7878#ifdef CONFIG_SMP 7895#ifdef CONFIG_SMP
7879 rq->sd = NULL; 7896 rq->sd = NULL;
7880 rq->rd = NULL; 7897 rq->rd = NULL;
7898 rq->cpu_power = SCHED_LOAD_SCALE;
7881 rq->post_schedule = 0; 7899 rq->post_schedule = 0;
7882 rq->active_balance = 0; 7900 rq->active_balance = 0;
7883 rq->next_balance = jiffies; 7901 rq->next_balance = jiffies;
7884 rq->push_cpu = 0; 7902 rq->push_cpu = 0;
7885 rq->cpu = i; 7903 rq->cpu = i;
7886 rq->online = 0; 7904 rq->online = 0;
7887 rq->migration_thread = NULL;
7888 rq->idle_stamp = 0; 7905 rq->idle_stamp = 0;
7889 rq->avg_idle = 2*sysctl_sched_migration_cost; 7906 rq->avg_idle = 2*sysctl_sched_migration_cost;
7890 INIT_LIST_HEAD(&rq->migration_queue);
7891 rq_attach_root(rq, &def_root_domain); 7907 rq_attach_root(rq, &def_root_domain);
7908#ifdef CONFIG_NO_HZ
7909 rq->nohz_balance_kick = 0;
7910 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7911#endif
7892#endif 7912#endif
7893 init_rq_hrtick(rq); 7913 init_rq_hrtick(rq);
7894 atomic_set(&rq->nr_iowait, 0); 7914 atomic_set(&rq->nr_iowait, 0);
@@ -7933,8 +7953,11 @@ void __init sched_init(void)
7933 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7953 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7934#ifdef CONFIG_SMP 7954#ifdef CONFIG_SMP
7935#ifdef CONFIG_NO_HZ 7955#ifdef CONFIG_NO_HZ
7936 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7956 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7937 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7957 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
7958 atomic_set(&nohz.load_balancer, nr_cpu_ids);
7959 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
7960 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7938#endif 7961#endif
7939 /* May be allocated at isolcpus cmdline parse time */ 7962 /* May be allocated at isolcpus cmdline parse time */
7940 if (cpu_isolated_map == NULL) 7963 if (cpu_isolated_map == NULL)
@@ -7988,7 +8011,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7988{ 8011{
7989 int on_rq; 8012 int on_rq;
7990 8013
7991 update_rq_clock(rq);
7992 on_rq = p->se.on_rq; 8014 on_rq = p->se.on_rq;
7993 if (on_rq) 8015 if (on_rq)
7994 deactivate_task(rq, p, 0); 8016 deactivate_task(rq, p, 0);
@@ -8015,9 +8037,9 @@ void normalize_rt_tasks(void)
8015 8037
8016 p->se.exec_start = 0; 8038 p->se.exec_start = 0;
8017#ifdef CONFIG_SCHEDSTATS 8039#ifdef CONFIG_SCHEDSTATS
8018 p->se.wait_start = 0; 8040 p->se.statistics.wait_start = 0;
8019 p->se.sleep_start = 0; 8041 p->se.statistics.sleep_start = 0;
8020 p->se.block_start = 0; 8042 p->se.statistics.block_start = 0;
8021#endif 8043#endif
8022 8044
8023 if (!rt_task(p)) { 8045 if (!rt_task(p)) {
@@ -8044,9 +8066,9 @@ void normalize_rt_tasks(void)
8044 8066
8045#endif /* CONFIG_MAGIC_SYSRQ */ 8067#endif /* CONFIG_MAGIC_SYSRQ */
8046 8068
8047#ifdef CONFIG_IA64 8069#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
8048/* 8070/*
8049 * These functions are only useful for the IA64 MCA handling. 8071 * These functions are only useful for the IA64 MCA handling, or kdb.
8050 * 8072 *
8051 * They can only be called when the whole system has been 8073 * They can only be called when the whole system has been
8052 * stopped - every CPU needs to be quiescent, and no scheduling 8074 * stopped - every CPU needs to be quiescent, and no scheduling
@@ -8066,6 +8088,9 @@ struct task_struct *curr_task(int cpu)
8066 return cpu_curr(cpu); 8088 return cpu_curr(cpu);
8067} 8089}
8068 8090
8091#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
8092
8093#ifdef CONFIG_IA64
8069/** 8094/**
8070 * set_curr_task - set the current task for a given cpu. 8095 * set_curr_task - set the current task for a given cpu.
8071 * @cpu: the processor in question. 8096 * @cpu: the processor in question.
@@ -8350,8 +8375,6 @@ void sched_move_task(struct task_struct *tsk)
8350 8375
8351 rq = task_rq_lock(tsk, &flags); 8376 rq = task_rq_lock(tsk, &flags);
8352 8377
8353 update_rq_clock(rq);
8354
8355 running = task_current(rq, tsk); 8378 running = task_current(rq, tsk);
8356 on_rq = tsk->se.on_rq; 8379 on_rq = tsk->se.on_rq;
8357 8380
@@ -8370,7 +8393,7 @@ void sched_move_task(struct task_struct *tsk)
8370 if (unlikely(running)) 8393 if (unlikely(running))
8371 tsk->sched_class->set_curr_task(rq); 8394 tsk->sched_class->set_curr_task(rq);
8372 if (on_rq) 8395 if (on_rq)
8373 enqueue_task(rq, tsk, 0, false); 8396 enqueue_task(rq, tsk, 0);
8374 8397
8375 task_rq_unlock(rq, &flags); 8398 task_rq_unlock(rq, &flags);
8376} 8399}
@@ -9184,43 +9207,32 @@ struct cgroup_subsys cpuacct_subsys = {
9184 9207
9185#ifndef CONFIG_SMP 9208#ifndef CONFIG_SMP
9186 9209
9187int rcu_expedited_torture_stats(char *page)
9188{
9189 return 0;
9190}
9191EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9192
9193void synchronize_sched_expedited(void) 9210void synchronize_sched_expedited(void)
9194{ 9211{
9212 barrier();
9195} 9213}
9196EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 9214EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9197 9215
9198#else /* #ifndef CONFIG_SMP */ 9216#else /* #ifndef CONFIG_SMP */
9199 9217
9200static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 9218static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9201static DEFINE_MUTEX(rcu_sched_expedited_mutex);
9202
9203#define RCU_EXPEDITED_STATE_POST -2
9204#define RCU_EXPEDITED_STATE_IDLE -1
9205 9219
9206static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 9220static int synchronize_sched_expedited_cpu_stop(void *data)
9207
9208int rcu_expedited_torture_stats(char *page)
9209{ 9221{
9210 int cnt = 0; 9222 /*
9211 int cpu; 9223 * There must be a full memory barrier on each affected CPU
9212 9224 * between the time that try_stop_cpus() is called and the
9213 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 9225 * time that it returns.
9214 for_each_online_cpu(cpu) { 9226 *
9215 cnt += sprintf(&page[cnt], " %d:%d", 9227 * In the current initial implementation of cpu_stop, the
9216 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 9228 * above condition is already met when the control reaches
9217 } 9229 * this point and the following smp_mb() is not strictly
9218 cnt += sprintf(&page[cnt], "\n"); 9230 * necessary. Do smp_mb() anyway for documentation and
9219 return cnt; 9231 * robustness against future implementation changes.
9232 */
9233 smp_mb(); /* See above comment block. */
9234 return 0;
9220} 9235}
9221EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9222
9223static long synchronize_sched_expedited_count;
9224 9236
9225/* 9237/*
9226 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 9238 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9234,18 +9246,14 @@ static long synchronize_sched_expedited_count;
9234 */ 9246 */
9235void synchronize_sched_expedited(void) 9247void synchronize_sched_expedited(void)
9236{ 9248{
9237 int cpu; 9249 int snap, trycount = 0;
9238 unsigned long flags;
9239 bool need_full_sync = 0;
9240 struct rq *rq;
9241 struct migration_req *req;
9242 long snap;
9243 int trycount = 0;
9244 9250
9245 smp_mb(); /* ensure prior mod happens before capturing snap. */ 9251 smp_mb(); /* ensure prior mod happens before capturing snap. */
9246 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 9252 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9247 get_online_cpus(); 9253 get_online_cpus();
9248 while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 9254 while (try_stop_cpus(cpu_online_mask,
9255 synchronize_sched_expedited_cpu_stop,
9256 NULL) == -EAGAIN) {
9249 put_online_cpus(); 9257 put_online_cpus();
9250 if (trycount++ < 10) 9258 if (trycount++ < 10)
9251 udelay(trycount * num_online_cpus()); 9259 udelay(trycount * num_online_cpus());
@@ -9253,41 +9261,15 @@ void synchronize_sched_expedited(void)
9253 synchronize_sched(); 9261 synchronize_sched();
9254 return; 9262 return;
9255 } 9263 }
9256 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 9264 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9257 smp_mb(); /* ensure test happens before caller kfree */ 9265 smp_mb(); /* ensure test happens before caller kfree */
9258 return; 9266 return;
9259 } 9267 }
9260 get_online_cpus(); 9268 get_online_cpus();
9261 } 9269 }
9262 rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 9270 atomic_inc(&synchronize_sched_expedited_count);
9263 for_each_online_cpu(cpu) { 9271 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9264 rq = cpu_rq(cpu);
9265 req = &per_cpu(rcu_migration_req, cpu);
9266 init_completion(&req->done);
9267 req->task = NULL;
9268 req->dest_cpu = RCU_MIGRATION_NEED_QS;
9269 raw_spin_lock_irqsave(&rq->lock, flags);
9270 list_add(&req->list, &rq->migration_queue);
9271 raw_spin_unlock_irqrestore(&rq->lock, flags);
9272 wake_up_process(rq->migration_thread);
9273 }
9274 for_each_online_cpu(cpu) {
9275 rcu_expedited_state = cpu;
9276 req = &per_cpu(rcu_migration_req, cpu);
9277 rq = cpu_rq(cpu);
9278 wait_for_completion(&req->done);
9279 raw_spin_lock_irqsave(&rq->lock, flags);
9280 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
9281 need_full_sync = 1;
9282 req->dest_cpu = RCU_MIGRATION_IDLE;
9283 raw_spin_unlock_irqrestore(&rq->lock, flags);
9284 }
9285 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9286 synchronize_sched_expedited_count++;
9287 mutex_unlock(&rcu_sched_expedited_mutex);
9288 put_online_cpus(); 9272 put_online_cpus();
9289 if (need_full_sync)
9290 synchronize_sched();
9291} 9273}
9292EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 9274EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9293 9275