aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1326
1 files changed, 647 insertions, 679 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c2a54f70ffe..41541d79e3c8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/cpuset.h> 56#include <linux/cpuset.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h> 58#include <linux/proc_fs.h>
60#include <linux/seq_file.h> 59#include <linux/seq_file.h>
60#include <linux/stop_machine.h>
61#include <linux/sysctl.h> 61#include <linux/sysctl.h>
62#include <linux/syscalls.h> 62#include <linux/syscalls.h>
63#include <linux/times.h> 63#include <linux/times.h>
@@ -77,6 +77,7 @@
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h"
80 81
81#define CREATE_TRACE_POINTS 82#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h> 83#include <trace/events/sched.h>
@@ -306,52 +307,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
306 */ 307 */
307struct task_group init_task_group; 308struct task_group init_task_group;
308 309
309/* return group to which a task belongs */
310static inline struct task_group *task_group(struct task_struct *p)
311{
312 struct task_group *tg;
313
314#ifdef CONFIG_CGROUP_SCHED
315 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
316 struct task_group, css);
317#else
318 tg = &init_task_group;
319#endif
320 return tg;
321}
322
323/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
324static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
325{
326 /*
327 * Strictly speaking this rcu_read_lock() is not needed since the
328 * task_group is tied to the cgroup, which in turn can never go away
329 * as long as there are tasks attached to it.
330 *
331 * However since task_group() uses task_subsys_state() which is an
332 * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
333 */
334 rcu_read_lock();
335#ifdef CONFIG_FAIR_GROUP_SCHED
336 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
337 p->se.parent = task_group(p)->se[cpu];
338#endif
339
340#ifdef CONFIG_RT_GROUP_SCHED
341 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
342 p->rt.parent = task_group(p)->rt_se[cpu];
343#endif
344 rcu_read_unlock();
345}
346
347#else
348
349static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
350static inline struct task_group *task_group(struct task_struct *p)
351{
352 return NULL;
353}
354
355#endif /* CONFIG_CGROUP_SCHED */ 310#endif /* CONFIG_CGROUP_SCHED */
356 311
357/* CFS-related fields in a runqueue */ 312/* CFS-related fields in a runqueue */
@@ -502,9 +457,13 @@ struct rq {
502 unsigned long nr_running; 457 unsigned long nr_running;
503 #define CPU_LOAD_IDX_MAX 5 458 #define CPU_LOAD_IDX_MAX 5
504 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 459 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
460 unsigned long last_load_update_tick;
505#ifdef CONFIG_NO_HZ 461#ifdef CONFIG_NO_HZ
506 unsigned char in_nohz_recently; 462 u64 nohz_stamp;
463 unsigned char nohz_balance_kick;
507#endif 464#endif
465 unsigned int skip_clock_update;
466
508 /* capture load from *all* tasks on this cpu: */ 467 /* capture load from *all* tasks on this cpu: */
509 struct load_weight load; 468 struct load_weight load;
510 unsigned long nr_load_updates; 469 unsigned long nr_load_updates;
@@ -541,20 +500,20 @@ struct rq {
541 struct root_domain *rd; 500 struct root_domain *rd;
542 struct sched_domain *sd; 501 struct sched_domain *sd;
543 502
503 unsigned long cpu_power;
504
544 unsigned char idle_at_tick; 505 unsigned char idle_at_tick;
545 /* For active balancing */ 506 /* For active balancing */
546 int post_schedule; 507 int post_schedule;
547 int active_balance; 508 int active_balance;
548 int push_cpu; 509 int push_cpu;
510 struct cpu_stop_work active_balance_work;
549 /* cpu of this runqueue: */ 511 /* cpu of this runqueue: */
550 int cpu; 512 int cpu;
551 int online; 513 int online;
552 514
553 unsigned long avg_load_per_task; 515 unsigned long avg_load_per_task;
554 516
555 struct task_struct *migration_thread;
556 struct list_head migration_queue;
557
558 u64 rt_avg; 517 u64 rt_avg;
559 u64 age_stamp; 518 u64 age_stamp;
560 u64 idle_stamp; 519 u64 idle_stamp;
@@ -602,6 +561,13 @@ static inline
602void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 561void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
603{ 562{
604 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 563 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
564
565 /*
566 * A queue event has occurred, and we're going to schedule. In
567 * this case, we can save a useless back to back clock update.
568 */
569 if (test_tsk_need_resched(p))
570 rq->skip_clock_update = 1;
605} 571}
606 572
607static inline int cpu_of(struct rq *rq) 573static inline int cpu_of(struct rq *rq)
@@ -634,9 +600,53 @@ static inline int cpu_of(struct rq *rq)
634#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 600#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
635#define raw_rq() (&__raw_get_cpu_var(runqueues)) 601#define raw_rq() (&__raw_get_cpu_var(runqueues))
636 602
603#ifdef CONFIG_CGROUP_SCHED
604
605/*
606 * Return the group to which this tasks belongs.
607 *
608 * We use task_subsys_state_check() and extend the RCU verification
609 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
610 * holds that lock for each task it moves into the cgroup. Therefore
611 * by holding that lock, we pin the task to the current cgroup.
612 */
613static inline struct task_group *task_group(struct task_struct *p)
614{
615 struct cgroup_subsys_state *css;
616
617 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
618 lockdep_is_held(&task_rq(p)->lock));
619 return container_of(css, struct task_group, css);
620}
621
622/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
623static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
624{
625#ifdef CONFIG_FAIR_GROUP_SCHED
626 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
627 p->se.parent = task_group(p)->se[cpu];
628#endif
629
630#ifdef CONFIG_RT_GROUP_SCHED
631 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
632 p->rt.parent = task_group(p)->rt_se[cpu];
633#endif
634}
635
636#else /* CONFIG_CGROUP_SCHED */
637
638static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
639static inline struct task_group *task_group(struct task_struct *p)
640{
641 return NULL;
642}
643
644#endif /* CONFIG_CGROUP_SCHED */
645
637inline void update_rq_clock(struct rq *rq) 646inline void update_rq_clock(struct rq *rq)
638{ 647{
639 rq->clock = sched_clock_cpu(cpu_of(rq)); 648 if (!rq->skip_clock_update)
649 rq->clock = sched_clock_cpu(cpu_of(rq));
640} 650}
641 651
642/* 652/*
@@ -914,16 +924,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
914#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 924#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
915 925
916/* 926/*
917 * Check whether the task is waking, we use this to synchronize against 927 * Check whether the task is waking, we use this to synchronize ->cpus_allowed
918 * ttwu() so that task_cpu() reports a stable number. 928 * against ttwu().
919 *
920 * We need to make an exception for PF_STARTING tasks because the fork
921 * path might require task_rq_lock() to work, eg. it can call
922 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
923 */ 929 */
924static inline int task_is_waking(struct task_struct *p) 930static inline int task_is_waking(struct task_struct *p)
925{ 931{
926 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); 932 return unlikely(p->state == TASK_WAKING);
927} 933}
928 934
929/* 935/*
@@ -936,11 +942,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
936 struct rq *rq; 942 struct rq *rq;
937 943
938 for (;;) { 944 for (;;) {
939 while (task_is_waking(p))
940 cpu_relax();
941 rq = task_rq(p); 945 rq = task_rq(p);
942 raw_spin_lock(&rq->lock); 946 raw_spin_lock(&rq->lock);
943 if (likely(rq == task_rq(p) && !task_is_waking(p))) 947 if (likely(rq == task_rq(p)))
944 return rq; 948 return rq;
945 raw_spin_unlock(&rq->lock); 949 raw_spin_unlock(&rq->lock);
946 } 950 }
@@ -957,25 +961,15 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
957 struct rq *rq; 961 struct rq *rq;
958 962
959 for (;;) { 963 for (;;) {
960 while (task_is_waking(p))
961 cpu_relax();
962 local_irq_save(*flags); 964 local_irq_save(*flags);
963 rq = task_rq(p); 965 rq = task_rq(p);
964 raw_spin_lock(&rq->lock); 966 raw_spin_lock(&rq->lock);
965 if (likely(rq == task_rq(p) && !task_is_waking(p))) 967 if (likely(rq == task_rq(p)))
966 return rq; 968 return rq;
967 raw_spin_unlock_irqrestore(&rq->lock, *flags); 969 raw_spin_unlock_irqrestore(&rq->lock, *flags);
968 } 970 }
969} 971}
970 972
971void task_rq_unlock_wait(struct task_struct *p)
972{
973 struct rq *rq = task_rq(p);
974
975 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
976 raw_spin_unlock_wait(&rq->lock);
977}
978
979static void __task_rq_unlock(struct rq *rq) 973static void __task_rq_unlock(struct rq *rq)
980 __releases(rq->lock) 974 __releases(rq->lock)
981{ 975{
@@ -1201,6 +1195,27 @@ static void resched_cpu(int cpu)
1201 1195
1202#ifdef CONFIG_NO_HZ 1196#ifdef CONFIG_NO_HZ
1203/* 1197/*
1198 * In the semi idle case, use the nearest busy cpu for migrating timers
1199 * from an idle cpu. This is good for power-savings.
1200 *
1201 * We don't do similar optimization for completely idle system, as
1202 * selecting an idle cpu will add more delays to the timers than intended
1203 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1204 */
1205int get_nohz_timer_target(void)
1206{
1207 int cpu = smp_processor_id();
1208 int i;
1209 struct sched_domain *sd;
1210
1211 for_each_domain(cpu, sd) {
1212 for_each_cpu(i, sched_domain_span(sd))
1213 if (!idle_cpu(i))
1214 return i;
1215 }
1216 return cpu;
1217}
1218/*
1204 * When add_timer_on() enqueues a timer into the timer wheel of an 1219 * When add_timer_on() enqueues a timer into the timer wheel of an
1205 * idle CPU then this timer might expire before the next timer event 1220 * idle CPU then this timer might expire before the next timer event
1206 * which is scheduled to wake up that CPU. In case of a completely 1221 * which is scheduled to wake up that CPU. In case of a completely
@@ -1239,6 +1254,7 @@ void wake_up_idle_cpu(int cpu)
1239 if (!tsk_is_polling(rq->idle)) 1254 if (!tsk_is_polling(rq->idle))
1240 smp_send_reschedule(cpu); 1255 smp_send_reschedule(cpu);
1241} 1256}
1257
1242#endif /* CONFIG_NO_HZ */ 1258#endif /* CONFIG_NO_HZ */
1243 1259
1244static u64 sched_avg_period(void) 1260static u64 sched_avg_period(void)
@@ -1251,6 +1267,12 @@ static void sched_avg_update(struct rq *rq)
1251 s64 period = sched_avg_period(); 1267 s64 period = sched_avg_period();
1252 1268
1253 while ((s64)(rq->clock - rq->age_stamp) > period) { 1269 while ((s64)(rq->clock - rq->age_stamp) > period) {
1270 /*
1271 * Inline assembly required to prevent the compiler
1272 * optimising this loop into a divmod call.
1273 * See __iter_div_u64_rem() for another example of this.
1274 */
1275 asm("" : "+rm" (rq->age_stamp));
1254 rq->age_stamp += period; 1276 rq->age_stamp += period;
1255 rq->rt_avg /= 2; 1277 rq->rt_avg /= 2;
1256 } 1278 }
@@ -1495,24 +1517,9 @@ static unsigned long target_load(int cpu, int type)
1495 return max(rq->cpu_load[type-1], total); 1517 return max(rq->cpu_load[type-1], total);
1496} 1518}
1497 1519
1498static struct sched_group *group_of(int cpu)
1499{
1500 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1501
1502 if (!sd)
1503 return NULL;
1504
1505 return sd->groups;
1506}
1507
1508static unsigned long power_of(int cpu) 1520static unsigned long power_of(int cpu)
1509{ 1521{
1510 struct sched_group *group = group_of(cpu); 1522 return cpu_rq(cpu)->cpu_power;
1511
1512 if (!group)
1513 return SCHED_LOAD_SCALE;
1514
1515 return group->cpu_power;
1516} 1523}
1517 1524
1518static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1525static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1658,7 +1665,7 @@ static void update_shares(struct sched_domain *sd)
1658 if (root_task_group_empty()) 1665 if (root_task_group_empty())
1659 return; 1666 return;
1660 1667
1661 now = cpu_clock(raw_smp_processor_id()); 1668 now = local_clock();
1662 elapsed = now - sd->last_update; 1669 elapsed = now - sd->last_update;
1663 1670
1664 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1671 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1669,9 +1676,6 @@ static void update_shares(struct sched_domain *sd)
1669 1676
1670static void update_h_load(long cpu) 1677static void update_h_load(long cpu)
1671{ 1678{
1672 if (root_task_group_empty())
1673 return;
1674
1675 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1679 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1676} 1680}
1677 1681
@@ -1781,8 +1785,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1781 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1785 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1782 } 1786 }
1783 } 1787 }
1784 update_rq_clock(rq1);
1785 update_rq_clock(rq2);
1786} 1788}
1787 1789
1788/* 1790/*
@@ -1813,9 +1815,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1813} 1815}
1814#endif 1816#endif
1815 1817
1816static void calc_load_account_active(struct rq *this_rq); 1818static void calc_load_account_idle(struct rq *this_rq);
1817static void update_sysctl(void); 1819static void update_sysctl(void);
1818static int get_update_sysctl_factor(void); 1820static int get_update_sysctl_factor(void);
1821static void update_cpu_load(struct rq *this_rq);
1819 1822
1820static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1823static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1821{ 1824{
@@ -1852,8 +1855,8 @@ static void dec_nr_running(struct rq *rq)
1852static void set_load_weight(struct task_struct *p) 1855static void set_load_weight(struct task_struct *p)
1853{ 1856{
1854 if (task_has_rt_policy(p)) { 1857 if (task_has_rt_policy(p)) {
1855 p->se.load.weight = prio_to_weight[0] * 2; 1858 p->se.load.weight = 0;
1856 p->se.load.inv_weight = prio_to_wmult[0] >> 1; 1859 p->se.load.inv_weight = WMULT_CONST;
1857 return; 1860 return;
1858 } 1861 }
1859 1862
@@ -1870,62 +1873,43 @@ static void set_load_weight(struct task_struct *p)
1870 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1873 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1871} 1874}
1872 1875
1873static void update_avg(u64 *avg, u64 sample) 1876static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1874{
1875 s64 diff = sample - *avg;
1876 *avg += diff >> 3;
1877}
1878
1879static void
1880enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1881{ 1877{
1882 if (wakeup) 1878 update_rq_clock(rq);
1883 p->se.start_runtime = p->se.sum_exec_runtime;
1884
1885 sched_info_queued(p); 1879 sched_info_queued(p);
1886 p->sched_class->enqueue_task(rq, p, wakeup, head); 1880 p->sched_class->enqueue_task(rq, p, flags);
1887 p->se.on_rq = 1; 1881 p->se.on_rq = 1;
1888} 1882}
1889 1883
1890static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1884static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1891{ 1885{
1892 if (sleep) { 1886 update_rq_clock(rq);
1893 if (p->se.last_wakeup) {
1894 update_avg(&p->se.avg_overlap,
1895 p->se.sum_exec_runtime - p->se.last_wakeup);
1896 p->se.last_wakeup = 0;
1897 } else {
1898 update_avg(&p->se.avg_wakeup,
1899 sysctl_sched_wakeup_granularity);
1900 }
1901 }
1902
1903 sched_info_dequeued(p); 1887 sched_info_dequeued(p);
1904 p->sched_class->dequeue_task(rq, p, sleep); 1888 p->sched_class->dequeue_task(rq, p, flags);
1905 p->se.on_rq = 0; 1889 p->se.on_rq = 0;
1906} 1890}
1907 1891
1908/* 1892/*
1909 * activate_task - move a task to the runqueue. 1893 * activate_task - move a task to the runqueue.
1910 */ 1894 */
1911static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 1895static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1912{ 1896{
1913 if (task_contributes_to_load(p)) 1897 if (task_contributes_to_load(p))
1914 rq->nr_uninterruptible--; 1898 rq->nr_uninterruptible--;
1915 1899
1916 enqueue_task(rq, p, wakeup, false); 1900 enqueue_task(rq, p, flags);
1917 inc_nr_running(rq); 1901 inc_nr_running(rq);
1918} 1902}
1919 1903
1920/* 1904/*
1921 * deactivate_task - remove a task from the runqueue. 1905 * deactivate_task - remove a task from the runqueue.
1922 */ 1906 */
1923static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 1907static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1924{ 1908{
1925 if (task_contributes_to_load(p)) 1909 if (task_contributes_to_load(p))
1926 rq->nr_uninterruptible++; 1910 rq->nr_uninterruptible++;
1927 1911
1928 dequeue_task(rq, p, sleep); 1912 dequeue_task(rq, p, flags);
1929 dec_nr_running(rq); 1913 dec_nr_running(rq);
1930} 1914}
1931 1915
@@ -2054,21 +2038,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2054 __set_task_cpu(p, new_cpu); 2038 __set_task_cpu(p, new_cpu);
2055} 2039}
2056 2040
2057struct migration_req { 2041struct migration_arg {
2058 struct list_head list;
2059
2060 struct task_struct *task; 2042 struct task_struct *task;
2061 int dest_cpu; 2043 int dest_cpu;
2062
2063 struct completion done;
2064}; 2044};
2065 2045
2046static int migration_cpu_stop(void *data);
2047
2066/* 2048/*
2067 * The task's runqueue lock must be held. 2049 * The task's runqueue lock must be held.
2068 * Returns true if you have to wait for migration thread. 2050 * Returns true if you have to wait for migration thread.
2069 */ 2051 */
2070static int 2052static bool migrate_task(struct task_struct *p, int dest_cpu)
2071migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2072{ 2053{
2073 struct rq *rq = task_rq(p); 2054 struct rq *rq = task_rq(p);
2074 2055
@@ -2076,58 +2057,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2076 * If the task is not on a runqueue (and not running), then 2057 * If the task is not on a runqueue (and not running), then
2077 * the next wake-up will properly place the task. 2058 * the next wake-up will properly place the task.
2078 */ 2059 */
2079 if (!p->se.on_rq && !task_running(rq, p)) 2060 return p->se.on_rq || task_running(rq, p);
2080 return 0;
2081
2082 init_completion(&req->done);
2083 req->task = p;
2084 req->dest_cpu = dest_cpu;
2085 list_add(&req->list, &rq->migration_queue);
2086
2087 return 1;
2088}
2089
2090/*
2091 * wait_task_context_switch - wait for a thread to complete at least one
2092 * context switch.
2093 *
2094 * @p must not be current.
2095 */
2096void wait_task_context_switch(struct task_struct *p)
2097{
2098 unsigned long nvcsw, nivcsw, flags;
2099 int running;
2100 struct rq *rq;
2101
2102 nvcsw = p->nvcsw;
2103 nivcsw = p->nivcsw;
2104 for (;;) {
2105 /*
2106 * The runqueue is assigned before the actual context
2107 * switch. We need to take the runqueue lock.
2108 *
2109 * We could check initially without the lock but it is
2110 * very likely that we need to take the lock in every
2111 * iteration.
2112 */
2113 rq = task_rq_lock(p, &flags);
2114 running = task_running(rq, p);
2115 task_rq_unlock(rq, &flags);
2116
2117 if (likely(!running))
2118 break;
2119 /*
2120 * The switch count is incremented before the actual
2121 * context switch. We thus wait for two switches to be
2122 * sure at least one completed.
2123 */
2124 if ((p->nvcsw - nvcsw) > 1)
2125 break;
2126 if ((p->nivcsw - nivcsw) > 1)
2127 break;
2128
2129 cpu_relax();
2130 }
2131} 2061}
2132 2062
2133/* 2063/*
@@ -2185,7 +2115,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2185 * just go back and repeat. 2115 * just go back and repeat.
2186 */ 2116 */
2187 rq = task_rq_lock(p, &flags); 2117 rq = task_rq_lock(p, &flags);
2188 trace_sched_wait_task(rq, p); 2118 trace_sched_wait_task(p);
2189 running = task_running(rq, p); 2119 running = task_running(rq, p);
2190 on_rq = p->se.on_rq; 2120 on_rq = p->se.on_rq;
2191 ncsw = 0; 2121 ncsw = 0;
@@ -2283,6 +2213,9 @@ void task_oncpu_function_call(struct task_struct *p,
2283} 2213}
2284 2214
2285#ifdef CONFIG_SMP 2215#ifdef CONFIG_SMP
2216/*
2217 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
2218 */
2286static int select_fallback_rq(int cpu, struct task_struct *p) 2219static int select_fallback_rq(int cpu, struct task_struct *p)
2287{ 2220{
2288 int dest_cpu; 2221 int dest_cpu;
@@ -2299,12 +2232,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2299 return dest_cpu; 2232 return dest_cpu;
2300 2233
2301 /* No more Mr. Nice Guy. */ 2234 /* No more Mr. Nice Guy. */
2302 if (dest_cpu >= nr_cpu_ids) { 2235 if (unlikely(dest_cpu >= nr_cpu_ids)) {
2303 rcu_read_lock(); 2236 dest_cpu = cpuset_cpus_allowed_fallback(p);
2304 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2305 rcu_read_unlock();
2306 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2307
2308 /* 2237 /*
2309 * Don't tell them about moving exiting tasks or 2238 * Don't tell them about moving exiting tasks or
2310 * kernel threads (both mm NULL), since they never 2239 * kernel threads (both mm NULL), since they never
@@ -2321,17 +2250,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2321} 2250}
2322 2251
2323/* 2252/*
2324 * Gets called from 3 sites (exec, fork, wakeup), since it is called without 2253 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
2325 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2326 * by:
2327 *
2328 * exec: is unstable, retry loop
2329 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2330 */ 2254 */
2331static inline 2255static inline
2332int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2256int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2333{ 2257{
2334 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2258 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2335 2259
2336 /* 2260 /*
2337 * In order not to call set_task_cpu() on a blocking task we need 2261 * In order not to call set_task_cpu() on a blocking task we need
@@ -2349,13 +2273,63 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2349 2273
2350 return cpu; 2274 return cpu;
2351} 2275}
2276
2277static void update_avg(u64 *avg, u64 sample)
2278{
2279 s64 diff = sample - *avg;
2280 *avg += diff >> 3;
2281}
2352#endif 2282#endif
2353 2283
2354/*** 2284static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2285 bool is_sync, bool is_migrate, bool is_local,
2286 unsigned long en_flags)
2287{
2288 schedstat_inc(p, se.statistics.nr_wakeups);
2289 if (is_sync)
2290 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2291 if (is_migrate)
2292 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2293 if (is_local)
2294 schedstat_inc(p, se.statistics.nr_wakeups_local);
2295 else
2296 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2297
2298 activate_task(rq, p, en_flags);
2299}
2300
2301static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2302 int wake_flags, bool success)
2303{
2304 trace_sched_wakeup(p, success);
2305 check_preempt_curr(rq, p, wake_flags);
2306
2307 p->state = TASK_RUNNING;
2308#ifdef CONFIG_SMP
2309 if (p->sched_class->task_woken)
2310 p->sched_class->task_woken(rq, p);
2311
2312 if (unlikely(rq->idle_stamp)) {
2313 u64 delta = rq->clock - rq->idle_stamp;
2314 u64 max = 2*sysctl_sched_migration_cost;
2315
2316 if (delta > max)
2317 rq->avg_idle = max;
2318 else
2319 update_avg(&rq->avg_idle, delta);
2320 rq->idle_stamp = 0;
2321 }
2322#endif
2323 /* if a worker is waking up, notify workqueue */
2324 if ((p->flags & PF_WQ_WORKER) && success)
2325 wq_worker_waking_up(p, cpu_of(rq));
2326}
2327
2328/**
2355 * try_to_wake_up - wake up a thread 2329 * try_to_wake_up - wake up a thread
2356 * @p: the to-be-woken-up thread 2330 * @p: the thread to be awakened
2357 * @state: the mask of task states that can be woken 2331 * @state: the mask of task states that can be woken
2358 * @sync: do a synchronous wakeup? 2332 * @wake_flags: wake modifier flags (WF_*)
2359 * 2333 *
2360 * Put it on the run-queue if it's not already there. The "current" 2334 * Put it on the run-queue if it's not already there. The "current"
2361 * thread is always on the run-queue (except when the actual 2335 * thread is always on the run-queue (except when the actual
@@ -2363,23 +2337,21 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2363 * the simpler "current->state = TASK_RUNNING" to mark yourself 2337 * the simpler "current->state = TASK_RUNNING" to mark yourself
2364 * runnable without the overhead of this. 2338 * runnable without the overhead of this.
2365 * 2339 *
2366 * returns failure only if the task is already active. 2340 * Returns %true if @p was woken up, %false if it was already running
2341 * or @state didn't match @p's state.
2367 */ 2342 */
2368static int try_to_wake_up(struct task_struct *p, unsigned int state, 2343static int try_to_wake_up(struct task_struct *p, unsigned int state,
2369 int wake_flags) 2344 int wake_flags)
2370{ 2345{
2371 int cpu, orig_cpu, this_cpu, success = 0; 2346 int cpu, orig_cpu, this_cpu, success = 0;
2372 unsigned long flags; 2347 unsigned long flags;
2348 unsigned long en_flags = ENQUEUE_WAKEUP;
2373 struct rq *rq; 2349 struct rq *rq;
2374 2350
2375 if (!sched_feat(SYNC_WAKEUPS))
2376 wake_flags &= ~WF_SYNC;
2377
2378 this_cpu = get_cpu(); 2351 this_cpu = get_cpu();
2379 2352
2380 smp_wmb(); 2353 smp_wmb();
2381 rq = task_rq_lock(p, &flags); 2354 rq = task_rq_lock(p, &flags);
2382 update_rq_clock(rq);
2383 if (!(p->state & state)) 2355 if (!(p->state & state))
2384 goto out; 2356 goto out;
2385 2357
@@ -2399,28 +2371,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2399 * 2371 *
2400 * First fix up the nr_uninterruptible count: 2372 * First fix up the nr_uninterruptible count:
2401 */ 2373 */
2402 if (task_contributes_to_load(p)) 2374 if (task_contributes_to_load(p)) {
2403 rq->nr_uninterruptible--; 2375 if (likely(cpu_online(orig_cpu)))
2376 rq->nr_uninterruptible--;
2377 else
2378 this_rq()->nr_uninterruptible--;
2379 }
2404 p->state = TASK_WAKING; 2380 p->state = TASK_WAKING;
2405 2381
2406 if (p->sched_class->task_waking) 2382 if (p->sched_class->task_waking) {
2407 p->sched_class->task_waking(rq, p); 2383 p->sched_class->task_waking(rq, p);
2384 en_flags |= ENQUEUE_WAKING;
2385 }
2408 2386
2409 __task_rq_unlock(rq); 2387 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2410 2388 if (cpu != orig_cpu)
2411 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2412 if (cpu != orig_cpu) {
2413 /*
2414 * Since we migrate the task without holding any rq->lock,
2415 * we need to be careful with task_rq_lock(), since that
2416 * might end up locking an invalid rq.
2417 */
2418 set_task_cpu(p, cpu); 2389 set_task_cpu(p, cpu);
2419 } 2390 __task_rq_unlock(rq);
2420 2391
2421 rq = cpu_rq(cpu); 2392 rq = cpu_rq(cpu);
2422 raw_spin_lock(&rq->lock); 2393 raw_spin_lock(&rq->lock);
2423 update_rq_clock(rq);
2424 2394
2425 /* 2395 /*
2426 * We migrated the task without holding either rq->lock, however 2396 * We migrated the task without holding either rq->lock, however
@@ -2448,54 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2448 2418
2449out_activate: 2419out_activate:
2450#endif /* CONFIG_SMP */ 2420#endif /* CONFIG_SMP */
2451 schedstat_inc(p, se.nr_wakeups); 2421 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2452 if (wake_flags & WF_SYNC) 2422 cpu == this_cpu, en_flags);
2453 schedstat_inc(p, se.nr_wakeups_sync);
2454 if (orig_cpu != cpu)
2455 schedstat_inc(p, se.nr_wakeups_migrate);
2456 if (cpu == this_cpu)
2457 schedstat_inc(p, se.nr_wakeups_local);
2458 else
2459 schedstat_inc(p, se.nr_wakeups_remote);
2460 activate_task(rq, p, 1);
2461 success = 1; 2423 success = 1;
2462
2463 /*
2464 * Only attribute actual wakeups done by this task.
2465 */
2466 if (!in_interrupt()) {
2467 struct sched_entity *se = &current->se;
2468 u64 sample = se->sum_exec_runtime;
2469
2470 if (se->last_wakeup)
2471 sample -= se->last_wakeup;
2472 else
2473 sample -= se->start_runtime;
2474 update_avg(&se->avg_wakeup, sample);
2475
2476 se->last_wakeup = se->sum_exec_runtime;
2477 }
2478
2479out_running: 2424out_running:
2480 trace_sched_wakeup(rq, p, success); 2425 ttwu_post_activation(p, rq, wake_flags, success);
2481 check_preempt_curr(rq, p, wake_flags);
2482
2483 p->state = TASK_RUNNING;
2484#ifdef CONFIG_SMP
2485 if (p->sched_class->task_woken)
2486 p->sched_class->task_woken(rq, p);
2487
2488 if (unlikely(rq->idle_stamp)) {
2489 u64 delta = rq->clock - rq->idle_stamp;
2490 u64 max = 2*sysctl_sched_migration_cost;
2491
2492 if (delta > max)
2493 rq->avg_idle = max;
2494 else
2495 update_avg(&rq->avg_idle, delta);
2496 rq->idle_stamp = 0;
2497 }
2498#endif
2499out: 2426out:
2500 task_rq_unlock(rq, &flags); 2427 task_rq_unlock(rq, &flags);
2501 put_cpu(); 2428 put_cpu();
@@ -2504,6 +2431,37 @@ out:
2504} 2431}
2505 2432
2506/** 2433/**
2434 * try_to_wake_up_local - try to wake up a local task with rq lock held
2435 * @p: the thread to be awakened
2436 *
2437 * Put @p on the run-queue if it's not alredy there. The caller must
2438 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2439 * the current task. this_rq() stays locked over invocation.
2440 */
2441static void try_to_wake_up_local(struct task_struct *p)
2442{
2443 struct rq *rq = task_rq(p);
2444 bool success = false;
2445
2446 BUG_ON(rq != this_rq());
2447 BUG_ON(p == current);
2448 lockdep_assert_held(&rq->lock);
2449
2450 if (!(p->state & TASK_NORMAL))
2451 return;
2452
2453 if (!p->se.on_rq) {
2454 if (likely(!task_running(rq, p))) {
2455 schedstat_inc(rq, ttwu_count);
2456 schedstat_inc(rq, ttwu_local);
2457 }
2458 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2459 success = true;
2460 }
2461 ttwu_post_activation(p, rq, 0, success);
2462}
2463
2464/**
2507 * wake_up_process - Wake up a specific process 2465 * wake_up_process - Wake up a specific process
2508 * @p: The process to be woken up. 2466 * @p: The process to be woken up.
2509 * 2467 *
@@ -2537,42 +2495,9 @@ static void __sched_fork(struct task_struct *p)
2537 p->se.sum_exec_runtime = 0; 2495 p->se.sum_exec_runtime = 0;
2538 p->se.prev_sum_exec_runtime = 0; 2496 p->se.prev_sum_exec_runtime = 0;
2539 p->se.nr_migrations = 0; 2497 p->se.nr_migrations = 0;
2540 p->se.last_wakeup = 0;
2541 p->se.avg_overlap = 0;
2542 p->se.start_runtime = 0;
2543 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2544 2498
2545#ifdef CONFIG_SCHEDSTATS 2499#ifdef CONFIG_SCHEDSTATS
2546 p->se.wait_start = 0; 2500 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2547 p->se.wait_max = 0;
2548 p->se.wait_count = 0;
2549 p->se.wait_sum = 0;
2550
2551 p->se.sleep_start = 0;
2552 p->se.sleep_max = 0;
2553 p->se.sum_sleep_runtime = 0;
2554
2555 p->se.block_start = 0;
2556 p->se.block_max = 0;
2557 p->se.exec_max = 0;
2558 p->se.slice_max = 0;
2559
2560 p->se.nr_migrations_cold = 0;
2561 p->se.nr_failed_migrations_affine = 0;
2562 p->se.nr_failed_migrations_running = 0;
2563 p->se.nr_failed_migrations_hot = 0;
2564 p->se.nr_forced_migrations = 0;
2565
2566 p->se.nr_wakeups = 0;
2567 p->se.nr_wakeups_sync = 0;
2568 p->se.nr_wakeups_migrate = 0;
2569 p->se.nr_wakeups_local = 0;
2570 p->se.nr_wakeups_remote = 0;
2571 p->se.nr_wakeups_affine = 0;
2572 p->se.nr_wakeups_affine_attempts = 0;
2573 p->se.nr_wakeups_passive = 0;
2574 p->se.nr_wakeups_idle = 0;
2575
2576#endif 2501#endif
2577 2502
2578 INIT_LIST_HEAD(&p->rt.run_list); 2503 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2593,11 +2518,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2593 2518
2594 __sched_fork(p); 2519 __sched_fork(p);
2595 /* 2520 /*
2596 * We mark the process as waking here. This guarantees that 2521 * We mark the process as running here. This guarantees that
2597 * nobody will actually run it, and a signal or other external 2522 * nobody will actually run it, and a signal or other external
2598 * event cannot wake it up and insert it on the runqueue either. 2523 * event cannot wake it up and insert it on the runqueue either.
2599 */ 2524 */
2600 p->state = TASK_WAKING; 2525 p->state = TASK_RUNNING;
2601 2526
2602 /* 2527 /*
2603 * Revert to default priority/policy on fork if requested. 2528 * Revert to default priority/policy on fork if requested.
@@ -2632,7 +2557,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2632 if (p->sched_class->task_fork) 2557 if (p->sched_class->task_fork)
2633 p->sched_class->task_fork(p); 2558 p->sched_class->task_fork(p);
2634 2559
2560 /*
2561 * The child is not yet in the pid-hash so no cgroup attach races,
2562 * and the cgroup is pinned to this child due to cgroup_fork()
2563 * is ran before sched_fork().
2564 *
2565 * Silence PROVE_RCU.
2566 */
2567 rcu_read_lock();
2635 set_task_cpu(p, cpu); 2568 set_task_cpu(p, cpu);
2569 rcu_read_unlock();
2636 2570
2637#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2571#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2638 if (likely(sched_info_on())) 2572 if (likely(sched_info_on()))
@@ -2664,31 +2598,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2664 int cpu __maybe_unused = get_cpu(); 2598 int cpu __maybe_unused = get_cpu();
2665 2599
2666#ifdef CONFIG_SMP 2600#ifdef CONFIG_SMP
2601 rq = task_rq_lock(p, &flags);
2602 p->state = TASK_WAKING;
2603
2667 /* 2604 /*
2668 * Fork balancing, do it here and not earlier because: 2605 * Fork balancing, do it here and not earlier because:
2669 * - cpus_allowed can change in the fork path 2606 * - cpus_allowed can change in the fork path
2670 * - any previously selected cpu might disappear through hotplug 2607 * - any previously selected cpu might disappear through hotplug
2671 * 2608 *
2672 * We still have TASK_WAKING but PF_STARTING is gone now, meaning 2609 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2673 * ->cpus_allowed is stable, we have preemption disabled, meaning 2610 * without people poking at ->cpus_allowed.
2674 * cpu_online_mask is stable.
2675 */ 2611 */
2676 cpu = select_task_rq(p, SD_BALANCE_FORK, 0); 2612 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2677 set_task_cpu(p, cpu); 2613 set_task_cpu(p, cpu);
2678#endif
2679 2614
2680 /*
2681 * Since the task is not on the rq and we still have TASK_WAKING set
2682 * nobody else will migrate this task.
2683 */
2684 rq = cpu_rq(cpu);
2685 raw_spin_lock_irqsave(&rq->lock, flags);
2686
2687 BUG_ON(p->state != TASK_WAKING);
2688 p->state = TASK_RUNNING; 2615 p->state = TASK_RUNNING;
2689 update_rq_clock(rq); 2616 task_rq_unlock(rq, &flags);
2617#endif
2618
2619 rq = task_rq_lock(p, &flags);
2690 activate_task(rq, p, 0); 2620 activate_task(rq, p, 0);
2691 trace_sched_wakeup_new(rq, p, 1); 2621 trace_sched_wakeup_new(p, 1);
2692 check_preempt_curr(rq, p, WF_FORK); 2622 check_preempt_curr(rq, p, WF_FORK);
2693#ifdef CONFIG_SMP 2623#ifdef CONFIG_SMP
2694 if (p->sched_class->task_woken) 2624 if (p->sched_class->task_woken)
@@ -2908,7 +2838,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2908 struct mm_struct *mm, *oldmm; 2838 struct mm_struct *mm, *oldmm;
2909 2839
2910 prepare_task_switch(rq, prev, next); 2840 prepare_task_switch(rq, prev, next);
2911 trace_sched_switch(rq, prev, next); 2841 trace_sched_switch(prev, next);
2912 mm = next->mm; 2842 mm = next->mm;
2913 oldmm = prev->active_mm; 2843 oldmm = prev->active_mm;
2914 /* 2844 /*
@@ -3006,9 +2936,9 @@ unsigned long nr_iowait(void)
3006 return sum; 2936 return sum;
3007} 2937}
3008 2938
3009unsigned long nr_iowait_cpu(void) 2939unsigned long nr_iowait_cpu(int cpu)
3010{ 2940{
3011 struct rq *this = this_rq(); 2941 struct rq *this = cpu_rq(cpu);
3012 return atomic_read(&this->nr_iowait); 2942 return atomic_read(&this->nr_iowait);
3013} 2943}
3014 2944
@@ -3025,6 +2955,61 @@ static unsigned long calc_load_update;
3025unsigned long avenrun[3]; 2955unsigned long avenrun[3];
3026EXPORT_SYMBOL(avenrun); 2956EXPORT_SYMBOL(avenrun);
3027 2957
2958static long calc_load_fold_active(struct rq *this_rq)
2959{
2960 long nr_active, delta = 0;
2961
2962 nr_active = this_rq->nr_running;
2963 nr_active += (long) this_rq->nr_uninterruptible;
2964
2965 if (nr_active != this_rq->calc_load_active) {
2966 delta = nr_active - this_rq->calc_load_active;
2967 this_rq->calc_load_active = nr_active;
2968 }
2969
2970 return delta;
2971}
2972
2973#ifdef CONFIG_NO_HZ
2974/*
2975 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
2976 *
2977 * When making the ILB scale, we should try to pull this in as well.
2978 */
2979static atomic_long_t calc_load_tasks_idle;
2980
2981static void calc_load_account_idle(struct rq *this_rq)
2982{
2983 long delta;
2984
2985 delta = calc_load_fold_active(this_rq);
2986 if (delta)
2987 atomic_long_add(delta, &calc_load_tasks_idle);
2988}
2989
2990static long calc_load_fold_idle(void)
2991{
2992 long delta = 0;
2993
2994 /*
2995 * Its got a race, we don't care...
2996 */
2997 if (atomic_long_read(&calc_load_tasks_idle))
2998 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
2999
3000 return delta;
3001}
3002#else
3003static void calc_load_account_idle(struct rq *this_rq)
3004{
3005}
3006
3007static inline long calc_load_fold_idle(void)
3008{
3009 return 0;
3010}
3011#endif
3012
3028/** 3013/**
3029 * get_avenrun - get the load average array 3014 * get_avenrun - get the load average array
3030 * @loads: pointer to dest load array 3015 * @loads: pointer to dest load array
@@ -3071,40 +3056,121 @@ void calc_global_load(void)
3071} 3056}
3072 3057
3073/* 3058/*
3074 * Either called from update_cpu_load() or from a cpu going idle 3059 * Called from update_cpu_load() to periodically update this CPU's
3060 * active count.
3075 */ 3061 */
3076static void calc_load_account_active(struct rq *this_rq) 3062static void calc_load_account_active(struct rq *this_rq)
3077{ 3063{
3078 long nr_active, delta; 3064 long delta;
3079 3065
3080 nr_active = this_rq->nr_running; 3066 if (time_before(jiffies, this_rq->calc_load_update))
3081 nr_active += (long) this_rq->nr_uninterruptible; 3067 return;
3082 3068
3083 if (nr_active != this_rq->calc_load_active) { 3069 delta = calc_load_fold_active(this_rq);
3084 delta = nr_active - this_rq->calc_load_active; 3070 delta += calc_load_fold_idle();
3085 this_rq->calc_load_active = nr_active; 3071 if (delta)
3086 atomic_long_add(delta, &calc_load_tasks); 3072 atomic_long_add(delta, &calc_load_tasks);
3073
3074 this_rq->calc_load_update += LOAD_FREQ;
3075}
3076
3077/*
3078 * The exact cpuload at various idx values, calculated at every tick would be
3079 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3080 *
3081 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3082 * on nth tick when cpu may be busy, then we have:
3083 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3084 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3085 *
3086 * decay_load_missed() below does efficient calculation of
3087 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3088 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3089 *
3090 * The calculation is approximated on a 128 point scale.
3091 * degrade_zero_ticks is the number of ticks after which load at any
3092 * particular idx is approximated to be zero.
3093 * degrade_factor is a precomputed table, a row for each load idx.
3094 * Each column corresponds to degradation factor for a power of two ticks,
3095 * based on 128 point scale.
3096 * Example:
3097 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3098 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3099 *
3100 * With this power of 2 load factors, we can degrade the load n times
3101 * by looking at 1 bits in n and doing as many mult/shift instead of
3102 * n mult/shifts needed by the exact degradation.
3103 */
3104#define DEGRADE_SHIFT 7
3105static const unsigned char
3106 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3107static const unsigned char
3108 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3109 {0, 0, 0, 0, 0, 0, 0, 0},
3110 {64, 32, 8, 0, 0, 0, 0, 0},
3111 {96, 72, 40, 12, 1, 0, 0},
3112 {112, 98, 75, 43, 15, 1, 0},
3113 {120, 112, 98, 76, 45, 16, 2} };
3114
3115/*
3116 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3117 * would be when CPU is idle and so we just decay the old load without
3118 * adding any new load.
3119 */
3120static unsigned long
3121decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3122{
3123 int j = 0;
3124
3125 if (!missed_updates)
3126 return load;
3127
3128 if (missed_updates >= degrade_zero_ticks[idx])
3129 return 0;
3130
3131 if (idx == 1)
3132 return load >> missed_updates;
3133
3134 while (missed_updates) {
3135 if (missed_updates % 2)
3136 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3137
3138 missed_updates >>= 1;
3139 j++;
3087 } 3140 }
3141 return load;
3088} 3142}
3089 3143
3090/* 3144/*
3091 * Update rq->cpu_load[] statistics. This function is usually called every 3145 * Update rq->cpu_load[] statistics. This function is usually called every
3092 * scheduler tick (TICK_NSEC). 3146 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3147 * every tick. We fix it up based on jiffies.
3093 */ 3148 */
3094static void update_cpu_load(struct rq *this_rq) 3149static void update_cpu_load(struct rq *this_rq)
3095{ 3150{
3096 unsigned long this_load = this_rq->load.weight; 3151 unsigned long this_load = this_rq->load.weight;
3152 unsigned long curr_jiffies = jiffies;
3153 unsigned long pending_updates;
3097 int i, scale; 3154 int i, scale;
3098 3155
3099 this_rq->nr_load_updates++; 3156 this_rq->nr_load_updates++;
3100 3157
3158 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3159 if (curr_jiffies == this_rq->last_load_update_tick)
3160 return;
3161
3162 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3163 this_rq->last_load_update_tick = curr_jiffies;
3164
3101 /* Update our load: */ 3165 /* Update our load: */
3102 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3166 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3167 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3103 unsigned long old_load, new_load; 3168 unsigned long old_load, new_load;
3104 3169
3105 /* scale is effectively 1 << i now, and >> i divides by scale */ 3170 /* scale is effectively 1 << i now, and >> i divides by scale */
3106 3171
3107 old_load = this_rq->cpu_load[i]; 3172 old_load = this_rq->cpu_load[i];
3173 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3108 new_load = this_load; 3174 new_load = this_load;
3109 /* 3175 /*
3110 * Round up the averaging division if load is increasing. This 3176 * Round up the averaging division if load is increasing. This
@@ -3112,16 +3178,19 @@ static void update_cpu_load(struct rq *this_rq)
3112 * example. 3178 * example.
3113 */ 3179 */
3114 if (new_load > old_load) 3180 if (new_load > old_load)
3115 new_load += scale-1; 3181 new_load += scale - 1;
3116 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3117 }
3118 3182
3119 if (time_after_eq(jiffies, this_rq->calc_load_update)) { 3183 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3120 this_rq->calc_load_update += LOAD_FREQ;
3121 calc_load_account_active(this_rq);
3122 } 3184 }
3123} 3185}
3124 3186
3187static void update_cpu_load_active(struct rq *this_rq)
3188{
3189 update_cpu_load(this_rq);
3190
3191 calc_load_account_active(this_rq);
3192}
3193
3125#ifdef CONFIG_SMP 3194#ifdef CONFIG_SMP
3126 3195
3127/* 3196/*
@@ -3131,44 +3200,27 @@ static void update_cpu_load(struct rq *this_rq)
3131void sched_exec(void) 3200void sched_exec(void)
3132{ 3201{
3133 struct task_struct *p = current; 3202 struct task_struct *p = current;
3134 struct migration_req req;
3135 int dest_cpu, this_cpu;
3136 unsigned long flags; 3203 unsigned long flags;
3137 struct rq *rq; 3204 struct rq *rq;
3138 3205 int dest_cpu;
3139again:
3140 this_cpu = get_cpu();
3141 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3142 if (dest_cpu == this_cpu) {
3143 put_cpu();
3144 return;
3145 }
3146 3206
3147 rq = task_rq_lock(p, &flags); 3207 rq = task_rq_lock(p, &flags);
3148 put_cpu(); 3208 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3209 if (dest_cpu == smp_processor_id())
3210 goto unlock;
3149 3211
3150 /* 3212 /*
3151 * select_task_rq() can race against ->cpus_allowed 3213 * select_task_rq() can race against ->cpus_allowed
3152 */ 3214 */
3153 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3215 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3154 || unlikely(!cpu_active(dest_cpu))) { 3216 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3155 task_rq_unlock(rq, &flags); 3217 struct migration_arg arg = { p, dest_cpu };
3156 goto again;
3157 }
3158
3159 /* force the process onto the specified CPU */
3160 if (migrate_task(p, dest_cpu, &req)) {
3161 /* Need to wait for migration thread (might exit: take ref). */
3162 struct task_struct *mt = rq->migration_thread;
3163 3218
3164 get_task_struct(mt);
3165 task_rq_unlock(rq, &flags); 3219 task_rq_unlock(rq, &flags);
3166 wake_up_process(mt); 3220 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3167 put_task_struct(mt);
3168 wait_for_completion(&req.done);
3169
3170 return; 3221 return;
3171 } 3222 }
3223unlock:
3172 task_rq_unlock(rq, &flags); 3224 task_rq_unlock(rq, &flags);
3173} 3225}
3174 3226
@@ -3522,7 +3574,7 @@ void scheduler_tick(void)
3522 3574
3523 raw_spin_lock(&rq->lock); 3575 raw_spin_lock(&rq->lock);
3524 update_rq_clock(rq); 3576 update_rq_clock(rq);
3525 update_cpu_load(rq); 3577 update_cpu_load_active(rq);
3526 curr->sched_class->task_tick(rq, curr, 0); 3578 curr->sched_class->task_tick(rq, curr, 0);
3527 raw_spin_unlock(&rq->lock); 3579 raw_spin_unlock(&rq->lock);
3528 3580
@@ -3640,23 +3692,9 @@ static inline void schedule_debug(struct task_struct *prev)
3640 3692
3641static void put_prev_task(struct rq *rq, struct task_struct *prev) 3693static void put_prev_task(struct rq *rq, struct task_struct *prev)
3642{ 3694{
3643 if (prev->state == TASK_RUNNING) { 3695 if (prev->se.on_rq)
3644 u64 runtime = prev->se.sum_exec_runtime; 3696 update_rq_clock(rq);
3645 3697 rq->skip_clock_update = 0;
3646 runtime -= prev->se.prev_sum_exec_runtime;
3647 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
3648
3649 /*
3650 * In order to avoid avg_overlap growing stale when we are
3651 * indeed overlapping and hence not getting put to sleep, grow
3652 * the avg_overlap on preemption.
3653 *
3654 * We use the average preemption runtime because that
3655 * correlates to the amount of cache footprint a task can
3656 * build up.
3657 */
3658 update_avg(&prev->se.avg_overlap, runtime);
3659 }
3660 prev->sched_class->put_prev_task(rq, prev); 3698 prev->sched_class->put_prev_task(rq, prev);
3661} 3699}
3662 3700
@@ -3706,9 +3744,8 @@ need_resched:
3706 preempt_disable(); 3744 preempt_disable();
3707 cpu = smp_processor_id(); 3745 cpu = smp_processor_id();
3708 rq = cpu_rq(cpu); 3746 rq = cpu_rq(cpu);
3709 rcu_sched_qs(cpu); 3747 rcu_note_context_switch(cpu);
3710 prev = rq->curr; 3748 prev = rq->curr;
3711 switch_count = &prev->nivcsw;
3712 3749
3713 release_kernel_lock(prev); 3750 release_kernel_lock(prev);
3714need_resched_nonpreemptible: 3751need_resched_nonpreemptible:
@@ -3719,14 +3756,28 @@ need_resched_nonpreemptible:
3719 hrtick_clear(rq); 3756 hrtick_clear(rq);
3720 3757
3721 raw_spin_lock_irq(&rq->lock); 3758 raw_spin_lock_irq(&rq->lock);
3722 update_rq_clock(rq);
3723 clear_tsk_need_resched(prev); 3759 clear_tsk_need_resched(prev);
3724 3760
3761 switch_count = &prev->nivcsw;
3725 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3762 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3726 if (unlikely(signal_pending_state(prev->state, prev))) 3763 if (unlikely(signal_pending_state(prev->state, prev))) {
3727 prev->state = TASK_RUNNING; 3764 prev->state = TASK_RUNNING;
3728 else 3765 } else {
3729 deactivate_task(rq, prev, 1); 3766 /*
3767 * If a worker is going to sleep, notify and
3768 * ask workqueue whether it wants to wake up a
3769 * task to maintain concurrency. If so, wake
3770 * up the task.
3771 */
3772 if (prev->flags & PF_WQ_WORKER) {
3773 struct task_struct *to_wakeup;
3774
3775 to_wakeup = wq_worker_sleeping(prev, cpu);
3776 if (to_wakeup)
3777 try_to_wake_up_local(to_wakeup);
3778 }
3779 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3780 }
3730 switch_count = &prev->nvcsw; 3781 switch_count = &prev->nvcsw;
3731 } 3782 }
3732 3783
@@ -3748,8 +3799,10 @@ need_resched_nonpreemptible:
3748 3799
3749 context_switch(rq, prev, next); /* unlocks the rq */ 3800 context_switch(rq, prev, next); /* unlocks the rq */
3750 /* 3801 /*
3751 * the context switch might have flipped the stack from under 3802 * The context switch have flipped the stack from under us
3752 * us, hence refresh the local variables. 3803 * and restored the local variables which were saved when
3804 * this task called schedule() in the past. prev == current
3805 * is still correct, but it can be moved to another cpu/rq.
3753 */ 3806 */
3754 cpu = smp_processor_id(); 3807 cpu = smp_processor_id();
3755 rq = cpu_rq(cpu); 3808 rq = cpu_rq(cpu);
@@ -3758,11 +3811,8 @@ need_resched_nonpreemptible:
3758 3811
3759 post_schedule(rq); 3812 post_schedule(rq);
3760 3813
3761 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3814 if (unlikely(reacquire_kernel_lock(prev)))
3762 prev = rq->curr;
3763 switch_count = &prev->nivcsw;
3764 goto need_resched_nonpreemptible; 3815 goto need_resched_nonpreemptible;
3765 }
3766 3816
3767 preempt_enable_no_resched(); 3817 preempt_enable_no_resched();
3768 if (need_resched()) 3818 if (need_resched())
@@ -3837,7 +3887,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3837 * off of preempt_enable. Kernel preemptions off return from interrupt 3887 * off of preempt_enable. Kernel preemptions off return from interrupt
3838 * occur there and call schedule directly. 3888 * occur there and call schedule directly.
3839 */ 3889 */
3840asmlinkage void __sched preempt_schedule(void) 3890asmlinkage void __sched notrace preempt_schedule(void)
3841{ 3891{
3842 struct thread_info *ti = current_thread_info(); 3892 struct thread_info *ti = current_thread_info();
3843 3893
@@ -3849,9 +3899,9 @@ asmlinkage void __sched preempt_schedule(void)
3849 return; 3899 return;
3850 3900
3851 do { 3901 do {
3852 add_preempt_count(PREEMPT_ACTIVE); 3902 add_preempt_count_notrace(PREEMPT_ACTIVE);
3853 schedule(); 3903 schedule();
3854 sub_preempt_count(PREEMPT_ACTIVE); 3904 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3855 3905
3856 /* 3906 /*
3857 * Check again in case we missed a preemption opportunity 3907 * Check again in case we missed a preemption opportunity
@@ -3950,6 +4000,7 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3950{ 4000{
3951 __wake_up_common(q, mode, 1, 0, NULL); 4001 __wake_up_common(q, mode, 1, 0, NULL);
3952} 4002}
4003EXPORT_SYMBOL_GPL(__wake_up_locked);
3953 4004
3954void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 4005void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3955{ 4006{
@@ -4049,8 +4100,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4049 if (!x->done) { 4100 if (!x->done) {
4050 DECLARE_WAITQUEUE(wait, current); 4101 DECLARE_WAITQUEUE(wait, current);
4051 4102
4052 wait.flags |= WQ_FLAG_EXCLUSIVE; 4103 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4053 __add_wait_queue_tail(&x->wait, &wait);
4054 do { 4104 do {
4055 if (signal_pending_state(state, current)) { 4105 if (signal_pending_state(state, current)) {
4056 timeout = -ERESTARTSYS; 4106 timeout = -ERESTARTSYS;
@@ -4161,6 +4211,23 @@ int __sched wait_for_completion_killable(struct completion *x)
4161EXPORT_SYMBOL(wait_for_completion_killable); 4211EXPORT_SYMBOL(wait_for_completion_killable);
4162 4212
4163/** 4213/**
4214 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
4215 * @x: holds the state of this particular completion
4216 * @timeout: timeout value in jiffies
4217 *
4218 * This waits for either a completion of a specific task to be
4219 * signaled or for a specified timeout to expire. It can be
4220 * interrupted by a kill signal. The timeout is in jiffies.
4221 */
4222unsigned long __sched
4223wait_for_completion_killable_timeout(struct completion *x,
4224 unsigned long timeout)
4225{
4226 return wait_for_common(x, timeout, TASK_KILLABLE);
4227}
4228EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4229
4230/**
4164 * try_wait_for_completion - try to decrement a completion without blocking 4231 * try_wait_for_completion - try to decrement a completion without blocking
4165 * @x: completion structure 4232 * @x: completion structure
4166 * 4233 *
@@ -4276,7 +4343,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4276 BUG_ON(prio < 0 || prio > MAX_PRIO); 4343 BUG_ON(prio < 0 || prio > MAX_PRIO);
4277 4344
4278 rq = task_rq_lock(p, &flags); 4345 rq = task_rq_lock(p, &flags);
4279 update_rq_clock(rq);
4280 4346
4281 oldprio = p->prio; 4347 oldprio = p->prio;
4282 prev_class = p->sched_class; 4348 prev_class = p->sched_class;
@@ -4297,7 +4363,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4297 if (running) 4363 if (running)
4298 p->sched_class->set_curr_task(rq); 4364 p->sched_class->set_curr_task(rq);
4299 if (on_rq) { 4365 if (on_rq) {
4300 enqueue_task(rq, p, 0, oldprio < prio); 4366 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4301 4367
4302 check_class_changed(rq, p, prev_class, oldprio, running); 4368 check_class_changed(rq, p, prev_class, oldprio, running);
4303 } 4369 }
@@ -4319,7 +4385,6 @@ void set_user_nice(struct task_struct *p, long nice)
4319 * the task might be in the middle of scheduling on another CPU. 4385 * the task might be in the middle of scheduling on another CPU.
4320 */ 4386 */
4321 rq = task_rq_lock(p, &flags); 4387 rq = task_rq_lock(p, &flags);
4322 update_rq_clock(rq);
4323 /* 4388 /*
4324 * The RT priorities are set via sched_setscheduler(), but we still 4389 * The RT priorities are set via sched_setscheduler(), but we still
4325 * allow the 'normal' nice value to be set - but as expected 4390 * allow the 'normal' nice value to be set - but as expected
@@ -4341,7 +4406,7 @@ void set_user_nice(struct task_struct *p, long nice)
4341 delta = p->prio - old_prio; 4406 delta = p->prio - old_prio;
4342 4407
4343 if (on_rq) { 4408 if (on_rq) {
4344 enqueue_task(rq, p, 0, false); 4409 enqueue_task(rq, p, 0);
4345 /* 4410 /*
4346 * If the task increased its priority or is running and 4411 * If the task increased its priority or is running and
4347 * lowered its priority, then reschedule its CPU: 4412 * lowered its priority, then reschedule its CPU:
@@ -4537,12 +4602,8 @@ recheck:
4537 */ 4602 */
4538 if (user && !capable(CAP_SYS_NICE)) { 4603 if (user && !capable(CAP_SYS_NICE)) {
4539 if (rt_policy(policy)) { 4604 if (rt_policy(policy)) {
4540 unsigned long rlim_rtprio; 4605 unsigned long rlim_rtprio =
4541 4606 task_rlimit(p, RLIMIT_RTPRIO);
4542 if (!lock_task_sighand(p, &flags))
4543 return -ESRCH;
4544 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4545 unlock_task_sighand(p, &flags);
4546 4607
4547 /* can't set/change the rt policy */ 4608 /* can't set/change the rt policy */
4548 if (policy != p->policy && !rlim_rtprio) 4609 if (policy != p->policy && !rlim_rtprio)
@@ -4570,16 +4631,6 @@ recheck:
4570 } 4631 }
4571 4632
4572 if (user) { 4633 if (user) {
4573#ifdef CONFIG_RT_GROUP_SCHED
4574 /*
4575 * Do not allow realtime tasks into groups that have no runtime
4576 * assigned.
4577 */
4578 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4579 task_group(p)->rt_bandwidth.rt_runtime == 0)
4580 return -EPERM;
4581#endif
4582
4583 retval = security_task_setscheduler(p, policy, param); 4634 retval = security_task_setscheduler(p, policy, param);
4584 if (retval) 4635 if (retval)
4585 return retval; 4636 return retval;
@@ -4595,6 +4646,22 @@ recheck:
4595 * runqueue lock must be held. 4646 * runqueue lock must be held.
4596 */ 4647 */
4597 rq = __task_rq_lock(p); 4648 rq = __task_rq_lock(p);
4649
4650#ifdef CONFIG_RT_GROUP_SCHED
4651 if (user) {
4652 /*
4653 * Do not allow realtime tasks into groups that have no runtime
4654 * assigned.
4655 */
4656 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4657 task_group(p)->rt_bandwidth.rt_runtime == 0) {
4658 __task_rq_unlock(rq);
4659 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4660 return -EPERM;
4661 }
4662 }
4663#endif
4664
4598 /* recheck policy now with rq lock held */ 4665 /* recheck policy now with rq lock held */
4599 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4666 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4600 policy = oldpolicy = -1; 4667 policy = oldpolicy = -1;
@@ -4602,7 +4669,6 @@ recheck:
4602 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4669 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4603 goto recheck; 4670 goto recheck;
4604 } 4671 }
4605 update_rq_clock(rq);
4606 on_rq = p->se.on_rq; 4672 on_rq = p->se.on_rq;
4607 running = task_current(rq, p); 4673 running = task_current(rq, p);
4608 if (on_rq) 4674 if (on_rq)
@@ -5339,17 +5405,15 @@ static inline void sched_init_granularity(void)
5339/* 5405/*
5340 * This is how migration works: 5406 * This is how migration works:
5341 * 5407 *
5342 * 1) we queue a struct migration_req structure in the source CPU's 5408 * 1) we invoke migration_cpu_stop() on the target CPU using
5343 * runqueue and wake up that CPU's migration thread. 5409 * stop_one_cpu().
5344 * 2) we down() the locked semaphore => thread blocks. 5410 * 2) stopper starts to run (implicitly forcing the migrated thread
5345 * 3) migration thread wakes up (implicitly it forces the migrated 5411 * off the CPU)
5346 * thread off the CPU) 5412 * 3) it checks whether the migrated task is still in the wrong runqueue.
5347 * 4) it gets the migration request and checks whether the migrated 5413 * 4) if it's in the wrong runqueue then the migration thread removes
5348 * task is still in the wrong runqueue.
5349 * 5) if it's in the wrong runqueue then the migration thread removes
5350 * it and puts it into the right queue. 5414 * it and puts it into the right queue.
5351 * 6) migration thread up()s the semaphore. 5415 * 5) stopper completes and stop_one_cpu() returns and the migration
5352 * 7) we wake up and the migration is done. 5416 * is done.
5353 */ 5417 */
5354 5418
5355/* 5419/*
@@ -5363,12 +5427,23 @@ static inline void sched_init_granularity(void)
5363 */ 5427 */
5364int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5428int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5365{ 5429{
5366 struct migration_req req;
5367 unsigned long flags; 5430 unsigned long flags;
5368 struct rq *rq; 5431 struct rq *rq;
5432 unsigned int dest_cpu;
5369 int ret = 0; 5433 int ret = 0;
5370 5434
5435 /*
5436 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5437 * drop the rq->lock and still rely on ->cpus_allowed.
5438 */
5439again:
5440 while (task_is_waking(p))
5441 cpu_relax();
5371 rq = task_rq_lock(p, &flags); 5442 rq = task_rq_lock(p, &flags);
5443 if (task_is_waking(p)) {
5444 task_rq_unlock(rq, &flags);
5445 goto again;
5446 }
5372 5447
5373 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5448 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5374 ret = -EINVAL; 5449 ret = -EINVAL;
@@ -5392,15 +5467,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5392 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5467 if (cpumask_test_cpu(task_cpu(p), new_mask))
5393 goto out; 5468 goto out;
5394 5469
5395 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { 5470 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5471 if (migrate_task(p, dest_cpu)) {
5472 struct migration_arg arg = { p, dest_cpu };
5396 /* Need help from migration thread: drop lock and wait. */ 5473 /* Need help from migration thread: drop lock and wait. */
5397 struct task_struct *mt = rq->migration_thread;
5398
5399 get_task_struct(mt);
5400 task_rq_unlock(rq, &flags); 5474 task_rq_unlock(rq, &flags);
5401 wake_up_process(mt); 5475 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5402 put_task_struct(mt);
5403 wait_for_completion(&req.done);
5404 tlb_migrate_finish(p->mm); 5476 tlb_migrate_finish(p->mm);
5405 return 0; 5477 return 0;
5406 } 5478 }
@@ -5458,98 +5530,49 @@ fail:
5458 return ret; 5530 return ret;
5459} 5531}
5460 5532
5461#define RCU_MIGRATION_IDLE 0
5462#define RCU_MIGRATION_NEED_QS 1
5463#define RCU_MIGRATION_GOT_QS 2
5464#define RCU_MIGRATION_MUST_SYNC 3
5465
5466/* 5533/*
5467 * migration_thread - this is a highprio system thread that performs 5534 * migration_cpu_stop - this will be executed by a highprio stopper thread
5468 * thread migration by bumping thread off CPU then 'pushing' onto 5535 * and performs thread migration by bumping thread off CPU then
5469 * another runqueue. 5536 * 'pushing' onto another runqueue.
5470 */ 5537 */
5471static int migration_thread(void *data) 5538static int migration_cpu_stop(void *data)
5472{ 5539{
5473 int badcpu; 5540 struct migration_arg *arg = data;
5474 int cpu = (long)data;
5475 struct rq *rq;
5476
5477 rq = cpu_rq(cpu);
5478 BUG_ON(rq->migration_thread != current);
5479
5480 set_current_state(TASK_INTERRUPTIBLE);
5481 while (!kthread_should_stop()) {
5482 struct migration_req *req;
5483 struct list_head *head;
5484
5485 raw_spin_lock_irq(&rq->lock);
5486
5487 if (cpu_is_offline(cpu)) {
5488 raw_spin_unlock_irq(&rq->lock);
5489 break;
5490 }
5491
5492 if (rq->active_balance) {
5493 active_load_balance(rq, cpu);
5494 rq->active_balance = 0;
5495 }
5496
5497 head = &rq->migration_queue;
5498
5499 if (list_empty(head)) {
5500 raw_spin_unlock_irq(&rq->lock);
5501 schedule();
5502 set_current_state(TASK_INTERRUPTIBLE);
5503 continue;
5504 }
5505 req = list_entry(head->next, struct migration_req, list);
5506 list_del_init(head->next);
5507
5508 if (req->task != NULL) {
5509 raw_spin_unlock(&rq->lock);
5510 __migrate_task(req->task, cpu, req->dest_cpu);
5511 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
5512 req->dest_cpu = RCU_MIGRATION_GOT_QS;
5513 raw_spin_unlock(&rq->lock);
5514 } else {
5515 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
5516 raw_spin_unlock(&rq->lock);
5517 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
5518 }
5519 local_irq_enable();
5520
5521 complete(&req->done);
5522 }
5523 __set_current_state(TASK_RUNNING);
5524
5525 return 0;
5526}
5527
5528#ifdef CONFIG_HOTPLUG_CPU
5529
5530static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5531{
5532 int ret;
5533 5541
5542 /*
5543 * The original target cpu might have gone down and we might
5544 * be on another cpu but it doesn't matter.
5545 */
5534 local_irq_disable(); 5546 local_irq_disable();
5535 ret = __migrate_task(p, src_cpu, dest_cpu); 5547 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5536 local_irq_enable(); 5548 local_irq_enable();
5537 return ret; 5549 return 0;
5538} 5550}
5539 5551
5552#ifdef CONFIG_HOTPLUG_CPU
5540/* 5553/*
5541 * Figure out where task on dead CPU should go, use force if necessary. 5554 * Figure out where task on dead CPU should go, use force if necessary.
5542 */ 5555 */
5543static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5556void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5544{ 5557{
5545 int dest_cpu; 5558 struct rq *rq = cpu_rq(dead_cpu);
5559 int needs_cpu, uninitialized_var(dest_cpu);
5560 unsigned long flags;
5546 5561
5547again: 5562 local_irq_save(flags);
5548 dest_cpu = select_fallback_rq(dead_cpu, p);
5549 5563
5550 /* It can have affinity changed while we were choosing. */ 5564 raw_spin_lock(&rq->lock);
5551 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5565 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
5552 goto again; 5566 if (needs_cpu)
5567 dest_cpu = select_fallback_rq(dead_cpu, p);
5568 raw_spin_unlock(&rq->lock);
5569 /*
5570 * It can only fail if we race with set_cpus_allowed(),
5571 * in the racer should migrate the task anyway.
5572 */
5573 if (needs_cpu)
5574 __migrate_task(p, dead_cpu, dest_cpu);
5575 local_irq_restore(flags);
5553} 5576}
5554 5577
5555/* 5578/*
@@ -5613,7 +5636,6 @@ void sched_idle_next(void)
5613 5636
5614 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5637 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5615 5638
5616 update_rq_clock(rq);
5617 activate_task(rq, p, 0); 5639 activate_task(rq, p, 0);
5618 5640
5619 raw_spin_unlock_irqrestore(&rq->lock, flags); 5641 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5668,7 +5690,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5668 for ( ; ; ) { 5690 for ( ; ; ) {
5669 if (!rq->nr_running) 5691 if (!rq->nr_running)
5670 break; 5692 break;
5671 update_rq_clock(rq);
5672 next = pick_next_task(rq); 5693 next = pick_next_task(rq);
5673 if (!next) 5694 if (!next)
5674 break; 5695 break;
@@ -5891,35 +5912,20 @@ static void set_rq_offline(struct rq *rq)
5891static int __cpuinit 5912static int __cpuinit
5892migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5913migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5893{ 5914{
5894 struct task_struct *p;
5895 int cpu = (long)hcpu; 5915 int cpu = (long)hcpu;
5896 unsigned long flags; 5916 unsigned long flags;
5897 struct rq *rq; 5917 struct rq *rq = cpu_rq(cpu);
5898 5918
5899 switch (action) { 5919 switch (action) {
5900 5920
5901 case CPU_UP_PREPARE: 5921 case CPU_UP_PREPARE:
5902 case CPU_UP_PREPARE_FROZEN: 5922 case CPU_UP_PREPARE_FROZEN:
5903 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5904 if (IS_ERR(p))
5905 return NOTIFY_BAD;
5906 kthread_bind(p, cpu);
5907 /* Must be high prio: stop_machine expects to yield to it. */
5908 rq = task_rq_lock(p, &flags);
5909 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5910 task_rq_unlock(rq, &flags);
5911 get_task_struct(p);
5912 cpu_rq(cpu)->migration_thread = p;
5913 rq->calc_load_update = calc_load_update; 5923 rq->calc_load_update = calc_load_update;
5914 break; 5924 break;
5915 5925
5916 case CPU_ONLINE: 5926 case CPU_ONLINE:
5917 case CPU_ONLINE_FROZEN: 5927 case CPU_ONLINE_FROZEN:
5918 /* Strictly unnecessary, as first user will wake it. */
5919 wake_up_process(cpu_rq(cpu)->migration_thread);
5920
5921 /* Update our root-domain */ 5928 /* Update our root-domain */
5922 rq = cpu_rq(cpu);
5923 raw_spin_lock_irqsave(&rq->lock, flags); 5929 raw_spin_lock_irqsave(&rq->lock, flags);
5924 if (rq->rd) { 5930 if (rq->rd) {
5925 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5931 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5930,61 +5936,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5930 break; 5936 break;
5931 5937
5932#ifdef CONFIG_HOTPLUG_CPU 5938#ifdef CONFIG_HOTPLUG_CPU
5933 case CPU_UP_CANCELED:
5934 case CPU_UP_CANCELED_FROZEN:
5935 if (!cpu_rq(cpu)->migration_thread)
5936 break;
5937 /* Unbind it from offline cpu so it can run. Fall thru. */
5938 kthread_bind(cpu_rq(cpu)->migration_thread,
5939 cpumask_any(cpu_online_mask));
5940 kthread_stop(cpu_rq(cpu)->migration_thread);
5941 put_task_struct(cpu_rq(cpu)->migration_thread);
5942 cpu_rq(cpu)->migration_thread = NULL;
5943 break;
5944
5945 case CPU_DEAD: 5939 case CPU_DEAD:
5946 case CPU_DEAD_FROZEN: 5940 case CPU_DEAD_FROZEN:
5947 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
5948 migrate_live_tasks(cpu); 5941 migrate_live_tasks(cpu);
5949 rq = cpu_rq(cpu);
5950 kthread_stop(rq->migration_thread);
5951 put_task_struct(rq->migration_thread);
5952 rq->migration_thread = NULL;
5953 /* Idle task back to normal (off runqueue, low prio) */ 5942 /* Idle task back to normal (off runqueue, low prio) */
5954 raw_spin_lock_irq(&rq->lock); 5943 raw_spin_lock_irq(&rq->lock);
5955 update_rq_clock(rq);
5956 deactivate_task(rq, rq->idle, 0); 5944 deactivate_task(rq, rq->idle, 0);
5957 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 5945 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5958 rq->idle->sched_class = &idle_sched_class; 5946 rq->idle->sched_class = &idle_sched_class;
5959 migrate_dead_tasks(cpu); 5947 migrate_dead_tasks(cpu);
5960 raw_spin_unlock_irq(&rq->lock); 5948 raw_spin_unlock_irq(&rq->lock);
5961 cpuset_unlock();
5962 migrate_nr_uninterruptible(rq); 5949 migrate_nr_uninterruptible(rq);
5963 BUG_ON(rq->nr_running != 0); 5950 BUG_ON(rq->nr_running != 0);
5964 calc_global_load_remove(rq); 5951 calc_global_load_remove(rq);
5965 /*
5966 * No need to migrate the tasks: it was best-effort if
5967 * they didn't take sched_hotcpu_mutex. Just wake up
5968 * the requestors.
5969 */
5970 raw_spin_lock_irq(&rq->lock);
5971 while (!list_empty(&rq->migration_queue)) {
5972 struct migration_req *req;
5973
5974 req = list_entry(rq->migration_queue.next,
5975 struct migration_req, list);
5976 list_del_init(&req->list);
5977 raw_spin_unlock_irq(&rq->lock);
5978 complete(&req->done);
5979 raw_spin_lock_irq(&rq->lock);
5980 }
5981 raw_spin_unlock_irq(&rq->lock);
5982 break; 5952 break;
5983 5953
5984 case CPU_DYING: 5954 case CPU_DYING:
5985 case CPU_DYING_FROZEN: 5955 case CPU_DYING_FROZEN:
5986 /* Update our root-domain */ 5956 /* Update our root-domain */
5987 rq = cpu_rq(cpu);
5988 raw_spin_lock_irqsave(&rq->lock, flags); 5957 raw_spin_lock_irqsave(&rq->lock, flags);
5989 if (rq->rd) { 5958 if (rq->rd) {
5990 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5959 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6004,20 +5973,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6004 */ 5973 */
6005static struct notifier_block __cpuinitdata migration_notifier = { 5974static struct notifier_block __cpuinitdata migration_notifier = {
6006 .notifier_call = migration_call, 5975 .notifier_call = migration_call,
6007 .priority = 10 5976 .priority = CPU_PRI_MIGRATION,
6008}; 5977};
6009 5978
5979static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5980 unsigned long action, void *hcpu)
5981{
5982 switch (action & ~CPU_TASKS_FROZEN) {
5983 case CPU_ONLINE:
5984 case CPU_DOWN_FAILED:
5985 set_cpu_active((long)hcpu, true);
5986 return NOTIFY_OK;
5987 default:
5988 return NOTIFY_DONE;
5989 }
5990}
5991
5992static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5993 unsigned long action, void *hcpu)
5994{
5995 switch (action & ~CPU_TASKS_FROZEN) {
5996 case CPU_DOWN_PREPARE:
5997 set_cpu_active((long)hcpu, false);
5998 return NOTIFY_OK;
5999 default:
6000 return NOTIFY_DONE;
6001 }
6002}
6003
6010static int __init migration_init(void) 6004static int __init migration_init(void)
6011{ 6005{
6012 void *cpu = (void *)(long)smp_processor_id(); 6006 void *cpu = (void *)(long)smp_processor_id();
6013 int err; 6007 int err;
6014 6008
6015 /* Start one for the boot CPU: */ 6009 /* Initialize migration for the boot CPU */
6016 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6010 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6017 BUG_ON(err == NOTIFY_BAD); 6011 BUG_ON(err == NOTIFY_BAD);
6018 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6012 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6019 register_cpu_notifier(&migration_notifier); 6013 register_cpu_notifier(&migration_notifier);
6020 6014
6015 /* Register cpu active notifiers */
6016 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6017 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6018
6021 return 0; 6019 return 0;
6022} 6020}
6023early_initcall(migration_init); 6021early_initcall(migration_init);
@@ -6252,23 +6250,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6252 free_rootdomain(old_rd); 6250 free_rootdomain(old_rd);
6253} 6251}
6254 6252
6255static int init_rootdomain(struct root_domain *rd, bool bootmem) 6253static int init_rootdomain(struct root_domain *rd)
6256{ 6254{
6257 gfp_t gfp = GFP_KERNEL;
6258
6259 memset(rd, 0, sizeof(*rd)); 6255 memset(rd, 0, sizeof(*rd));
6260 6256
6261 if (bootmem) 6257 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6262 gfp = GFP_NOWAIT;
6263
6264 if (!alloc_cpumask_var(&rd->span, gfp))
6265 goto out; 6258 goto out;
6266 if (!alloc_cpumask_var(&rd->online, gfp)) 6259 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6267 goto free_span; 6260 goto free_span;
6268 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 6261 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6269 goto free_online; 6262 goto free_online;
6270 6263
6271 if (cpupri_init(&rd->cpupri, bootmem) != 0) 6264 if (cpupri_init(&rd->cpupri) != 0)
6272 goto free_rto_mask; 6265 goto free_rto_mask;
6273 return 0; 6266 return 0;
6274 6267
@@ -6284,7 +6277,7 @@ out:
6284 6277
6285static void init_defrootdomain(void) 6278static void init_defrootdomain(void)
6286{ 6279{
6287 init_rootdomain(&def_root_domain, true); 6280 init_rootdomain(&def_root_domain);
6288 6281
6289 atomic_set(&def_root_domain.refcount, 1); 6282 atomic_set(&def_root_domain.refcount, 1);
6290} 6283}
@@ -6297,7 +6290,7 @@ static struct root_domain *alloc_rootdomain(void)
6297 if (!rd) 6290 if (!rd)
6298 return NULL; 6291 return NULL;
6299 6292
6300 if (init_rootdomain(rd, false) != 0) { 6293 if (init_rootdomain(rd) != 0) {
6301 kfree(rd); 6294 kfree(rd);
6302 return NULL; 6295 return NULL;
6303 } 6296 }
@@ -6315,6 +6308,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6315 struct rq *rq = cpu_rq(cpu); 6308 struct rq *rq = cpu_rq(cpu);
6316 struct sched_domain *tmp; 6309 struct sched_domain *tmp;
6317 6310
6311 for (tmp = sd; tmp; tmp = tmp->parent)
6312 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6313
6318 /* Remove the sched domains which do not contribute to scheduling. */ 6314 /* Remove the sched domains which do not contribute to scheduling. */
6319 for (tmp = sd; tmp; ) { 6315 for (tmp = sd; tmp; ) {
6320 struct sched_domain *parent = tmp->parent; 6316 struct sched_domain *parent = tmp->parent;
@@ -7473,29 +7469,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7473} 7469}
7474#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7470#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7475 7471
7476#ifndef CONFIG_CPUSETS
7477/* 7472/*
7478 * Add online and remove offline CPUs from the scheduler domains. 7473 * Update cpusets according to cpu_active mask. If cpusets are
7479 * When cpusets are enabled they take over this function. 7474 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7475 * around partition_sched_domains().
7480 */ 7476 */
7481static int update_sched_domains(struct notifier_block *nfb, 7477static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7482 unsigned long action, void *hcpu) 7478 void *hcpu)
7483{ 7479{
7484 switch (action) { 7480 switch (action & ~CPU_TASKS_FROZEN) {
7485 case CPU_ONLINE: 7481 case CPU_ONLINE:
7486 case CPU_ONLINE_FROZEN:
7487 case CPU_DOWN_PREPARE:
7488 case CPU_DOWN_PREPARE_FROZEN:
7489 case CPU_DOWN_FAILED: 7482 case CPU_DOWN_FAILED:
7490 case CPU_DOWN_FAILED_FROZEN: 7483 cpuset_update_active_cpus();
7491 partition_sched_domains(1, NULL, NULL);
7492 return NOTIFY_OK; 7484 return NOTIFY_OK;
7485 default:
7486 return NOTIFY_DONE;
7487 }
7488}
7493 7489
7490static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7491 void *hcpu)
7492{
7493 switch (action & ~CPU_TASKS_FROZEN) {
7494 case CPU_DOWN_PREPARE:
7495 cpuset_update_active_cpus();
7496 return NOTIFY_OK;
7494 default: 7497 default:
7495 return NOTIFY_DONE; 7498 return NOTIFY_DONE;
7496 } 7499 }
7497} 7500}
7498#endif
7499 7501
7500static int update_runtime(struct notifier_block *nfb, 7502static int update_runtime(struct notifier_block *nfb,
7501 unsigned long action, void *hcpu) 7503 unsigned long action, void *hcpu)
@@ -7541,10 +7543,8 @@ void __init sched_init_smp(void)
7541 mutex_unlock(&sched_domains_mutex); 7543 mutex_unlock(&sched_domains_mutex);
7542 put_online_cpus(); 7544 put_online_cpus();
7543 7545
7544#ifndef CONFIG_CPUSETS 7546 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7545 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7547 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7546 hotcpu_notifier(update_sched_domains, 0);
7547#endif
7548 7548
7549 /* RT runtime code needs to handle some hotplug events */ 7549 /* RT runtime code needs to handle some hotplug events */
7550 hotcpu_notifier(update_runtime, 0); 7550 hotcpu_notifier(update_runtime, 0);
@@ -7789,20 +7789,26 @@ void __init sched_init(void)
7789 7789
7790 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7790 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7791 rq->cpu_load[j] = 0; 7791 rq->cpu_load[j] = 0;
7792
7793 rq->last_load_update_tick = jiffies;
7794
7792#ifdef CONFIG_SMP 7795#ifdef CONFIG_SMP
7793 rq->sd = NULL; 7796 rq->sd = NULL;
7794 rq->rd = NULL; 7797 rq->rd = NULL;
7798 rq->cpu_power = SCHED_LOAD_SCALE;
7795 rq->post_schedule = 0; 7799 rq->post_schedule = 0;
7796 rq->active_balance = 0; 7800 rq->active_balance = 0;
7797 rq->next_balance = jiffies; 7801 rq->next_balance = jiffies;
7798 rq->push_cpu = 0; 7802 rq->push_cpu = 0;
7799 rq->cpu = i; 7803 rq->cpu = i;
7800 rq->online = 0; 7804 rq->online = 0;
7801 rq->migration_thread = NULL;
7802 rq->idle_stamp = 0; 7805 rq->idle_stamp = 0;
7803 rq->avg_idle = 2*sysctl_sched_migration_cost; 7806 rq->avg_idle = 2*sysctl_sched_migration_cost;
7804 INIT_LIST_HEAD(&rq->migration_queue);
7805 rq_attach_root(rq, &def_root_domain); 7807 rq_attach_root(rq, &def_root_domain);
7808#ifdef CONFIG_NO_HZ
7809 rq->nohz_balance_kick = 0;
7810 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7811#endif
7806#endif 7812#endif
7807 init_rq_hrtick(rq); 7813 init_rq_hrtick(rq);
7808 atomic_set(&rq->nr_iowait, 0); 7814 atomic_set(&rq->nr_iowait, 0);
@@ -7847,8 +7853,11 @@ void __init sched_init(void)
7847 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7853 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7848#ifdef CONFIG_SMP 7854#ifdef CONFIG_SMP
7849#ifdef CONFIG_NO_HZ 7855#ifdef CONFIG_NO_HZ
7850 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7856 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7851 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7857 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
7858 atomic_set(&nohz.load_balancer, nr_cpu_ids);
7859 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
7860 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7852#endif 7861#endif
7853 /* May be allocated at isolcpus cmdline parse time */ 7862 /* May be allocated at isolcpus cmdline parse time */
7854 if (cpu_isolated_map == NULL) 7863 if (cpu_isolated_map == NULL)
@@ -7902,7 +7911,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7902{ 7911{
7903 int on_rq; 7912 int on_rq;
7904 7913
7905 update_rq_clock(rq);
7906 on_rq = p->se.on_rq; 7914 on_rq = p->se.on_rq;
7907 if (on_rq) 7915 if (on_rq)
7908 deactivate_task(rq, p, 0); 7916 deactivate_task(rq, p, 0);
@@ -7929,9 +7937,9 @@ void normalize_rt_tasks(void)
7929 7937
7930 p->se.exec_start = 0; 7938 p->se.exec_start = 0;
7931#ifdef CONFIG_SCHEDSTATS 7939#ifdef CONFIG_SCHEDSTATS
7932 p->se.wait_start = 0; 7940 p->se.statistics.wait_start = 0;
7933 p->se.sleep_start = 0; 7941 p->se.statistics.sleep_start = 0;
7934 p->se.block_start = 0; 7942 p->se.statistics.block_start = 0;
7935#endif 7943#endif
7936 7944
7937 if (!rt_task(p)) { 7945 if (!rt_task(p)) {
@@ -7958,9 +7966,9 @@ void normalize_rt_tasks(void)
7958 7966
7959#endif /* CONFIG_MAGIC_SYSRQ */ 7967#endif /* CONFIG_MAGIC_SYSRQ */
7960 7968
7961#ifdef CONFIG_IA64 7969#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7962/* 7970/*
7963 * These functions are only useful for the IA64 MCA handling. 7971 * These functions are only useful for the IA64 MCA handling, or kdb.
7964 * 7972 *
7965 * They can only be called when the whole system has been 7973 * They can only be called when the whole system has been
7966 * stopped - every CPU needs to be quiescent, and no scheduling 7974 * stopped - every CPU needs to be quiescent, and no scheduling
@@ -7980,6 +7988,9 @@ struct task_struct *curr_task(int cpu)
7980 return cpu_curr(cpu); 7988 return cpu_curr(cpu);
7981} 7989}
7982 7990
7991#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
7992
7993#ifdef CONFIG_IA64
7983/** 7994/**
7984 * set_curr_task - set the current task for a given cpu. 7995 * set_curr_task - set the current task for a given cpu.
7985 * @cpu: the processor in question. 7996 * @cpu: the processor in question.
@@ -8264,8 +8275,6 @@ void sched_move_task(struct task_struct *tsk)
8264 8275
8265 rq = task_rq_lock(tsk, &flags); 8276 rq = task_rq_lock(tsk, &flags);
8266 8277
8267 update_rq_clock(rq);
8268
8269 running = task_current(rq, tsk); 8278 running = task_current(rq, tsk);
8270 on_rq = tsk->se.on_rq; 8279 on_rq = tsk->se.on_rq;
8271 8280
@@ -8284,7 +8293,7 @@ void sched_move_task(struct task_struct *tsk)
8284 if (unlikely(running)) 8293 if (unlikely(running))
8285 tsk->sched_class->set_curr_task(rq); 8294 tsk->sched_class->set_curr_task(rq);
8286 if (on_rq) 8295 if (on_rq)
8287 enqueue_task(rq, tsk, 0, false); 8296 enqueue_task(rq, tsk, 0);
8288 8297
8289 task_rq_unlock(rq, &flags); 8298 task_rq_unlock(rq, &flags);
8290} 8299}
@@ -9098,43 +9107,32 @@ struct cgroup_subsys cpuacct_subsys = {
9098 9107
9099#ifndef CONFIG_SMP 9108#ifndef CONFIG_SMP
9100 9109
9101int rcu_expedited_torture_stats(char *page)
9102{
9103 return 0;
9104}
9105EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9106
9107void synchronize_sched_expedited(void) 9110void synchronize_sched_expedited(void)
9108{ 9111{
9112 barrier();
9109} 9113}
9110EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 9114EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9111 9115
9112#else /* #ifndef CONFIG_SMP */ 9116#else /* #ifndef CONFIG_SMP */
9113 9117
9114static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 9118static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9115static DEFINE_MUTEX(rcu_sched_expedited_mutex);
9116
9117#define RCU_EXPEDITED_STATE_POST -2
9118#define RCU_EXPEDITED_STATE_IDLE -1
9119 9119
9120static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 9120static int synchronize_sched_expedited_cpu_stop(void *data)
9121
9122int rcu_expedited_torture_stats(char *page)
9123{ 9121{
9124 int cnt = 0; 9122 /*
9125 int cpu; 9123 * There must be a full memory barrier on each affected CPU
9126 9124 * between the time that try_stop_cpus() is called and the
9127 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 9125 * time that it returns.
9128 for_each_online_cpu(cpu) { 9126 *
9129 cnt += sprintf(&page[cnt], " %d:%d", 9127 * In the current initial implementation of cpu_stop, the
9130 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 9128 * above condition is already met when the control reaches
9131 } 9129 * this point and the following smp_mb() is not strictly
9132 cnt += sprintf(&page[cnt], "\n"); 9130 * necessary. Do smp_mb() anyway for documentation and
9133 return cnt; 9131 * robustness against future implementation changes.
9132 */
9133 smp_mb(); /* See above comment block. */
9134 return 0;
9134} 9135}
9135EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9136
9137static long synchronize_sched_expedited_count;
9138 9136
9139/* 9137/*
9140 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 9138 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9148,18 +9146,14 @@ static long synchronize_sched_expedited_count;
9148 */ 9146 */
9149void synchronize_sched_expedited(void) 9147void synchronize_sched_expedited(void)
9150{ 9148{
9151 int cpu; 9149 int snap, trycount = 0;
9152 unsigned long flags;
9153 bool need_full_sync = 0;
9154 struct rq *rq;
9155 struct migration_req *req;
9156 long snap;
9157 int trycount = 0;
9158 9150
9159 smp_mb(); /* ensure prior mod happens before capturing snap. */ 9151 smp_mb(); /* ensure prior mod happens before capturing snap. */
9160 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 9152 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9161 get_online_cpus(); 9153 get_online_cpus();
9162 while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 9154 while (try_stop_cpus(cpu_online_mask,
9155 synchronize_sched_expedited_cpu_stop,
9156 NULL) == -EAGAIN) {
9163 put_online_cpus(); 9157 put_online_cpus();
9164 if (trycount++ < 10) 9158 if (trycount++ < 10)
9165 udelay(trycount * num_online_cpus()); 9159 udelay(trycount * num_online_cpus());
@@ -9167,41 +9161,15 @@ void synchronize_sched_expedited(void)
9167 synchronize_sched(); 9161 synchronize_sched();
9168 return; 9162 return;
9169 } 9163 }
9170 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 9164 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9171 smp_mb(); /* ensure test happens before caller kfree */ 9165 smp_mb(); /* ensure test happens before caller kfree */
9172 return; 9166 return;
9173 } 9167 }
9174 get_online_cpus(); 9168 get_online_cpus();
9175 } 9169 }
9176 rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 9170 atomic_inc(&synchronize_sched_expedited_count);
9177 for_each_online_cpu(cpu) { 9171 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9178 rq = cpu_rq(cpu);
9179 req = &per_cpu(rcu_migration_req, cpu);
9180 init_completion(&req->done);
9181 req->task = NULL;
9182 req->dest_cpu = RCU_MIGRATION_NEED_QS;
9183 raw_spin_lock_irqsave(&rq->lock, flags);
9184 list_add(&req->list, &rq->migration_queue);
9185 raw_spin_unlock_irqrestore(&rq->lock, flags);
9186 wake_up_process(rq->migration_thread);
9187 }
9188 for_each_online_cpu(cpu) {
9189 rcu_expedited_state = cpu;
9190 req = &per_cpu(rcu_migration_req, cpu);
9191 rq = cpu_rq(cpu);
9192 wait_for_completion(&req->done);
9193 raw_spin_lock_irqsave(&rq->lock, flags);
9194 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
9195 need_full_sync = 1;
9196 req->dest_cpu = RCU_MIGRATION_IDLE;
9197 raw_spin_unlock_irqrestore(&rq->lock, flags);
9198 }
9199 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9200 synchronize_sched_expedited_count++;
9201 mutex_unlock(&rcu_sched_expedited_mutex);
9202 put_online_cpus(); 9172 put_online_cpus();
9203 if (need_full_sync)
9204 synchronize_sched();
9205} 9173}
9206EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 9174EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9207 9175