diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 686 |
1 files changed, 551 insertions, 135 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index da19c1e05a5a..21c1cf2e27aa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -4,6 +4,7 @@ | |||
4 | * Kernel scheduler and related syscalls | 4 | * Kernel scheduler and related syscalls |
5 | * | 5 | * |
6 | * Copyright (C) 1991-2002 Linus Torvalds | 6 | * Copyright (C) 1991-2002 Linus Torvalds |
7 | * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * | 8 | * |
8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and | 9 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and |
9 | * make semaphores SMP safe | 10 | * make semaphores SMP safe |
@@ -16,6 +17,7 @@ | |||
16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | 17 | * by Davide Libenzi, preemptible kernel bits by Robert Love. |
17 | * 2003-09-03 Interactivity tuning by Con Kolivas. | 18 | * 2003-09-03 Interactivity tuning by Con Kolivas. |
18 | * 2004-04-02 Scheduler domains code by Nick Piggin | 19 | * 2004-04-02 Scheduler domains code by Nick Piggin |
20 | * 2004-10-13 Real-Time Preemption support by Ingo Molnar | ||
19 | * 2007-04-15 Work begun on replacing all interactivity tuning with a | 21 | * 2007-04-15 Work begun on replacing all interactivity tuning with a |
20 | * fair scheduling design by Con Kolivas. | 22 | * fair scheduling design by Con Kolivas. |
21 | * 2007-05-05 Load balancing (smp-nice) and other improvements | 23 | * 2007-05-05 Load balancing (smp-nice) and other improvements |
@@ -61,6 +63,7 @@ | |||
61 | #include <linux/sysctl.h> | 63 | #include <linux/sysctl.h> |
62 | #include <linux/syscalls.h> | 64 | #include <linux/syscalls.h> |
63 | #include <linux/times.h> | 65 | #include <linux/times.h> |
66 | #include <linux/kallsyms.h> | ||
64 | #include <linux/tsacct_kern.h> | 67 | #include <linux/tsacct_kern.h> |
65 | #include <linux/kprobes.h> | 68 | #include <linux/kprobes.h> |
66 | #include <linux/delayacct.h> | 69 | #include <linux/delayacct.h> |
@@ -106,6 +109,20 @@ | |||
106 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 109 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
107 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 110 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
108 | 111 | ||
112 | #if (BITS_PER_LONG < 64) | ||
113 | #define JIFFIES_TO_NS64(TIME) \ | ||
114 | ((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ))) | ||
115 | |||
116 | #define NS64_TO_JIFFIES(TIME) \ | ||
117 | ((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \ | ||
118 | (1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME))) | ||
119 | #else /* BITS_PER_LONG < 64 */ | ||
120 | |||
121 | #define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME) | ||
122 | #define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME) | ||
123 | |||
124 | #endif /* BITS_PER_LONG < 64 */ | ||
125 | |||
109 | /* | 126 | /* |
110 | * These are the 'tuning knobs' of the scheduler: | 127 | * These are the 'tuning knobs' of the scheduler: |
111 | * | 128 | * |
@@ -131,6 +148,9 @@ static inline int task_has_rt_policy(struct task_struct *p) | |||
131 | return rt_policy(p->policy); | 148 | return rt_policy(p->policy); |
132 | } | 149 | } |
133 | 150 | ||
151 | #define TASK_PREEMPTS_CURR(p, rq) \ | ||
152 | ((p)->prio < (rq)->curr->prio) | ||
153 | |||
134 | /* | 154 | /* |
135 | * This is the priority-queue data structure of the RT scheduling class: | 155 | * This is the priority-queue data structure of the RT scheduling class: |
136 | */ | 156 | */ |
@@ -182,6 +202,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | |||
182 | 202 | ||
183 | hrtimer_init(&rt_b->rt_period_timer, | 203 | hrtimer_init(&rt_b->rt_period_timer, |
184 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 204 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
205 | rt_b->rt_period_timer.irqsafe = 1; | ||
185 | rt_b->rt_period_timer.function = sched_rt_period_timer; | 206 | rt_b->rt_period_timer.function = sched_rt_period_timer; |
186 | } | 207 | } |
187 | 208 | ||
@@ -389,6 +410,7 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
389 | struct cfs_rq { | 410 | struct cfs_rq { |
390 | struct load_weight load; | 411 | struct load_weight load; |
391 | unsigned long nr_running; | 412 | unsigned long nr_running; |
413 | unsigned long nr_enqueued; | ||
392 | 414 | ||
393 | u64 exec_clock; | 415 | u64 exec_clock; |
394 | u64 min_vruntime; | 416 | u64 min_vruntime; |
@@ -466,6 +488,7 @@ struct rt_rq { | |||
466 | int overloaded; | 488 | int overloaded; |
467 | struct plist_head pushable_tasks; | 489 | struct plist_head pushable_tasks; |
468 | #endif | 490 | #endif |
491 | unsigned long rt_nr_uninterruptible; | ||
469 | int rt_throttled; | 492 | int rt_throttled; |
470 | u64 rt_time; | 493 | u64 rt_time; |
471 | u64 rt_runtime; | 494 | u64 rt_runtime; |
@@ -561,6 +584,8 @@ struct rq { | |||
561 | */ | 584 | */ |
562 | unsigned long nr_uninterruptible; | 585 | unsigned long nr_uninterruptible; |
563 | 586 | ||
587 | unsigned long switch_timestamp; | ||
588 | unsigned long slice_avg; | ||
564 | struct task_struct *curr, *idle; | 589 | struct task_struct *curr, *idle; |
565 | unsigned long next_balance; | 590 | unsigned long next_balance; |
566 | struct mm_struct *prev_mm; | 591 | struct mm_struct *prev_mm; |
@@ -625,9 +650,21 @@ struct rq { | |||
625 | 650 | ||
626 | /* BKL stats */ | 651 | /* BKL stats */ |
627 | unsigned int bkl_count; | 652 | unsigned int bkl_count; |
653 | |||
654 | /* RT-overload stats: */ | ||
655 | unsigned long rto_schedule; | ||
656 | unsigned long rto_schedule_tail; | ||
657 | unsigned long rto_wakeup; | ||
658 | unsigned long rto_pulled; | ||
659 | unsigned long rto_pushed; | ||
628 | #endif | 660 | #endif |
629 | }; | 661 | }; |
630 | 662 | ||
663 | struct task_struct *rq_curr(struct rq *rq) | ||
664 | { | ||
665 | return rq->curr; | ||
666 | } | ||
667 | |||
631 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 668 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
632 | 669 | ||
633 | static inline | 670 | static inline |
@@ -666,6 +703,13 @@ inline void update_rq_clock(struct rq *rq) | |||
666 | rq->clock = sched_clock_cpu(cpu_of(rq)); | 703 | rq->clock = sched_clock_cpu(cpu_of(rq)); |
667 | } | 704 | } |
668 | 705 | ||
706 | #ifndef CONFIG_SMP | ||
707 | int task_is_current(struct task_struct *task) | ||
708 | { | ||
709 | return task_rq(task)->curr == task; | ||
710 | } | ||
711 | #endif | ||
712 | |||
669 | /* | 713 | /* |
670 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 714 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
671 | */ | 715 | */ |
@@ -807,7 +851,11 @@ late_initcall(sched_init_debug); | |||
807 | * Number of tasks to iterate in a single balance run. | 851 | * Number of tasks to iterate in a single balance run. |
808 | * Limited because this is done with IRQs disabled. | 852 | * Limited because this is done with IRQs disabled. |
809 | */ | 853 | */ |
854 | #ifndef CONFIG_PREEMPT | ||
810 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 855 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
856 | #else | ||
857 | const_debug unsigned int sysctl_sched_nr_migrate = 8; | ||
858 | #endif | ||
811 | 859 | ||
812 | /* | 860 | /* |
813 | * ratelimit for updating the group shares. | 861 | * ratelimit for updating the group shares. |
@@ -858,11 +906,25 @@ static inline u64 global_rt_runtime(void) | |||
858 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 906 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; |
859 | } | 907 | } |
860 | 908 | ||
909 | /* | ||
910 | * We really dont want to do anything complex within switch_to() | ||
911 | * on PREEMPT_RT - this check enforces this. | ||
912 | */ | ||
913 | #ifdef prepare_arch_switch | ||
914 | # ifdef CONFIG_PREEMPT_RT | ||
915 | # error FIXME | ||
916 | # else | ||
917 | # define _finish_arch_switch finish_arch_switch | ||
918 | # endif | ||
919 | #endif | ||
920 | |||
861 | #ifndef prepare_arch_switch | 921 | #ifndef prepare_arch_switch |
862 | # define prepare_arch_switch(next) do { } while (0) | 922 | # define prepare_arch_switch(next) do { } while (0) |
863 | #endif | 923 | #endif |
864 | #ifndef finish_arch_switch | 924 | #ifndef finish_arch_switch |
865 | # define finish_arch_switch(prev) do { } while (0) | 925 | # define _finish_arch_switch(prev) do { } while (0) |
926 | #else | ||
927 | # define _finish_arch_switch finish_arch_switch | ||
866 | #endif | 928 | #endif |
867 | 929 | ||
868 | static inline int task_current(struct rq *rq, struct task_struct *p) | 930 | static inline int task_current(struct rq *rq, struct task_struct *p) |
@@ -870,18 +932,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) | |||
870 | return rq->curr == p; | 932 | return rq->curr == p; |
871 | } | 933 | } |
872 | 934 | ||
873 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
874 | static inline int task_running(struct rq *rq, struct task_struct *p) | 935 | static inline int task_running(struct rq *rq, struct task_struct *p) |
875 | { | 936 | { |
937 | #ifdef CONFIG_SMP | ||
938 | return p->oncpu; | ||
939 | #else | ||
876 | return task_current(rq, p); | 940 | return task_current(rq, p); |
941 | #endif | ||
877 | } | 942 | } |
878 | 943 | ||
944 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
879 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 945 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
880 | { | 946 | { |
947 | #ifdef CONFIG_SMP | ||
948 | /* | ||
949 | * We can optimise this out completely for !SMP, because the | ||
950 | * SMP rebalancing from interrupt is the only thing that cares | ||
951 | * here. | ||
952 | */ | ||
953 | next->oncpu = 1; | ||
954 | #endif | ||
881 | } | 955 | } |
882 | 956 | ||
883 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 957 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
884 | { | 958 | { |
959 | #ifdef CONFIG_SMP | ||
960 | /* | ||
961 | * After ->oncpu is cleared, the task can be moved to a different CPU. | ||
962 | * We must ensure this doesn't happen until the switch is completely | ||
963 | * finished. | ||
964 | */ | ||
965 | smp_wmb(); | ||
966 | prev->oncpu = 0; | ||
967 | #endif | ||
885 | #ifdef CONFIG_DEBUG_SPINLOCK | 968 | #ifdef CONFIG_DEBUG_SPINLOCK |
886 | /* this is a valid case when another task releases the spinlock */ | 969 | /* this is a valid case when another task releases the spinlock */ |
887 | rq->lock.owner = current; | 970 | rq->lock.owner = current; |
@@ -893,18 +976,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
893 | */ | 976 | */ |
894 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | 977 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); |
895 | 978 | ||
896 | raw_spin_unlock_irq(&rq->lock); | 979 | raw_spin_unlock(&rq->lock); |
897 | } | 980 | } |
898 | 981 | ||
899 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 982 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
900 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
901 | { | ||
902 | #ifdef CONFIG_SMP | ||
903 | return p->oncpu; | ||
904 | #else | ||
905 | return task_current(rq, p); | ||
906 | #endif | ||
907 | } | ||
908 | 983 | ||
909 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 984 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
910 | { | 985 | { |
@@ -934,23 +1009,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
934 | smp_wmb(); | 1009 | smp_wmb(); |
935 | prev->oncpu = 0; | 1010 | prev->oncpu = 0; |
936 | #endif | 1011 | #endif |
937 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 1012 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
938 | local_irq_enable(); | 1013 | local_irq_disable(); |
939 | #endif | 1014 | #endif |
940 | } | 1015 | } |
941 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 1016 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
942 | 1017 | ||
943 | /* | 1018 | /* |
1019 | * Check whether the task is waking, we use this to synchronize against | ||
1020 | * ttwu() so that task_cpu() reports a stable number. | ||
1021 | * | ||
1022 | * We need to make an exception for PF_STARTING tasks because the fork | ||
1023 | * path might require task_rq_lock() to work, eg. it can call | ||
1024 | * set_cpus_allowed_ptr() from the cpuset clone_ns code. | ||
1025 | */ | ||
1026 | static inline int task_is_waking(struct task_struct *p) | ||
1027 | { | ||
1028 | return unlikely((p->state & TASK_WAKING) && !(p->flags & PF_STARTING)); | ||
1029 | } | ||
1030 | |||
1031 | /* | ||
944 | * __task_rq_lock - lock the runqueue a given task resides on. | 1032 | * __task_rq_lock - lock the runqueue a given task resides on. |
945 | * Must be called interrupts disabled. | 1033 | * Must be called interrupts disabled. |
946 | */ | 1034 | */ |
947 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 1035 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
948 | __acquires(rq->lock) | 1036 | __acquires(rq->lock) |
949 | { | 1037 | { |
1038 | struct rq *rq; | ||
1039 | |||
950 | for (;;) { | 1040 | for (;;) { |
951 | struct rq *rq = task_rq(p); | 1041 | while (task_is_waking(p)) |
1042 | cpu_relax(); | ||
1043 | rq = task_rq(p); | ||
952 | raw_spin_lock(&rq->lock); | 1044 | raw_spin_lock(&rq->lock); |
953 | if (likely(rq == task_rq(p))) | 1045 | if (likely(rq == task_rq(p) && !task_is_waking(p))) |
954 | return rq; | 1046 | return rq; |
955 | raw_spin_unlock(&rq->lock); | 1047 | raw_spin_unlock(&rq->lock); |
956 | } | 1048 | } |
@@ -967,10 +1059,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
967 | struct rq *rq; | 1059 | struct rq *rq; |
968 | 1060 | ||
969 | for (;;) { | 1061 | for (;;) { |
1062 | while (task_is_waking(p)) | ||
1063 | cpu_relax(); | ||
970 | local_irq_save(*flags); | 1064 | local_irq_save(*flags); |
971 | rq = task_rq(p); | 1065 | rq = task_rq(p); |
972 | raw_spin_lock(&rq->lock); | 1066 | raw_spin_lock(&rq->lock); |
973 | if (likely(rq == task_rq(p))) | 1067 | if (likely(rq == task_rq(p) && !task_is_waking(p))) |
974 | return rq; | 1068 | return rq; |
975 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 1069 | raw_spin_unlock_irqrestore(&rq->lock, *flags); |
976 | } | 1070 | } |
@@ -1147,6 +1241,7 @@ static void init_rq_hrtick(struct rq *rq) | |||
1147 | 1241 | ||
1148 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1242 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
1149 | rq->hrtick_timer.function = hrtick; | 1243 | rq->hrtick_timer.function = hrtick; |
1244 | rq->hrtick_timer.irqsafe = 1; | ||
1150 | } | 1245 | } |
1151 | #else /* CONFIG_SCHED_HRTICK */ | 1246 | #else /* CONFIG_SCHED_HRTICK */ |
1152 | static inline void hrtick_clear(struct rq *rq) | 1247 | static inline void hrtick_clear(struct rq *rq) |
@@ -1222,7 +1317,7 @@ void wake_up_idle_cpu(int cpu) | |||
1222 | { | 1317 | { |
1223 | struct rq *rq = cpu_rq(cpu); | 1318 | struct rq *rq = cpu_rq(cpu); |
1224 | 1319 | ||
1225 | if (cpu == smp_processor_id()) | 1320 | if (cpu == raw_smp_processor_id()) |
1226 | return; | 1321 | return; |
1227 | 1322 | ||
1228 | /* | 1323 | /* |
@@ -1390,7 +1485,8 @@ static const u32 prio_to_wmult[40] = { | |||
1390 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 1485 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
1391 | }; | 1486 | }; |
1392 | 1487 | ||
1393 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); | 1488 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup, |
1489 | bool head); | ||
1394 | 1490 | ||
1395 | /* | 1491 | /* |
1396 | * runqueue iterator, to support SMP load-balancing between different | 1492 | * runqueue iterator, to support SMP load-balancing between different |
@@ -1883,13 +1979,14 @@ static void update_avg(u64 *avg, u64 sample) | |||
1883 | *avg += diff >> 3; | 1979 | *avg += diff >> 3; |
1884 | } | 1980 | } |
1885 | 1981 | ||
1886 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | 1982 | static void |
1983 | enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) | ||
1887 | { | 1984 | { |
1888 | if (wakeup) | 1985 | if (wakeup) |
1889 | p->se.start_runtime = p->se.sum_exec_runtime; | 1986 | p->se.start_runtime = p->se.sum_exec_runtime; |
1890 | 1987 | ||
1891 | sched_info_queued(p); | 1988 | sched_info_queued(p); |
1892 | p->sched_class->enqueue_task(rq, p, wakeup); | 1989 | p->sched_class->enqueue_task(rq, p, wakeup, head); |
1893 | p->se.on_rq = 1; | 1990 | p->se.on_rq = 1; |
1894 | } | 1991 | } |
1895 | 1992 | ||
@@ -1934,6 +2031,8 @@ static inline int normal_prio(struct task_struct *p) | |||
1934 | prio = MAX_RT_PRIO-1 - p->rt_priority; | 2031 | prio = MAX_RT_PRIO-1 - p->rt_priority; |
1935 | else | 2032 | else |
1936 | prio = __normal_prio(p); | 2033 | prio = __normal_prio(p); |
2034 | |||
2035 | // trace_special_pid(p->pid, PRIO(p), __PRIO(prio)); | ||
1937 | return prio; | 2036 | return prio; |
1938 | } | 2037 | } |
1939 | 2038 | ||
@@ -1960,12 +2059,13 @@ static int effective_prio(struct task_struct *p) | |||
1960 | /* | 2059 | /* |
1961 | * activate_task - move a task to the runqueue. | 2060 | * activate_task - move a task to the runqueue. |
1962 | */ | 2061 | */ |
1963 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | 2062 | static void |
2063 | activate_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) | ||
1964 | { | 2064 | { |
1965 | if (task_contributes_to_load(p)) | 2065 | if (task_contributes_to_load(p)) |
1966 | rq->nr_uninterruptible--; | 2066 | rq->nr_uninterruptible--; |
1967 | 2067 | ||
1968 | enqueue_task(rq, p, wakeup); | 2068 | enqueue_task(rq, p, wakeup, head); |
1969 | inc_nr_running(rq); | 2069 | inc_nr_running(rq); |
1970 | } | 2070 | } |
1971 | 2071 | ||
@@ -2034,13 +2134,20 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
2034 | 2134 | ||
2035 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 2135 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
2036 | { | 2136 | { |
2037 | #ifdef CONFIG_SCHED_DEBUG | 2137 | #if defined(CONFIG_SCHED_DEBUG) |
2038 | /* | 2138 | /* |
2039 | * We should never call set_task_cpu() on a blocked task, | 2139 | * We should never call set_task_cpu() on a blocked task, |
2040 | * ttwu() will sort out the placement. | 2140 | * ttwu() will sort out the placement. |
2041 | */ | 2141 | */ |
2042 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 2142 | if (p->state != TASK_RUNNING && |
2043 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 2143 | !(p->state & TASK_WAKING) && |
2144 | !(p->state & TASK_RUNNING_MUTEX) && | ||
2145 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)) { | ||
2146 | printk(KERN_ERR "%d %s %lx %lx\n", p->pid, p->comm, | ||
2147 | (unsigned long) p->state, | ||
2148 | (unsigned long) preempt_count()); | ||
2149 | WARN_ON(1); | ||
2150 | } | ||
2044 | #endif | 2151 | #endif |
2045 | 2152 | ||
2046 | trace_sched_migrate_task(p, new_cpu); | 2153 | trace_sched_migrate_task(p, new_cpu); |
@@ -2219,7 +2326,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2219 | * yield - it could be a while. | 2326 | * yield - it could be a while. |
2220 | */ | 2327 | */ |
2221 | if (unlikely(on_rq)) { | 2328 | if (unlikely(on_rq)) { |
2222 | schedule_timeout_uninterruptible(1); | 2329 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); |
2330 | |||
2331 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
2332 | schedule_hrtimeout(&to, HRTIMER_MODE_REL); | ||
2223 | continue; | 2333 | continue; |
2224 | } | 2334 | } |
2225 | 2335 | ||
@@ -2365,7 +2475,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | |||
2365 | * returns failure only if the task is already active. | 2475 | * returns failure only if the task is already active. |
2366 | */ | 2476 | */ |
2367 | static int try_to_wake_up(struct task_struct *p, unsigned int state, | 2477 | static int try_to_wake_up(struct task_struct *p, unsigned int state, |
2368 | int wake_flags) | 2478 | int wake_flags, int mutex) |
2369 | { | 2479 | { |
2370 | int cpu, orig_cpu, this_cpu, success = 0; | 2480 | int cpu, orig_cpu, this_cpu, success = 0; |
2371 | unsigned long flags; | 2481 | unsigned long flags; |
@@ -2395,12 +2505,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
2395 | /* | 2505 | /* |
2396 | * In order to handle concurrent wakeups and release the rq->lock | 2506 | * In order to handle concurrent wakeups and release the rq->lock |
2397 | * we put the task in TASK_WAKING state. | 2507 | * we put the task in TASK_WAKING state. |
2398 | * | ||
2399 | * First fix up the nr_uninterruptible count: | ||
2400 | */ | 2508 | */ |
2401 | if (task_contributes_to_load(p)) | 2509 | p->state |= TASK_WAKING; |
2402 | rq->nr_uninterruptible--; | ||
2403 | p->state = TASK_WAKING; | ||
2404 | 2510 | ||
2405 | if (p->sched_class->task_waking) | 2511 | if (p->sched_class->task_waking) |
2406 | p->sched_class->task_waking(rq, p); | 2512 | p->sched_class->task_waking(rq, p); |
@@ -2408,14 +2514,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
2408 | __task_rq_unlock(rq); | 2514 | __task_rq_unlock(rq); |
2409 | 2515 | ||
2410 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | 2516 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
2411 | if (cpu != orig_cpu) | 2517 | if (cpu != orig_cpu) { |
2518 | /* | ||
2519 | * Since we migrate the task without holding any rq->lock, | ||
2520 | * we need to be careful with task_rq_lock(), since that | ||
2521 | * might end up locking an invalid rq. | ||
2522 | */ | ||
2412 | set_task_cpu(p, cpu); | 2523 | set_task_cpu(p, cpu); |
2524 | } | ||
2413 | 2525 | ||
2414 | rq = __task_rq_lock(p); | 2526 | rq = cpu_rq(cpu); |
2527 | raw_spin_lock(&rq->lock); | ||
2415 | update_rq_clock(rq); | 2528 | update_rq_clock(rq); |
2416 | 2529 | ||
2417 | WARN_ON(p->state != TASK_WAKING); | 2530 | /* |
2418 | cpu = task_cpu(p); | 2531 | * We migrated the task without holding either rq->lock, however |
2532 | * since the task is not on the task list itself, nobody else | ||
2533 | * will try and migrate the task, hence the rq should match the | ||
2534 | * cpu we just moved it to. | ||
2535 | */ | ||
2536 | WARN_ON(task_cpu(p) != cpu); | ||
2537 | WARN_ON(!(p->state & TASK_WAKING)); | ||
2419 | 2538 | ||
2420 | #ifdef CONFIG_SCHEDSTATS | 2539 | #ifdef CONFIG_SCHEDSTATS |
2421 | schedstat_inc(rq, ttwu_count); | 2540 | schedstat_inc(rq, ttwu_count); |
@@ -2443,7 +2562,7 @@ out_activate: | |||
2443 | schedstat_inc(p, se.nr_wakeups_local); | 2562 | schedstat_inc(p, se.nr_wakeups_local); |
2444 | else | 2563 | else |
2445 | schedstat_inc(p, se.nr_wakeups_remote); | 2564 | schedstat_inc(p, se.nr_wakeups_remote); |
2446 | activate_task(rq, p, 1); | 2565 | activate_task(rq, p, 1, false); |
2447 | success = 1; | 2566 | success = 1; |
2448 | 2567 | ||
2449 | /* | 2568 | /* |
@@ -2466,7 +2585,20 @@ out_running: | |||
2466 | trace_sched_wakeup(rq, p, success); | 2585 | trace_sched_wakeup(rq, p, success); |
2467 | check_preempt_curr(rq, p, wake_flags); | 2586 | check_preempt_curr(rq, p, wake_flags); |
2468 | 2587 | ||
2469 | p->state = TASK_RUNNING; | 2588 | /* |
2589 | * For a mutex wakeup we or TASK_RUNNING_MUTEX to the task | ||
2590 | * state to preserve the original state, so a real wakeup | ||
2591 | * still can see the (UN)INTERRUPTIBLE bits in the state check | ||
2592 | * above. We dont have to worry about the | TASK_RUNNING_MUTEX | ||
2593 | * here. The waiter is serialized by the mutex lock and nobody | ||
2594 | * else can fiddle with p->state as we hold rq lock. | ||
2595 | */ | ||
2596 | p->state &= ~TASK_WAKING; | ||
2597 | if (mutex) | ||
2598 | p->state |= TASK_RUNNING_MUTEX; | ||
2599 | else | ||
2600 | p->state = TASK_RUNNING; | ||
2601 | |||
2470 | #ifdef CONFIG_SMP | 2602 | #ifdef CONFIG_SMP |
2471 | if (p->sched_class->task_woken) | 2603 | if (p->sched_class->task_woken) |
2472 | p->sched_class->task_woken(rq, p); | 2604 | p->sched_class->task_woken(rq, p); |
@@ -2502,13 +2634,31 @@ out: | |||
2502 | */ | 2634 | */ |
2503 | int wake_up_process(struct task_struct *p) | 2635 | int wake_up_process(struct task_struct *p) |
2504 | { | 2636 | { |
2505 | return try_to_wake_up(p, TASK_ALL, 0); | 2637 | return try_to_wake_up(p, TASK_ALL, 0, 0); |
2506 | } | 2638 | } |
2507 | EXPORT_SYMBOL(wake_up_process); | 2639 | EXPORT_SYMBOL(wake_up_process); |
2508 | 2640 | ||
2641 | int wake_up_process_sync(struct task_struct * p) | ||
2642 | { | ||
2643 | return try_to_wake_up(p, TASK_ALL, 1, 0); | ||
2644 | } | ||
2645 | EXPORT_SYMBOL(wake_up_process_sync); | ||
2646 | |||
2647 | int wake_up_process_mutex(struct task_struct * p) | ||
2648 | { | ||
2649 | return try_to_wake_up(p, TASK_ALL, 0, 1); | ||
2650 | } | ||
2651 | EXPORT_SYMBOL(wake_up_process_mutex); | ||
2652 | |||
2653 | int wake_up_process_mutex_sync(struct task_struct * p) | ||
2654 | { | ||
2655 | return try_to_wake_up(p, TASK_ALL, 1, 1); | ||
2656 | } | ||
2657 | EXPORT_SYMBOL(wake_up_process_mutex_sync); | ||
2658 | |||
2509 | int wake_up_state(struct task_struct *p, unsigned int state) | 2659 | int wake_up_state(struct task_struct *p, unsigned int state) |
2510 | { | 2660 | { |
2511 | return try_to_wake_up(p, state, 0); | 2661 | return try_to_wake_up(p, state, 0, 0); |
2512 | } | 2662 | } |
2513 | 2663 | ||
2514 | /* | 2664 | /* |
@@ -2575,7 +2725,7 @@ static void __sched_fork(struct task_struct *p) | |||
2575 | */ | 2725 | */ |
2576 | void sched_fork(struct task_struct *p, int clone_flags) | 2726 | void sched_fork(struct task_struct *p, int clone_flags) |
2577 | { | 2727 | { |
2578 | int cpu = get_cpu(); | 2728 | int cpu; |
2579 | 2729 | ||
2580 | __sched_fork(p); | 2730 | __sched_fork(p); |
2581 | /* | 2731 | /* |
@@ -2615,16 +2765,24 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2615 | if (!rt_prio(p->prio)) | 2765 | if (!rt_prio(p->prio)) |
2616 | p->sched_class = &fair_sched_class; | 2766 | p->sched_class = &fair_sched_class; |
2617 | 2767 | ||
2768 | /* | ||
2769 | * task_fork() and set_task_cpu() must be called with | ||
2770 | * preemption disabled | ||
2771 | */ | ||
2772 | cpu = get_cpu(); | ||
2773 | |||
2618 | if (p->sched_class->task_fork) | 2774 | if (p->sched_class->task_fork) |
2619 | p->sched_class->task_fork(p); | 2775 | p->sched_class->task_fork(p); |
2620 | 2776 | ||
2621 | set_task_cpu(p, cpu); | 2777 | set_task_cpu(p, cpu); |
2622 | 2778 | ||
2779 | put_cpu(); | ||
2780 | |||
2623 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2781 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
2624 | if (likely(sched_info_on())) | 2782 | if (likely(sched_info_on())) |
2625 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2783 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
2626 | #endif | 2784 | #endif |
2627 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 2785 | #if defined(CONFIG_SMP) |
2628 | p->oncpu = 0; | 2786 | p->oncpu = 0; |
2629 | #endif | 2787 | #endif |
2630 | #ifdef CONFIG_PREEMPT | 2788 | #ifdef CONFIG_PREEMPT |
@@ -2632,8 +2790,6 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2632 | task_thread_info(p)->preempt_count = 1; | 2790 | task_thread_info(p)->preempt_count = 1; |
2633 | #endif | 2791 | #endif |
2634 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 2792 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
2635 | |||
2636 | put_cpu(); | ||
2637 | } | 2793 | } |
2638 | 2794 | ||
2639 | /* | 2795 | /* |
@@ -2663,11 +2819,17 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2663 | set_task_cpu(p, cpu); | 2819 | set_task_cpu(p, cpu); |
2664 | #endif | 2820 | #endif |
2665 | 2821 | ||
2666 | rq = task_rq_lock(p, &flags); | 2822 | /* |
2823 | * Since the task is not on the rq and we still have TASK_WAKING set | ||
2824 | * nobody else will migrate this task. | ||
2825 | */ | ||
2826 | rq = cpu_rq(cpu); | ||
2827 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
2828 | |||
2667 | BUG_ON(p->state != TASK_WAKING); | 2829 | BUG_ON(p->state != TASK_WAKING); |
2668 | p->state = TASK_RUNNING; | 2830 | p->state = TASK_RUNNING; |
2669 | update_rq_clock(rq); | 2831 | update_rq_clock(rq); |
2670 | activate_task(rq, p, 0); | 2832 | activate_task(rq, p, 0, false); |
2671 | trace_sched_wakeup_new(rq, p, 1); | 2833 | trace_sched_wakeup_new(rq, p, 1); |
2672 | check_preempt_curr(rq, p, WF_FORK); | 2834 | check_preempt_curr(rq, p, WF_FORK); |
2673 | #ifdef CONFIG_SMP | 2835 | #ifdef CONFIG_SMP |
@@ -2707,8 +2869,17 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | |||
2707 | struct preempt_notifier *notifier; | 2869 | struct preempt_notifier *notifier; |
2708 | struct hlist_node *node; | 2870 | struct hlist_node *node; |
2709 | 2871 | ||
2872 | if (hlist_empty(&curr->preempt_notifiers)) | ||
2873 | return; | ||
2874 | |||
2875 | /* | ||
2876 | * The KVM sched in notifier expects to be called with | ||
2877 | * interrupts enabled. | ||
2878 | */ | ||
2879 | local_irq_enable(); | ||
2710 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) | 2880 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) |
2711 | notifier->ops->sched_in(notifier, raw_smp_processor_id()); | 2881 | notifier->ops->sched_in(notifier, raw_smp_processor_id()); |
2882 | local_irq_disable(); | ||
2712 | } | 2883 | } |
2713 | 2884 | ||
2714 | static void | 2885 | static void |
@@ -2793,13 +2964,17 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2793 | * Manfred Spraul <manfred@colorfullife.com> | 2964 | * Manfred Spraul <manfred@colorfullife.com> |
2794 | */ | 2965 | */ |
2795 | prev_state = prev->state; | 2966 | prev_state = prev->state; |
2796 | finish_arch_switch(prev); | 2967 | _finish_arch_switch(prev); |
2797 | perf_event_task_sched_in(current, cpu_of(rq)); | 2968 | perf_event_task_sched_in(current, cpu_of(rq)); |
2798 | finish_lock_switch(rq, prev); | 2969 | finish_lock_switch(rq, prev); |
2799 | 2970 | ||
2800 | fire_sched_in_preempt_notifiers(current); | 2971 | fire_sched_in_preempt_notifiers(current); |
2972 | /* | ||
2973 | * Delay the final freeing of the mm or task, so that we dont have | ||
2974 | * to do complex work from within the scheduler: | ||
2975 | */ | ||
2801 | if (mm) | 2976 | if (mm) |
2802 | mmdrop(mm); | 2977 | mmdrop_delayed(mm); |
2803 | if (unlikely(prev_state == TASK_DEAD)) { | 2978 | if (unlikely(prev_state == TASK_DEAD)) { |
2804 | /* | 2979 | /* |
2805 | * Remove function-return probe instances associated with this | 2980 | * Remove function-return probe instances associated with this |
@@ -2853,8 +3028,10 @@ static inline void post_schedule(struct rq *rq) | |||
2853 | asmlinkage void schedule_tail(struct task_struct *prev) | 3028 | asmlinkage void schedule_tail(struct task_struct *prev) |
2854 | __releases(rq->lock) | 3029 | __releases(rq->lock) |
2855 | { | 3030 | { |
2856 | struct rq *rq = this_rq(); | 3031 | struct rq *rq; |
2857 | 3032 | ||
3033 | preempt_disable(); | ||
3034 | rq = this_rq(); | ||
2858 | finish_task_switch(rq, prev); | 3035 | finish_task_switch(rq, prev); |
2859 | 3036 | ||
2860 | /* | 3037 | /* |
@@ -2863,9 +3040,14 @@ asmlinkage void schedule_tail(struct task_struct *prev) | |||
2863 | */ | 3040 | */ |
2864 | post_schedule(rq); | 3041 | post_schedule(rq); |
2865 | 3042 | ||
3043 | __preempt_enable_no_resched(); | ||
3044 | local_irq_enable(); | ||
3045 | |||
2866 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 3046 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
2867 | /* In this case, finish_task_switch does not reenable preemption */ | 3047 | /* In this case, finish_task_switch does not reenable preemption */ |
2868 | preempt_enable(); | 3048 | preempt_enable(); |
3049 | #else | ||
3050 | preempt_check_resched(); | ||
2869 | #endif | 3051 | #endif |
2870 | if (current->set_child_tid) | 3052 | if (current->set_child_tid) |
2871 | put_user(task_pid_vnr(current), current->set_child_tid); | 3053 | put_user(task_pid_vnr(current), current->set_child_tid); |
@@ -2913,6 +3095,11 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2913 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 3095 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
2914 | #endif | 3096 | #endif |
2915 | 3097 | ||
3098 | #ifdef CURRENT_PTR | ||
3099 | barrier(); | ||
3100 | *current_ptr = next; | ||
3101 | *current_ti_ptr = next->thread_info; | ||
3102 | #endif | ||
2916 | /* Here we just switch the register state and the stack. */ | 3103 | /* Here we just switch the register state and the stack. */ |
2917 | switch_to(prev, next, prev); | 3104 | switch_to(prev, next, prev); |
2918 | 3105 | ||
@@ -2959,6 +3146,11 @@ unsigned long nr_uninterruptible(void) | |||
2959 | return sum; | 3146 | return sum; |
2960 | } | 3147 | } |
2961 | 3148 | ||
3149 | unsigned long nr_uninterruptible_cpu(int cpu) | ||
3150 | { | ||
3151 | return cpu_rq(cpu)->nr_uninterruptible; | ||
3152 | } | ||
3153 | |||
2962 | unsigned long long nr_context_switches(void) | 3154 | unsigned long long nr_context_switches(void) |
2963 | { | 3155 | { |
2964 | int i; | 3156 | int i; |
@@ -2977,6 +3169,13 @@ unsigned long nr_iowait(void) | |||
2977 | for_each_possible_cpu(i) | 3169 | for_each_possible_cpu(i) |
2978 | sum += atomic_read(&cpu_rq(i)->nr_iowait); | 3170 | sum += atomic_read(&cpu_rq(i)->nr_iowait); |
2979 | 3171 | ||
3172 | /* | ||
3173 | * Since we read the counters lockless, it might be slightly | ||
3174 | * inaccurate. Do not allow it to go below zero though: | ||
3175 | */ | ||
3176 | if (unlikely((long)sum < 0)) | ||
3177 | sum = 0; | ||
3178 | |||
2980 | return sum; | 3179 | return sum; |
2981 | } | 3180 | } |
2982 | 3181 | ||
@@ -3199,7 +3398,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
3199 | { | 3398 | { |
3200 | deactivate_task(src_rq, p, 0); | 3399 | deactivate_task(src_rq, p, 0); |
3201 | set_task_cpu(p, this_cpu); | 3400 | set_task_cpu(p, this_cpu); |
3202 | activate_task(this_rq, p, 0); | 3401 | activate_task(this_rq, p, 0, false); |
3203 | check_preempt_curr(this_rq, p, 0); | 3402 | check_preempt_curr(this_rq, p, 0); |
3204 | } | 3403 | } |
3205 | 3404 | ||
@@ -3295,6 +3494,10 @@ next: | |||
3295 | */ | 3494 | */ |
3296 | if (idle == CPU_NEWLY_IDLE) | 3495 | if (idle == CPU_NEWLY_IDLE) |
3297 | goto out; | 3496 | goto out; |
3497 | |||
3498 | if (raw_spin_is_contended(&this_rq->lock) || | ||
3499 | raw_spin_is_contended(&busiest->lock)) | ||
3500 | goto out; | ||
3298 | #endif | 3501 | #endif |
3299 | 3502 | ||
3300 | /* | 3503 | /* |
@@ -3351,6 +3554,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3351 | */ | 3554 | */ |
3352 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | 3555 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) |
3353 | break; | 3556 | break; |
3557 | |||
3558 | if (raw_spin_is_contended(&this_rq->lock) || | ||
3559 | raw_spin_is_contended(&busiest->lock)) | ||
3560 | break; | ||
3354 | #endif | 3561 | #endif |
3355 | } while (class && max_load_move > total_load_moved); | 3562 | } while (class && max_load_move > total_load_moved); |
3356 | 3563 | ||
@@ -4867,7 +5074,7 @@ out: | |||
4867 | */ | 5074 | */ |
4868 | static void run_rebalance_domains(struct softirq_action *h) | 5075 | static void run_rebalance_domains(struct softirq_action *h) |
4869 | { | 5076 | { |
4870 | int this_cpu = smp_processor_id(); | 5077 | int this_cpu = raw_smp_processor_id(); |
4871 | struct rq *this_rq = cpu_rq(this_cpu); | 5078 | struct rq *this_rq = cpu_rq(this_cpu); |
4872 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | 5079 | enum cpu_idle_type idle = this_rq->idle_at_tick ? |
4873 | CPU_IDLE : CPU_NOT_IDLE; | 5080 | CPU_IDLE : CPU_NOT_IDLE; |
@@ -5141,9 +5348,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
5141 | 5348 | ||
5142 | /* Add system time to cpustat. */ | 5349 | /* Add system time to cpustat. */ |
5143 | tmp = cputime_to_cputime64(cputime); | 5350 | tmp = cputime_to_cputime64(cputime); |
5144 | if (hardirq_count() - hardirq_offset) | 5351 | if ((hardirq_count() - hardirq_offset) || |
5352 | (p->extra_flags & PFE_HARDIRQ)) | ||
5145 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 5353 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
5146 | else if (softirq_count()) | 5354 | else if (softirq_count() || (p->extra_flags & PFE_SOFTIRQ)) |
5147 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 5355 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); |
5148 | else | 5356 | else |
5149 | cpustat->system = cputime64_add(cpustat->system, tmp); | 5357 | cpustat->system = cputime64_add(cpustat->system, tmp); |
@@ -5324,10 +5532,13 @@ void scheduler_tick(void) | |||
5324 | 5532 | ||
5325 | sched_clock_tick(); | 5533 | sched_clock_tick(); |
5326 | 5534 | ||
5535 | BUG_ON(!irqs_disabled()); | ||
5536 | |||
5327 | raw_spin_lock(&rq->lock); | 5537 | raw_spin_lock(&rq->lock); |
5328 | update_rq_clock(rq); | 5538 | update_rq_clock(rq); |
5329 | update_cpu_load(rq); | 5539 | update_cpu_load(rq); |
5330 | curr->sched_class->task_tick(rq, curr, 0); | 5540 | if (curr != rq->idle && curr->se.on_rq) |
5541 | curr->sched_class->task_tick(rq, curr, 0); | ||
5331 | raw_spin_unlock(&rq->lock); | 5542 | raw_spin_unlock(&rq->lock); |
5332 | 5543 | ||
5333 | perf_event_task_tick(curr, cpu); | 5544 | perf_event_task_tick(curr, cpu); |
@@ -5348,6 +5559,19 @@ notrace unsigned long get_parent_ip(unsigned long addr) | |||
5348 | return addr; | 5559 | return addr; |
5349 | } | 5560 | } |
5350 | 5561 | ||
5562 | #ifdef CONFIG_DEBUG_PREEMPT | ||
5563 | void notrace preempt_enable_no_resched(void) | ||
5564 | { | ||
5565 | barrier(); | ||
5566 | dec_preempt_count(); | ||
5567 | |||
5568 | WARN_ONCE(!preempt_count(), | ||
5569 | KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n", | ||
5570 | current->comm, current->pid); | ||
5571 | } | ||
5572 | EXPORT_SYMBOL(preempt_enable_no_resched); | ||
5573 | #endif | ||
5574 | |||
5351 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ | 5575 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
5352 | defined(CONFIG_PREEMPT_TRACER)) | 5576 | defined(CONFIG_PREEMPT_TRACER)) |
5353 | 5577 | ||
@@ -5404,8 +5628,8 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
5404 | { | 5628 | { |
5405 | struct pt_regs *regs = get_irq_regs(); | 5629 | struct pt_regs *regs = get_irq_regs(); |
5406 | 5630 | ||
5407 | printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", | 5631 | printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d, CPU#%d\n", |
5408 | prev->comm, prev->pid, preempt_count()); | 5632 | prev->comm, preempt_count(), prev->pid, smp_processor_id()); |
5409 | 5633 | ||
5410 | debug_show_held_locks(prev); | 5634 | debug_show_held_locks(prev); |
5411 | print_modules(); | 5635 | print_modules(); |
@@ -5423,12 +5647,14 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
5423 | */ | 5647 | */ |
5424 | static inline void schedule_debug(struct task_struct *prev) | 5648 | static inline void schedule_debug(struct task_struct *prev) |
5425 | { | 5649 | { |
5650 | // WARN_ON(system_state == SYSTEM_BOOTING); | ||
5651 | |||
5426 | /* | 5652 | /* |
5427 | * Test if we are atomic. Since do_exit() needs to call into | 5653 | * Test if we are atomic. Since do_exit() needs to call into |
5428 | * schedule() atomically, we ignore that path for now. | 5654 | * schedule() atomically, we ignore that path for now. |
5429 | * Otherwise, whine if we are scheduling when we should not be. | 5655 | * Otherwise, whine if we are scheduling when we should not be. |
5430 | */ | 5656 | */ |
5431 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) | 5657 | if (unlikely(in_atomic() && !prev->exit_state)) |
5432 | __schedule_bug(prev); | 5658 | __schedule_bug(prev); |
5433 | 5659 | ||
5434 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 5660 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
@@ -5499,15 +5725,13 @@ pick_next_task(struct rq *rq) | |||
5499 | /* | 5725 | /* |
5500 | * schedule() is the main scheduler function. | 5726 | * schedule() is the main scheduler function. |
5501 | */ | 5727 | */ |
5502 | asmlinkage void __sched schedule(void) | 5728 | asmlinkage void __sched __schedule(void) |
5503 | { | 5729 | { |
5504 | struct task_struct *prev, *next; | 5730 | struct task_struct *prev, *next; |
5505 | unsigned long *switch_count; | 5731 | unsigned long *switch_count; |
5506 | struct rq *rq; | 5732 | struct rq *rq; |
5507 | int cpu; | 5733 | int cpu; |
5508 | 5734 | ||
5509 | need_resched: | ||
5510 | preempt_disable(); | ||
5511 | cpu = smp_processor_id(); | 5735 | cpu = smp_processor_id(); |
5512 | rq = cpu_rq(cpu); | 5736 | rq = cpu_rq(cpu); |
5513 | rcu_sched_qs(cpu); | 5737 | rcu_sched_qs(cpu); |
@@ -5515,10 +5739,11 @@ need_resched: | |||
5515 | switch_count = &prev->nivcsw; | 5739 | switch_count = &prev->nivcsw; |
5516 | 5740 | ||
5517 | release_kernel_lock(prev); | 5741 | release_kernel_lock(prev); |
5518 | need_resched_nonpreemptible: | ||
5519 | 5742 | ||
5520 | schedule_debug(prev); | 5743 | schedule_debug(prev); |
5521 | 5744 | ||
5745 | preempt_disable(); | ||
5746 | |||
5522 | if (sched_feat(HRTICK)) | 5747 | if (sched_feat(HRTICK)) |
5523 | hrtick_clear(rq); | 5748 | hrtick_clear(rq); |
5524 | 5749 | ||
@@ -5526,7 +5751,8 @@ need_resched_nonpreemptible: | |||
5526 | update_rq_clock(rq); | 5751 | update_rq_clock(rq); |
5527 | clear_tsk_need_resched(prev); | 5752 | clear_tsk_need_resched(prev); |
5528 | 5753 | ||
5529 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 5754 | if (!(prev->state & TASK_RUNNING_MUTEX) && prev->state && |
5755 | !(preempt_count() & PREEMPT_ACTIVE)) { | ||
5530 | if (unlikely(signal_pending_state(prev->state, prev))) | 5756 | if (unlikely(signal_pending_state(prev->state, prev))) |
5531 | prev->state = TASK_RUNNING; | 5757 | prev->state = TASK_RUNNING; |
5532 | else | 5758 | else |
@@ -5557,24 +5783,29 @@ need_resched_nonpreemptible: | |||
5557 | */ | 5783 | */ |
5558 | cpu = smp_processor_id(); | 5784 | cpu = smp_processor_id(); |
5559 | rq = cpu_rq(cpu); | 5785 | rq = cpu_rq(cpu); |
5560 | } else | 5786 | __preempt_enable_no_resched(); |
5561 | raw_spin_unlock_irq(&rq->lock); | 5787 | } else { |
5788 | __preempt_enable_no_resched(); | ||
5789 | raw_spin_unlock(&rq->lock); | ||
5790 | } | ||
5562 | 5791 | ||
5563 | post_schedule(rq); | 5792 | post_schedule(rq); |
5564 | 5793 | ||
5565 | if (unlikely(reacquire_kernel_lock(current) < 0)) { | 5794 | reacquire_kernel_lock(current); |
5566 | prev = rq->curr; | 5795 | } |
5567 | switch_count = &prev->nivcsw; | ||
5568 | goto need_resched_nonpreemptible; | ||
5569 | } | ||
5570 | 5796 | ||
5571 | preempt_enable_no_resched(); | 5797 | asmlinkage void __sched schedule(void) |
5798 | { | ||
5799 | need_resched: | ||
5800 | local_irq_disable(); | ||
5801 | __schedule(); | ||
5802 | local_irq_enable(); | ||
5572 | if (need_resched()) | 5803 | if (need_resched()) |
5573 | goto need_resched; | 5804 | goto need_resched; |
5574 | } | 5805 | } |
5575 | EXPORT_SYMBOL(schedule); | 5806 | EXPORT_SYMBOL(schedule); |
5576 | 5807 | ||
5577 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 5808 | #if defined(CONFIG_MUTEX_SPIN_ON_OWNER) && !defined(CONFIG_PREEMPT_RT) |
5578 | /* | 5809 | /* |
5579 | * Look out! "owner" is an entirely speculative pointer | 5810 | * Look out! "owner" is an entirely speculative pointer |
5580 | * access and not reliable. | 5811 | * access and not reliable. |
@@ -5636,6 +5867,35 @@ out: | |||
5636 | #endif | 5867 | #endif |
5637 | 5868 | ||
5638 | #ifdef CONFIG_PREEMPT | 5869 | #ifdef CONFIG_PREEMPT |
5870 | |||
5871 | /* | ||
5872 | * Global flag to turn preemption off on a CONFIG_PREEMPT kernel: | ||
5873 | */ | ||
5874 | int kernel_preemption = 1; | ||
5875 | |||
5876 | static int __init preempt_setup (char *str) | ||
5877 | { | ||
5878 | if (!strncmp(str, "off", 3)) { | ||
5879 | if (kernel_preemption) { | ||
5880 | printk(KERN_INFO "turning off kernel preemption!\n"); | ||
5881 | kernel_preemption = 0; | ||
5882 | } | ||
5883 | return 1; | ||
5884 | } | ||
5885 | if (!strncmp(str, "on", 2)) { | ||
5886 | if (!kernel_preemption) { | ||
5887 | printk(KERN_INFO "turning on kernel preemption!\n"); | ||
5888 | kernel_preemption = 1; | ||
5889 | } | ||
5890 | return 1; | ||
5891 | } | ||
5892 | get_option(&str, &kernel_preemption); | ||
5893 | |||
5894 | return 1; | ||
5895 | } | ||
5896 | |||
5897 | __setup("preempt=", preempt_setup); | ||
5898 | |||
5639 | /* | 5899 | /* |
5640 | * this is the entry point to schedule() from in-kernel preemption | 5900 | * this is the entry point to schedule() from in-kernel preemption |
5641 | * off of preempt_enable. Kernel preemptions off return from interrupt | 5901 | * off of preempt_enable. Kernel preemptions off return from interrupt |
@@ -5644,7 +5904,11 @@ out: | |||
5644 | asmlinkage void __sched preempt_schedule(void) | 5904 | asmlinkage void __sched preempt_schedule(void) |
5645 | { | 5905 | { |
5646 | struct thread_info *ti = current_thread_info(); | 5906 | struct thread_info *ti = current_thread_info(); |
5907 | struct task_struct *task = current; | ||
5908 | int saved_lock_depth; | ||
5647 | 5909 | ||
5910 | if (!kernel_preemption) | ||
5911 | return; | ||
5648 | /* | 5912 | /* |
5649 | * If there is a non-zero preempt_count or interrupts are disabled, | 5913 | * If there is a non-zero preempt_count or interrupts are disabled, |
5650 | * we do not want to preempt the current task. Just return.. | 5914 | * we do not want to preempt the current task. Just return.. |
@@ -5653,10 +5917,23 @@ asmlinkage void __sched preempt_schedule(void) | |||
5653 | return; | 5917 | return; |
5654 | 5918 | ||
5655 | do { | 5919 | do { |
5920 | local_irq_disable(); | ||
5656 | add_preempt_count(PREEMPT_ACTIVE); | 5921 | add_preempt_count(PREEMPT_ACTIVE); |
5657 | schedule(); | 5922 | |
5923 | /* | ||
5924 | * We keep the big kernel semaphore locked, but we | ||
5925 | * clear ->lock_depth so that schedule() doesnt | ||
5926 | * auto-release the semaphore: | ||
5927 | */ | ||
5928 | saved_lock_depth = task->lock_depth; | ||
5929 | task->lock_depth = -1; | ||
5930 | __schedule(); | ||
5931 | task->lock_depth = saved_lock_depth; | ||
5932 | |||
5658 | sub_preempt_count(PREEMPT_ACTIVE); | 5933 | sub_preempt_count(PREEMPT_ACTIVE); |
5659 | 5934 | ||
5935 | local_irq_enable(); | ||
5936 | |||
5660 | /* | 5937 | /* |
5661 | * Check again in case we missed a preemption opportunity | 5938 | * Check again in case we missed a preemption opportunity |
5662 | * between schedule and now. | 5939 | * between schedule and now. |
@@ -5667,23 +5944,40 @@ asmlinkage void __sched preempt_schedule(void) | |||
5667 | EXPORT_SYMBOL(preempt_schedule); | 5944 | EXPORT_SYMBOL(preempt_schedule); |
5668 | 5945 | ||
5669 | /* | 5946 | /* |
5670 | * this is the entry point to schedule() from kernel preemption | 5947 | * this is is the entry point for the IRQ return path. Called with |
5671 | * off of irq context. | 5948 | * interrupts disabled. To avoid infinite irq-entry recursion problems |
5672 | * Note, that this is called and return with irqs disabled. This will | 5949 | * with fast-paced IRQ sources we do all of this carefully to never |
5673 | * protect us against recursive calling from irq. | 5950 | * enable interrupts again. |
5674 | */ | 5951 | */ |
5675 | asmlinkage void __sched preempt_schedule_irq(void) | 5952 | asmlinkage void __sched preempt_schedule_irq(void) |
5676 | { | 5953 | { |
5677 | struct thread_info *ti = current_thread_info(); | 5954 | struct thread_info *ti = current_thread_info(); |
5955 | struct task_struct *task = current; | ||
5956 | int saved_lock_depth; | ||
5678 | 5957 | ||
5679 | /* Catch callers which need to be fixed */ | 5958 | if (!kernel_preemption) |
5680 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 5959 | return; |
5960 | /* | ||
5961 | * If there is a non-zero preempt_count then just return. | ||
5962 | * (interrupts are disabled) | ||
5963 | */ | ||
5964 | if (unlikely(ti->preempt_count)) | ||
5965 | return; | ||
5681 | 5966 | ||
5682 | do { | 5967 | do { |
5683 | add_preempt_count(PREEMPT_ACTIVE); | ||
5684 | local_irq_enable(); | ||
5685 | schedule(); | ||
5686 | local_irq_disable(); | 5968 | local_irq_disable(); |
5969 | add_preempt_count(PREEMPT_ACTIVE); | ||
5970 | |||
5971 | /* | ||
5972 | * We keep the big kernel semaphore locked, but we | ||
5973 | * clear ->lock_depth so that schedule() doesnt | ||
5974 | * auto-release the semaphore: | ||
5975 | */ | ||
5976 | saved_lock_depth = task->lock_depth; | ||
5977 | task->lock_depth = -1; | ||
5978 | __schedule(); | ||
5979 | |||
5980 | task->lock_depth = saved_lock_depth; | ||
5687 | sub_preempt_count(PREEMPT_ACTIVE); | 5981 | sub_preempt_count(PREEMPT_ACTIVE); |
5688 | 5982 | ||
5689 | /* | 5983 | /* |
@@ -5699,7 +5993,7 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
5699 | int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, | 5993 | int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, |
5700 | void *key) | 5994 | void *key) |
5701 | { | 5995 | { |
5702 | return try_to_wake_up(curr->private, mode, wake_flags); | 5996 | return try_to_wake_up(curr->private, mode, wake_flags, 0); |
5703 | } | 5997 | } |
5704 | EXPORT_SYMBOL(default_wake_function); | 5998 | EXPORT_SYMBOL(default_wake_function); |
5705 | 5999 | ||
@@ -5742,7 +6036,7 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode, | |||
5742 | unsigned long flags; | 6036 | unsigned long flags; |
5743 | 6037 | ||
5744 | spin_lock_irqsave(&q->lock, flags); | 6038 | spin_lock_irqsave(&q->lock, flags); |
5745 | __wake_up_common(q, mode, nr_exclusive, 0, key); | 6039 | __wake_up_common(q, mode, nr_exclusive, 1, key); |
5746 | spin_unlock_irqrestore(&q->lock, flags); | 6040 | spin_unlock_irqrestore(&q->lock, flags); |
5747 | } | 6041 | } |
5748 | EXPORT_SYMBOL(__wake_up); | 6042 | EXPORT_SYMBOL(__wake_up); |
@@ -5822,7 +6116,7 @@ void complete(struct completion *x) | |||
5822 | 6116 | ||
5823 | spin_lock_irqsave(&x->wait.lock, flags); | 6117 | spin_lock_irqsave(&x->wait.lock, flags); |
5824 | x->done++; | 6118 | x->done++; |
5825 | __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); | 6119 | __wake_up_common(&x->wait, TASK_NORMAL, 1, 1, NULL); |
5826 | spin_unlock_irqrestore(&x->wait.lock, flags); | 6120 | spin_unlock_irqrestore(&x->wait.lock, flags); |
5827 | } | 6121 | } |
5828 | EXPORT_SYMBOL(complete); | 6122 | EXPORT_SYMBOL(complete); |
@@ -5842,7 +6136,7 @@ void complete_all(struct completion *x) | |||
5842 | 6136 | ||
5843 | spin_lock_irqsave(&x->wait.lock, flags); | 6137 | spin_lock_irqsave(&x->wait.lock, flags); |
5844 | x->done += UINT_MAX/2; | 6138 | x->done += UINT_MAX/2; |
5845 | __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); | 6139 | __wake_up_common(&x->wait, TASK_NORMAL, 0, 1, NULL); |
5846 | spin_unlock_irqrestore(&x->wait.lock, flags); | 6140 | spin_unlock_irqrestore(&x->wait.lock, flags); |
5847 | } | 6141 | } |
5848 | EXPORT_SYMBOL(complete_all); | 6142 | EXPORT_SYMBOL(complete_all); |
@@ -6058,19 +6352,19 @@ long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | |||
6058 | } | 6352 | } |
6059 | EXPORT_SYMBOL(sleep_on_timeout); | 6353 | EXPORT_SYMBOL(sleep_on_timeout); |
6060 | 6354 | ||
6061 | #ifdef CONFIG_RT_MUTEXES | ||
6062 | |||
6063 | /* | 6355 | /* |
6064 | * rt_mutex_setprio - set the current priority of a task | 6356 | * task_setprio - set the current priority of a task |
6065 | * @p: task | 6357 | * @p: task |
6066 | * @prio: prio value (kernel-internal form) | 6358 | * @prio: prio value (kernel-internal form) |
6067 | * | 6359 | * |
6068 | * This function changes the 'effective' priority of a task. It does | 6360 | * This function changes the 'effective' priority of a task. It does |
6069 | * not touch ->normal_prio like __setscheduler(). | 6361 | * not touch ->normal_prio like __setscheduler(). |
6070 | * | 6362 | * |
6071 | * Used by the rt_mutex code to implement priority inheritance logic. | 6363 | * Used by the rt_mutex code to implement priority inheritance logic |
6364 | * and by rcupreempt-boost to boost priorities of tasks sleeping | ||
6365 | * with rcu locks. | ||
6072 | */ | 6366 | */ |
6073 | void rt_mutex_setprio(struct task_struct *p, int prio) | 6367 | void task_setprio(struct task_struct *p, int prio) |
6074 | { | 6368 | { |
6075 | unsigned long flags; | 6369 | unsigned long flags; |
6076 | int oldprio, on_rq, running; | 6370 | int oldprio, on_rq, running; |
@@ -6080,6 +6374,25 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
6080 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 6374 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
6081 | 6375 | ||
6082 | rq = task_rq_lock(p, &flags); | 6376 | rq = task_rq_lock(p, &flags); |
6377 | |||
6378 | /* | ||
6379 | * Idle task boosting is a nono in general. There is one | ||
6380 | * exception, when NOHZ is active: | ||
6381 | * | ||
6382 | * The idle task calls get_next_timer_interrupt() and holds | ||
6383 | * the timer wheel base->lock on the CPU and another CPU wants | ||
6384 | * to access the timer (probably to cancel it). We can safely | ||
6385 | * ignore the boosting request, as the idle CPU runs this code | ||
6386 | * with interrupts disabled and will complete the lock | ||
6387 | * protected section without being interrupted. So there is no | ||
6388 | * real need to boost. | ||
6389 | */ | ||
6390 | if (unlikely(p == rq->idle)) { | ||
6391 | WARN_ON(p != rq->curr); | ||
6392 | WARN_ON(p->pi_blocked_on); | ||
6393 | goto out_unlock; | ||
6394 | } | ||
6395 | |||
6083 | update_rq_clock(rq); | 6396 | update_rq_clock(rq); |
6084 | 6397 | ||
6085 | oldprio = p->prio; | 6398 | oldprio = p->prio; |
@@ -6098,18 +6411,20 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
6098 | 6411 | ||
6099 | p->prio = prio; | 6412 | p->prio = prio; |
6100 | 6413 | ||
6414 | trace_sched_task_setprio(rq, p, oldprio); | ||
6415 | |||
6101 | if (running) | 6416 | if (running) |
6102 | p->sched_class->set_curr_task(rq); | 6417 | p->sched_class->set_curr_task(rq); |
6103 | if (on_rq) { | 6418 | if (on_rq) { |
6104 | enqueue_task(rq, p, 0); | 6419 | enqueue_task(rq, p, 0, oldprio < prio); |
6105 | 6420 | ||
6106 | check_class_changed(rq, p, prev_class, oldprio, running); | 6421 | check_class_changed(rq, p, prev_class, oldprio, running); |
6107 | } | 6422 | } |
6423 | |||
6424 | out_unlock: | ||
6108 | task_rq_unlock(rq, &flags); | 6425 | task_rq_unlock(rq, &flags); |
6109 | } | 6426 | } |
6110 | 6427 | ||
6111 | #endif | ||
6112 | |||
6113 | void set_user_nice(struct task_struct *p, long nice) | 6428 | void set_user_nice(struct task_struct *p, long nice) |
6114 | { | 6429 | { |
6115 | int old_prio, delta, on_rq; | 6430 | int old_prio, delta, on_rq; |
@@ -6145,7 +6460,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
6145 | delta = p->prio - old_prio; | 6460 | delta = p->prio - old_prio; |
6146 | 6461 | ||
6147 | if (on_rq) { | 6462 | if (on_rq) { |
6148 | enqueue_task(rq, p, 0); | 6463 | enqueue_task(rq, p, 0, false); |
6149 | /* | 6464 | /* |
6150 | * If the task increased its priority or is running and | 6465 | * If the task increased its priority or is running and |
6151 | * lowered its priority, then reschedule its CPU: | 6466 | * lowered its priority, then reschedule its CPU: |
@@ -6423,7 +6738,25 @@ recheck: | |||
6423 | if (running) | 6738 | if (running) |
6424 | p->sched_class->set_curr_task(rq); | 6739 | p->sched_class->set_curr_task(rq); |
6425 | if (on_rq) { | 6740 | if (on_rq) { |
6426 | activate_task(rq, p, 0); | 6741 | /* |
6742 | * Workaround to make prio ceiling work as expected: | ||
6743 | * | ||
6744 | * Queue task to head when task is running and task is | ||
6745 | * lowering its priority. This works around the non- | ||
6746 | * availability of a sched_setprio syscall which was | ||
6747 | * tinkered into the posix spec to make prio ceiling | ||
6748 | * work correctly. | ||
6749 | * | ||
6750 | * This workaround violates the posix scheduling | ||
6751 | * semantics of tail queueing in the case that the | ||
6752 | * priority was changed by anything else than | ||
6753 | * sched_setprio, but there is no other breakage | ||
6754 | * lurking than some specification fetishists going | ||
6755 | * berserk on me. | ||
6756 | * | ||
6757 | * Fixing this in mainline needs more thoughts. | ||
6758 | */ | ||
6759 | activate_task(rq, p, 0, running && oldprio < p->prio); | ||
6427 | 6760 | ||
6428 | check_class_changed(rq, p, prev_class, oldprio, running); | 6761 | check_class_changed(rq, p, prev_class, oldprio, running); |
6429 | } | 6762 | } |
@@ -6759,9 +7092,9 @@ SYSCALL_DEFINE0(sched_yield) | |||
6759 | __release(rq->lock); | 7092 | __release(rq->lock); |
6760 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 7093 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
6761 | do_raw_spin_unlock(&rq->lock); | 7094 | do_raw_spin_unlock(&rq->lock); |
6762 | preempt_enable_no_resched(); | 7095 | local_irq_enable(); |
6763 | 7096 | ||
6764 | schedule(); | 7097 | preempt_enable_and_schedule(); |
6765 | 7098 | ||
6766 | return 0; | 7099 | return 0; |
6767 | } | 7100 | } |
@@ -6773,9 +7106,18 @@ static inline int should_resched(void) | |||
6773 | 7106 | ||
6774 | static void __cond_resched(void) | 7107 | static void __cond_resched(void) |
6775 | { | 7108 | { |
6776 | add_preempt_count(PREEMPT_ACTIVE); | 7109 | do { |
6777 | schedule(); | 7110 | add_preempt_count(PREEMPT_ACTIVE); |
6778 | sub_preempt_count(PREEMPT_ACTIVE); | 7111 | schedule(); |
7112 | sub_preempt_count(PREEMPT_ACTIVE); | ||
7113 | |||
7114 | /* | ||
7115 | * Check again in case we missed a preemption opportunity | ||
7116 | * between schedule and now. | ||
7117 | */ | ||
7118 | barrier(); | ||
7119 | |||
7120 | } while (need_resched()); | ||
6779 | } | 7121 | } |
6780 | 7122 | ||
6781 | int __sched _cond_resched(void) | 7123 | int __sched _cond_resched(void) |
@@ -6816,10 +7158,16 @@ int __cond_resched_lock(spinlock_t *lock) | |||
6816 | } | 7158 | } |
6817 | EXPORT_SYMBOL(__cond_resched_lock); | 7159 | EXPORT_SYMBOL(__cond_resched_lock); |
6818 | 7160 | ||
7161 | /* | ||
7162 | * Voluntarily preempt a process context that has softirqs disabled: | ||
7163 | */ | ||
6819 | int __sched __cond_resched_softirq(void) | 7164 | int __sched __cond_resched_softirq(void) |
6820 | { | 7165 | { |
6821 | BUG_ON(!in_softirq()); | 7166 | #ifndef CONFIG_PREEMPT_SOFTIRQS |
6822 | 7167 | WARN_ON_ONCE(!in_softirq()); | |
7168 | if (!in_softirq()) | ||
7169 | return 0; | ||
7170 | #endif | ||
6823 | if (should_resched()) { | 7171 | if (should_resched()) { |
6824 | local_bh_enable(); | 7172 | local_bh_enable(); |
6825 | __cond_resched(); | 7173 | __cond_resched(); |
@@ -6830,17 +7178,75 @@ int __sched __cond_resched_softirq(void) | |||
6830 | } | 7178 | } |
6831 | EXPORT_SYMBOL(__cond_resched_softirq); | 7179 | EXPORT_SYMBOL(__cond_resched_softirq); |
6832 | 7180 | ||
7181 | /* | ||
7182 | * Voluntarily preempt a softirq context (possible with softirq threading): | ||
7183 | */ | ||
7184 | int __sched cond_resched_softirq_context(void) | ||
7185 | { | ||
7186 | WARN_ON_ONCE(!in_softirq() && !(current->extra_flags & PFE_SOFTIRQ)); | ||
7187 | |||
7188 | if (softirq_need_resched() && system_state == SYSTEM_RUNNING) { | ||
7189 | raw_local_irq_disable(); | ||
7190 | _local_bh_enable(); | ||
7191 | raw_local_irq_enable(); | ||
7192 | __cond_resched(); | ||
7193 | local_bh_disable(); | ||
7194 | return 1; | ||
7195 | } | ||
7196 | return 0; | ||
7197 | } | ||
7198 | EXPORT_SYMBOL(cond_resched_softirq_context); | ||
7199 | |||
7200 | #ifdef CONFIG_PREEMPT_VOLUNTARY | ||
7201 | int voluntary_preemption = 1; | ||
7202 | EXPORT_SYMBOL(voluntary_preemption); | ||
7203 | |||
7204 | static int __init voluntary_preempt_setup (char *str) | ||
7205 | { | ||
7206 | if (!strncmp(str, "off", 3)) | ||
7207 | voluntary_preemption = 0; | ||
7208 | else | ||
7209 | get_option(&str, &voluntary_preemption); | ||
7210 | if (!voluntary_preemption) | ||
7211 | printk("turning off voluntary preemption!\n"); | ||
7212 | |||
7213 | return 1; | ||
7214 | } | ||
7215 | |||
7216 | __setup("voluntary-preempt=", voluntary_preempt_setup); | ||
7217 | |||
7218 | #endif | ||
7219 | |||
6833 | /** | 7220 | /** |
6834 | * yield - yield the current processor to other threads. | 7221 | * yield - yield the current processor to other threads. |
6835 | * | 7222 | * |
6836 | * This is a shortcut for kernel-space yielding - it marks the | 7223 | * This is a shortcut for kernel-space yielding - it marks the |
6837 | * thread runnable and calls sys_sched_yield(). | 7224 | * thread runnable and calls sys_sched_yield(). |
6838 | */ | 7225 | */ |
6839 | void __sched yield(void) | 7226 | void __sched __yield(void) |
6840 | { | 7227 | { |
6841 | set_current_state(TASK_RUNNING); | 7228 | set_current_state(TASK_RUNNING); |
6842 | sys_sched_yield(); | 7229 | sys_sched_yield(); |
6843 | } | 7230 | } |
7231 | |||
7232 | void __sched yield(void) | ||
7233 | { | ||
7234 | static int once = 1; | ||
7235 | |||
7236 | /* | ||
7237 | * it's a bug to rely on yield() with RT priorities. We print | ||
7238 | * the first occurance after bootup ... this will still give | ||
7239 | * us an idea about the scope of the problem, without spamming | ||
7240 | * the syslog: | ||
7241 | */ | ||
7242 | if (once && rt_task(current)) { | ||
7243 | once = 0; | ||
7244 | printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n", | ||
7245 | current->comm, current->pid); | ||
7246 | dump_stack(); | ||
7247 | } | ||
7248 | __yield(); | ||
7249 | } | ||
6844 | EXPORT_SYMBOL(yield); | 7250 | EXPORT_SYMBOL(yield); |
6845 | 7251 | ||
6846 | /* | 7252 | /* |
@@ -7004,6 +7410,7 @@ void sched_show_task(struct task_struct *p) | |||
7004 | void show_state_filter(unsigned long state_filter) | 7410 | void show_state_filter(unsigned long state_filter) |
7005 | { | 7411 | { |
7006 | struct task_struct *g, *p; | 7412 | struct task_struct *g, *p; |
7413 | int do_unlock = 1; | ||
7007 | 7414 | ||
7008 | #if BITS_PER_LONG == 32 | 7415 | #if BITS_PER_LONG == 32 |
7009 | printk(KERN_INFO | 7416 | printk(KERN_INFO |
@@ -7012,7 +7419,16 @@ void show_state_filter(unsigned long state_filter) | |||
7012 | printk(KERN_INFO | 7419 | printk(KERN_INFO |
7013 | " task PC stack pid father\n"); | 7420 | " task PC stack pid father\n"); |
7014 | #endif | 7421 | #endif |
7422 | #ifdef CONFIG_PREEMPT_RT | ||
7423 | if (!read_trylock(&tasklist_lock)) { | ||
7424 | printk("hm, tasklist_lock write-locked.\n"); | ||
7425 | printk("ignoring ...\n"); | ||
7426 | do_unlock = 0; | ||
7427 | } | ||
7428 | #else | ||
7015 | read_lock(&tasklist_lock); | 7429 | read_lock(&tasklist_lock); |
7430 | #endif | ||
7431 | |||
7016 | do_each_thread(g, p) { | 7432 | do_each_thread(g, p) { |
7017 | /* | 7433 | /* |
7018 | * reset the NMI-timeout, listing all files on a slow | 7434 | * reset the NMI-timeout, listing all files on a slow |
@@ -7028,7 +7444,8 @@ void show_state_filter(unsigned long state_filter) | |||
7028 | #ifdef CONFIG_SCHED_DEBUG | 7444 | #ifdef CONFIG_SCHED_DEBUG |
7029 | sysrq_sched_debug_show(); | 7445 | sysrq_sched_debug_show(); |
7030 | #endif | 7446 | #endif |
7031 | read_unlock(&tasklist_lock); | 7447 | if (do_unlock) |
7448 | read_unlock(&tasklist_lock); | ||
7032 | /* | 7449 | /* |
7033 | * Only show locks if all tasks are dumped: | 7450 | * Only show locks if all tasks are dumped: |
7034 | */ | 7451 | */ |
@@ -7064,17 +7481,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
7064 | __set_task_cpu(idle, cpu); | 7481 | __set_task_cpu(idle, cpu); |
7065 | 7482 | ||
7066 | rq->curr = rq->idle = idle; | 7483 | rq->curr = rq->idle = idle; |
7067 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 7484 | #if defined(CONFIG_SMP) |
7068 | idle->oncpu = 1; | 7485 | idle->oncpu = 1; |
7069 | #endif | 7486 | #endif |
7070 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 7487 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
7071 | 7488 | ||
7072 | /* Set the preempt count _outside_ the spinlocks! */ | 7489 | /* Set the preempt count _outside_ the spinlocks! */ |
7073 | #if defined(CONFIG_PREEMPT) | ||
7074 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
7075 | #else | ||
7076 | task_thread_info(idle)->preempt_count = 0; | 7490 | task_thread_info(idle)->preempt_count = 0; |
7077 | #endif | 7491 | |
7078 | /* | 7492 | /* |
7079 | * The idle tasks have their own, simple scheduling class: | 7493 | * The idle tasks have their own, simple scheduling class: |
7080 | */ | 7494 | */ |
@@ -7172,27 +7586,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
7172 | struct rq *rq; | 7586 | struct rq *rq; |
7173 | int ret = 0; | 7587 | int ret = 0; |
7174 | 7588 | ||
7175 | /* | ||
7176 | * Since we rely on wake-ups to migrate sleeping tasks, don't change | ||
7177 | * the ->cpus_allowed mask from under waking tasks, which would be | ||
7178 | * possible when we change rq->lock in ttwu(), so synchronize against | ||
7179 | * TASK_WAKING to avoid that. | ||
7180 | * | ||
7181 | * Make an exception for freshly cloned tasks, since cpuset namespaces | ||
7182 | * might move the task about, we have to validate the target in | ||
7183 | * wake_up_new_task() anyway since the cpu might have gone away. | ||
7184 | */ | ||
7185 | again: | ||
7186 | while (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) | ||
7187 | cpu_relax(); | ||
7188 | |||
7189 | rq = task_rq_lock(p, &flags); | 7589 | rq = task_rq_lock(p, &flags); |
7190 | 7590 | ||
7191 | if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) { | ||
7192 | task_rq_unlock(rq, &flags); | ||
7193 | goto again; | ||
7194 | } | ||
7195 | |||
7196 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | 7591 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
7197 | ret = -EINVAL; | 7592 | ret = -EINVAL; |
7198 | goto out; | 7593 | goto out; |
@@ -7248,11 +7643,18 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); | |||
7248 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 7643 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
7249 | { | 7644 | { |
7250 | struct rq *rq_dest, *rq_src; | 7645 | struct rq *rq_dest, *rq_src; |
7646 | unsigned long flags; | ||
7251 | int ret = 0; | 7647 | int ret = 0; |
7252 | 7648 | ||
7253 | if (unlikely(!cpu_active(dest_cpu))) | 7649 | if (unlikely(!cpu_active(dest_cpu))) |
7254 | return ret; | 7650 | return ret; |
7255 | 7651 | ||
7652 | /* | ||
7653 | * PREEMPT_RT: this relies on write_lock_irq(&tasklist_lock) | ||
7654 | * disabling interrupts - which on PREEMPT_RT does not do: | ||
7655 | */ | ||
7656 | local_irq_save(flags); | ||
7657 | |||
7256 | rq_src = cpu_rq(src_cpu); | 7658 | rq_src = cpu_rq(src_cpu); |
7257 | rq_dest = cpu_rq(dest_cpu); | 7659 | rq_dest = cpu_rq(dest_cpu); |
7258 | 7660 | ||
@@ -7271,13 +7673,15 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
7271 | if (p->se.on_rq) { | 7673 | if (p->se.on_rq) { |
7272 | deactivate_task(rq_src, p, 0); | 7674 | deactivate_task(rq_src, p, 0); |
7273 | set_task_cpu(p, dest_cpu); | 7675 | set_task_cpu(p, dest_cpu); |
7274 | activate_task(rq_dest, p, 0); | 7676 | activate_task(rq_dest, p, 0, false); |
7275 | check_preempt_curr(rq_dest, p, 0); | 7677 | check_preempt_curr(rq_dest, p, 0); |
7276 | } | 7678 | } |
7277 | done: | 7679 | done: |
7278 | ret = 1; | 7680 | ret = 1; |
7279 | fail: | 7681 | fail: |
7280 | double_rq_unlock(rq_src, rq_dest); | 7682 | double_rq_unlock(rq_src, rq_dest); |
7683 | local_irq_restore(flags); | ||
7684 | |||
7281 | return ret; | 7685 | return ret; |
7282 | } | 7686 | } |
7283 | 7687 | ||
@@ -7437,7 +7841,7 @@ void sched_idle_next(void) | |||
7437 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | 7841 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
7438 | 7842 | ||
7439 | update_rq_clock(rq); | 7843 | update_rq_clock(rq); |
7440 | activate_task(rq, p, 0); | 7844 | activate_task(rq, p, 0, false); |
7441 | 7845 | ||
7442 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 7846 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
7443 | } | 7847 | } |
@@ -7454,7 +7858,11 @@ void idle_task_exit(void) | |||
7454 | 7858 | ||
7455 | if (mm != &init_mm) | 7859 | if (mm != &init_mm) |
7456 | switch_mm(mm, &init_mm, current); | 7860 | switch_mm(mm, &init_mm, current); |
7861 | #ifdef CONFIG_PREEMPT_RT | ||
7862 | mmdrop_delayed(mm); | ||
7863 | #else | ||
7457 | mmdrop(mm); | 7864 | mmdrop(mm); |
7865 | #endif | ||
7458 | } | 7866 | } |
7459 | 7867 | ||
7460 | /* called under rq->lock with disabled interrupts */ | 7868 | /* called under rq->lock with disabled interrupts */ |
@@ -9699,6 +10107,9 @@ void __init sched_init(void) | |||
9699 | atomic_inc(&init_mm.mm_count); | 10107 | atomic_inc(&init_mm.mm_count); |
9700 | enter_lazy_tlb(&init_mm, current); | 10108 | enter_lazy_tlb(&init_mm, current); |
9701 | 10109 | ||
10110 | #ifdef CONFIG_PREEMPT_RT | ||
10111 | printk("Real-Time Preemption Support (C) 2004-2007 Ingo Molnar\n"); | ||
10112 | #endif | ||
9702 | /* | 10113 | /* |
9703 | * Make us the idle thread. Technically, schedule() should not be | 10114 | * Make us the idle thread. Technically, schedule() should not be |
9704 | * called from this thread, however somewhere below it might be, | 10115 | * called from this thread, however somewhere below it might be, |
@@ -9731,10 +10142,14 @@ void __init sched_init(void) | |||
9731 | scheduler_running = 1; | 10142 | scheduler_running = 1; |
9732 | } | 10143 | } |
9733 | 10144 | ||
9734 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 10145 | #if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) |
9735 | static inline int preempt_count_equals(int preempt_offset) | 10146 | static inline int preempt_count_equals(int preempt_offset) |
9736 | { | 10147 | { |
9737 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); | 10148 | int nested = (preempt_count() & ~PREEMPT_ACTIVE); |
10149 | |||
10150 | #ifndef CONFIG_PREEMPT_RT | ||
10151 | nested += rcu_preempt_depth(); | ||
10152 | #endif | ||
9738 | 10153 | ||
9739 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | 10154 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); |
9740 | } | 10155 | } |
@@ -9755,7 +10170,8 @@ void __might_sleep(char *file, int line, int preempt_offset) | |||
9755 | "BUG: sleeping function called from invalid context at %s:%d\n", | 10170 | "BUG: sleeping function called from invalid context at %s:%d\n", |
9756 | file, line); | 10171 | file, line); |
9757 | printk(KERN_ERR | 10172 | printk(KERN_ERR |
9758 | "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", | 10173 | "pcnt: %x %d in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", |
10174 | preempt_count(), preempt_offset, | ||
9759 | in_atomic(), irqs_disabled(), | 10175 | in_atomic(), irqs_disabled(), |
9760 | current->pid, current->comm); | 10176 | current->pid, current->comm); |
9761 | 10177 | ||
@@ -9779,7 +10195,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
9779 | deactivate_task(rq, p, 0); | 10195 | deactivate_task(rq, p, 0); |
9780 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 10196 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
9781 | if (on_rq) { | 10197 | if (on_rq) { |
9782 | activate_task(rq, p, 0); | 10198 | activate_task(rq, p, 0, false); |
9783 | resched_task(rq->curr); | 10199 | resched_task(rq->curr); |
9784 | } | 10200 | } |
9785 | } | 10201 | } |
@@ -10155,7 +10571,7 @@ void sched_move_task(struct task_struct *tsk) | |||
10155 | if (unlikely(running)) | 10571 | if (unlikely(running)) |
10156 | tsk->sched_class->set_curr_task(rq); | 10572 | tsk->sched_class->set_curr_task(rq); |
10157 | if (on_rq) | 10573 | if (on_rq) |
10158 | enqueue_task(rq, tsk, 0); | 10574 | enqueue_task(rq, tsk, 0, false); |
10159 | 10575 | ||
10160 | task_rq_unlock(rq, &flags); | 10576 | task_rq_unlock(rq, &flags); |
10161 | } | 10577 | } |