aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-05-05 20:31:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-05-05 20:31:14 -0400
commitbb896afe2089575ca1bb1fbf3f07b934e1ba999b (patch)
tree7300f9b4e3e267fe97d898440c72ff2c4c327f23
parent2e83fc4df5f27dfc1b53044c4f142b2f9d1db08c (diff)
parentaac6abca858386438d9a7233c3471d2ecfa2f704 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-fixes
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-fixes: sched: default to n for GROUP_SCHED and FAIR_GROUP_SCHED sched: add optional support for CONFIG_HAVE_UNSTABLE_SCHED_CLOCK sched, x86: add HAVE_UNSTABLE_SCHED_CLOCK sched: fix cpu clock sched: fair-group: fix a Div0 error of the fair group scheduler sched: fix missing locking in sched_domains code sched: make clock sync tunable by architecture code sched: fix debugging sched: fix sched_info_switch not being called according to documentation sched: fix hrtick_start_fair and CPU-Hotplug sched: fix SCHED_FAIR wake-idle logic error sched: fix RT task-wakeup logic sched: add statics, don't return void expressions sched: add debug checks to idle functions sched: remove old sched doc sched: make rt_sched_class, idle_sched_class static sched: optimize calc_delta_mine() sched: fix normalized sleeper
-rw-r--r--Documentation/scheduler/sched-design.txt165
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--include/linux/sched.h38
-rw-r--r--init/Kconfig11
-rw-r--r--init/main.c1
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/sched.c323
-rw-r--r--kernel/sched_clock.c236
-rw-r--r--kernel/sched_debug.c7
-rw-r--r--kernel/sched_fair.c39
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c9
12 files changed, 428 insertions, 406 deletions
diff --git a/Documentation/scheduler/sched-design.txt b/Documentation/scheduler/sched-design.txt
deleted file mode 100644
index 1605bf0cba8b..000000000000
--- a/Documentation/scheduler/sched-design.txt
+++ /dev/null
@@ -1,165 +0,0 @@
1 Goals, Design and Implementation of the
2 new ultra-scalable O(1) scheduler
3
4
5 This is an edited version of an email Ingo Molnar sent to
6 lkml on 4 Jan 2002. It describes the goals, design, and
7 implementation of Ingo's new ultra-scalable O(1) scheduler.
8 Last Updated: 18 April 2002.
9
10
11Goal
12====
13
14The main goal of the new scheduler is to keep all the good things we know
15and love about the current Linux scheduler:
16
17 - good interactive performance even during high load: if the user
18 types or clicks then the system must react instantly and must execute
19 the user tasks smoothly, even during considerable background load.
20
21 - good scheduling/wakeup performance with 1-2 runnable processes.
22
23 - fairness: no process should stay without any timeslice for any
24 unreasonable amount of time. No process should get an unjustly high
25 amount of CPU time.
26
27 - priorities: less important tasks can be started with lower priority,
28 more important tasks with higher priority.
29
30 - SMP efficiency: no CPU should stay idle if there is work to do.
31
32 - SMP affinity: processes which run on one CPU should stay affine to
33 that CPU. Processes should not bounce between CPUs too frequently.
34
35 - plus additional scheduler features: RT scheduling, CPU binding.
36
37and the goal is also to add a few new things:
38
39 - fully O(1) scheduling. Are you tired of the recalculation loop
40 blowing the L1 cache away every now and then? Do you think the goodness
41 loop is taking a bit too long to finish if there are lots of runnable
42 processes? This new scheduler takes no prisoners: wakeup(), schedule(),
43 the timer interrupt are all O(1) algorithms. There is no recalculation
44 loop. There is no goodness loop either.
45
46 - 'perfect' SMP scalability. With the new scheduler there is no 'big'
47 runqueue_lock anymore - it's all per-CPU runqueues and locks - two
48 tasks on two separate CPUs can wake up, schedule and context-switch
49 completely in parallel, without any interlocking. All
50 scheduling-relevant data is structured for maximum scalability.
51
52 - better SMP affinity. The old scheduler has a particular weakness that
53 causes the random bouncing of tasks between CPUs if/when higher
54 priority/interactive tasks, this was observed and reported by many
55 people. The reason is that the timeslice recalculation loop first needs
56 every currently running task to consume its timeslice. But when this
57 happens on eg. an 8-way system, then this property starves an
58 increasing number of CPUs from executing any process. Once the last
59 task that has a timeslice left has finished using up that timeslice,
60 the recalculation loop is triggered and other CPUs can start executing
61 tasks again - after having idled around for a number of timer ticks.
62 The more CPUs, the worse this effect.
63
64 Furthermore, this same effect causes the bouncing effect as well:
65 whenever there is such a 'timeslice squeeze' of the global runqueue,
66 idle processors start executing tasks which are not affine to that CPU.
67 (because the affine tasks have finished off their timeslices already.)
68
69 The new scheduler solves this problem by distributing timeslices on a
70 per-CPU basis, without having any global synchronization or
71 recalculation.
72
73 - batch scheduling. A significant proportion of computing-intensive tasks
74 benefit from batch-scheduling, where timeslices are long and processes
75 are roundrobin scheduled. The new scheduler does such batch-scheduling
76 of the lowest priority tasks - so nice +19 jobs will get
77 'batch-scheduled' automatically. With this scheduler, nice +19 jobs are
78 in essence SCHED_IDLE, from an interactiveness point of view.
79
80 - handle extreme loads more smoothly, without breakdown and scheduling
81 storms.
82
83 - O(1) RT scheduling. For those RT folks who are paranoid about the
84 O(nr_running) property of the goodness loop and the recalculation loop.
85
86 - run fork()ed children before the parent. Andrea has pointed out the
87 advantages of this a few months ago, but patches for this feature
88 do not work with the old scheduler as well as they should,
89 because idle processes often steal the new child before the fork()ing
90 CPU gets to execute it.
91
92
93Design
94======
95
96The core of the new scheduler contains the following mechanisms:
97
98 - *two* priority-ordered 'priority arrays' per CPU. There is an 'active'
99 array and an 'expired' array. The active array contains all tasks that
100 are affine to this CPU and have timeslices left. The expired array
101 contains all tasks which have used up their timeslices - but this array
102 is kept sorted as well. The active and expired array is not accessed
103 directly, it's accessed through two pointers in the per-CPU runqueue
104 structure. If all active tasks are used up then we 'switch' the two
105 pointers and from now on the ready-to-go (former-) expired array is the
106 active array - and the empty active array serves as the new collector
107 for expired tasks.
108
109 - there is a 64-bit bitmap cache for array indices. Finding the highest
110 priority task is thus a matter of two x86 BSFL bit-search instructions.
111
112the split-array solution enables us to have an arbitrary number of active
113and expired tasks, and the recalculation of timeslices can be done
114immediately when the timeslice expires. Because the arrays are always
115access through the pointers in the runqueue, switching the two arrays can
116be done very quickly.
117
118this is a hybride priority-list approach coupled with roundrobin
119scheduling and the array-switch method of distributing timeslices.
120
121 - there is a per-task 'load estimator'.
122
123one of the toughest things to get right is good interactive feel during
124heavy system load. While playing with various scheduler variants i found
125that the best interactive feel is achieved not by 'boosting' interactive
126tasks, but by 'punishing' tasks that want to use more CPU time than there
127is available. This method is also much easier to do in an O(1) fashion.
128
129to establish the actual 'load' the task contributes to the system, a
130complex-looking but pretty accurate method is used: there is a 4-entry
131'history' ringbuffer of the task's activities during the last 4 seconds.
132This ringbuffer is operated without much overhead. The entries tell the
133scheduler a pretty accurate load-history of the task: has it used up more
134CPU time or less during the past N seconds. [the size '4' and the interval
135of 4x 1 seconds was found by lots of experimentation - this part is
136flexible and can be changed in both directions.]
137
138the penalty a task gets for generating more load than the CPU can handle
139is a priority decrease - there is a maximum amount to this penalty
140relative to their static priority, so even fully CPU-bound tasks will
141observe each other's priorities, and will share the CPU accordingly.
142
143the SMP load-balancer can be extended/switched with additional parallel
144computing and cache hierarchy concepts: NUMA scheduling, multi-core CPUs
145can be supported easily by changing the load-balancer. Right now it's
146tuned for my SMP systems.
147
148i skipped the prev->mm == next->mm advantage - no workload i know of shows
149any sensitivity to this. It can be added back by sacrificing O(1)
150schedule() [the current and one-lower priority list can be searched for a
151that->mm == current->mm condition], but costs a fair number of cycles
152during a number of important workloads, so i wanted to avoid this as much
153as possible.
154
155- the SMP idle-task startup code was still racy and the new scheduler
156triggered this. So i streamlined the idle-setup code a bit. We do not call
157into schedule() before all processors have started up fully and all idle
158threads are in place.
159
160- the patch also cleans up a number of aspects of sched.c - moves code
161into other areas of the kernel where it's appropriate, and simplifies
162certain code paths and data constructs. As a result, the new scheduler's
163code is smaller than the old one.
164
165 Ingo
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 845ea2b2d487..bbcafaa160c0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -18,6 +18,7 @@ config X86_64
18### Arch settings 18### Arch settings
19config X86 19config X86
20 def_bool y 20 def_bool y
21 select HAVE_UNSTABLE_SCHED_CLOCK
21 select HAVE_IDE 22 select HAVE_IDE
22 select HAVE_OPROFILE 23 select HAVE_OPROFILE
23 select HAVE_KPROBES 24 select HAVE_KPROBES
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 03c238088aee..0c35b0343a76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -158,6 +158,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
158} 158}
159#endif 159#endif
160 160
161extern unsigned long long time_sync_thresh;
162
161/* 163/*
162 * Task state bitmask. NOTE! These bits are also 164 * Task state bitmask. NOTE! These bits are also
163 * encoded in fs/proc/array.c: get_task_state(). 165 * encoded in fs/proc/array.c: get_task_state().
@@ -1551,6 +1553,35 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
1551 1553
1552extern unsigned long long sched_clock(void); 1554extern unsigned long long sched_clock(void);
1553 1555
1556#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
1557static inline void sched_clock_init(void)
1558{
1559}
1560
1561static inline u64 sched_clock_cpu(int cpu)
1562{
1563 return sched_clock();
1564}
1565
1566static inline void sched_clock_tick(void)
1567{
1568}
1569
1570static inline void sched_clock_idle_sleep_event(void)
1571{
1572}
1573
1574static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
1575{
1576}
1577#else
1578extern void sched_clock_init(void);
1579extern u64 sched_clock_cpu(int cpu);
1580extern void sched_clock_tick(void);
1581extern void sched_clock_idle_sleep_event(void);
1582extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1583#endif
1584
1554/* 1585/*
1555 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 1586 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
1556 * clock constructed from sched_clock(): 1587 * clock constructed from sched_clock():
@@ -1977,6 +2008,11 @@ static inline void clear_tsk_need_resched(struct task_struct *tsk)
1977 clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); 2008 clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
1978} 2009}
1979 2010
2011static inline int test_tsk_need_resched(struct task_struct *tsk)
2012{
2013 return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
2014}
2015
1980static inline int signal_pending(struct task_struct *p) 2016static inline int signal_pending(struct task_struct *p)
1981{ 2017{
1982 return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); 2018 return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
@@ -1991,7 +2027,7 @@ static inline int fatal_signal_pending(struct task_struct *p)
1991 2027
1992static inline int need_resched(void) 2028static inline int need_resched(void)
1993{ 2029{
1994 return unlikely(test_thread_flag(TIF_NEED_RESCHED)); 2030 return unlikely(test_tsk_need_resched(current));
1995} 2031}
1996 2032
1997/* 2033/*
diff --git a/init/Kconfig b/init/Kconfig
index f0e62e5ce0dc..4c33316743f5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -316,9 +316,16 @@ config CPUSETS
316 316
317 Say N if unsure. 317 Say N if unsure.
318 318
319#
320# Architectures with an unreliable sched_clock() should select this:
321#
322config HAVE_UNSTABLE_SCHED_CLOCK
323 bool
324
319config GROUP_SCHED 325config GROUP_SCHED
320 bool "Group CPU scheduler" 326 bool "Group CPU scheduler"
321 default y 327 depends on EXPERIMENTAL
328 default n
322 help 329 help
323 This feature lets CPU scheduler recognize task groups and control CPU 330 This feature lets CPU scheduler recognize task groups and control CPU
324 bandwidth allocation to such task groups. 331 bandwidth allocation to such task groups.
@@ -326,7 +333,7 @@ config GROUP_SCHED
326config FAIR_GROUP_SCHED 333config FAIR_GROUP_SCHED
327 bool "Group scheduling for SCHED_OTHER" 334 bool "Group scheduling for SCHED_OTHER"
328 depends on GROUP_SCHED 335 depends on GROUP_SCHED
329 default y 336 default GROUP_SCHED
330 337
331config RT_GROUP_SCHED 338config RT_GROUP_SCHED
332 bool "Group scheduling for SCHED_RR/FIFO" 339 bool "Group scheduling for SCHED_RR/FIFO"
diff --git a/init/main.c b/init/main.c
index a87d4ca5c36c..ddada7acf363 100644
--- a/init/main.c
+++ b/init/main.c
@@ -602,6 +602,7 @@ asmlinkage void __init start_kernel(void)
602 softirq_init(); 602 softirq_init();
603 timekeeping_init(); 603 timekeeping_init();
604 time_init(); 604 time_init();
605 sched_clock_init();
605 profile_init(); 606 profile_init();
606 if (!irqs_disabled()) 607 if (!irqs_disabled())
607 printk("start_kernel(): bug: interrupts were enabled early\n"); 608 printk("start_kernel(): bug: interrupts were enabled early\n");
diff --git a/kernel/Makefile b/kernel/Makefile
index 188c43223f52..1c9938addb9d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o
13 13
14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
15obj-$(CONFIG_STACKTRACE) += stacktrace.o 15obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/sched.c b/kernel/sched.c
index 34bcc5bc120e..58fb8af15776 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,16 +75,6 @@
75#include <asm/irq_regs.h> 75#include <asm/irq_regs.h>
76 76
77/* 77/*
78 * Scheduler clock - returns current time in nanosec units.
79 * This is default implementation.
80 * Architectures and sub-architectures can override this.
81 */
82unsigned long long __attribute__((weak)) sched_clock(void)
83{
84 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
85}
86
87/*
88 * Convert user-nice values [ -20 ... 0 ... 19 ] 78 * Convert user-nice values [ -20 ... 0 ... 19 ]
89 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
90 * and back. 80 * and back.
@@ -242,6 +232,12 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
242} 232}
243#endif 233#endif
244 234
235/*
236 * sched_domains_mutex serializes calls to arch_init_sched_domains,
237 * detach_destroy_domains and partition_sched_domains.
238 */
239static DEFINE_MUTEX(sched_domains_mutex);
240
245#ifdef CONFIG_GROUP_SCHED 241#ifdef CONFIG_GROUP_SCHED
246 242
247#include <linux/cgroup.h> 243#include <linux/cgroup.h>
@@ -308,9 +304,6 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
308 */ 304 */
309static DEFINE_SPINLOCK(task_group_lock); 305static DEFINE_SPINLOCK(task_group_lock);
310 306
311/* doms_cur_mutex serializes access to doms_cur[] array */
312static DEFINE_MUTEX(doms_cur_mutex);
313
314#ifdef CONFIG_FAIR_GROUP_SCHED 307#ifdef CONFIG_FAIR_GROUP_SCHED
315#ifdef CONFIG_USER_SCHED 308#ifdef CONFIG_USER_SCHED
316# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -318,7 +311,13 @@ static DEFINE_MUTEX(doms_cur_mutex);
318# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
319#endif 312#endif
320 313
314/*
315 * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
316 * (The default weight is 1024 - so there's no practical
317 * limitation from this.)
318 */
321#define MIN_SHARES 2 319#define MIN_SHARES 2
320#define MAX_SHARES (ULONG_MAX - 1)
322 321
323static int init_task_group_load = INIT_TASK_GROUP_LOAD; 322static int init_task_group_load = INIT_TASK_GROUP_LOAD;
324#endif 323#endif
@@ -358,21 +357,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
358#endif 357#endif
359} 358}
360 359
361static inline void lock_doms_cur(void)
362{
363 mutex_lock(&doms_cur_mutex);
364}
365
366static inline void unlock_doms_cur(void)
367{
368 mutex_unlock(&doms_cur_mutex);
369}
370
371#else 360#else
372 361
373static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 362static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
374static inline void lock_doms_cur(void) { }
375static inline void unlock_doms_cur(void) { }
376 363
377#endif /* CONFIG_GROUP_SCHED */ 364#endif /* CONFIG_GROUP_SCHED */
378 365
@@ -560,13 +547,7 @@ struct rq {
560 unsigned long next_balance; 547 unsigned long next_balance;
561 struct mm_struct *prev_mm; 548 struct mm_struct *prev_mm;
562 549
563 u64 clock, prev_clock_raw; 550 u64 clock;
564 s64 clock_max_delta;
565
566 unsigned int clock_warps, clock_overflows, clock_underflows;
567 u64 idle_clock;
568 unsigned int clock_deep_idle_events;
569 u64 tick_timestamp;
570 551
571 atomic_t nr_iowait; 552 atomic_t nr_iowait;
572 553
@@ -631,82 +612,6 @@ static inline int cpu_of(struct rq *rq)
631#endif 612#endif
632} 613}
633 614
634#ifdef CONFIG_NO_HZ
635static inline bool nohz_on(int cpu)
636{
637 return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
638}
639
640static inline u64 max_skipped_ticks(struct rq *rq)
641{
642 return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
643}
644
645static inline void update_last_tick_seen(struct rq *rq)
646{
647 rq->last_tick_seen = jiffies;
648}
649#else
650static inline u64 max_skipped_ticks(struct rq *rq)
651{
652 return 1;
653}
654
655static inline void update_last_tick_seen(struct rq *rq)
656{
657}
658#endif
659
660/*
661 * Update the per-runqueue clock, as finegrained as the platform can give
662 * us, but without assuming monotonicity, etc.:
663 */
664static void __update_rq_clock(struct rq *rq)
665{
666 u64 prev_raw = rq->prev_clock_raw;
667 u64 now = sched_clock();
668 s64 delta = now - prev_raw;
669 u64 clock = rq->clock;
670
671#ifdef CONFIG_SCHED_DEBUG
672 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
673#endif
674 /*
675 * Protect against sched_clock() occasionally going backwards:
676 */
677 if (unlikely(delta < 0)) {
678 clock++;
679 rq->clock_warps++;
680 } else {
681 /*
682 * Catch too large forward jumps too:
683 */
684 u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
685 u64 max_time = rq->tick_timestamp + max_jump;
686
687 if (unlikely(clock + delta > max_time)) {
688 if (clock < max_time)
689 clock = max_time;
690 else
691 clock++;
692 rq->clock_overflows++;
693 } else {
694 if (unlikely(delta > rq->clock_max_delta))
695 rq->clock_max_delta = delta;
696 clock += delta;
697 }
698 }
699
700 rq->prev_clock_raw = now;
701 rq->clock = clock;
702}
703
704static void update_rq_clock(struct rq *rq)
705{
706 if (likely(smp_processor_id() == cpu_of(rq)))
707 __update_rq_clock(rq);
708}
709
710/* 615/*
711 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 616 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
712 * See detach_destroy_domains: synchronize_sched for details. 617 * See detach_destroy_domains: synchronize_sched for details.
@@ -722,6 +627,11 @@ static void update_rq_clock(struct rq *rq)
722#define task_rq(p) cpu_rq(task_cpu(p)) 627#define task_rq(p) cpu_rq(task_cpu(p))
723#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 628#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
724 629
630static inline void update_rq_clock(struct rq *rq)
631{
632 rq->clock = sched_clock_cpu(cpu_of(rq));
633}
634
725/* 635/*
726 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 636 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
727 */ 637 */
@@ -757,14 +667,14 @@ const_debug unsigned int sysctl_sched_features =
757#define SCHED_FEAT(name, enabled) \ 667#define SCHED_FEAT(name, enabled) \
758 #name , 668 #name ,
759 669
760__read_mostly char *sched_feat_names[] = { 670static __read_mostly char *sched_feat_names[] = {
761#include "sched_features.h" 671#include "sched_features.h"
762 NULL 672 NULL
763}; 673};
764 674
765#undef SCHED_FEAT 675#undef SCHED_FEAT
766 676
767int sched_feat_open(struct inode *inode, struct file *filp) 677static int sched_feat_open(struct inode *inode, struct file *filp)
768{ 678{
769 filp->private_data = inode->i_private; 679 filp->private_data = inode->i_private;
770 return 0; 680 return 0;
@@ -899,7 +809,7 @@ static inline u64 global_rt_runtime(void)
899 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 809 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
900} 810}
901 811
902static const unsigned long long time_sync_thresh = 100000; 812unsigned long long time_sync_thresh = 100000;
903 813
904static DEFINE_PER_CPU(unsigned long long, time_offset); 814static DEFINE_PER_CPU(unsigned long long, time_offset);
905static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); 815static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
@@ -913,11 +823,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
913static DEFINE_SPINLOCK(time_sync_lock); 823static DEFINE_SPINLOCK(time_sync_lock);
914static unsigned long long prev_global_time; 824static unsigned long long prev_global_time;
915 825
916static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) 826static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
917{ 827{
918 unsigned long flags; 828 /*
919 829 * We want this inlined, to not get tracer function calls
920 spin_lock_irqsave(&time_sync_lock, flags); 830 * in this critical section:
831 */
832 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
833 __raw_spin_lock(&time_sync_lock.raw_lock);
921 834
922 if (time < prev_global_time) { 835 if (time < prev_global_time) {
923 per_cpu(time_offset, cpu) += prev_global_time - time; 836 per_cpu(time_offset, cpu) += prev_global_time - time;
@@ -926,7 +839,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
926 prev_global_time = time; 839 prev_global_time = time;
927 } 840 }
928 841
929 spin_unlock_irqrestore(&time_sync_lock, flags); 842 __raw_spin_unlock(&time_sync_lock.raw_lock);
843 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
930 844
931 return time; 845 return time;
932} 846}
@@ -934,8 +848,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
934static unsigned long long __cpu_clock(int cpu) 848static unsigned long long __cpu_clock(int cpu)
935{ 849{
936 unsigned long long now; 850 unsigned long long now;
937 unsigned long flags;
938 struct rq *rq;
939 851
940 /* 852 /*
941 * Only call sched_clock() if the scheduler has already been 853 * Only call sched_clock() if the scheduler has already been
@@ -944,11 +856,7 @@ static unsigned long long __cpu_clock(int cpu)
944 if (unlikely(!scheduler_running)) 856 if (unlikely(!scheduler_running))
945 return 0; 857 return 0;
946 858
947 local_irq_save(flags); 859 now = sched_clock_cpu(cpu);
948 rq = cpu_rq(cpu);
949 update_rq_clock(rq);
950 now = rq->clock;
951 local_irq_restore(flags);
952 860
953 return now; 861 return now;
954} 862}
@@ -960,13 +868,18 @@ static unsigned long long __cpu_clock(int cpu)
960unsigned long long cpu_clock(int cpu) 868unsigned long long cpu_clock(int cpu)
961{ 869{
962 unsigned long long prev_cpu_time, time, delta_time; 870 unsigned long long prev_cpu_time, time, delta_time;
871 unsigned long flags;
963 872
873 local_irq_save(flags);
964 prev_cpu_time = per_cpu(prev_cpu_time, cpu); 874 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
965 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); 875 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
966 delta_time = time-prev_cpu_time; 876 delta_time = time-prev_cpu_time;
967 877
968 if (unlikely(delta_time > time_sync_thresh)) 878 if (unlikely(delta_time > time_sync_thresh)) {
969 time = __sync_cpu_clock(time, cpu); 879 time = __sync_cpu_clock(time, cpu);
880 per_cpu(prev_cpu_time, cpu) = time;
881 }
882 local_irq_restore(flags);
970 883
971 return time; 884 return time;
972} 885}
@@ -1117,43 +1030,6 @@ static struct rq *this_rq_lock(void)
1117 return rq; 1030 return rq;
1118} 1031}
1119 1032
1120/*
1121 * We are going deep-idle (irqs are disabled):
1122 */
1123void sched_clock_idle_sleep_event(void)
1124{
1125 struct rq *rq = cpu_rq(smp_processor_id());
1126
1127 spin_lock(&rq->lock);
1128 __update_rq_clock(rq);
1129 spin_unlock(&rq->lock);
1130 rq->clock_deep_idle_events++;
1131}
1132EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
1133
1134/*
1135 * We just idled delta nanoseconds (called with irqs disabled):
1136 */
1137void sched_clock_idle_wakeup_event(u64 delta_ns)
1138{
1139 struct rq *rq = cpu_rq(smp_processor_id());
1140 u64 now = sched_clock();
1141
1142 rq->idle_clock += delta_ns;
1143 /*
1144 * Override the previous timestamp and ignore all
1145 * sched_clock() deltas that occured while we idled,
1146 * and use the PM-provided delta_ns to advance the
1147 * rq clock:
1148 */
1149 spin_lock(&rq->lock);
1150 rq->prev_clock_raw = now;
1151 rq->clock += delta_ns;
1152 spin_unlock(&rq->lock);
1153 touch_softlockup_watchdog();
1154}
1155EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
1156
1157static void __resched_task(struct task_struct *p, int tif_bit); 1033static void __resched_task(struct task_struct *p, int tif_bit);
1158 1034
1159static inline void resched_task(struct task_struct *p) 1035static inline void resched_task(struct task_struct *p)
@@ -1189,6 +1065,7 @@ static inline void resched_rq(struct rq *rq)
1189enum { 1065enum {
1190 HRTICK_SET, /* re-programm hrtick_timer */ 1066 HRTICK_SET, /* re-programm hrtick_timer */
1191 HRTICK_RESET, /* not a new slice */ 1067 HRTICK_RESET, /* not a new slice */
1068 HRTICK_BLOCK, /* stop hrtick operations */
1192}; 1069};
1193 1070
1194/* 1071/*
@@ -1200,6 +1077,8 @@ static inline int hrtick_enabled(struct rq *rq)
1200{ 1077{
1201 if (!sched_feat(HRTICK)) 1078 if (!sched_feat(HRTICK))
1202 return 0; 1079 return 0;
1080 if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
1081 return 0;
1203 return hrtimer_is_hres_active(&rq->hrtick_timer); 1082 return hrtimer_is_hres_active(&rq->hrtick_timer);
1204} 1083}
1205 1084
@@ -1275,14 +1154,70 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1275 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1154 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1276 1155
1277 spin_lock(&rq->lock); 1156 spin_lock(&rq->lock);
1278 __update_rq_clock(rq); 1157 update_rq_clock(rq);
1279 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1158 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1280 spin_unlock(&rq->lock); 1159 spin_unlock(&rq->lock);
1281 1160
1282 return HRTIMER_NORESTART; 1161 return HRTIMER_NORESTART;
1283} 1162}
1284 1163
1285static inline void init_rq_hrtick(struct rq *rq) 1164static void hotplug_hrtick_disable(int cpu)
1165{
1166 struct rq *rq = cpu_rq(cpu);
1167 unsigned long flags;
1168
1169 spin_lock_irqsave(&rq->lock, flags);
1170 rq->hrtick_flags = 0;
1171 __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1172 spin_unlock_irqrestore(&rq->lock, flags);
1173
1174 hrtick_clear(rq);
1175}
1176
1177static void hotplug_hrtick_enable(int cpu)
1178{
1179 struct rq *rq = cpu_rq(cpu);
1180 unsigned long flags;
1181
1182 spin_lock_irqsave(&rq->lock, flags);
1183 __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1184 spin_unlock_irqrestore(&rq->lock, flags);
1185}
1186
1187static int
1188hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1189{
1190 int cpu = (int)(long)hcpu;
1191
1192 switch (action) {
1193 case CPU_UP_CANCELED:
1194 case CPU_UP_CANCELED_FROZEN:
1195 case CPU_DOWN_PREPARE:
1196 case CPU_DOWN_PREPARE_FROZEN:
1197 case CPU_DEAD:
1198 case CPU_DEAD_FROZEN:
1199 hotplug_hrtick_disable(cpu);
1200 return NOTIFY_OK;
1201
1202 case CPU_UP_PREPARE:
1203 case CPU_UP_PREPARE_FROZEN:
1204 case CPU_DOWN_FAILED:
1205 case CPU_DOWN_FAILED_FROZEN:
1206 case CPU_ONLINE:
1207 case CPU_ONLINE_FROZEN:
1208 hotplug_hrtick_enable(cpu);
1209 return NOTIFY_OK;
1210 }
1211
1212 return NOTIFY_DONE;
1213}
1214
1215static void init_hrtick(void)
1216{
1217 hotcpu_notifier(hotplug_hrtick, 0);
1218}
1219
1220static void init_rq_hrtick(struct rq *rq)
1286{ 1221{
1287 rq->hrtick_flags = 0; 1222 rq->hrtick_flags = 0;
1288 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1223 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -1319,6 +1254,10 @@ static inline void init_rq_hrtick(struct rq *rq)
1319void hrtick_resched(void) 1254void hrtick_resched(void)
1320{ 1255{
1321} 1256}
1257
1258static inline void init_hrtick(void)
1259{
1260}
1322#endif 1261#endif
1323 1262
1324/* 1263/*
@@ -1438,8 +1377,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1438{ 1377{
1439 u64 tmp; 1378 u64 tmp;
1440 1379
1441 if (unlikely(!lw->inv_weight)) 1380 if (!lw->inv_weight)
1442 lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); 1381 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1);
1443 1382
1444 tmp = (u64)delta_exec * weight; 1383 tmp = (u64)delta_exec * weight;
1445 /* 1384 /*
@@ -1748,6 +1687,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1748 1687
1749 if (shares < MIN_SHARES) 1688 if (shares < MIN_SHARES)
1750 shares = MIN_SHARES; 1689 shares = MIN_SHARES;
1690 else if (shares > MAX_SHARES)
1691 shares = MAX_SHARES;
1751 1692
1752 __set_se_shares(tg->se[tcpu], shares); 1693 __set_se_shares(tg->se[tcpu], shares);
1753} 1694}
@@ -4339,8 +4280,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4339 struct rq *rq = this_rq(); 4280 struct rq *rq = this_rq();
4340 cputime64_t tmp; 4281 cputime64_t tmp;
4341 4282
4342 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) 4283 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4343 return account_guest_time(p, cputime); 4284 account_guest_time(p, cputime);
4285 return;
4286 }
4344 4287
4345 p->stime = cputime_add(p->stime, cputime); 4288 p->stime = cputime_add(p->stime, cputime);
4346 4289
@@ -4404,19 +4347,11 @@ void scheduler_tick(void)
4404 int cpu = smp_processor_id(); 4347 int cpu = smp_processor_id();
4405 struct rq *rq = cpu_rq(cpu); 4348 struct rq *rq = cpu_rq(cpu);
4406 struct task_struct *curr = rq->curr; 4349 struct task_struct *curr = rq->curr;
4407 u64 next_tick = rq->tick_timestamp + TICK_NSEC; 4350
4351 sched_clock_tick();
4408 4352
4409 spin_lock(&rq->lock); 4353 spin_lock(&rq->lock);
4410 __update_rq_clock(rq); 4354 update_rq_clock(rq);
4411 /*
4412 * Let rq->clock advance by at least TICK_NSEC:
4413 */
4414 if (unlikely(rq->clock < next_tick)) {
4415 rq->clock = next_tick;
4416 rq->clock_underflows++;
4417 }
4418 rq->tick_timestamp = rq->clock;
4419 update_last_tick_seen(rq);
4420 update_cpu_load(rq); 4355 update_cpu_load(rq);
4421 curr->sched_class->task_tick(rq, curr, 0); 4356 curr->sched_class->task_tick(rq, curr, 0);
4422 spin_unlock(&rq->lock); 4357 spin_unlock(&rq->lock);
@@ -4570,7 +4505,7 @@ need_resched_nonpreemptible:
4570 * Do the rq-clock update outside the rq lock: 4505 * Do the rq-clock update outside the rq lock:
4571 */ 4506 */
4572 local_irq_disable(); 4507 local_irq_disable();
4573 __update_rq_clock(rq); 4508 update_rq_clock(rq);
4574 spin_lock(&rq->lock); 4509 spin_lock(&rq->lock);
4575 clear_tsk_need_resched(prev); 4510 clear_tsk_need_resched(prev);
4576 4511
@@ -4595,9 +4530,9 @@ need_resched_nonpreemptible:
4595 prev->sched_class->put_prev_task(rq, prev); 4530 prev->sched_class->put_prev_task(rq, prev);
4596 next = pick_next_task(rq, prev); 4531 next = pick_next_task(rq, prev);
4597 4532
4598 sched_info_switch(prev, next);
4599
4600 if (likely(prev != next)) { 4533 if (likely(prev != next)) {
4534 sched_info_switch(prev, next);
4535
4601 rq->nr_switches++; 4536 rq->nr_switches++;
4602 rq->curr = next; 4537 rq->curr = next;
4603 ++*switch_count; 4538 ++*switch_count;
@@ -7755,7 +7690,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7755{ 7690{
7756 int i, j; 7691 int i, j;
7757 7692
7758 lock_doms_cur(); 7693 mutex_lock(&sched_domains_mutex);
7759 7694
7760 /* always unregister in case we don't destroy any domains */ 7695 /* always unregister in case we don't destroy any domains */
7761 unregister_sched_domain_sysctl(); 7696 unregister_sched_domain_sysctl();
@@ -7804,7 +7739,7 @@ match2:
7804 7739
7805 register_sched_domain_sysctl(); 7740 register_sched_domain_sysctl();
7806 7741
7807 unlock_doms_cur(); 7742 mutex_unlock(&sched_domains_mutex);
7808} 7743}
7809 7744
7810#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7745#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -7813,8 +7748,10 @@ int arch_reinit_sched_domains(void)
7813 int err; 7748 int err;
7814 7749
7815 get_online_cpus(); 7750 get_online_cpus();
7751 mutex_lock(&sched_domains_mutex);
7816 detach_destroy_domains(&cpu_online_map); 7752 detach_destroy_domains(&cpu_online_map);
7817 err = arch_init_sched_domains(&cpu_online_map); 7753 err = arch_init_sched_domains(&cpu_online_map);
7754 mutex_unlock(&sched_domains_mutex);
7818 put_online_cpus(); 7755 put_online_cpus();
7819 7756
7820 return err; 7757 return err;
@@ -7932,13 +7869,16 @@ void __init sched_init_smp(void)
7932 BUG_ON(sched_group_nodes_bycpu == NULL); 7869 BUG_ON(sched_group_nodes_bycpu == NULL);
7933#endif 7870#endif
7934 get_online_cpus(); 7871 get_online_cpus();
7872 mutex_lock(&sched_domains_mutex);
7935 arch_init_sched_domains(&cpu_online_map); 7873 arch_init_sched_domains(&cpu_online_map);
7936 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7874 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
7937 if (cpus_empty(non_isolated_cpus)) 7875 if (cpus_empty(non_isolated_cpus))
7938 cpu_set(smp_processor_id(), non_isolated_cpus); 7876 cpu_set(smp_processor_id(), non_isolated_cpus);
7877 mutex_unlock(&sched_domains_mutex);
7939 put_online_cpus(); 7878 put_online_cpus();
7940 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7879 /* XXX: Theoretical race here - CPU may be hotplugged now */
7941 hotcpu_notifier(update_sched_domains, 0); 7880 hotcpu_notifier(update_sched_domains, 0);
7881 init_hrtick();
7942 7882
7943 /* Move init over to a non-isolated CPU */ 7883 /* Move init over to a non-isolated CPU */
7944 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) 7884 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
@@ -8025,7 +7965,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8025 7965
8026 se->my_q = cfs_rq; 7966 se->my_q = cfs_rq;
8027 se->load.weight = tg->shares; 7967 se->load.weight = tg->shares;
8028 se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight); 7968 se->load.inv_weight = 0;
8029 se->parent = parent; 7969 se->parent = parent;
8030} 7970}
8031#endif 7971#endif
@@ -8149,8 +8089,6 @@ void __init sched_init(void)
8149 spin_lock_init(&rq->lock); 8089 spin_lock_init(&rq->lock);
8150 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 8090 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
8151 rq->nr_running = 0; 8091 rq->nr_running = 0;
8152 rq->clock = 1;
8153 update_last_tick_seen(rq);
8154 init_cfs_rq(&rq->cfs, rq); 8092 init_cfs_rq(&rq->cfs, rq);
8155 init_rt_rq(&rq->rt, rq); 8093 init_rt_rq(&rq->rt, rq);
8156#ifdef CONFIG_FAIR_GROUP_SCHED 8094#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8294,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep);
8294static void normalize_task(struct rq *rq, struct task_struct *p) 8232static void normalize_task(struct rq *rq, struct task_struct *p)
8295{ 8233{
8296 int on_rq; 8234 int on_rq;
8235
8297 update_rq_clock(rq); 8236 update_rq_clock(rq);
8298 on_rq = p->se.on_rq; 8237 on_rq = p->se.on_rq;
8299 if (on_rq) 8238 if (on_rq)
@@ -8325,7 +8264,6 @@ void normalize_rt_tasks(void)
8325 p->se.sleep_start = 0; 8264 p->se.sleep_start = 0;
8326 p->se.block_start = 0; 8265 p->se.block_start = 0;
8327#endif 8266#endif
8328 task_rq(p)->clock = 0;
8329 8267
8330 if (!rt_task(p)) { 8268 if (!rt_task(p)) {
8331 /* 8269 /*
@@ -8692,7 +8630,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8692 dequeue_entity(cfs_rq, se, 0); 8630 dequeue_entity(cfs_rq, se, 0);
8693 8631
8694 se->load.weight = shares; 8632 se->load.weight = shares;
8695 se->load.inv_weight = div64_u64((1ULL<<32), shares); 8633 se->load.inv_weight = 0;
8696 8634
8697 if (on_rq) 8635 if (on_rq)
8698 enqueue_entity(cfs_rq, se, 0); 8636 enqueue_entity(cfs_rq, se, 0);
@@ -8722,13 +8660,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8722 if (!tg->se[0]) 8660 if (!tg->se[0])
8723 return -EINVAL; 8661 return -EINVAL;
8724 8662
8725 /*
8726 * A weight of 0 or 1 can cause arithmetics problems.
8727 * (The default weight is 1024 - so there's no practical
8728 * limitation from this.)
8729 */
8730 if (shares < MIN_SHARES) 8663 if (shares < MIN_SHARES)
8731 shares = MIN_SHARES; 8664 shares = MIN_SHARES;
8665 else if (shares > MAX_SHARES)
8666 shares = MAX_SHARES;
8732 8667
8733 mutex_lock(&shares_mutex); 8668 mutex_lock(&shares_mutex);
8734 if (tg->shares == shares) 8669 if (tg->shares == shares)
@@ -8753,7 +8688,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8753 * force a rebalance 8688 * force a rebalance
8754 */ 8689 */
8755 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8690 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8756 set_se_shares(tg->se[i], shares/nr_cpu_ids); 8691 set_se_shares(tg->se[i], shares);
8757 } 8692 }
8758 8693
8759 /* 8694 /*
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
new file mode 100644
index 000000000000..9c597e37f7de
--- /dev/null
+++ b/kernel/sched_clock.c
@@ -0,0 +1,236 @@
1/*
2 * sched_clock for unstable cpu clocks
3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 *
6 * Based on code by:
7 * Ingo Molnar <mingo@redhat.com>
8 * Guillaume Chazarain <guichaz@gmail.com>
9 *
10 * Create a semi stable clock from a mixture of other events, including:
11 * - gtod
12 * - jiffies
13 * - sched_clock()
14 * - explicit idle events
15 *
16 * We use gtod as base and the unstable clock deltas. The deltas are filtered,
17 * making it monotonic and keeping it within an expected window. This window
18 * is set up using jiffies.
19 *
20 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
21 * that is otherwise invisible (TSC gets stopped).
22 *
23 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
24 * consistent between cpus (never more than 1 jiffies difference).
25 */
26#include <linux/sched.h>
27#include <linux/percpu.h>
28#include <linux/spinlock.h>
29#include <linux/ktime.h>
30#include <linux/module.h>
31
32
33#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
34
35struct sched_clock_data {
36 /*
37 * Raw spinlock - this is a special case: this might be called
38 * from within instrumentation code so we dont want to do any
39 * instrumentation ourselves.
40 */
41 raw_spinlock_t lock;
42
43 unsigned long prev_jiffies;
44 u64 prev_raw;
45 u64 tick_raw;
46 u64 tick_gtod;
47 u64 clock;
48};
49
50static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
51
52static inline struct sched_clock_data *this_scd(void)
53{
54 return &__get_cpu_var(sched_clock_data);
55}
56
57static inline struct sched_clock_data *cpu_sdc(int cpu)
58{
59 return &per_cpu(sched_clock_data, cpu);
60}
61
62void sched_clock_init(void)
63{
64 u64 ktime_now = ktime_to_ns(ktime_get());
65 u64 now = 0;
66 int cpu;
67
68 for_each_possible_cpu(cpu) {
69 struct sched_clock_data *scd = cpu_sdc(cpu);
70
71 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
72 scd->prev_jiffies = jiffies;
73 scd->prev_raw = now;
74 scd->tick_raw = now;
75 scd->tick_gtod = ktime_now;
76 scd->clock = ktime_now;
77 }
78}
79
80/*
81 * update the percpu scd from the raw @now value
82 *
83 * - filter out backward motion
84 * - use jiffies to generate a min,max window to clip the raw values
85 */
86static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
87{
88 unsigned long now_jiffies = jiffies;
89 long delta_jiffies = now_jiffies - scd->prev_jiffies;
90 u64 clock = scd->clock;
91 u64 min_clock, max_clock;
92 s64 delta = now - scd->prev_raw;
93
94 WARN_ON_ONCE(!irqs_disabled());
95 min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
96
97 if (unlikely(delta < 0)) {
98 clock++;
99 goto out;
100 }
101
102 max_clock = min_clock + TICK_NSEC;
103
104 if (unlikely(clock + delta > max_clock)) {
105 if (clock < max_clock)
106 clock = max_clock;
107 else
108 clock++;
109 } else {
110 clock += delta;
111 }
112
113 out:
114 if (unlikely(clock < min_clock))
115 clock = min_clock;
116
117 scd->prev_raw = now;
118 scd->prev_jiffies = now_jiffies;
119 scd->clock = clock;
120}
121
122static void lock_double_clock(struct sched_clock_data *data1,
123 struct sched_clock_data *data2)
124{
125 if (data1 < data2) {
126 __raw_spin_lock(&data1->lock);
127 __raw_spin_lock(&data2->lock);
128 } else {
129 __raw_spin_lock(&data2->lock);
130 __raw_spin_lock(&data1->lock);
131 }
132}
133
134u64 sched_clock_cpu(int cpu)
135{
136 struct sched_clock_data *scd = cpu_sdc(cpu);
137 u64 now, clock;
138
139 WARN_ON_ONCE(!irqs_disabled());
140 now = sched_clock();
141
142 if (cpu != raw_smp_processor_id()) {
143 /*
144 * in order to update a remote cpu's clock based on our
145 * unstable raw time rebase it against:
146 * tick_raw (offset between raw counters)
147 * tick_gotd (tick offset between cpus)
148 */
149 struct sched_clock_data *my_scd = this_scd();
150
151 lock_double_clock(scd, my_scd);
152
153 now -= my_scd->tick_raw;
154 now += scd->tick_raw;
155
156 now -= my_scd->tick_gtod;
157 now += scd->tick_gtod;
158
159 __raw_spin_unlock(&my_scd->lock);
160 } else {
161 __raw_spin_lock(&scd->lock);
162 }
163
164 __update_sched_clock(scd, now);
165 clock = scd->clock;
166
167 __raw_spin_unlock(&scd->lock);
168
169 return clock;
170}
171
172void sched_clock_tick(void)
173{
174 struct sched_clock_data *scd = this_scd();
175 u64 now, now_gtod;
176
177 WARN_ON_ONCE(!irqs_disabled());
178
179 now = sched_clock();
180 now_gtod = ktime_to_ns(ktime_get());
181
182 __raw_spin_lock(&scd->lock);
183 __update_sched_clock(scd, now);
184 /*
185 * update tick_gtod after __update_sched_clock() because that will
186 * already observe 1 new jiffy; adding a new tick_gtod to that would
187 * increase the clock 2 jiffies.
188 */
189 scd->tick_raw = now;
190 scd->tick_gtod = now_gtod;
191 __raw_spin_unlock(&scd->lock);
192}
193
194/*
195 * We are going deep-idle (irqs are disabled):
196 */
197void sched_clock_idle_sleep_event(void)
198{
199 sched_clock_cpu(smp_processor_id());
200}
201EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
202
203/*
204 * We just idled delta nanoseconds (called with irqs disabled):
205 */
206void sched_clock_idle_wakeup_event(u64 delta_ns)
207{
208 struct sched_clock_data *scd = this_scd();
209 u64 now = sched_clock();
210
211 /*
212 * Override the previous timestamp and ignore all
213 * sched_clock() deltas that occured while we idled,
214 * and use the PM-provided delta_ns to advance the
215 * rq clock:
216 */
217 __raw_spin_lock(&scd->lock);
218 scd->prev_raw = now;
219 scd->clock += delta_ns;
220 __raw_spin_unlock(&scd->lock);
221
222 touch_softlockup_watchdog();
223}
224EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
225
226#endif
227
228/*
229 * Scheduler clock - returns current time in nanosec units.
230 * This is default implementation.
231 * Architectures and sub-architectures can override this.
232 */
233unsigned long long __attribute__((weak)) sched_clock(void)
234{
235 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
236}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6b4a12558e88..5f06118fbc31 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -204,13 +204,6 @@ static void print_cpu(struct seq_file *m, int cpu)
204 PN(next_balance); 204 PN(next_balance);
205 P(curr->pid); 205 P(curr->pid);
206 PN(clock); 206 PN(clock);
207 PN(idle_clock);
208 PN(prev_clock_raw);
209 P(clock_warps);
210 P(clock_overflows);
211 P(clock_underflows);
212 P(clock_deep_idle_events);
213 PN(clock_max_delta);
214 P(cpu_load[0]); 207 P(cpu_load[0]);
215 P(cpu_load[1]); 208 P(cpu_load[1]);
216 P(cpu_load[2]); 209 P(cpu_load[2]);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 89fa32b4edf2..c863663d204d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -682,6 +682,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
682 * Update run-time statistics of the 'current'. 682 * Update run-time statistics of the 'current'.
683 */ 683 */
684 update_curr(cfs_rq); 684 update_curr(cfs_rq);
685 account_entity_enqueue(cfs_rq, se);
685 686
686 if (wakeup) { 687 if (wakeup) {
687 place_entity(cfs_rq, se, 0); 688 place_entity(cfs_rq, se, 0);
@@ -692,7 +693,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
692 check_spread(cfs_rq, se); 693 check_spread(cfs_rq, se);
693 if (se != cfs_rq->curr) 694 if (se != cfs_rq->curr)
694 __enqueue_entity(cfs_rq, se); 695 __enqueue_entity(cfs_rq, se);
695 account_entity_enqueue(cfs_rq, se);
696} 696}
697 697
698static void update_avg(u64 *avg, u64 sample) 698static void update_avg(u64 *avg, u64 sample)
@@ -841,8 +841,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
841 * queued ticks are scheduled to match the slice, so don't bother 841 * queued ticks are scheduled to match the slice, so don't bother
842 * validating it and just reschedule. 842 * validating it and just reschedule.
843 */ 843 */
844 if (queued) 844 if (queued) {
845 return resched_task(rq_of(cfs_rq)->curr); 845 resched_task(rq_of(cfs_rq)->curr);
846 return;
847 }
846 /* 848 /*
847 * don't let the period tick interfere with the hrtick preemption 849 * don't let the period tick interfere with the hrtick preemption
848 */ 850 */
@@ -957,7 +959,7 @@ static void yield_task_fair(struct rq *rq)
957 return; 959 return;
958 960
959 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { 961 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
960 __update_rq_clock(rq); 962 update_rq_clock(rq);
961 /* 963 /*
962 * Update run-time statistics of the 'current'. 964 * Update run-time statistics of the 'current'.
963 */ 965 */
@@ -1007,7 +1009,7 @@ static int wake_idle(int cpu, struct task_struct *p)
1007 * sibling runqueue info. This will avoid the checks and cache miss 1009 * sibling runqueue info. This will avoid the checks and cache miss
1008 * penalities associated with that. 1010 * penalities associated with that.
1009 */ 1011 */
1010 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) 1012 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1011 return cpu; 1013 return cpu;
1012 1014
1013 for_each_domain(cpu, sd) { 1015 for_each_domain(cpu, sd) {
@@ -1611,30 +1613,6 @@ static const struct sched_class fair_sched_class = {
1611}; 1613};
1612 1614
1613#ifdef CONFIG_SCHED_DEBUG 1615#ifdef CONFIG_SCHED_DEBUG
1614static void
1615print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth)
1616{
1617 struct sched_entity *se;
1618
1619 if (!cfs_rq)
1620 return;
1621
1622 list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) {
1623 int i;
1624
1625 for (i = depth; i; i--)
1626 seq_puts(m, " ");
1627
1628 seq_printf(m, "%lu %s %lu\n",
1629 se->load.weight,
1630 entity_is_task(se) ? "T" : "G",
1631 calc_delta_weight(SCHED_LOAD_SCALE, se)
1632 );
1633 if (!entity_is_task(se))
1634 print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
1635 }
1636}
1637
1638static void print_cfs_stats(struct seq_file *m, int cpu) 1616static void print_cfs_stats(struct seq_file *m, int cpu)
1639{ 1617{
1640 struct cfs_rq *cfs_rq; 1618 struct cfs_rq *cfs_rq;
@@ -1642,9 +1620,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
1642 rcu_read_lock(); 1620 rcu_read_lock();
1643 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1621 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1644 print_cfs_rq(m, cpu, cfs_rq); 1622 print_cfs_rq(m, cpu, cfs_rq);
1645
1646 seq_printf(m, "\nWeight tree:\n");
1647 print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1);
1648 rcu_read_unlock(); 1623 rcu_read_unlock();
1649} 1624}
1650#endif 1625#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 2bcafa375633..3a4f92dbbe66 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -99,7 +99,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
99/* 99/*
100 * Simple, special scheduling class for the per-CPU idle tasks: 100 * Simple, special scheduling class for the per-CPU idle tasks:
101 */ 101 */
102const struct sched_class idle_sched_class = { 102static const struct sched_class idle_sched_class = {
103 /* .next is NULL */ 103 /* .next is NULL */
104 /* no enqueue/yield_task for idle tasks */ 104 /* no enqueue/yield_task for idle tasks */
105 105
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c2730a5a4f05..060e87b0cb1c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1098,11 +1098,14 @@ static void post_schedule_rt(struct rq *rq)
1098 } 1098 }
1099} 1099}
1100 1100
1101 1101/*
1102 * If we are not running and we are not going to reschedule soon, we should
1103 * try to push tasks away now
1104 */
1102static void task_wake_up_rt(struct rq *rq, struct task_struct *p) 1105static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1103{ 1106{
1104 if (!task_running(rq, p) && 1107 if (!task_running(rq, p) &&
1105 (p->prio >= rq->rt.highest_prio) && 1108 !test_tsk_need_resched(rq->curr) &&
1106 rq->rt.overloaded) 1109 rq->rt.overloaded)
1107 push_rt_tasks(rq); 1110 push_rt_tasks(rq);
1108} 1111}
@@ -1309,7 +1312,7 @@ static void set_curr_task_rt(struct rq *rq)
1309 p->se.exec_start = rq->clock; 1312 p->se.exec_start = rq->clock;
1310} 1313}
1311 1314
1312const struct sched_class rt_sched_class = { 1315static const struct sched_class rt_sched_class = {
1313 .next = &fair_sched_class, 1316 .next = &fair_sched_class,
1314 .enqueue_task = enqueue_task_rt, 1317 .enqueue_task = enqueue_task_rt,
1315 .dequeue_task = dequeue_task_rt, 1318 .dequeue_task = dequeue_task_rt,