From ca94c442535a44d508c99a77e54f21a59f4fc462 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 15 Jun 2009 17:17:47 +0200 Subject: sched: Introduce SCHED_RESET_ON_FORK scheduling policy flag This patch introduces a new flag SCHED_RESET_ON_FORK which can be passed to the kernel via sched_setscheduler(), ORed in the policy parameter. If set this will make sure that when the process forks a) the scheduling priority is reset to DEFAULT_PRIO if it was higher and b) the scheduling policy is reset to SCHED_NORMAL if it was either SCHED_FIFO or SCHED_RR. Why have this? Currently, if a process is real-time scheduled this will 'leak' to all its child processes. For security reasons it is often (always?) a good idea to make sure that if a process acquires RT scheduling this is confined to this process and only this process. More specifically this makes the per-process resource limit RLIMIT_RTTIME useful for security purposes, because it makes it impossible to use a fork bomb to circumvent the per-process RLIMIT_RTTIME accounting. This feature is also useful for tools like 'renice' which can then change the nice level of a process without having this spill to all its child processes. Why expose this via sched_setscheduler() and not other syscalls such as prctl() or sched_setparam()? prctl() does not take a pid parameter. Due to that it would be impossible to modify this flag for other processes than the current one. The struct passed to sched_setparam() can unfortunately not be extended without breaking compatibility, since sched_setparam() lacks a size parameter. How to use this from userspace? In your RT program simply replace this: sched_setscheduler(pid, SCHED_FIFO, ¶m); by this: sched_setscheduler(pid, SCHED_FIFO|SCHED_RESET_ON_FORK, ¶m); Signed-off-by: Lennart Poettering Acked-by: Peter Zijlstra LKML-Reference: <20090615152714.GA29092@tango.0pointer.de> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4896fdfec913..d4a2c6662f7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -38,6 +38,8 @@ #define SCHED_BATCH 3 /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 +/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ +#define SCHED_RESET_ON_FORK 0x40000000 #ifdef __KERNEL__ @@ -1209,6 +1211,10 @@ struct task_struct { unsigned did_exec:1; unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ + + /* Revert to default priority/policy when forking */ + unsigned sched_reset_on_fork:1; + pid_t pid; pid_t tgid; -- cgit v1.2.2 From e09758fae8ccde97e026c704319eaa18d488dc86 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Cover the CONFIG_DEBUG_SPINLOCK_SLEEP off-case for __might_sleep() Cover the off case for __might_sleep(), so that we avoid #ifdefs in files that make use of it. Especially, this prepares for the __might_sleep() pull up on cond_resched(). Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d6320a3e8def..b804f694ae6a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -139,6 +139,7 @@ extern int _cond_resched(void); # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) #else + static inline void __might_sleep(char *file, int line) { } # define might_sleep() do { might_resched(); } while (0) #endif -- cgit v1.2.2 From e4aafea2d4bde8b44d6500c4ee7195bbfc51269e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Add a preempt count base offset to __might_sleep() Add a preempt count base offset to compare against the current preempt level count. It prepares to pull up the might_sleep check from cond_resched() to cond_resched_lock() and cond_resched_bh(). For these two helpers, we need to respectively ensure that once we'll unlock the given spinlock / reenable local softirqs, we will reach a sleepable state. Signed-off-by: Frederic Weisbecker [ Move and rename preempt_count_equals() ] Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b804f694ae6a..2b5b1e0899a8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -125,7 +125,7 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - void __might_sleep(char *file, int line); + void __might_sleep(char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep * @@ -137,9 +137,9 @@ extern int _cond_resched(void); * supposed to. */ # define might_sleep() \ - do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) + do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) #else - static inline void __might_sleep(char *file, int line) { } + static inline void __might_sleep(char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) #endif -- cgit v1.2.2 From 6f80bd985fe242c2e6a8b6209ed20b0495d3d63b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Remove the CONFIG_PREEMPT_BKL case definition of cond_resched() CONFIG_PREEMPT_BKL doesn't exist anymore. So remove this config-on case definition of cond_resched(). Reported-by: Peter Zijlstra Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-5-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9bada20e2b23..e2bdf18e05c4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2285,17 +2285,12 @@ static inline int need_resched(void) * cond_resched_softirq() will enable bhs before scheduling. */ extern int _cond_resched(void); -#ifdef CONFIG_PREEMPT_BKL -static inline int cond_resched(void) -{ - return 0; -} -#else + static inline int cond_resched(void) { return _cond_resched(); } -#endif + extern int cond_resched_lock(spinlock_t * lock); extern int cond_resched_softirq(void); static inline int cond_resched_bkl(void) -- cgit v1.2.2 From 613afbf83298efaead05ebcac23d2285609d7160 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Pull up the might_sleep() check into cond_resched() might_sleep() is called late-ish in cond_resched(), after the need_resched()/preempt enabled/system running tests are checked. It's better to check the sleeps while atomic earlier and not depend on some environment datas that reduce the chances to detect a problem. Also define cond_resched_*() helpers as macros, so that the FILE/LINE reported in the sleeping while atomic warning displays the real origin and not sched.h Changes in v2: - Call __might_sleep() directly instead of might_sleep() which may call cond_resched() - Turn cond_resched() into a macro so that the file:line couple reported refers to the caller of cond_resched() and not __cond_resched() itself. Changes in v3: - Also propagate this __might_sleep() pull up to cond_resched_lock() and cond_resched_softirq() Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-6-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index e2bdf18e05c4..c41d424db887 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2286,17 +2286,26 @@ static inline int need_resched(void) */ extern int _cond_resched(void); -static inline int cond_resched(void) -{ - return _cond_resched(); -} +#define cond_resched() ({ \ + __might_sleep(__FILE__, __LINE__, 0); \ + _cond_resched(); \ +}) -extern int cond_resched_lock(spinlock_t * lock); -extern int cond_resched_softirq(void); -static inline int cond_resched_bkl(void) -{ - return _cond_resched(); -} +extern int __cond_resched_lock(spinlock_t *lock); + +#define cond_resched_lock(lock) ({ \ + __might_sleep(__FILE__, __LINE__, PREEMPT_OFFSET); \ + __cond_resched_lock(lock); \ +}) + +extern int __cond_resched_softirq(void); + +#define cond_resched_softirq() ({ \ + __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ + __cond_resched_softirq(); \ +}) + +#define cond_resched_bkl() cond_resched() /* * Does a critical section need to be broken due to another -- cgit v1.2.2 From def01bc53d03881acfc393bd10a5c7575187e008 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Convert the only user of cond_resched_bkl to use cond_resched() fs/locks.c:flock_lock_file() is the only user of cond_resched_bkl() This helper doesn't do anything more than cond_resched(). The latter naming is enough to explain that we are rescheduling if needed. The bkl suffix suggests another semantics but it's actually a synonym of cond_resched(). Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-7-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index c41d424db887..cbbfca69aa4a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2305,8 +2305,6 @@ extern int __cond_resched_softirq(void); __cond_resched_softirq(); \ }) -#define cond_resched_bkl() cond_resched() - /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPT, -- cgit v1.2.2 From 716a42348cdaf04534b15fbdc9c83e25baebfed5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 24 Jul 2009 20:05:23 +0200 Subject: sched: Fix cond_resched_lock() in !CONFIG_PREEMPT The might_sleep() test inside cond_resched_lock() assumes the spinlock is held and then preemption is disabled. This is true with CONFIG_PREEMPT but the preempt_count() doesn't change otherwise. Check by starting from the appropriate preempt offset depending on the config. Reported-by: Li Zefan Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1248458723-12146-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index cbbfca69aa4a..c472414953bf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2293,8 +2293,14 @@ extern int _cond_resched(void); extern int __cond_resched_lock(spinlock_t *lock); +#ifdef CONFIG_PREEMPT +#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET +#else +#define PREEMPT_LOCK_OFFSET 0 +#endif + #define cond_resched_lock(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_OFFSET); \ + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ __cond_resched_lock(lock); \ }) -- cgit v1.2.2 From 3f029d3c6d62068d59301d90c18dbde8ee402107 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 29 Jul 2009 11:08:47 -0400 Subject: sched: Enhance the pre/post scheduling logic We currently have an explicit "needs_post" vtable method which returns a stack variable for whether we should later run post-schedule. This leads to an awkward exchange of the variable as it bubbles back up out of the context switch. Peter Zijlstra observed that this information could be stored in the run-queue itself instead of handled on the stack. Therefore, we revert to the method of having context_switch return void, and update an internal rq->post_schedule variable when we require further processing. In addition, we fix a race condition where we try to access current->sched_class without holding the rq->lock. This is technically racy, as the sched-class could change out from under us. Instead, we reference the per-rq post_schedule variable with the runqueue unlocked, but with preemption disabled to see if we need to reacquire the rq->lock. Finally, we clean the code up slightly by removing the #ifdef CONFIG_SMP conditionals from the schedule() call, and implement some inline helper functions instead. This patch passes checkpatch, and rt-migrate. Signed-off-by: Gregory Haskins Signed-off-by: Peter Zijlstra LKML-Reference: <20090729150422.17691.55590.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c35bc29d2a9..195d72d5c102 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1047,7 +1047,6 @@ struct sched_class { struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); - int (*needs_post_schedule) (struct rq *this_rq); void (*post_schedule) (struct rq *this_rq); void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); -- cgit v1.2.2 From 8e5b59a2d728e6963b35dba8bb36e0b70267462e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 6 Aug 2009 16:02:50 -0700 Subject: sched: Add default defines for PREEMPT_ACTIVE The PREEMPT_ACTIVE setting doesn't actually need to be arch-specific, so set up a sane default for all arches to (hopefully) migrate to. > if we look at linux/hardirq.h, it makes this claim: > * - bit 28 is the PREEMPT_ACTIVE flag > if that's true, then why are we letting any arch set this define ? a > quick survey shows that half the arches (11) are using 0x10000000 (bit > 28) while the other half (10) are using 0x4000000 (bit 26). and then > there is the ia64 oddity which uses bit 30. the exact value here > shouldnt really matter across arches though should it ? actually alpha, arm and avr32 also use bit 30 (0x40000000), there are only five (or eight, depending on how you count) architectures (blackfin, h8300, m68k, s390 and sparc) using bit 26. Signed-off-by: Arnd Bergmann Signed-off-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/hardirq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 8246c697863d..0d885fd75111 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -64,6 +64,12 @@ #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT) +#ifndef PREEMPT_ACTIVE +#define PREEMPT_ACTIVE_BITS 1 +#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) +#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) +#endif + #if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) #error PREEMPT_ACTIVE is too low! #endif -- cgit v1.2.2 From 1314562a9ae5f39f6f595656023c1baf970831ef Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 18 Aug 2009 15:06:02 +0900 Subject: sched, task_struct: stack_canary is not needed without CC_STACKPROTECTOR The field stack_canary is only used with CC_STACKPROTECTOR. This patch reduces task_struct size without CC_STACKPROTECTOR. Signed-off-by: Hiroshi Shimamoto LKML-Reference: <4A8A44CA.2020701@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 195d72d5c102..7bc2d9290837 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1237,8 +1237,10 @@ struct task_struct { pid_t pid; pid_t tgid; +#ifdef CONFIG_CC_STACKPROTECTOR /* Canary value for the -fstack-protector gcc feature */ unsigned long stack_canary; +#endif /* * pointers to (original) parent process, youngest child, younger sibling, -- cgit v1.2.2 From 8f0dfc34e9b323a028c2ec41abb7e9de477b7a94 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 20 Jul 2009 11:26:58 -0700 Subject: sched: Provide iowait counters For counting how long an application has been waiting for (disk) IO, there currently is only the HZ sample driven information available, while for all other counters in this class, a high resolution version is available via CONFIG_SCHEDSTATS. In order to make an improved bootchart tool possible, we also need a higher resolution version of the iowait time. This patch below adds this scheduler statistic to the kernel. Signed-off-by: Arjan van de Ven Signed-off-by: Peter Zijlstra LKML-Reference: <4A64B813.1080506@linux.intel.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index e209ae0e7a8a..9c96ef2f7e68 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1111,6 +1111,8 @@ struct sched_entity { u64 wait_max; u64 wait_count; u64 wait_sum; + u64 iowait_count; + u64 iowait_sum; u64 sleep_start; u64 sleep_max; @@ -1231,6 +1233,8 @@ struct task_struct { unsigned did_exec:1; unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ + unsigned in_iowait:1; + /* Revert to default priority/policy when forking */ unsigned sched_reset_on_fork:1; -- cgit v1.2.2 From 768d0c27226e6587cad2fcf543f9711da3f3774e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jul 2009 20:13:26 +0200 Subject: sched: Add wait, sleep and iowait accounting tracepoints Add 3 schedstat tracepoints to help account for wait-time, sleep-time and iowait-time. They can also be used as a perf-counter source to profile tasks on these clocks. Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Arjan van de Ven LKML-Reference: [ build fix for the !CONFIG_SCHEDSTATS case ] Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 95 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) (limited to 'include') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8949bb7eb082..a4c369ec328f 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -340,6 +340,101 @@ TRACE_EVENT(sched_signal_send, __entry->sig, __entry->comm, __entry->pid) ); +/* + * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE + * adding sched_stat support to SCHED_FIFO/RR would be welcome. + */ + +/* + * Tracepoint for accounting wait time (time the task is runnable + * but not actually running due to scheduler contention). + */ +TRACE_EVENT(sched_stat_wait, + + TP_PROTO(struct task_struct *tsk, u64 delay), + + TP_ARGS(tsk, delay), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( u64, delay ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->delay = delay; + ) + TP_perf_assign( + __perf_count(delay); + ), + + TP_printk("task: %s:%d wait: %Lu [ns]", + __entry->comm, __entry->pid, + (unsigned long long)__entry->delay) +); + +/* + * Tracepoint for accounting sleep time (time the task is not runnable, + * including iowait, see below). + */ +TRACE_EVENT(sched_stat_sleep, + + TP_PROTO(struct task_struct *tsk, u64 delay), + + TP_ARGS(tsk, delay), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( u64, delay ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->delay = delay; + ) + TP_perf_assign( + __perf_count(delay); + ), + + TP_printk("task: %s:%d sleep: %Lu [ns]", + __entry->comm, __entry->pid, + (unsigned long long)__entry->delay) +); + +/* + * Tracepoint for accounting iowait time (time the task is not runnable + * due to waiting on IO to complete). + */ +TRACE_EVENT(sched_stat_iowait, + + TP_PROTO(struct task_struct *tsk, u64 delay), + + TP_ARGS(tsk, delay), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( u64, delay ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->delay = delay; + ) + TP_perf_assign( + __perf_count(delay); + ), + + TP_printk("task: %s:%d iowait: %Lu [ns]", + __entry->comm, __entry->pid, + (unsigned long long)__entry->delay) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ -- cgit v1.2.2 From b5d978e0c7e79a7ff842e895c85a86b38c71f1cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:33 +0200 Subject: sched: Add SD_PREFER_SIBLING Do the placement thing using SD flags. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083825.897028974@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c96ef2f7e68..651dded25720 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -798,18 +798,19 @@ enum cpu_idle_type { #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE #ifdef CONFIG_SMP -#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ -#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ -#define SD_BALANCE_EXEC 4 /* Balance on exec */ -#define SD_BALANCE_FORK 8 /* Balance on fork, clone */ -#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ -#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ -#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ -#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ -#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ -#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ -#define SD_SERIALIZE 1024 /* Only a single load balancing instance */ -#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */ +#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ +#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ +#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ +#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ +#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */ +#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ +#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */ +#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ +#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ +#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ +#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ +#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */ +#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ enum powersavings_balance_level { POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ @@ -829,7 +830,7 @@ static inline int sd_balance_for_mc_power(void) if (sched_smt_power_savings) return SD_POWERSAVINGS_BALANCE; - return 0; + return SD_PREFER_SIBLING; } static inline int sd_balance_for_package_power(void) @@ -837,7 +838,7 @@ static inline int sd_balance_for_package_power(void) if (sched_mc_power_savings | sched_smt_power_savings) return SD_POWERSAVINGS_BALANCE; - return 0; + return SD_PREFER_SIBLING; } /* -- cgit v1.2.2 From a52bfd73589eaf88d9c95ad2c1de0b38a6b27972 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:35 +0200 Subject: sched: Add smt_gain The idea is that multi-threading a core yields more work capacity than a single thread, provide a way to express a static gain for threads. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083826.073345955@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + include/linux/topology.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 651dded25720..9c81c921acb3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -921,6 +921,7 @@ struct sched_domain { unsigned int newidle_idx; unsigned int wake_idx; unsigned int forkexec_idx; + unsigned int smt_gain; int flags; /* See SD_* */ enum sched_domain_level level; diff --git a/include/linux/topology.h b/include/linux/topology.h index 7402c1a27c4f..6203ae5067ce 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -99,6 +99,7 @@ int arch_update_cpu_topology(void); | SD_SHARE_CPUPOWER, \ .last_balance = jiffies, \ .balance_interval = 1, \ + .smt_gain = 1178, /* 15% */ \ } #endif #endif /* CONFIG_SCHED_SMT */ -- cgit v1.2.2 From e9e9250bc78e7f6342517214c0178a529807964b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:37 +0200 Subject: sched: Scale down cpu_power due to RT tasks Keep an average on the amount of time spend on RT tasks and use that fraction to scale down the cpu_power for regular tasks. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083826.287778431@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c81c921acb3..c67ddf309c84 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1831,6 +1831,7 @@ extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; +extern unsigned int sysctl_sched_time_avg; extern unsigned int sysctl_timer_migration; int sched_nr_latency_handler(struct ctl_table *table, int write, -- cgit v1.2.2 From 18a3885fc1ffa92c2212ff0afdf033403d5b0fa0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:39 +0200 Subject: sched: Remove reciprocal for cpu_power Its a source of fail, also, now that cpu_power is dynamical, its a waste of time. before: -0 [000] 132.877936: find_busiest_group: avg_load: 0 group_load: 8241 power: 1 after: bash-1689 [001] 137.862151: find_busiest_group: avg_load: 10636288 group_load: 10387 power: 1 [ v2: build fix from From: Andreas Herrmann ] Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083826.425896304@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index c67ddf309c84..3b7f43e3b736 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -860,15 +860,9 @@ struct sched_group { /* * CPU power of this group, SCHED_LOAD_SCALE being max power for a - * single CPU. This is read only (except for setup, hotplug CPU). - * Note : Never change cpu_power without recompute its reciprocal + * single CPU. */ - unsigned int __cpu_power; - /* - * reciprocal value of cpu_power to avoid expensive divides - * (see include/linux/reciprocal_div.h) - */ - u32 reciprocal_cpu_power; + unsigned int cpu_power; /* * The CPUs this group covers. -- cgit v1.2.2 From 47734f89be0614b5acbd6a532390f9c72f019648 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 4 Sep 2009 11:21:24 +0200 Subject: sched: Clean up topology.h Re-organize the flag settings so that it's visible at a glance which sched-domains flags are set and which not. With the new balancer code we'll need to re-tune these details anyway, so make it cleaner to make fewer mistakes down the road ;-) Cc: Peter Zijlstra Cc: Andreas Herrmann Cc: Andreas Herrmann Cc: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/topology.h | 169 ++++++++++++++++++++++++++++------------------- 1 file changed, 101 insertions(+), 68 deletions(-) (limited to 'include') diff --git a/include/linux/topology.h b/include/linux/topology.h index 6203ae5067ce..fe2c0329f82f 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -85,21 +85,29 @@ int arch_update_cpu_topology(void); #define ARCH_HAS_SCHED_WAKE_IDLE /* Common values for SMT siblings */ #ifndef SD_SIBLING_INIT -#define SD_SIBLING_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 2, \ - .busy_factor = 64, \ - .imbalance_pct = 110, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_FORK \ - | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE \ - | SD_SHARE_CPUPOWER, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .smt_gain = 1178, /* 15% */ \ +#define SD_SIBLING_INIT (struct sched_domain) { \ + .min_interval = 1, \ + .max_interval = 2, \ + .busy_factor = 64, \ + .imbalance_pct = 110, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 1*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_AFFINE \ + | 1*SD_WAKE_BALANCE \ + | 1*SD_SHARE_CPUPOWER \ + | 0*SD_POWERSAVINGS_BALANCE \ + | 0*SD_SHARE_PKG_RESOURCES \ + | 0*SD_SERIALIZE \ + | 0*SD_WAKE_IDLE_FAR \ + | 0*SD_PREFER_SIBLING \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .smt_gain = 1178, /* 15% */ \ } #endif #endif /* CONFIG_SCHED_SMT */ @@ -107,69 +115,94 @@ int arch_update_cpu_topology(void); #ifdef CONFIG_SCHED_MC /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ #ifndef SD_MC_INIT -#define SD_MC_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .wake_idx = 1, \ - .forkexec_idx = 1, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_FORK \ - | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE \ - | SD_SHARE_PKG_RESOURCES\ - | sd_balance_for_mc_power()\ - | sd_power_saving_flags(),\ - .last_balance = jiffies, \ - .balance_interval = 1, \ +#define SD_MC_INIT (struct sched_domain) { \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_nice_tries = 1, \ + .busy_idx = 2, \ + .wake_idx = 1, \ + .forkexec_idx = 1, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 0*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_AFFINE \ + | 1*SD_WAKE_BALANCE \ + | 0*SD_SHARE_CPUPOWER \ + | 1*SD_SHARE_PKG_RESOURCES \ + | 0*SD_SERIALIZE \ + | 0*SD_WAKE_IDLE_FAR \ + | sd_balance_for_mc_power() \ + | sd_power_saving_flags() \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ } #endif #endif /* CONFIG_SCHED_MC */ /* Common values for CPUs */ #ifndef SD_CPU_INIT -#define SD_CPU_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 2, \ - .wake_idx = 1, \ - .forkexec_idx = 1, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_EXEC \ - | SD_BALANCE_FORK \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE \ - | sd_balance_for_package_power()\ - | sd_power_saving_flags(),\ - .last_balance = jiffies, \ - .balance_interval = 1, \ +#define SD_CPU_INIT (struct sched_domain) { \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_nice_tries = 1, \ + .busy_idx = 2, \ + .idle_idx = 1, \ + .newidle_idx = 2, \ + .wake_idx = 1, \ + .forkexec_idx = 1, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 0*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 0*SD_WAKE_IDLE \ + | 0*SD_WAKE_AFFINE \ + | 1*SD_WAKE_BALANCE \ + | 0*SD_SHARE_CPUPOWER \ + | 0*SD_SHARE_PKG_RESOURCES \ + | 0*SD_SERIALIZE \ + | 0*SD_WAKE_IDLE_FAR \ + | sd_balance_for_package_power() \ + | sd_power_saving_flags() \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ } #endif /* sched_domains SD_ALLNODES_INIT for NUMA machines */ -#define SD_ALLNODES_INIT (struct sched_domain) { \ - .min_interval = 64, \ - .max_interval = 64*num_online_cpus(), \ - .busy_factor = 128, \ - .imbalance_pct = 133, \ - .cache_nice_tries = 1, \ - .busy_idx = 3, \ - .idle_idx = 3, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_WAKE_AFFINE \ - | SD_SERIALIZE, \ - .last_balance = jiffies, \ - .balance_interval = 64, \ +#define SD_ALLNODES_INIT (struct sched_domain) { \ + .min_interval = 64, \ + .max_interval = 64*num_online_cpus(), \ + .busy_factor = 128, \ + .imbalance_pct = 133, \ + .cache_nice_tries = 1, \ + .busy_idx = 3, \ + .idle_idx = 3, \ + .flags = 1*SD_LOAD_BALANCE \ + | 1*SD_BALANCE_NEWIDLE \ + | 0*SD_BALANCE_EXEC \ + | 0*SD_BALANCE_FORK \ + | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_AFFINE \ + | 0*SD_WAKE_BALANCE \ + | 0*SD_SHARE_CPUPOWER \ + | 0*SD_POWERSAVINGS_BALANCE \ + | 0*SD_SHARE_PKG_RESOURCES \ + | 1*SD_SERIALIZE \ + | 0*SD_WAKE_IDLE_FAR \ + | 0*SD_PREFER_SIBLING \ + , \ + .last_balance = jiffies, \ + .balance_interval = 64, \ } #ifdef CONFIG_NUMA -- cgit v1.2.2 From 840a0653100dbde599ae8ddf83fa214dfa5fd1aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 4 Sep 2009 11:32:54 +0200 Subject: sched: Turn on SD_BALANCE_NEWIDLE Start the re-tuning of the balancer by turning on newidle. It improves hackbench performance and parallelism on a 4x4 box. The "perf stat --repeat 10" measurements give us: domain0 domain1 ....................................... -SD_BALANCE_NEWIDLE -SD_BALANCE_NEWIDLE: 2041.273208 task-clock-msecs # 9.354 CPUs ( +- 0.363% ) +SD_BALANCE_NEWIDLE -SD_BALANCE_NEWIDLE: 2086.326925 task-clock-msecs # 11.934 CPUs ( +- 0.301% ) +SD_BALANCE_NEWIDLE +SD_BALANCE_NEWIDLE: 2115.289791 task-clock-msecs # 12.158 CPUs ( +- 0.263% ) Acked-by: Peter Zijlstra Cc: Andreas Herrmann Cc: Andreas Herrmann Cc: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/topology.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/topology.h b/include/linux/topology.h index fe2c0329f82f..66774fddec9b 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -126,7 +126,7 @@ int arch_update_cpu_topology(void); .forkexec_idx = 1, \ \ .flags = 1*SD_LOAD_BALANCE \ - | 0*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ | 0*SD_WAKE_IDLE \ @@ -160,7 +160,7 @@ int arch_update_cpu_topology(void); .forkexec_idx = 1, \ \ .flags = 1*SD_LOAD_BALANCE \ - | 0*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ | 0*SD_WAKE_IDLE \ -- cgit v1.2.2 From a8fae3ec5f118dc92517dcbed3ecf69ddb641d0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Sep 2009 18:32:32 +0200 Subject: sched: enable SD_WAKE_IDLE Now that SD_WAKE_IDLE doesn't make pipe-test suck anymore, enable it by default for MC, CPU and NUMA domains. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/topology.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/topology.h b/include/linux/topology.h index 66774fddec9b..85e8cf7d393c 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -129,7 +129,7 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_IDLE \ | 1*SD_WAKE_AFFINE \ | 1*SD_WAKE_BALANCE \ | 0*SD_SHARE_CPUPOWER \ @@ -163,7 +163,7 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_IDLE \ | 0*SD_WAKE_AFFINE \ | 1*SD_WAKE_BALANCE \ | 0*SD_SHARE_CPUPOWER \ @@ -198,7 +198,7 @@ int arch_update_cpu_topology(void); | 0*SD_POWERSAVINGS_BALANCE \ | 0*SD_SHARE_PKG_RESOURCES \ | 1*SD_SERIALIZE \ - | 0*SD_WAKE_IDLE_FAR \ + | 1*SD_WAKE_IDLE_FAR \ | 0*SD_PREFER_SIBLING \ , \ .last_balance = jiffies, \ -- cgit v1.2.2 From 2bba22c50b06abe9fd0d23933b1e64d35b419262 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 9 Sep 2009 15:41:37 +0200 Subject: sched: Turn off child_runs_first Set child_runs_first default to off. It hurts 'optimal' make -j workloads as make jobs get preempted by child tasks, reducing parallelism. Note, this patch might make existing races in user applications more prominent than before - so breakages might be bisected to this commit. Child-runs-first is broken on SMP to begin with, and we already had it off briefly in v2.6.23 so most of the offenders ought to be fixed. Would be nice not to revert this commit but fix those apps finally ... Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: <1252486344.28645.18.camel@marge.simson.net> [ made the sysctl independent of CONFIG_SCHED_DEBUG, in case people want to work around broken apps. ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3b7f43e3b736..3a50e8222498 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1820,8 +1820,8 @@ extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_shares_ratelimit; extern unsigned int sysctl_sched_shares_thresh; -#ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_child_runs_first; +#ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; -- cgit v1.2.2