aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt2
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/include/asm/mutex.h2
-rw-r--r--fs/proc/base.c79
-rw-r--r--include/linux/completion.h8
-rw-r--r--include/linux/init_task.h9
-rw-r--r--include/linux/mutex.h4
-rw-r--r--include/linux/sched.h36
-rw-r--r--init/Kconfig13
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/printk.c8
-rw-r--r--kernel/sched.c569
-rw-r--r--kernel/sched_autogroup.c238
-rw-r--r--kernel/sched_autogroup.h32
-rw-r--r--kernel/sched_clock.c2
-rw-r--r--kernel/sched_debug.c91
-rw-r--r--kernel/sched_fair.c322
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c24
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/watchdog.c2
29 files changed, 930 insertions, 597 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 992cda68fa63..d6496fde6180 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1614,6 +1614,8 @@ and is between 256 and 4096 characters. It is defined in the file
1614 noapic [SMP,APIC] Tells the kernel to not make use of any 1614 noapic [SMP,APIC] Tells the kernel to not make use of any
1615 IOAPICs that may be present in the system. 1615 IOAPICs that may be present in the system.
1616 1616
1617 noautogroup Disable scheduler automatic task group creation.
1618
1617 nobats [PPC] Do not use BATs for mapping kernel lowmem 1619 nobats [PPC] Do not use BATs for mapping kernel lowmem
1618 on "Classic" PPC cores. 1620 on "Classic" PPC cores.
1619 1621
diff --git a/arch/Kconfig b/arch/Kconfig
index 8bf0fa652eb6..f78c2be4242b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -175,4 +175,7 @@ config HAVE_PERF_EVENTS_NMI
175config HAVE_ARCH_JUMP_LABEL 175config HAVE_ARCH_JUMP_LABEL
176 bool 176 bool
177 177
178config HAVE_ARCH_MUTEX_CPU_RELAX
179 bool
180
178source "kernel/gcov/Kconfig" 181source "kernel/gcov/Kconfig"
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e0b98e71ff47..6c6d7b339aae 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -99,6 +99,7 @@ config S390
99 select HAVE_KERNEL_LZMA 99 select HAVE_KERNEL_LZMA
100 select HAVE_KERNEL_LZO 100 select HAVE_KERNEL_LZO
101 select HAVE_GET_USER_PAGES_FAST 101 select HAVE_GET_USER_PAGES_FAST
102 select HAVE_ARCH_MUTEX_CPU_RELAX
102 select ARCH_INLINE_SPIN_TRYLOCK 103 select ARCH_INLINE_SPIN_TRYLOCK
103 select ARCH_INLINE_SPIN_TRYLOCK_BH 104 select ARCH_INLINE_SPIN_TRYLOCK_BH
104 select ARCH_INLINE_SPIN_LOCK 105 select ARCH_INLINE_SPIN_LOCK
diff --git a/arch/s390/include/asm/mutex.h b/arch/s390/include/asm/mutex.h
index 458c1f7fbc18..688271f5f2e4 100644
--- a/arch/s390/include/asm/mutex.h
+++ b/arch/s390/include/asm/mutex.h
@@ -7,3 +7,5 @@
7 */ 7 */
8 8
9#include <asm-generic/mutex-dec.h> 9#include <asm-generic/mutex-dec.h>
10
11#define arch_mutex_cpu_relax() barrier()
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 182845147fe4..08cba2c3b612 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = {
1407 1407
1408#endif 1408#endif
1409 1409
1410#ifdef CONFIG_SCHED_AUTOGROUP
1411/*
1412 * Print out autogroup related information:
1413 */
1414static int sched_autogroup_show(struct seq_file *m, void *v)
1415{
1416 struct inode *inode = m->private;
1417 struct task_struct *p;
1418
1419 p = get_proc_task(inode);
1420 if (!p)
1421 return -ESRCH;
1422 proc_sched_autogroup_show_task(p, m);
1423
1424 put_task_struct(p);
1425
1426 return 0;
1427}
1428
1429static ssize_t
1430sched_autogroup_write(struct file *file, const char __user *buf,
1431 size_t count, loff_t *offset)
1432{
1433 struct inode *inode = file->f_path.dentry->d_inode;
1434 struct task_struct *p;
1435 char buffer[PROC_NUMBUF];
1436 long nice;
1437 int err;
1438
1439 memset(buffer, 0, sizeof(buffer));
1440 if (count > sizeof(buffer) - 1)
1441 count = sizeof(buffer) - 1;
1442 if (copy_from_user(buffer, buf, count))
1443 return -EFAULT;
1444
1445 err = strict_strtol(strstrip(buffer), 0, &nice);
1446 if (err)
1447 return -EINVAL;
1448
1449 p = get_proc_task(inode);
1450 if (!p)
1451 return -ESRCH;
1452
1453 err = nice;
1454 err = proc_sched_autogroup_set_nice(p, &err);
1455 if (err)
1456 count = err;
1457
1458 put_task_struct(p);
1459
1460 return count;
1461}
1462
1463static int sched_autogroup_open(struct inode *inode, struct file *filp)
1464{
1465 int ret;
1466
1467 ret = single_open(filp, sched_autogroup_show, NULL);
1468 if (!ret) {
1469 struct seq_file *m = filp->private_data;
1470
1471 m->private = inode;
1472 }
1473 return ret;
1474}
1475
1476static const struct file_operations proc_pid_sched_autogroup_operations = {
1477 .open = sched_autogroup_open,
1478 .read = seq_read,
1479 .write = sched_autogroup_write,
1480 .llseek = seq_lseek,
1481 .release = single_release,
1482};
1483
1484#endif /* CONFIG_SCHED_AUTOGROUP */
1485
1410static ssize_t comm_write(struct file *file, const char __user *buf, 1486static ssize_t comm_write(struct file *file, const char __user *buf,
1411 size_t count, loff_t *offset) 1487 size_t count, loff_t *offset)
1412{ 1488{
@@ -2733,6 +2809,9 @@ static const struct pid_entry tgid_base_stuff[] = {
2733#ifdef CONFIG_SCHED_DEBUG 2809#ifdef CONFIG_SCHED_DEBUG
2734 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2810 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2735#endif 2811#endif
2812#ifdef CONFIG_SCHED_AUTOGROUP
2813 REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
2814#endif
2736 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2815 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2737#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2816#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2738 INF("syscall", S_IRUSR, proc_pid_syscall), 2817 INF("syscall", S_IRUSR, proc_pid_syscall),
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 36d57f74cd01..51494e6b5548 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -81,10 +81,10 @@ extern int wait_for_completion_interruptible(struct completion *x);
81extern int wait_for_completion_killable(struct completion *x); 81extern int wait_for_completion_killable(struct completion *x);
82extern unsigned long wait_for_completion_timeout(struct completion *x, 82extern unsigned long wait_for_completion_timeout(struct completion *x,
83 unsigned long timeout); 83 unsigned long timeout);
84extern unsigned long wait_for_completion_interruptible_timeout( 84extern long wait_for_completion_interruptible_timeout(
85 struct completion *x, unsigned long timeout); 85 struct completion *x, unsigned long timeout);
86extern unsigned long wait_for_completion_killable_timeout( 86extern long wait_for_completion_killable_timeout(
87 struct completion *x, unsigned long timeout); 87 struct completion *x, unsigned long timeout);
88extern bool try_wait_for_completion(struct completion *x); 88extern bool try_wait_for_completion(struct completion *x);
89extern bool completion_done(struct completion *x); 89extern bool completion_done(struct completion *x);
90 90
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6b281fae114a..caa151fbebb7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -12,6 +12,13 @@
12#include <linux/securebits.h> 12#include <linux/securebits.h>
13#include <net/net_namespace.h> 13#include <net/net_namespace.h>
14 14
15#ifdef CONFIG_SMP
16# define INIT_PUSHABLE_TASKS(tsk) \
17 .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
18#else
19# define INIT_PUSHABLE_TASKS(tsk)
20#endif
21
15extern struct files_struct init_files; 22extern struct files_struct init_files;
16extern struct fs_struct init_fs; 23extern struct fs_struct init_fs;
17 24
@@ -144,7 +151,7 @@ extern struct cred init_cred;
144 .nr_cpus_allowed = NR_CPUS, \ 151 .nr_cpus_allowed = NR_CPUS, \
145 }, \ 152 }, \
146 .tasks = LIST_HEAD_INIT(tsk.tasks), \ 153 .tasks = LIST_HEAD_INIT(tsk.tasks), \
147 .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \ 154 INIT_PUSHABLE_TASKS(tsk) \
148 .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ 155 .ptraced = LIST_HEAD_INIT(tsk.ptraced), \
149 .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ 156 .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
150 .real_parent = &tsk, \ 157 .real_parent = &tsk, \
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index f363bc8fdc74..94b48bd40dd7 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -160,4 +160,8 @@ extern int mutex_trylock(struct mutex *lock);
160extern void mutex_unlock(struct mutex *lock); 160extern void mutex_unlock(struct mutex *lock);
161extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); 161extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
162 162
163#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
164#define arch_mutex_cpu_relax() cpu_relax()
165#endif
166
163#endif 167#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 48c409c279d3..777cd01e240e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -513,6 +513,8 @@ struct thread_group_cputimer {
513 spinlock_t lock; 513 spinlock_t lock;
514}; 514};
515 515
516struct autogroup;
517
516/* 518/*
517 * NOTE! "signal_struct" does not have it's own 519 * NOTE! "signal_struct" does not have it's own
518 * locking, because a shared signal_struct always 520 * locking, because a shared signal_struct always
@@ -580,6 +582,9 @@ struct signal_struct {
580 582
581 struct tty_struct *tty; /* NULL if no tty */ 583 struct tty_struct *tty; /* NULL if no tty */
582 584
585#ifdef CONFIG_SCHED_AUTOGROUP
586 struct autogroup *autogroup;
587#endif
583 /* 588 /*
584 * Cumulative resource counters for dead threads in the group, 589 * Cumulative resource counters for dead threads in the group,
585 * and for reaped dead child processes forked by this group. 590 * and for reaped dead child processes forked by this group.
@@ -1242,7 +1247,9 @@ struct task_struct {
1242#endif 1247#endif
1243 1248
1244 struct list_head tasks; 1249 struct list_head tasks;
1250#ifdef CONFIG_SMP
1245 struct plist_node pushable_tasks; 1251 struct plist_node pushable_tasks;
1252#endif
1246 1253
1247 struct mm_struct *mm, *active_mm; 1254 struct mm_struct *mm, *active_mm;
1248#if defined(SPLIT_RSS_COUNTING) 1255#if defined(SPLIT_RSS_COUNTING)
@@ -1883,14 +1890,11 @@ extern void sched_clock_idle_sleep_event(void);
1883extern void sched_clock_idle_wakeup_event(u64 delta_ns); 1890extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1884 1891
1885#ifdef CONFIG_HOTPLUG_CPU 1892#ifdef CONFIG_HOTPLUG_CPU
1886extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
1887extern void idle_task_exit(void); 1893extern void idle_task_exit(void);
1888#else 1894#else
1889static inline void idle_task_exit(void) {} 1895static inline void idle_task_exit(void) {}
1890#endif 1896#endif
1891 1897
1892extern void sched_idle_next(void);
1893
1894#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 1898#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
1895extern void wake_up_idle_cpu(int cpu); 1899extern void wake_up_idle_cpu(int cpu);
1896#else 1900#else
@@ -1900,8 +1904,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
1900extern unsigned int sysctl_sched_latency; 1904extern unsigned int sysctl_sched_latency;
1901extern unsigned int sysctl_sched_min_granularity; 1905extern unsigned int sysctl_sched_min_granularity;
1902extern unsigned int sysctl_sched_wakeup_granularity; 1906extern unsigned int sysctl_sched_wakeup_granularity;
1903extern unsigned int sysctl_sched_shares_ratelimit;
1904extern unsigned int sysctl_sched_shares_thresh;
1905extern unsigned int sysctl_sched_child_runs_first; 1907extern unsigned int sysctl_sched_child_runs_first;
1906 1908
1907enum sched_tunable_scaling { 1909enum sched_tunable_scaling {
@@ -1917,6 +1919,7 @@ extern unsigned int sysctl_sched_migration_cost;
1917extern unsigned int sysctl_sched_nr_migrate; 1919extern unsigned int sysctl_sched_nr_migrate;
1918extern unsigned int sysctl_sched_time_avg; 1920extern unsigned int sysctl_sched_time_avg;
1919extern unsigned int sysctl_timer_migration; 1921extern unsigned int sysctl_timer_migration;
1922extern unsigned int sysctl_sched_shares_window;
1920 1923
1921int sched_proc_update_handler(struct ctl_table *table, int write, 1924int sched_proc_update_handler(struct ctl_table *table, int write,
1922 void __user *buffer, size_t *length, 1925 void __user *buffer, size_t *length,
@@ -1942,6 +1945,24 @@ int sched_rt_handler(struct ctl_table *table, int write,
1942 1945
1943extern unsigned int sysctl_sched_compat_yield; 1946extern unsigned int sysctl_sched_compat_yield;
1944 1947
1948#ifdef CONFIG_SCHED_AUTOGROUP
1949extern unsigned int sysctl_sched_autogroup_enabled;
1950
1951extern void sched_autogroup_create_attach(struct task_struct *p);
1952extern void sched_autogroup_detach(struct task_struct *p);
1953extern void sched_autogroup_fork(struct signal_struct *sig);
1954extern void sched_autogroup_exit(struct signal_struct *sig);
1955#ifdef CONFIG_PROC_FS
1956extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
1957extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice);
1958#endif
1959#else
1960static inline void sched_autogroup_create_attach(struct task_struct *p) { }
1961static inline void sched_autogroup_detach(struct task_struct *p) { }
1962static inline void sched_autogroup_fork(struct signal_struct *sig) { }
1963static inline void sched_autogroup_exit(struct signal_struct *sig) { }
1964#endif
1965
1945#ifdef CONFIG_RT_MUTEXES 1966#ifdef CONFIG_RT_MUTEXES
1946extern int rt_mutex_getprio(struct task_struct *p); 1967extern int rt_mutex_getprio(struct task_struct *p);
1947extern void rt_mutex_setprio(struct task_struct *p, int prio); 1968extern void rt_mutex_setprio(struct task_struct *p, int prio);
@@ -1960,9 +1981,10 @@ extern int task_nice(const struct task_struct *p);
1960extern int can_nice(const struct task_struct *p, const int nice); 1981extern int can_nice(const struct task_struct *p, const int nice);
1961extern int task_curr(const struct task_struct *p); 1982extern int task_curr(const struct task_struct *p);
1962extern int idle_cpu(int cpu); 1983extern int idle_cpu(int cpu);
1963extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); 1984extern int sched_setscheduler(struct task_struct *, int,
1985 const struct sched_param *);
1964extern int sched_setscheduler_nocheck(struct task_struct *, int, 1986extern int sched_setscheduler_nocheck(struct task_struct *, int,
1965 struct sched_param *); 1987 const struct sched_param *);
1966extern struct task_struct *idle_task(int cpu); 1988extern struct task_struct *idle_task(int cpu);
1967extern struct task_struct *curr_task(int cpu); 1989extern struct task_struct *curr_task(int cpu);
1968extern void set_curr_task(int cpu, struct task_struct *p); 1990extern void set_curr_task(int cpu, struct task_struct *p);
diff --git a/init/Kconfig b/init/Kconfig
index 526ec1c7456a..8dfd094e6875 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -794,6 +794,19 @@ config NET_NS
794 794
795endif # NAMESPACES 795endif # NAMESPACES
796 796
797config SCHED_AUTOGROUP
798 bool "Automatic process group scheduling"
799 select EVENTFD
800 select CGROUPS
801 select CGROUP_SCHED
802 select FAIR_GROUP_SCHED
803 help
804 This option optimizes the scheduler for common desktop workloads by
805 automatically creating and populating task groups. This separation
806 of workloads isolates aggressive CPU burners (like build jobs) from
807 desktop applications. Task group autogeneration is currently based
808 upon task session.
809
797config MM_OWNER 810config MM_OWNER
798 bool 811 bool
799 812
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..cb7a1efa9c2b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
189} 189}
190 190
191struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
193 unsigned long mod; 192 unsigned long mod;
194 void *hcpu; 193 void *hcpu;
195}; 194};
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
198static int __ref take_cpu_down(void *_param) 197static int __ref take_cpu_down(void *_param)
199{ 198{
200 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
202 int err; 200 int err;
203 201
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
208 206
209 cpu_notify(CPU_DYING | param->mod, param->hcpu); 207 cpu_notify(CPU_DYING | param->mod, param->hcpu);
210 208
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
213 /* Force idle task to run as soon as we yield: it should
214 immediately notice cpu is offline and die quickly. */
215 sched_idle_next();
216 return 0; 209 return 0;
217} 210}
218 211
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 void *hcpu = (void *)(long)cpu; 216 void *hcpu = (void *)(long)cpu;
224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 217 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
225 struct take_cpu_down_param tcd_param = { 218 struct take_cpu_down_param tcd_param = {
226 .caller = current,
227 .mod = mod, 219 .mod = mod,
228 .hcpu = hcpu, 220 .hcpu = hcpu,
229 }; 221 };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
253 } 245 }
254 BUG_ON(cpu_online(cpu)); 246 BUG_ON(cpu_online(cpu));
255 247
256 /* Wait for it to sleep (leaving idle task). */ 248 /*
249 * The migration_call() CPU_DYING callback will have removed all
250 * runnable tasks from the cpu, there's only the idle task left now
251 * that the migration thread is done doing the stop_machine thing.
252 *
253 * Wait for the stop thread to go away.
254 */
257 while (!idle_cpu(cpu)) 255 while (!idle_cpu(cpu))
258 yield(); 256 cpu_relax();
259 257
260 /* This actually kills the CPU. */ 258 /* This actually kills the CPU. */
261 __cpu_die(cpu); 259 __cpu_die(cpu);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5447dc7defa9..7d164e25b0f0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
174 174
175static inline void put_signal_struct(struct signal_struct *sig) 175static inline void put_signal_struct(struct signal_struct *sig)
176{ 176{
177 if (atomic_dec_and_test(&sig->sigcnt)) 177 if (atomic_dec_and_test(&sig->sigcnt)) {
178 sched_autogroup_exit(sig);
178 free_signal_struct(sig); 179 free_signal_struct(sig);
180 }
179} 181}
180 182
181void __put_task_struct(struct task_struct *tsk) 183void __put_task_struct(struct task_struct *tsk)
@@ -905,6 +907,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
905 posix_cpu_timers_init_group(sig); 907 posix_cpu_timers_init_group(sig);
906 908
907 tty_audit_fork(sig); 909 tty_audit_fork(sig);
910 sched_autogroup_fork(sig);
908 911
909 sig->oom_adj = current->signal->oom_adj; 912 sig->oom_adj = current->signal->oom_adj;
910 sig->oom_score_adj = current->signal->oom_score_adj; 913 sig->oom_score_adj = current->signal->oom_score_adj;
@@ -1315,7 +1318,7 @@ bad_fork_cleanup_mm:
1315 } 1318 }
1316bad_fork_cleanup_signal: 1319bad_fork_cleanup_signal:
1317 if (!(clone_flags & CLONE_THREAD)) 1320 if (!(clone_flags & CLONE_THREAD))
1318 free_signal_struct(p->signal); 1321 put_signal_struct(p->signal);
1319bad_fork_cleanup_sighand: 1322bad_fork_cleanup_sighand:
1320 __cleanup_sighand(p->sighand); 1323 __cleanup_sighand(p->sighand);
1321bad_fork_cleanup_fs: 1324bad_fork_cleanup_fs:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc5f952..91a5fa25054e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
577 */ 577 */
578static int irq_thread(void *data) 578static int irq_thread(void *data)
579{ 579{
580 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 580 static struct sched_param param = {
581 .sched_priority = MAX_USER_RT_PRIO/2,
582 };
581 struct irqaction *action = data; 583 struct irqaction *action = data;
582 struct irq_desc *desc = irq_to_desc(action->irq); 584 struct irq_desc *desc = irq_to_desc(action->irq);
583 int wake, oneshot = desc->status & IRQ_ONESHOT; 585 int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ca61bbdd44b2..5355cfd44a3f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
148 wait_for_completion(&create.done); 148 wait_for_completion(&create.done);
149 149
150 if (!IS_ERR(create.result)) { 150 if (!IS_ERR(create.result)) {
151 struct sched_param param = { .sched_priority = 0 }; 151 static struct sched_param param = { .sched_priority = 0 };
152 va_list args; 152 va_list args;
153 153
154 va_start(args, namefmt); 154 va_start(args, namefmt);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..a5889fb28ecf 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
199 * memory barriers as we'll eventually observe the right 199 * memory barriers as we'll eventually observe the right
200 * values at the cost of a few extra spins. 200 * values at the cost of a few extra spins.
201 */ 201 */
202 cpu_relax(); 202 arch_mutex_cpu_relax();
203 } 203 }
204#endif 204#endif
205 spin_lock_mutex(&lock->wait_lock, flags); 205 spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/printk.c b/kernel/printk.c
index a23315dc4498..ab3ffc5b3b64 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending);
1074 1074
1075void printk_tick(void) 1075void printk_tick(void)
1076{ 1076{
1077 if (__get_cpu_var(printk_pending)) { 1077 if (__this_cpu_read(printk_pending)) {
1078 __get_cpu_var(printk_pending) = 0; 1078 __this_cpu_write(printk_pending, 0);
1079 wake_up_interruptible(&log_wait); 1079 wake_up_interruptible(&log_wait);
1080 } 1080 }
1081} 1081}
1082 1082
1083int printk_needs_cpu(int cpu) 1083int printk_needs_cpu(int cpu)
1084{ 1084{
1085 if (unlikely(cpu_is_offline(cpu))) 1085 if (cpu_is_offline(cpu))
1086 printk_tick(); 1086 printk_tick();
1087 return per_cpu(printk_pending, cpu); 1087 return __this_cpu_read(printk_pending);
1088} 1088}
1089 1089
1090void wake_up_klogd(void) 1090void wake_up_klogd(void)
diff --git a/kernel/sched.c b/kernel/sched.c
index 260132961a99..04949089e760 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
75 75
76#include <asm/tlb.h> 76#include <asm/tlb.h>
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
78 79
79#include "sched_cpupri.h" 80#include "sched_cpupri.h"
80#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
81 83
82#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 255 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 256 struct cfs_rq **cfs_rq;
255 unsigned long shares; 257 unsigned long shares;
258
259 atomic_t load_weight;
256#endif 260#endif
257 261
258#ifdef CONFIG_RT_GROUP_SCHED 262#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,24 +272,19 @@ struct task_group {
268 struct task_group *parent; 272 struct task_group *parent;
269 struct list_head siblings; 273 struct list_head siblings;
270 struct list_head children; 274 struct list_head children;
275
276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
271}; 279};
272 280
273#define root_task_group init_task_group 281#define root_task_group init_task_group
274 282
275/* task_group_lock serializes add/remove of task groups and also changes to 283/* task_group_lock serializes the addition/removal of task groups */
276 * a task group's cpu shares.
277 */
278static DEFINE_SPINLOCK(task_group_lock); 284static DEFINE_SPINLOCK(task_group_lock);
279 285
280#ifdef CONFIG_FAIR_GROUP_SCHED 286#ifdef CONFIG_FAIR_GROUP_SCHED
281 287
282#ifdef CONFIG_SMP
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 288# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290 289
291/* 290/*
@@ -342,6 +341,7 @@ struct cfs_rq {
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 341 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance. 342 * list is used during load balance.
344 */ 343 */
344 int on_list;
345 struct list_head leaf_cfs_rq_list; 345 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 346 struct task_group *tg; /* group that "owns" this runqueue */
347 347
@@ -360,14 +360,17 @@ struct cfs_rq {
360 unsigned long h_load; 360 unsigned long h_load;
361 361
362 /* 362 /*
363 * this cpu's part of tg->shares 363 * Maintaining per-cpu shares distribution for group scheduling
364 *
365 * load_stamp is the last time we updated the load average
366 * load_last is the last time we updated the load average and saw load
367 * load_unacc_exec_time is currently unaccounted execution time
364 */ 368 */
365 unsigned long shares; 369 u64 load_avg;
370 u64 load_period;
371 u64 load_stamp, load_last, load_unacc_exec_time;
366 372
367 /* 373 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 374#endif
372#endif 375#endif
373}; 376};
@@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
605 */ 608 */
606static inline struct task_group *task_group(struct task_struct *p) 609static inline struct task_group *task_group(struct task_struct *p)
607{ 610{
611 struct task_group *tg;
608 struct cgroup_subsys_state *css; 612 struct cgroup_subsys_state *css;
609 613
610 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 614 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
611 lockdep_is_held(&task_rq(p)->lock)); 615 lockdep_is_held(&task_rq(p)->lock));
612 return container_of(css, struct task_group, css); 616 tg = container_of(css, struct task_group, css);
617
618 return autogroup_task_group(p, tg);
613} 619}
614 620
615/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 621/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -793,20 +799,6 @@ late_initcall(sched_init_debug);
793const_debug unsigned int sysctl_sched_nr_migrate = 32; 799const_debug unsigned int sysctl_sched_nr_migrate = 32;
794 800
795/* 801/*
796 * ratelimit for updating the group shares.
797 * default: 0.25ms
798 */
799unsigned int sysctl_sched_shares_ratelimit = 250000;
800unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
801
802/*
803 * Inject some fuzzyness into changing the per-cpu group shares
804 * this avoids remote rq-locks at the expense of fairness.
805 * default: 4
806 */
807unsigned int sysctl_sched_shares_thresh = 4;
808
809/*
810 * period over which we average the RT time consumption, measured 802 * period over which we average the RT time consumption, measured
811 * in ms. 803 * in ms.
812 * 804 *
@@ -1355,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1355 lw->inv_weight = 0; 1347 lw->inv_weight = 0;
1356} 1348}
1357 1349
1350static inline void update_load_set(struct load_weight *lw, unsigned long w)
1351{
1352 lw->weight = w;
1353 lw->inv_weight = 0;
1354}
1355
1358/* 1356/*
1359 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1357 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1360 * of tasks with abnormal "nice" values across CPUs the contribution that 1358 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1543 1541
1544#ifdef CONFIG_FAIR_GROUP_SCHED 1542#ifdef CONFIG_FAIR_GROUP_SCHED
1545 1543
1546static __read_mostly unsigned long __percpu *update_shares_data;
1547
1548static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1549
1550/*
1551 * Calculate and set the cpu's group shares.
1552 */
1553static void update_group_shares_cpu(struct task_group *tg, int cpu,
1554 unsigned long sd_shares,
1555 unsigned long sd_rq_weight,
1556 unsigned long *usd_rq_weight)
1557{
1558 unsigned long shares, rq_weight;
1559 int boost = 0;
1560
1561 rq_weight = usd_rq_weight[cpu];
1562 if (!rq_weight) {
1563 boost = 1;
1564 rq_weight = NICE_0_LOAD;
1565 }
1566
1567 /*
1568 * \Sum_j shares_j * rq_weight_i
1569 * shares_i = -----------------------------
1570 * \Sum_j rq_weight_j
1571 */
1572 shares = (sd_shares * rq_weight) / sd_rq_weight;
1573 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1574
1575 if (abs(shares - tg->se[cpu]->load.weight) >
1576 sysctl_sched_shares_thresh) {
1577 struct rq *rq = cpu_rq(cpu);
1578 unsigned long flags;
1579
1580 raw_spin_lock_irqsave(&rq->lock, flags);
1581 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1582 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1583 __set_se_shares(tg->se[cpu], shares);
1584 raw_spin_unlock_irqrestore(&rq->lock, flags);
1585 }
1586}
1587
1588/*
1589 * Re-compute the task group their per cpu shares over the given domain.
1590 * This needs to be done in a bottom-up fashion because the rq weight of a
1591 * parent group depends on the shares of its child groups.
1592 */
1593static int tg_shares_up(struct task_group *tg, void *data)
1594{
1595 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1596 unsigned long *usd_rq_weight;
1597 struct sched_domain *sd = data;
1598 unsigned long flags;
1599 int i;
1600
1601 if (!tg->se[0])
1602 return 0;
1603
1604 local_irq_save(flags);
1605 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1606
1607 for_each_cpu(i, sched_domain_span(sd)) {
1608 weight = tg->cfs_rq[i]->load.weight;
1609 usd_rq_weight[i] = weight;
1610
1611 rq_weight += weight;
1612 /*
1613 * If there are currently no tasks on the cpu pretend there
1614 * is one of average load so that when a new task gets to
1615 * run here it will not get delayed by group starvation.
1616 */
1617 if (!weight)
1618 weight = NICE_0_LOAD;
1619
1620 sum_weight += weight;
1621 shares += tg->cfs_rq[i]->shares;
1622 }
1623
1624 if (!rq_weight)
1625 rq_weight = sum_weight;
1626
1627 if ((!shares && rq_weight) || shares > tg->shares)
1628 shares = tg->shares;
1629
1630 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1631 shares = tg->shares;
1632
1633 for_each_cpu(i, sched_domain_span(sd))
1634 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1635
1636 local_irq_restore(flags);
1637
1638 return 0;
1639}
1640
1641/* 1544/*
1642 * Compute the cpu's hierarchical load factor for each task group. 1545 * Compute the cpu's hierarchical load factor for each task group.
1643 * This needs to be done in a top-down fashion because the load of a child 1546 * This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1652 load = cpu_rq(cpu)->load.weight; 1555 load = cpu_rq(cpu)->load.weight;
1653 } else { 1556 } else {
1654 load = tg->parent->cfs_rq[cpu]->h_load; 1557 load = tg->parent->cfs_rq[cpu]->h_load;
1655 load *= tg->cfs_rq[cpu]->shares; 1558 load *= tg->se[cpu]->load.weight;
1656 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1559 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1657 } 1560 }
1658 1561
@@ -1661,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1661 return 0; 1564 return 0;
1662} 1565}
1663 1566
1664static void update_shares(struct sched_domain *sd)
1665{
1666 s64 elapsed;
1667 u64 now;
1668
1669 if (root_task_group_empty())
1670 return;
1671
1672 now = local_clock();
1673 elapsed = now - sd->last_update;
1674
1675 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1676 sd->last_update = now;
1677 walk_tg_tree(tg_nop, tg_shares_up, sd);
1678 }
1679}
1680
1681static void update_h_load(long cpu) 1567static void update_h_load(long cpu)
1682{ 1568{
1683 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1569 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1684} 1570}
1685 1571
1686#else
1687
1688static inline void update_shares(struct sched_domain *sd)
1689{
1690}
1691
1692#endif 1572#endif
1693 1573
1694#ifdef CONFIG_PREEMPT 1574#ifdef CONFIG_PREEMPT
@@ -1810,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1810 1690
1811#endif 1691#endif
1812 1692
1813#ifdef CONFIG_FAIR_GROUP_SCHED
1814static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1815{
1816#ifdef CONFIG_SMP
1817 cfs_rq->shares = shares;
1818#endif
1819}
1820#endif
1821
1822static void calc_load_account_idle(struct rq *this_rq); 1693static void calc_load_account_idle(struct rq *this_rq);
1823static void update_sysctl(void); 1694static void update_sysctl(void);
1824static int get_update_sysctl_factor(void); 1695static int get_update_sysctl_factor(void);
@@ -2063,6 +1934,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2063#include "sched_idletask.c" 1934#include "sched_idletask.c"
2064#include "sched_fair.c" 1935#include "sched_fair.c"
2065#include "sched_rt.c" 1936#include "sched_rt.c"
1937#include "sched_autogroup.c"
2066#include "sched_stoptask.c" 1938#include "sched_stoptask.c"
2067#ifdef CONFIG_SCHED_DEBUG 1939#ifdef CONFIG_SCHED_DEBUG
2068# include "sched_debug.c" 1940# include "sched_debug.c"
@@ -2255,10 +2127,8 @@ static int migration_cpu_stop(void *data);
2255 * The task's runqueue lock must be held. 2127 * The task's runqueue lock must be held.
2256 * Returns true if you have to wait for migration thread. 2128 * Returns true if you have to wait for migration thread.
2257 */ 2129 */
2258static bool migrate_task(struct task_struct *p, int dest_cpu) 2130static bool migrate_task(struct task_struct *p, struct rq *rq)
2259{ 2131{
2260 struct rq *rq = task_rq(p);
2261
2262 /* 2132 /*
2263 * If the task is not on a runqueue (and not running), then 2133 * If the task is not on a runqueue (and not running), then
2264 * the next wake-up will properly place the task. 2134 * the next wake-up will properly place the task.
@@ -2438,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2438 return dest_cpu; 2308 return dest_cpu;
2439 2309
2440 /* No more Mr. Nice Guy. */ 2310 /* No more Mr. Nice Guy. */
2441 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2311 dest_cpu = cpuset_cpus_allowed_fallback(p);
2442 dest_cpu = cpuset_cpus_allowed_fallback(p); 2312 /*
2443 /* 2313 * Don't tell them about moving exiting tasks or
2444 * Don't tell them about moving exiting tasks or 2314 * kernel threads (both mm NULL), since they never
2445 * kernel threads (both mm NULL), since they never 2315 * leave kernel.
2446 * leave kernel. 2316 */
2447 */ 2317 if (p->mm && printk_ratelimit()) {
2448 if (p->mm && printk_ratelimit()) { 2318 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2449 printk(KERN_INFO "process %d (%s) no " 2319 task_pid_nr(p), p->comm, cpu);
2450 "longer affine to cpu%d\n",
2451 task_pid_nr(p), p->comm, cpu);
2452 }
2453 } 2320 }
2454 2321
2455 return dest_cpu; 2322 return dest_cpu;
@@ -2785,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2785 /* Want to start with kernel preemption disabled. */ 2652 /* Want to start with kernel preemption disabled. */
2786 task_thread_info(p)->preempt_count = 1; 2653 task_thread_info(p)->preempt_count = 1;
2787#endif 2654#endif
2655#ifdef CONFIG_SMP
2788 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2656 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2657#endif
2789 2658
2790 put_cpu(); 2659 put_cpu();
2791} 2660}
@@ -3549,7 +3418,7 @@ void sched_exec(void)
3549 * select_task_rq() can race against ->cpus_allowed 3418 * select_task_rq() can race against ->cpus_allowed
3550 */ 3419 */
3551 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3420 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3552 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3421 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3553 struct migration_arg arg = { p, dest_cpu }; 3422 struct migration_arg arg = { p, dest_cpu };
3554 3423
3555 task_rq_unlock(rq, &flags); 3424 task_rq_unlock(rq, &flags);
@@ -4214,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4214 if (task_thread_info(rq->curr) != owner || need_resched()) 4083 if (task_thread_info(rq->curr) != owner || need_resched())
4215 return 0; 4084 return 0;
4216 4085
4217 cpu_relax(); 4086 arch_mutex_cpu_relax();
4218 } 4087 }
4219 4088
4220 return 1; 4089 return 1;
@@ -4526,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4526 * This waits for either a completion of a specific task to be signaled or for a 4395 * This waits for either a completion of a specific task to be signaled or for a
4527 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4396 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4528 */ 4397 */
4529unsigned long __sched 4398long __sched
4530wait_for_completion_interruptible_timeout(struct completion *x, 4399wait_for_completion_interruptible_timeout(struct completion *x,
4531 unsigned long timeout) 4400 unsigned long timeout)
4532{ 4401{
@@ -4559,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4559 * signaled or for a specified timeout to expire. It can be 4428 * signaled or for a specified timeout to expire. It can be
4560 * interrupted by a kill signal. The timeout is in jiffies. 4429 * interrupted by a kill signal. The timeout is in jiffies.
4561 */ 4430 */
4562unsigned long __sched 4431long __sched
4563wait_for_completion_killable_timeout(struct completion *x, 4432wait_for_completion_killable_timeout(struct completion *x,
4564 unsigned long timeout) 4433 unsigned long timeout)
4565{ 4434{
@@ -4901,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p)
4901} 4770}
4902 4771
4903static int __sched_setscheduler(struct task_struct *p, int policy, 4772static int __sched_setscheduler(struct task_struct *p, int policy,
4904 struct sched_param *param, bool user) 4773 const struct sched_param *param, bool user)
4905{ 4774{
4906 int retval, oldprio, oldpolicy = -1, on_rq, running; 4775 int retval, oldprio, oldpolicy = -1, on_rq, running;
4907 unsigned long flags; 4776 unsigned long flags;
@@ -5056,7 +4925,7 @@ recheck:
5056 * NOTE that the task may be already dead. 4925 * NOTE that the task may be already dead.
5057 */ 4926 */
5058int sched_setscheduler(struct task_struct *p, int policy, 4927int sched_setscheduler(struct task_struct *p, int policy,
5059 struct sched_param *param) 4928 const struct sched_param *param)
5060{ 4929{
5061 return __sched_setscheduler(p, policy, param, true); 4930 return __sched_setscheduler(p, policy, param, true);
5062} 4931}
@@ -5074,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
5074 * but our caller might not have that capability. 4943 * but our caller might not have that capability.
5075 */ 4944 */
5076int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4945int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5077 struct sched_param *param) 4946 const struct sched_param *param)
5078{ 4947{
5079 return __sched_setscheduler(p, policy, param, false); 4948 return __sched_setscheduler(p, policy, param, false);
5080} 4949}
@@ -5590,7 +5459,7 @@ void sched_show_task(struct task_struct *p)
5590 unsigned state; 5459 unsigned state;
5591 5460
5592 state = p->state ? __ffs(p->state) + 1 : 0; 5461 state = p->state ? __ffs(p->state) + 1 : 0;
5593 printk(KERN_INFO "%-13.13s %c", p->comm, 5462 printk(KERN_INFO "%-15.15s %c", p->comm,
5594 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5463 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5595#if BITS_PER_LONG == 32 5464#if BITS_PER_LONG == 32
5596 if (state == TASK_RUNNING) 5465 if (state == TASK_RUNNING)
@@ -5754,7 +5623,6 @@ static void update_sysctl(void)
5754 SET_SYSCTL(sched_min_granularity); 5623 SET_SYSCTL(sched_min_granularity);
5755 SET_SYSCTL(sched_latency); 5624 SET_SYSCTL(sched_latency);
5756 SET_SYSCTL(sched_wakeup_granularity); 5625 SET_SYSCTL(sched_wakeup_granularity);
5757 SET_SYSCTL(sched_shares_ratelimit);
5758#undef SET_SYSCTL 5626#undef SET_SYSCTL
5759} 5627}
5760 5628
@@ -5830,7 +5698,7 @@ again:
5830 goto out; 5698 goto out;
5831 5699
5832 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5700 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5833 if (migrate_task(p, dest_cpu)) { 5701 if (migrate_task(p, rq)) {
5834 struct migration_arg arg = { p, dest_cpu }; 5702 struct migration_arg arg = { p, dest_cpu };
5835 /* Need help from migration thread: drop lock and wait. */ 5703 /* Need help from migration thread: drop lock and wait. */
5836 task_rq_unlock(rq, &flags); 5704 task_rq_unlock(rq, &flags);
@@ -5912,29 +5780,20 @@ static int migration_cpu_stop(void *data)
5912} 5780}
5913 5781
5914#ifdef CONFIG_HOTPLUG_CPU 5782#ifdef CONFIG_HOTPLUG_CPU
5783
5915/* 5784/*
5916 * Figure out where task on dead CPU should go, use force if necessary. 5785 * Ensures that the idle task is using init_mm right before its cpu goes
5786 * offline.
5917 */ 5787 */
5918void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5788void idle_task_exit(void)
5919{ 5789{
5920 struct rq *rq = cpu_rq(dead_cpu); 5790 struct mm_struct *mm = current->active_mm;
5921 int needs_cpu, uninitialized_var(dest_cpu);
5922 unsigned long flags;
5923 5791
5924 local_irq_save(flags); 5792 BUG_ON(cpu_online(smp_processor_id()));
5925 5793
5926 raw_spin_lock(&rq->lock); 5794 if (mm != &init_mm)
5927 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5795 switch_mm(mm, &init_mm, current);
5928 if (needs_cpu) 5796 mmdrop(mm);
5929 dest_cpu = select_fallback_rq(dead_cpu, p);
5930 raw_spin_unlock(&rq->lock);
5931 /*
5932 * It can only fail if we race with set_cpus_allowed(),
5933 * in the racer should migrate the task anyway.
5934 */
5935 if (needs_cpu)
5936 __migrate_task(p, dead_cpu, dest_cpu);
5937 local_irq_restore(flags);
5938} 5797}
5939 5798
5940/* 5799/*
@@ -5947,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5947static void migrate_nr_uninterruptible(struct rq *rq_src) 5806static void migrate_nr_uninterruptible(struct rq *rq_src)
5948{ 5807{
5949 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5808 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5950 unsigned long flags;
5951 5809
5952 local_irq_save(flags);
5953 double_rq_lock(rq_src, rq_dest);
5954 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5810 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5955 rq_src->nr_uninterruptible = 0; 5811 rq_src->nr_uninterruptible = 0;
5956 double_rq_unlock(rq_src, rq_dest);
5957 local_irq_restore(flags);
5958}
5959
5960/* Run through task list and migrate tasks from the dead cpu. */
5961static void migrate_live_tasks(int src_cpu)
5962{
5963 struct task_struct *p, *t;
5964
5965 read_lock(&tasklist_lock);
5966
5967 do_each_thread(t, p) {
5968 if (p == current)
5969 continue;
5970
5971 if (task_cpu(p) == src_cpu)
5972 move_task_off_dead_cpu(src_cpu, p);
5973 } while_each_thread(t, p);
5974
5975 read_unlock(&tasklist_lock);
5976} 5812}
5977 5813
5978/* 5814/*
5979 * Schedules idle task to be the next runnable task on current CPU. 5815 * remove the tasks which were accounted by rq from calc_load_tasks.
5980 * It does so by boosting its priority to highest possible.
5981 * Used by CPU offline code.
5982 */ 5816 */
5983void sched_idle_next(void) 5817static void calc_global_load_remove(struct rq *rq)
5984{ 5818{
5985 int this_cpu = smp_processor_id(); 5819 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5986 struct rq *rq = cpu_rq(this_cpu); 5820 rq->calc_load_active = 0;
5987 struct task_struct *p = rq->idle;
5988 unsigned long flags;
5989
5990 /* cpu has to be offline */
5991 BUG_ON(cpu_online(this_cpu));
5992
5993 /*
5994 * Strictly not necessary since rest of the CPUs are stopped by now
5995 * and interrupts disabled on the current cpu.
5996 */
5997 raw_spin_lock_irqsave(&rq->lock, flags);
5998
5999 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6000
6001 activate_task(rq, p, 0);
6002
6003 raw_spin_unlock_irqrestore(&rq->lock, flags);
6004} 5821}
6005 5822
6006/* 5823/*
6007 * Ensures that the idle task is using init_mm right before its cpu goes 5824 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6008 * offline. 5825 * try_to_wake_up()->select_task_rq().
5826 *
5827 * Called with rq->lock held even though we'er in stop_machine() and
5828 * there's no concurrency possible, we hold the required locks anyway
5829 * because of lock validation efforts.
6009 */ 5830 */
6010void idle_task_exit(void) 5831static void migrate_tasks(unsigned int dead_cpu)
6011{
6012 struct mm_struct *mm = current->active_mm;
6013
6014 BUG_ON(cpu_online(smp_processor_id()));
6015
6016 if (mm != &init_mm)
6017 switch_mm(mm, &init_mm, current);
6018 mmdrop(mm);
6019}
6020
6021/* called under rq->lock with disabled interrupts */
6022static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
6023{ 5832{
6024 struct rq *rq = cpu_rq(dead_cpu); 5833 struct rq *rq = cpu_rq(dead_cpu);
6025 5834 struct task_struct *next, *stop = rq->stop;
6026 /* Must be exiting, otherwise would be on tasklist. */ 5835 int dest_cpu;
6027 BUG_ON(!p->exit_state);
6028
6029 /* Cannot have done final schedule yet: would have vanished. */
6030 BUG_ON(p->state == TASK_DEAD);
6031
6032 get_task_struct(p);
6033 5836
6034 /* 5837 /*
6035 * Drop lock around migration; if someone else moves it, 5838 * Fudge the rq selection such that the below task selection loop
6036 * that's OK. No task can be added to this CPU, so iteration is 5839 * doesn't get stuck on the currently eligible stop task.
6037 * fine. 5840 *
5841 * We're currently inside stop_machine() and the rq is either stuck
5842 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5843 * either way we should never end up calling schedule() until we're
5844 * done here.
6038 */ 5845 */
6039 raw_spin_unlock_irq(&rq->lock); 5846 rq->stop = NULL;
6040 move_task_off_dead_cpu(dead_cpu, p);
6041 raw_spin_lock_irq(&rq->lock);
6042
6043 put_task_struct(p);
6044}
6045
6046/* release_task() removes task from tasklist, so we won't find dead tasks. */
6047static void migrate_dead_tasks(unsigned int dead_cpu)
6048{
6049 struct rq *rq = cpu_rq(dead_cpu);
6050 struct task_struct *next;
6051 5847
6052 for ( ; ; ) { 5848 for ( ; ; ) {
6053 if (!rq->nr_running) 5849 /*
5850 * There's this thread running, bail when that's the only
5851 * remaining thread.
5852 */
5853 if (rq->nr_running == 1)
6054 break; 5854 break;
5855
6055 next = pick_next_task(rq); 5856 next = pick_next_task(rq);
6056 if (!next) 5857 BUG_ON(!next);
6057 break;
6058 next->sched_class->put_prev_task(rq, next); 5858 next->sched_class->put_prev_task(rq, next);
6059 migrate_dead(dead_cpu, next);
6060 5859
5860 /* Find suitable destination for @next, with force if needed. */
5861 dest_cpu = select_fallback_rq(dead_cpu, next);
5862 raw_spin_unlock(&rq->lock);
5863
5864 __migrate_task(next, dead_cpu, dest_cpu);
5865
5866 raw_spin_lock(&rq->lock);
6061 } 5867 }
6062}
6063 5868
6064/* 5869 rq->stop = stop;
6065 * remove the tasks which were accounted by rq from calc_load_tasks.
6066 */
6067static void calc_global_load_remove(struct rq *rq)
6068{
6069 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
6070 rq->calc_load_active = 0;
6071} 5870}
5871
6072#endif /* CONFIG_HOTPLUG_CPU */ 5872#endif /* CONFIG_HOTPLUG_CPU */
6073 5873
6074#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5874#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6278,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6278 unsigned long flags; 6078 unsigned long flags;
6279 struct rq *rq = cpu_rq(cpu); 6079 struct rq *rq = cpu_rq(cpu);
6280 6080
6281 switch (action) { 6081 switch (action & ~CPU_TASKS_FROZEN) {
6282 6082
6283 case CPU_UP_PREPARE: 6083 case CPU_UP_PREPARE:
6284 case CPU_UP_PREPARE_FROZEN:
6285 rq->calc_load_update = calc_load_update; 6084 rq->calc_load_update = calc_load_update;
6286 break; 6085 break;
6287 6086
6288 case CPU_ONLINE: 6087 case CPU_ONLINE:
6289 case CPU_ONLINE_FROZEN:
6290 /* Update our root-domain */ 6088 /* Update our root-domain */
6291 raw_spin_lock_irqsave(&rq->lock, flags); 6089 raw_spin_lock_irqsave(&rq->lock, flags);
6292 if (rq->rd) { 6090 if (rq->rd) {
@@ -6298,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6298 break; 6096 break;
6299 6097
6300#ifdef CONFIG_HOTPLUG_CPU 6098#ifdef CONFIG_HOTPLUG_CPU
6301 case CPU_DEAD:
6302 case CPU_DEAD_FROZEN:
6303 migrate_live_tasks(cpu);
6304 /* Idle task back to normal (off runqueue, low prio) */
6305 raw_spin_lock_irq(&rq->lock);
6306 deactivate_task(rq, rq->idle, 0);
6307 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6308 rq->idle->sched_class = &idle_sched_class;
6309 migrate_dead_tasks(cpu);
6310 raw_spin_unlock_irq(&rq->lock);
6311 migrate_nr_uninterruptible(rq);
6312 BUG_ON(rq->nr_running != 0);
6313 calc_global_load_remove(rq);
6314 break;
6315
6316 case CPU_DYING: 6099 case CPU_DYING:
6317 case CPU_DYING_FROZEN:
6318 /* Update our root-domain */ 6100 /* Update our root-domain */
6319 raw_spin_lock_irqsave(&rq->lock, flags); 6101 raw_spin_lock_irqsave(&rq->lock, flags);
6320 if (rq->rd) { 6102 if (rq->rd) {
6321 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6103 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6322 set_rq_offline(rq); 6104 set_rq_offline(rq);
6323 } 6105 }
6106 migrate_tasks(cpu);
6107 BUG_ON(rq->nr_running != 1); /* the migration thread */
6324 raw_spin_unlock_irqrestore(&rq->lock, flags); 6108 raw_spin_unlock_irqrestore(&rq->lock, flags);
6109
6110 migrate_nr_uninterruptible(rq);
6111 calc_global_load_remove(rq);
6325 break; 6112 break;
6326#endif 6113#endif
6327 } 6114 }
@@ -8052,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8052 7839
8053#ifdef CONFIG_FAIR_GROUP_SCHED 7840#ifdef CONFIG_FAIR_GROUP_SCHED
8054static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7841static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8055 struct sched_entity *se, int cpu, int add, 7842 struct sched_entity *se, int cpu,
8056 struct sched_entity *parent) 7843 struct sched_entity *parent)
8057{ 7844{
8058 struct rq *rq = cpu_rq(cpu); 7845 struct rq *rq = cpu_rq(cpu);
8059 tg->cfs_rq[cpu] = cfs_rq; 7846 tg->cfs_rq[cpu] = cfs_rq;
8060 init_cfs_rq(cfs_rq, rq); 7847 init_cfs_rq(cfs_rq, rq);
8061 cfs_rq->tg = tg; 7848 cfs_rq->tg = tg;
8062 if (add)
8063 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
8064 7849
8065 tg->se[cpu] = se; 7850 tg->se[cpu] = se;
8066 /* se could be NULL for init_task_group */ 7851 /* se could be NULL for init_task_group */
@@ -8073,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8073 se->cfs_rq = parent->my_q; 7858 se->cfs_rq = parent->my_q;
8074 7859
8075 se->my_q = cfs_rq; 7860 se->my_q = cfs_rq;
8076 se->load.weight = tg->shares; 7861 update_load_set(&se->load, 0);
8077 se->load.inv_weight = 0;
8078 se->parent = parent; 7862 se->parent = parent;
8079} 7863}
8080#endif 7864#endif
8081 7865
8082#ifdef CONFIG_RT_GROUP_SCHED 7866#ifdef CONFIG_RT_GROUP_SCHED
8083static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7867static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8084 struct sched_rt_entity *rt_se, int cpu, int add, 7868 struct sched_rt_entity *rt_se, int cpu,
8085 struct sched_rt_entity *parent) 7869 struct sched_rt_entity *parent)
8086{ 7870{
8087 struct rq *rq = cpu_rq(cpu); 7871 struct rq *rq = cpu_rq(cpu);
@@ -8090,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8090 init_rt_rq(rt_rq, rq); 7874 init_rt_rq(rt_rq, rq);
8091 rt_rq->tg = tg; 7875 rt_rq->tg = tg;
8092 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7876 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8093 if (add)
8094 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
8095 7877
8096 tg->rt_se[cpu] = rt_se; 7878 tg->rt_se[cpu] = rt_se;
8097 if (!rt_se) 7879 if (!rt_se)
@@ -8164,13 +7946,9 @@ void __init sched_init(void)
8164#ifdef CONFIG_CGROUP_SCHED 7946#ifdef CONFIG_CGROUP_SCHED
8165 list_add(&init_task_group.list, &task_groups); 7947 list_add(&init_task_group.list, &task_groups);
8166 INIT_LIST_HEAD(&init_task_group.children); 7948 INIT_LIST_HEAD(&init_task_group.children);
8167 7949 autogroup_init(&init_task);
8168#endif /* CONFIG_CGROUP_SCHED */ 7950#endif /* CONFIG_CGROUP_SCHED */
8169 7951
8170#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
8171 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
8172 __alignof__(unsigned long));
8173#endif
8174 for_each_possible_cpu(i) { 7952 for_each_possible_cpu(i) {
8175 struct rq *rq; 7953 struct rq *rq;
8176 7954
@@ -8184,7 +7962,6 @@ void __init sched_init(void)
8184#ifdef CONFIG_FAIR_GROUP_SCHED 7962#ifdef CONFIG_FAIR_GROUP_SCHED
8185 init_task_group.shares = init_task_group_load; 7963 init_task_group.shares = init_task_group_load;
8186 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7964 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8187#ifdef CONFIG_CGROUP_SCHED
8188 /* 7965 /*
8189 * How much cpu bandwidth does init_task_group get? 7966 * How much cpu bandwidth does init_task_group get?
8190 * 7967 *
@@ -8204,16 +7981,13 @@ void __init sched_init(void)
8204 * We achieve this by letting init_task_group's tasks sit 7981 * We achieve this by letting init_task_group's tasks sit
8205 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7982 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
8206 */ 7983 */
8207 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7984 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
8208#endif
8209#endif /* CONFIG_FAIR_GROUP_SCHED */ 7985#endif /* CONFIG_FAIR_GROUP_SCHED */
8210 7986
8211 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7987 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8212#ifdef CONFIG_RT_GROUP_SCHED 7988#ifdef CONFIG_RT_GROUP_SCHED
8213 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7989 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8214#ifdef CONFIG_CGROUP_SCHED 7990 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
8215 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8216#endif
8217#endif 7991#endif
8218 7992
8219 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7993 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8486,7 +8260,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8486 if (!se) 8260 if (!se)
8487 goto err_free_rq; 8261 goto err_free_rq;
8488 8262
8489 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8263 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8490 } 8264 }
8491 8265
8492 return 1; 8266 return 1;
@@ -8497,15 +8271,21 @@ err:
8497 return 0; 8271 return 0;
8498} 8272}
8499 8273
8500static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8501{
8502 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8503 &cpu_rq(cpu)->leaf_cfs_rq_list);
8504}
8505
8506static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8274static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8507{ 8275{
8508 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8276 struct rq *rq = cpu_rq(cpu);
8277 unsigned long flags;
8278
8279 /*
8280 * Only empty task groups can be destroyed; so we can speculatively
8281 * check on_list without danger of it being re-added.
8282 */
8283 if (!tg->cfs_rq[cpu]->on_list)
8284 return;
8285
8286 raw_spin_lock_irqsave(&rq->lock, flags);
8287 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8288 raw_spin_unlock_irqrestore(&rq->lock, flags);
8509} 8289}
8510#else /* !CONFG_FAIR_GROUP_SCHED */ 8290#else /* !CONFG_FAIR_GROUP_SCHED */
8511static inline void free_fair_sched_group(struct task_group *tg) 8291static inline void free_fair_sched_group(struct task_group *tg)
@@ -8518,10 +8298,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8518 return 1; 8298 return 1;
8519} 8299}
8520 8300
8521static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8522{
8523}
8524
8525static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8301static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8526{ 8302{
8527} 8303}
@@ -8576,7 +8352,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8576 if (!rt_se) 8352 if (!rt_se)
8577 goto err_free_rq; 8353 goto err_free_rq;
8578 8354
8579 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8355 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8580 } 8356 }
8581 8357
8582 return 1; 8358 return 1;
@@ -8586,17 +8362,6 @@ err_free_rq:
8586err: 8362err:
8587 return 0; 8363 return 0;
8588} 8364}
8589
8590static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8591{
8592 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8593 &cpu_rq(cpu)->leaf_rt_rq_list);
8594}
8595
8596static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8597{
8598 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8599}
8600#else /* !CONFIG_RT_GROUP_SCHED */ 8365#else /* !CONFIG_RT_GROUP_SCHED */
8601static inline void free_rt_sched_group(struct task_group *tg) 8366static inline void free_rt_sched_group(struct task_group *tg)
8602{ 8367{
@@ -8607,14 +8372,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8607{ 8372{
8608 return 1; 8373 return 1;
8609} 8374}
8610
8611static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8612{
8613}
8614
8615static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8616{
8617}
8618#endif /* CONFIG_RT_GROUP_SCHED */ 8375#endif /* CONFIG_RT_GROUP_SCHED */
8619 8376
8620#ifdef CONFIG_CGROUP_SCHED 8377#ifdef CONFIG_CGROUP_SCHED
@@ -8630,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8630{ 8387{
8631 struct task_group *tg; 8388 struct task_group *tg;
8632 unsigned long flags; 8389 unsigned long flags;
8633 int i;
8634 8390
8635 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8391 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8636 if (!tg) 8392 if (!tg)
@@ -8643,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8643 goto err; 8399 goto err;
8644 8400
8645 spin_lock_irqsave(&task_group_lock, flags); 8401 spin_lock_irqsave(&task_group_lock, flags);
8646 for_each_possible_cpu(i) {
8647 register_fair_sched_group(tg, i);
8648 register_rt_sched_group(tg, i);
8649 }
8650 list_add_rcu(&tg->list, &task_groups); 8402 list_add_rcu(&tg->list, &task_groups);
8651 8403
8652 WARN_ON(!parent); /* root should already exist */ 8404 WARN_ON(!parent); /* root should already exist */
@@ -8676,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
8676 unsigned long flags; 8428 unsigned long flags;
8677 int i; 8429 int i;
8678 8430
8679 spin_lock_irqsave(&task_group_lock, flags); 8431 /* end participation in shares distribution */
8680 for_each_possible_cpu(i) { 8432 for_each_possible_cpu(i)
8681 unregister_fair_sched_group(tg, i); 8433 unregister_fair_sched_group(tg, i);
8682 unregister_rt_sched_group(tg, i); 8434
8683 } 8435 spin_lock_irqsave(&task_group_lock, flags);
8684 list_del_rcu(&tg->list); 8436 list_del_rcu(&tg->list);
8685 list_del_rcu(&tg->siblings); 8437 list_del_rcu(&tg->siblings);
8686 spin_unlock_irqrestore(&task_group_lock, flags); 8438 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8727,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
8727#endif /* CONFIG_CGROUP_SCHED */ 8479#endif /* CONFIG_CGROUP_SCHED */
8728 8480
8729#ifdef CONFIG_FAIR_GROUP_SCHED 8481#ifdef CONFIG_FAIR_GROUP_SCHED
8730static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8731{
8732 struct cfs_rq *cfs_rq = se->cfs_rq;
8733 int on_rq;
8734
8735 on_rq = se->on_rq;
8736 if (on_rq)
8737 dequeue_entity(cfs_rq, se, 0);
8738
8739 se->load.weight = shares;
8740 se->load.inv_weight = 0;
8741
8742 if (on_rq)
8743 enqueue_entity(cfs_rq, se, 0);
8744}
8745
8746static void set_se_shares(struct sched_entity *se, unsigned long shares)
8747{
8748 struct cfs_rq *cfs_rq = se->cfs_rq;
8749 struct rq *rq = cfs_rq->rq;
8750 unsigned long flags;
8751
8752 raw_spin_lock_irqsave(&rq->lock, flags);
8753 __set_se_shares(se, shares);
8754 raw_spin_unlock_irqrestore(&rq->lock, flags);
8755}
8756
8757static DEFINE_MUTEX(shares_mutex); 8482static DEFINE_MUTEX(shares_mutex);
8758 8483
8759int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8484int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8776,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8776 if (tg->shares == shares) 8501 if (tg->shares == shares)
8777 goto done; 8502 goto done;
8778 8503
8779 spin_lock_irqsave(&task_group_lock, flags);
8780 for_each_possible_cpu(i)
8781 unregister_fair_sched_group(tg, i);
8782 list_del_rcu(&tg->siblings);
8783 spin_unlock_irqrestore(&task_group_lock, flags);
8784
8785 /* wait for any ongoing reference to this group to finish */
8786 synchronize_sched();
8787
8788 /*
8789 * Now we are free to modify the group's share on each cpu
8790 * w/o tripping rebalance_share or load_balance_fair.
8791 */
8792 tg->shares = shares; 8504 tg->shares = shares;
8793 for_each_possible_cpu(i) { 8505 for_each_possible_cpu(i) {
8794 /* 8506 struct rq *rq = cpu_rq(i);
8795 * force a rebalance 8507 struct sched_entity *se;
8796 */ 8508
8797 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8509 se = tg->se[i];
8798 set_se_shares(tg->se[i], shares); 8510 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0);
8514 raw_spin_unlock_irqrestore(&rq->lock, flags);
8799 } 8515 }
8800 8516
8801 /*
8802 * Enable load balance activity on this group, by inserting it back on
8803 * each cpu's rq->leaf_cfs_rq_list.
8804 */
8805 spin_lock_irqsave(&task_group_lock, flags);
8806 for_each_possible_cpu(i)
8807 register_fair_sched_group(tg, i);
8808 list_add_rcu(&tg->siblings, &tg->parent->children);
8809 spin_unlock_irqrestore(&task_group_lock, flags);
8810done: 8517done:
8811 mutex_unlock(&shares_mutex); 8518 mutex_unlock(&shares_mutex);
8812 return 0; 8519 return 0;
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..c80fedcd476b
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,238 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include <linux/proc_fs.h>
4#include <linux/seq_file.h>
5#include <linux/kallsyms.h>
6#include <linux/utsname.h>
7
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr;
11
12static void autogroup_init(struct task_struct *init_task)
13{
14 autogroup_default.tg = &init_task_group;
15 init_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default;
19}
20
21static inline void autogroup_free(struct task_group *tg)
22{
23 kfree(tg->autogroup);
24}
25
26static inline void autogroup_destroy(struct kref *kref)
27{
28 struct autogroup *ag = container_of(kref, struct autogroup, kref);
29
30 sched_destroy_group(ag->tg);
31}
32
33static inline void autogroup_kref_put(struct autogroup *ag)
34{
35 kref_put(&ag->kref, autogroup_destroy);
36}
37
38static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
39{
40 kref_get(&ag->kref);
41 return ag;
42}
43
44static inline struct autogroup *autogroup_task_get(struct task_struct *p)
45{
46 struct autogroup *ag;
47 unsigned long flags;
48
49 if (!lock_task_sighand(p, &flags))
50 return autogroup_kref_get(&autogroup_default);
51
52 ag = autogroup_kref_get(p->signal->autogroup);
53 unlock_task_sighand(p, &flags);
54
55 return ag;
56}
57
58static inline struct autogroup *autogroup_create(void)
59{
60 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
61 struct task_group *tg;
62
63 if (!ag)
64 goto out_fail;
65
66 tg = sched_create_group(&init_task_group);
67
68 if (IS_ERR(tg))
69 goto out_free;
70
71 kref_init(&ag->kref);
72 init_rwsem(&ag->lock);
73 ag->id = atomic_inc_return(&autogroup_seq_nr);
74 ag->tg = tg;
75 tg->autogroup = ag;
76
77 return ag;
78
79out_free:
80 kfree(ag);
81out_fail:
82 if (printk_ratelimit()) {
83 printk(KERN_WARNING "autogroup_create: %s failure.\n",
84 ag ? "sched_create_group()" : "kmalloc()");
85 }
86
87 return autogroup_kref_get(&autogroup_default);
88}
89
90static inline bool
91task_wants_autogroup(struct task_struct *p, struct task_group *tg)
92{
93 if (tg != &root_task_group)
94 return false;
95
96 if (p->sched_class != &fair_sched_class)
97 return false;
98
99 /*
100 * We can only assume the task group can't go away on us if
101 * autogroup_move_group() can see us on ->thread_group list.
102 */
103 if (p->flags & PF_EXITING)
104 return false;
105
106 return true;
107}
108
109static inline struct task_group *
110autogroup_task_group(struct task_struct *p, struct task_group *tg)
111{
112 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
113
114 if (enabled && task_wants_autogroup(p, tg))
115 return p->signal->autogroup->tg;
116
117 return tg;
118}
119
120static void
121autogroup_move_group(struct task_struct *p, struct autogroup *ag)
122{
123 struct autogroup *prev;
124 struct task_struct *t;
125 unsigned long flags;
126
127 BUG_ON(!lock_task_sighand(p, &flags));
128
129 prev = p->signal->autogroup;
130 if (prev == ag) {
131 unlock_task_sighand(p, &flags);
132 return;
133 }
134
135 p->signal->autogroup = autogroup_kref_get(ag);
136
137 t = p;
138 do {
139 sched_move_task(t);
140 } while_each_thread(p, t);
141
142 unlock_task_sighand(p, &flags);
143 autogroup_kref_put(prev);
144}
145
146/* Allocates GFP_KERNEL, cannot be called under any spinlock */
147void sched_autogroup_create_attach(struct task_struct *p)
148{
149 struct autogroup *ag = autogroup_create();
150
151 autogroup_move_group(p, ag);
152 /* drop extra refrence added by autogroup_create() */
153 autogroup_kref_put(ag);
154}
155EXPORT_SYMBOL(sched_autogroup_create_attach);
156
157/* Cannot be called under siglock. Currently has no users */
158void sched_autogroup_detach(struct task_struct *p)
159{
160 autogroup_move_group(p, &autogroup_default);
161}
162EXPORT_SYMBOL(sched_autogroup_detach);
163
164void sched_autogroup_fork(struct signal_struct *sig)
165{
166 sig->autogroup = autogroup_task_get(current);
167}
168
169void sched_autogroup_exit(struct signal_struct *sig)
170{
171 autogroup_kref_put(sig->autogroup);
172}
173
174static int __init setup_autogroup(char *str)
175{
176 sysctl_sched_autogroup_enabled = 0;
177
178 return 1;
179}
180
181__setup("noautogroup", setup_autogroup);
182
183#ifdef CONFIG_PROC_FS
184
185int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
186{
187 static unsigned long next = INITIAL_JIFFIES;
188 struct autogroup *ag;
189 int err;
190
191 if (*nice < -20 || *nice > 19)
192 return -EINVAL;
193
194 err = security_task_setnice(current, *nice);
195 if (err)
196 return err;
197
198 if (*nice < 0 && !can_nice(current, *nice))
199 return -EPERM;
200
201 /* this is a heavy operation taking global locks.. */
202 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
203 return -EAGAIN;
204
205 next = HZ / 10 + jiffies;
206 ag = autogroup_task_get(p);
207
208 down_write(&ag->lock);
209 err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
210 if (!err)
211 ag->nice = *nice;
212 up_write(&ag->lock);
213
214 autogroup_kref_put(ag);
215
216 return err;
217}
218
219void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
220{
221 struct autogroup *ag = autogroup_task_get(p);
222
223 down_read(&ag->lock);
224 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
225 up_read(&ag->lock);
226
227 autogroup_kref_put(ag);
228}
229#endif /* CONFIG_PROC_FS */
230
231#ifdef CONFIG_SCHED_DEBUG
232static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
233{
234 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
235}
236#endif /* CONFIG_SCHED_DEBUG */
237
238#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..5358e241cb20
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,32 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3struct autogroup {
4 struct kref kref;
5 struct task_group *tg;
6 struct rw_semaphore lock;
7 unsigned long id;
8 int nice;
9};
10
11static inline struct task_group *
12autogroup_task_group(struct task_struct *p, struct task_group *tg);
13
14#else /* !CONFIG_SCHED_AUTOGROUP */
15
16static inline void autogroup_init(struct task_struct *init_task) { }
17static inline void autogroup_free(struct task_group *tg) { }
18
19static inline struct task_group *
20autogroup_task_group(struct task_struct *p, struct task_group *tg)
21{
22 return tg;
23}
24
25#ifdef CONFIG_SCHED_DEBUG
26static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
27{
28 return 0;
29}
30#endif
31
32#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb1..9d8af0b3fb64 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
79} 79}
80EXPORT_SYMBOL_GPL(sched_clock); 80EXPORT_SYMBOL_GPL(sched_clock);
81 81
82static __read_mostly int sched_clock_running; 82__read_mostly int sched_clock_running;
83 83
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 85__read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..1dfae3d014b5 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED 56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu, 57static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
58 struct task_group *tg)
59{ 58{
60 struct sched_entity *se = tg->se[cpu]; 59 struct sched_entity *se = tg->se[cpu];
61 if (!se) 60 if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 109 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
111#endif 110#endif
112 111
113#ifdef CONFIG_CGROUP_SCHED
114 {
115 char path[64];
116
117 rcu_read_lock();
118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
120 SEQ_printf(m, " %s", path);
121 }
122#endif
123 SEQ_printf(m, "\n"); 112 SEQ_printf(m, "\n");
124} 113}
125 114
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
147 read_unlock_irqrestore(&tasklist_lock, flags); 136 read_unlock_irqrestore(&tasklist_lock, flags);
148} 137}
149 138
150#if defined(CONFIG_CGROUP_SCHED) && \
151 (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
152static void task_group_path(struct task_group *tg, char *buf, int buflen)
153{
154 /* may be NULL if the underlying cgroup isn't fully-created yet */
155 if (!tg->css.cgroup) {
156 buf[0] = '\0';
157 return;
158 }
159 cgroup_path(tg->css.cgroup, buf, buflen);
160}
161#endif
162
163void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 139void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
164{ 140{
165 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 141 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168 struct sched_entity *last; 144 struct sched_entity *last;
169 unsigned long flags; 145 unsigned long flags;
170 146
171#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
172 char path[128];
173 struct task_group *tg = cfs_rq->tg;
174
175 task_group_path(tg, path, sizeof(path));
176
177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
178#else
179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 147 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
180#endif
181 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 148 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
182 SPLIT_NS(cfs_rq->exec_clock)); 149 SPLIT_NS(cfs_rq->exec_clock));
183 150
@@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 spread0 = min_vruntime - rq0_min_vruntime; 169 spread0 = min_vruntime - rq0_min_vruntime;
203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", 170 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
204 SPLIT_NS(spread0)); 171 SPLIT_NS(spread0));
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207
208 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 172 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
209 cfs_rq->nr_spread_over); 173 cfs_rq->nr_spread_over);
174 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
175 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
210#ifdef CONFIG_FAIR_GROUP_SCHED 176#ifdef CONFIG_FAIR_GROUP_SCHED
211#ifdef CONFIG_SMP 177#ifdef CONFIG_SMP
212 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 178 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
179 SPLIT_NS(cfs_rq->load_avg));
180 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
181 SPLIT_NS(cfs_rq->load_period));
182 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
183 cfs_rq->load_contribution);
184 SEQ_printf(m, " .%-30s: %d\n", "load_tg",
185 atomic_read(&cfs_rq->tg->load_weight));
213#endif 186#endif
187
214 print_cfs_group_stats(m, cpu, cfs_rq->tg); 188 print_cfs_group_stats(m, cpu, cfs_rq->tg);
215#endif 189#endif
216} 190}
217 191
218void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 192void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
219{ 193{
220#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
221 char path[128];
222 struct task_group *tg = rt_rq->tg;
223
224 task_group_path(tg, path, sizeof(path));
225
226 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
227#else
228 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 194 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
229#endif
230
231 195
232#define P(x) \ 196#define P(x) \
233 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 197 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
243#undef P 207#undef P
244} 208}
245 209
210extern __read_mostly int sched_clock_running;
211
246static void print_cpu(struct seq_file *m, int cpu) 212static void print_cpu(struct seq_file *m, int cpu)
247{ 213{
248 struct rq *rq = cpu_rq(cpu); 214 struct rq *rq = cpu_rq(cpu);
@@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = {
314 280
315static int sched_debug_show(struct seq_file *m, void *v) 281static int sched_debug_show(struct seq_file *m, void *v)
316{ 282{
317 u64 now = ktime_to_ns(ktime_get()); 283 u64 ktime, sched_clk, cpu_clk;
284 unsigned long flags;
318 int cpu; 285 int cpu;
319 286
320 SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", 287 local_irq_save(flags);
288 ktime = ktime_to_ns(ktime_get());
289 sched_clk = sched_clock();
290 cpu_clk = local_clock();
291 local_irq_restore(flags);
292
293 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
321 init_utsname()->release, 294 init_utsname()->release,
322 (int)strcspn(init_utsname()->version, " "), 295 (int)strcspn(init_utsname()->version, " "),
323 init_utsname()->version); 296 init_utsname()->version);
324 297
325 SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); 298#define P(x) \
299 SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
300#define PN(x) \
301 SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
302 PN(ktime);
303 PN(sched_clk);
304 PN(cpu_clk);
305 P(jiffies);
306#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
307 P(sched_clock_stable);
308#endif
309#undef PN
310#undef P
311
312 SEQ_printf(m, "\n");
313 SEQ_printf(m, "sysctl_sched\n");
326 314
327#define P(x) \ 315#define P(x) \
328 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) 316 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
329#define PN(x) \ 317#define PN(x) \
330 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 318 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
331 P(jiffies);
332 PN(sysctl_sched_latency); 319 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 320 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 321 PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 00ebd7686676..c62ebae65cf0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 89
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 91
92/*
93 * The exponential sliding window over which load is averaged for shares
94 * distribution.
95 * (default: 10msec)
96 */
97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98
92static const struct sched_class fair_sched_class; 99static const struct sched_class fair_sched_class;
93 100
94/************************************************************** 101/**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
143 return cfs_rq->tg->cfs_rq[this_cpu]; 150 return cfs_rq->tg->cfs_rq[this_cpu];
144} 151}
145 152
153static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
154{
155 if (!cfs_rq->on_list) {
156 /*
157 * Ensure we either appear before our parent (if already
158 * enqueued) or force our parent to appear after us when it is
159 * enqueued. The fact that we always enqueue bottom-up
160 * reduces this to two cases.
161 */
162 if (cfs_rq->tg->parent &&
163 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
164 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
165 &rq_of(cfs_rq)->leaf_cfs_rq_list);
166 } else {
167 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
168 &rq_of(cfs_rq)->leaf_cfs_rq_list);
169 }
170
171 cfs_rq->on_list = 1;
172 }
173}
174
175static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
176{
177 if (cfs_rq->on_list) {
178 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
179 cfs_rq->on_list = 0;
180 }
181}
182
146/* Iterate thr' all leaf cfs_rq's on a runqueue */ 183/* Iterate thr' all leaf cfs_rq's on a runqueue */
147#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 184#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 185 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
246 return &cpu_rq(this_cpu)->cfs; 283 return &cpu_rq(this_cpu)->cfs;
247} 284}
248 285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288}
289
290static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291{
292}
293
249#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 294#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 295 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251 296
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 462 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 463 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 464 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 465#undef WRT_SYSCTL
422 466
423 return 0; 467 return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
495 return calc_delta_fair(sched_slice(cfs_rq, se), se); 539 return calc_delta_fair(sched_slice(cfs_rq, se), se);
496} 540}
497 541
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
544
498/* 545/*
499 * Update the current task's runtime statistics. Skip current tasks that 546 * Update the current task's runtime statistics. Skip current tasks that
500 * are not in our scheduling class. 547 * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
514 561
515 curr->vruntime += delta_exec_weighted; 562 curr->vruntime += delta_exec_weighted;
516 update_min_vruntime(cfs_rq); 563 update_min_vruntime(cfs_rq);
564
565#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
566 cfs_rq->load_unacc_exec_time += delta_exec;
567#endif
517} 568}
518 569
519static void update_curr(struct cfs_rq *cfs_rq) 570static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 684 list_add(&se->group_node, &cfs_rq->tasks);
634 } 685 }
635 cfs_rq->nr_running++; 686 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 687}
638 688
639static void 689static void
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 697 list_del_init(&se->group_node);
648 } 698 }
649 cfs_rq->nr_running--; 699 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 700}
652 701
702#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
703static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
704 int global_update)
705{
706 struct task_group *tg = cfs_rq->tg;
707 long load_avg;
708
709 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
710 load_avg -= cfs_rq->load_contribution;
711
712 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
713 atomic_add(load_avg, &tg->load_weight);
714 cfs_rq->load_contribution += load_avg;
715 }
716}
717
718static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
719{
720 u64 period = sysctl_sched_shares_window;
721 u64 now, delta;
722 unsigned long load = cfs_rq->load.weight;
723
724 if (!cfs_rq)
725 return;
726
727 now = rq_of(cfs_rq)->clock;
728 delta = now - cfs_rq->load_stamp;
729
730 /* truncate load history at 4 idle periods */
731 if (cfs_rq->load_stamp > cfs_rq->load_last &&
732 now - cfs_rq->load_last > 4 * period) {
733 cfs_rq->load_period = 0;
734 cfs_rq->load_avg = 0;
735 }
736
737 cfs_rq->load_stamp = now;
738 cfs_rq->load_unacc_exec_time = 0;
739 cfs_rq->load_period += delta;
740 if (load) {
741 cfs_rq->load_last = now;
742 cfs_rq->load_avg += delta * load;
743 }
744
745 /* consider updating load contribution on each fold or truncate */
746 if (global_update || cfs_rq->load_period > period
747 || !cfs_rq->load_period)
748 update_cfs_rq_load_contribution(cfs_rq, global_update);
749
750 while (cfs_rq->load_period > period) {
751 /*
752 * Inline assembly required to prevent the compiler
753 * optimising this loop into a divmod call.
754 * See __iter_div_u64_rem() for another example of this.
755 */
756 asm("" : "+rm" (cfs_rq->load_period));
757 cfs_rq->load_period /= 2;
758 cfs_rq->load_avg /= 2;
759 }
760
761 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
762 list_del_leaf_cfs_rq(cfs_rq);
763}
764
765static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
766 unsigned long weight)
767{
768 if (se->on_rq) {
769 /* commit outstanding execution time */
770 if (cfs_rq->curr == se)
771 update_curr(cfs_rq);
772 account_entity_dequeue(cfs_rq, se);
773 }
774
775 update_load_set(&se->load, weight);
776
777 if (se->on_rq)
778 account_entity_enqueue(cfs_rq, se);
779}
780
781static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
782{
783 struct task_group *tg;
784 struct sched_entity *se;
785 long load_weight, load, shares;
786
787 if (!cfs_rq)
788 return;
789
790 tg = cfs_rq->tg;
791 se = tg->se[cpu_of(rq_of(cfs_rq))];
792 if (!se)
793 return;
794
795 load = cfs_rq->load.weight + weight_delta;
796
797 load_weight = atomic_read(&tg->load_weight);
798 load_weight -= cfs_rq->load_contribution;
799 load_weight += load;
800
801 shares = (tg->shares * load);
802 if (load_weight)
803 shares /= load_weight;
804
805 if (shares < MIN_SHARES)
806 shares = MIN_SHARES;
807 if (shares > tg->shares)
808 shares = tg->shares;
809
810 reweight_entity(cfs_rq_of(se), se, shares);
811}
812
813static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
814{
815 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
816 update_cfs_load(cfs_rq, 0);
817 update_cfs_shares(cfs_rq, 0);
818 }
819}
820#else /* CONFIG_FAIR_GROUP_SCHED */
821static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
822{
823}
824
825static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
826{
827}
828
829static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
830{
831}
832#endif /* CONFIG_FAIR_GROUP_SCHED */
833
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 834static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 835{
655#ifdef CONFIG_SCHEDSTATS 836#ifdef CONFIG_SCHEDSTATS
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 952 * Update run-time statistics of the 'current'.
772 */ 953 */
773 update_curr(cfs_rq); 954 update_curr(cfs_rq);
955 update_cfs_load(cfs_rq, 0);
956 update_cfs_shares(cfs_rq, se->load.weight);
774 account_entity_enqueue(cfs_rq, se); 957 account_entity_enqueue(cfs_rq, se);
775 958
776 if (flags & ENQUEUE_WAKEUP) { 959 if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 965 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 966 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 967 __enqueue_entity(cfs_rq, se);
968 se->on_rq = 1;
969
970 if (cfs_rq->nr_running == 1)
971 list_add_leaf_cfs_rq(cfs_rq);
785} 972}
786 973
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 974static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 1012
826 if (se != cfs_rq->curr) 1013 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 1014 __dequeue_entity(cfs_rq, se);
1015 se->on_rq = 0;
1016 update_cfs_load(cfs_rq, 0);
828 account_entity_dequeue(cfs_rq, se); 1017 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq); 1018 update_min_vruntime(cfs_rq);
1019 update_cfs_shares(cfs_rq, 0);
830 1020
831 /* 1021 /*
832 * Normalize the entity after updating the min_vruntime because the 1022 * Normalize the entity after updating the min_vruntime because the
@@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
955 */ 1145 */
956 update_curr(cfs_rq); 1146 update_curr(cfs_rq);
957 1147
1148 /*
1149 * Update share accounting for long-running entities.
1150 */
1151 update_entity_shares_tick(cfs_rq);
1152
958#ifdef CONFIG_SCHED_HRTICK 1153#ifdef CONFIG_SCHED_HRTICK
959 /* 1154 /*
960 * queued ticks are scheduled to match the slice, so don't bother 1155 * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1250 flags = ENQUEUE_WAKEUP;
1056 } 1251 }
1057 1252
1253 for_each_sched_entity(se) {
1254 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1255
1256 update_cfs_load(cfs_rq, 0);
1257 update_cfs_shares(cfs_rq, 0);
1258 }
1259
1058 hrtick_update(rq); 1260 hrtick_update(rq);
1059} 1261}
1060 1262
@@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1071 for_each_sched_entity(se) { 1273 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1274 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1275 dequeue_entity(cfs_rq, se, flags);
1276
1074 /* Don't dequeue parent if it has other entities besides us */ 1277 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1278 if (cfs_rq->load.weight)
1076 break; 1279 break;
1077 flags |= DEQUEUE_SLEEP; 1280 flags |= DEQUEUE_SLEEP;
1078 } 1281 }
1079 1282
1283 for_each_sched_entity(se) {
1284 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1285
1286 update_cfs_load(cfs_rq, 0);
1287 update_cfs_shares(cfs_rq, 0);
1288 }
1289
1080 hrtick_update(rq); 1290 hrtick_update(rq);
1081} 1291}
1082 1292
@@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1353 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1354 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1355 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1356 */
1161static long effective_load(struct task_group *tg, int cpu, 1357static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1358{
1164 struct sched_entity *se = tg->se[cpu]; 1359 struct sched_entity *se = tg->se[cpu];
1165 1360
1166 if (!tg->parent) 1361 if (!tg->parent)
1167 return wl; 1362 return wl;
1168 1363
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1364 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1365 long S, rw, s, a, b;
1178 long more_w;
1179
1180 /*
1181 * Instead of using this increment, also add the difference
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187 1366
1188 S = se->my_q->tg->shares; 1367 S = se->my_q->tg->shares;
1189 s = se->my_q->shares; 1368 s = se->load.weight;
1190 rw = se->my_q->rq_weight; 1369 rw = se->my_q->load.weight;
1191 1370
1192 a = S*(rw + wl); 1371 a = S*(rw + wl);
1193 b = S*rw + s*wg; 1372 b = S*rw + s*wg;
@@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1687 sd = tmp;
1509 } 1688 }
1510 1689
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1690 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1691 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1692 return select_idle_sibling(p, cpu);
@@ -1909,6 +2071,48 @@ out:
1909} 2071}
1910 2072
1911#ifdef CONFIG_FAIR_GROUP_SCHED 2073#ifdef CONFIG_FAIR_GROUP_SCHED
2074/*
2075 * update tg->load_weight by folding this cpu's load_avg
2076 */
2077static int update_shares_cpu(struct task_group *tg, int cpu)
2078{
2079 struct cfs_rq *cfs_rq;
2080 unsigned long flags;
2081 struct rq *rq;
2082
2083 if (!tg->se[cpu])
2084 return 0;
2085
2086 rq = cpu_rq(cpu);
2087 cfs_rq = tg->cfs_rq[cpu];
2088
2089 raw_spin_lock_irqsave(&rq->lock, flags);
2090
2091 update_rq_clock(rq);
2092 update_cfs_load(cfs_rq, 1);
2093
2094 /*
2095 * We need to update shares after updating tg->load_weight in
2096 * order to adjust the weight of groups with long running tasks.
2097 */
2098 update_cfs_shares(cfs_rq, 0);
2099
2100 raw_spin_unlock_irqrestore(&rq->lock, flags);
2101
2102 return 0;
2103}
2104
2105static void update_shares(int cpu)
2106{
2107 struct cfs_rq *cfs_rq;
2108 struct rq *rq = cpu_rq(cpu);
2109
2110 rcu_read_lock();
2111 for_each_leaf_cfs_rq(rq, cfs_rq)
2112 update_shares_cpu(cfs_rq->tg, cpu);
2113 rcu_read_unlock();
2114}
2115
1912static unsigned long 2116static unsigned long
1913load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2117load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1914 unsigned long max_load_move, 2118 unsigned long max_load_move,
@@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1956 return max_load_move - rem_load_move; 2160 return max_load_move - rem_load_move;
1957} 2161}
1958#else 2162#else
2163static inline void update_shares(int cpu)
2164{
2165}
2166
1959static unsigned long 2167static unsigned long
1960load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2168load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1961 unsigned long max_load_move, 2169 unsigned long max_load_move,
@@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3032 schedstat_inc(sd, lb_count[idle]); 3240 schedstat_inc(sd, lb_count[idle]);
3033 3241
3034redo: 3242redo:
3035 update_shares(sd);
3036 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3243 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3037 cpus, balance); 3244 cpus, balance);
3038 3245
@@ -3174,8 +3381,6 @@ out_one_pinned:
3174 else 3381 else
3175 ld_moved = 0; 3382 ld_moved = 0;
3176out: 3383out:
3177 if (ld_moved)
3178 update_shares(sd);
3179 return ld_moved; 3384 return ld_moved;
3180} 3385}
3181 3386
@@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3199 */ 3404 */
3200 raw_spin_unlock(&this_rq->lock); 3405 raw_spin_unlock(&this_rq->lock);
3201 3406
3407 update_shares(this_cpu);
3202 for_each_domain(this_cpu, sd) { 3408 for_each_domain(this_cpu, sd) {
3203 unsigned long interval; 3409 unsigned long interval;
3204 int balance = 1; 3410 int balance = 1;
@@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3569 int update_next_balance = 0; 3775 int update_next_balance = 0;
3570 int need_serialize; 3776 int need_serialize;
3571 3777
3778 update_shares(cpu);
3779
3572 for_each_domain(cpu, sd) { 3780 for_each_domain(cpu, sd) {
3573 if (!(sd->flags & SD_LOAD_BALANCE)) 3781 if (!(sd->flags & SD_LOAD_BALANCE))
3574 continue; 3782 continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920ec1a2..68e69acc29b9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
52SCHED_FEAT(HRTICK, 0) 52SCHED_FEAT(HRTICK, 0)
53SCHED_FEAT(DOUBLE_TICK, 0) 53SCHED_FEAT(DOUBLE_TICK, 0)
54SCHED_FEAT(LB_BIAS, 1) 54SCHED_FEAT(LB_BIAS, 1)
55SCHED_FEAT(LB_SHARES_UPDATE, 1)
56SCHED_FEAT(ASYM_EFF_LOAD, 1)
57 55
58/* 56/*
59 * Spin-wait on mutex acquisition when the mutex owner is running on 57 * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bea7d79f7e9c..c914ec747ca6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list,
189 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
190}
191
192static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
193{
194 list_del_rcu(&rt_rq->leaf_rt_rq_list);
195}
196
186#define for_each_leaf_rt_rq(rt_rq, rq) \ 197#define for_each_leaf_rt_rq(rt_rq, rq) \
187 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 198 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
188 199
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
276 return ktime_to_ns(def_rt_bandwidth.rt_period); 287 return ktime_to_ns(def_rt_bandwidth.rt_period);
277} 288}
278 289
290static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
291{
292}
293
294static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
295{
296}
297
279#define for_each_leaf_rt_rq(rt_rq, rq) \ 298#define for_each_leaf_rt_rq(rt_rq, rq) \
280 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 299 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
281 300
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 844 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
826 return; 845 return;
827 846
847 if (!rt_rq->rt_nr_running)
848 list_add_leaf_rt_rq(rt_rq);
849
828 if (head) 850 if (head)
829 list_add(&rt_se->run_list, queue); 851 list_add(&rt_se->run_list, queue);
830 else 852 else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
844 __clear_bit(rt_se_prio(rt_se), array->bitmap); 866 __clear_bit(rt_se_prio(rt_se), array->bitmap);
845 867
846 dec_rt_tasks(rt_se, rt_rq); 868 dec_rt_tasks(rt_se, rt_rq);
869 if (!rt_rq->rt_nr_running)
870 list_del_leaf_rt_rq(rt_rq);
847} 871}
848 872
849/* 873/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18f4be0d5fe0..d4d918a91881 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
853 cpumask_any(cpu_online_mask)); 853 cpumask_any(cpu_online_mask));
854 case CPU_DEAD: 854 case CPU_DEAD:
855 case CPU_DEAD_FROZEN: { 855 case CPU_DEAD_FROZEN: {
856 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 856 static struct sched_param param = {
857 .sched_priority = MAX_RT_PRIO-1
858 };
857 859
858 p = per_cpu(ksoftirqd, hotcpu); 860 p = per_cpu(ksoftirqd, hotcpu);
859 per_cpu(ksoftirqd, hotcpu) = NULL; 861 per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..2745dcdb6c6c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
1080 err = session; 1080 err = session;
1081out: 1081out:
1082 write_unlock_irq(&tasklist_lock); 1082 write_unlock_irq(&tasklist_lock);
1083 if (err > 0) 1083 if (err > 0) {
1084 proc_sid_connector(group_leader); 1084 proc_sid_connector(group_leader);
1085 sched_autogroup_create_attach(group_leader);
1086 }
1085 return err; 1087 return err;
1086} 1088}
1087 1089
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 46404414d8a7..ae5cbb1e3ced 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
262static int min_sched_shares_ratelimit = 100000; /* 100 usec */
263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
264#endif 262#endif
265 263
266#ifdef CONFIG_COMPACTION 264#ifdef CONFIG_COMPACTION
@@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = {
305 .extra2 = &max_wakeup_granularity_ns, 303 .extra2 = &max_wakeup_granularity_ns,
306 }, 304 },
307 { 305 {
308 .procname = "sched_shares_ratelimit",
309 .data = &sysctl_sched_shares_ratelimit,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = sched_proc_update_handler,
313 .extra1 = &min_sched_shares_ratelimit,
314 .extra2 = &max_sched_shares_ratelimit,
315 },
316 {
317 .procname = "sched_tunable_scaling", 306 .procname = "sched_tunable_scaling",
318 .data = &sysctl_sched_tunable_scaling, 307 .data = &sysctl_sched_tunable_scaling,
319 .maxlen = sizeof(enum sched_tunable_scaling), 308 .maxlen = sizeof(enum sched_tunable_scaling),
@@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = {
323 .extra2 = &max_sched_tunable_scaling, 312 .extra2 = &max_sched_tunable_scaling,
324 }, 313 },
325 { 314 {
326 .procname = "sched_shares_thresh",
327 .data = &sysctl_sched_shares_thresh,
328 .maxlen = sizeof(unsigned int),
329 .mode = 0644,
330 .proc_handler = proc_dointvec_minmax,
331 .extra1 = &zero,
332 },
333 {
334 .procname = "sched_migration_cost", 315 .procname = "sched_migration_cost",
335 .data = &sysctl_sched_migration_cost, 316 .data = &sysctl_sched_migration_cost,
336 .maxlen = sizeof(unsigned int), 317 .maxlen = sizeof(unsigned int),
@@ -352,6 +333,13 @@ static struct ctl_table kern_table[] = {
352 .proc_handler = proc_dointvec, 333 .proc_handler = proc_dointvec,
353 }, 334 },
354 { 335 {
336 .procname = "sched_shares_window",
337 .data = &sysctl_sched_shares_window,
338 .maxlen = sizeof(unsigned int),
339 .mode = 0644,
340 .proc_handler = proc_dointvec,
341 },
342 {
355 .procname = "timer_migration", 343 .procname = "timer_migration",
356 .data = &sysctl_timer_migration, 344 .data = &sysctl_timer_migration,
357 .maxlen = sizeof(unsigned int), 345 .maxlen = sizeof(unsigned int),
@@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = {
382 .mode = 0644, 370 .mode = 0644,
383 .proc_handler = proc_dointvec, 371 .proc_handler = proc_dointvec,
384 }, 372 },
373#ifdef CONFIG_SCHED_AUTOGROUP
374 {
375 .procname = "sched_autogroup_enabled",
376 .data = &sysctl_sched_autogroup_enabled,
377 .maxlen = sizeof(unsigned int),
378 .mode = 0644,
379 .proc_handler = proc_dointvec,
380 .extra1 = &zero,
381 .extra2 = &one,
382 },
383#endif
385#ifdef CONFIG_PROVE_LOCKING 384#ifdef CONFIG_PROVE_LOCKING
386 { 385 {
387 .procname = "prove_locking", 386 .procname = "prove_locking",
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b3209..562c56e048fd 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
558static int trace_wakeup_test_thread(void *data) 558static int trace_wakeup_test_thread(void *data)
559{ 559{
560 /* Make this a RT thread, doesn't need to be too high */ 560 /* Make this a RT thread, doesn't need to be too high */
561 struct sched_param param = { .sched_priority = 5 }; 561 static struct sched_param param = { .sched_priority = 5 };
562 struct completion *x = data; 562 struct completion *x = data;
563 563
564 sched_setscheduler(current, SCHED_FIFO, &param); 564 sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index aaa8dae08236..6e7b575ac33c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -309,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
309 */ 309 */
310static int watchdog(void *unused) 310static int watchdog(void *unused)
311{ 311{
312 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 312 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
313 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 313 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
314 314
315 sched_setscheduler(current, SCHED_FIFO, &param); 315 sched_setscheduler(current, SCHED_FIFO, &param);