aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt2
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/include/asm/mutex.h2
-rw-r--r--fs/proc/base.c79
-rw-r--r--include/linux/init_task.h9
-rw-r--r--include/linux/mutex.h4
-rw-r--r--include/linux/sched.h36
-rw-r--r--init/Kconfig13
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/printk.c8
-rw-r--r--kernel/sched.c565
-rw-r--r--kernel/sched_autogroup.c238
-rw-r--r--kernel/sched_autogroup.h32
-rw-r--r--kernel/sched_clock.c2
-rw-r--r--kernel/sched_debug.c91
-rw-r--r--kernel/sched_fair.c322
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c24
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/watchdog.c2
28 files changed, 924 insertions, 591 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 01ece1b9213e..1031923f5254 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1622,6 +1622,8 @@ and is between 256 and 4096 characters. It is defined in the file
1622 noapic [SMP,APIC] Tells the kernel to not make use of any 1622 noapic [SMP,APIC] Tells the kernel to not make use of any
1623 IOAPICs that may be present in the system. 1623 IOAPICs that may be present in the system.
1624 1624
1625 noautogroup Disable scheduler automatic task group creation.
1626
1625 nobats [PPC] Do not use BATs for mapping kernel lowmem 1627 nobats [PPC] Do not use BATs for mapping kernel lowmem
1626 on "Classic" PPC cores. 1628 on "Classic" PPC cores.
1627 1629
diff --git a/arch/Kconfig b/arch/Kconfig
index 8bf0fa652eb6..f78c2be4242b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -175,4 +175,7 @@ config HAVE_PERF_EVENTS_NMI
175config HAVE_ARCH_JUMP_LABEL 175config HAVE_ARCH_JUMP_LABEL
176 bool 176 bool
177 177
178config HAVE_ARCH_MUTEX_CPU_RELAX
179 bool
180
178source "kernel/gcov/Kconfig" 181source "kernel/gcov/Kconfig"
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e0b98e71ff47..6c6d7b339aae 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -99,6 +99,7 @@ config S390
99 select HAVE_KERNEL_LZMA 99 select HAVE_KERNEL_LZMA
100 select HAVE_KERNEL_LZO 100 select HAVE_KERNEL_LZO
101 select HAVE_GET_USER_PAGES_FAST 101 select HAVE_GET_USER_PAGES_FAST
102 select HAVE_ARCH_MUTEX_CPU_RELAX
102 select ARCH_INLINE_SPIN_TRYLOCK 103 select ARCH_INLINE_SPIN_TRYLOCK
103 select ARCH_INLINE_SPIN_TRYLOCK_BH 104 select ARCH_INLINE_SPIN_TRYLOCK_BH
104 select ARCH_INLINE_SPIN_LOCK 105 select ARCH_INLINE_SPIN_LOCK
diff --git a/arch/s390/include/asm/mutex.h b/arch/s390/include/asm/mutex.h
index 458c1f7fbc18..688271f5f2e4 100644
--- a/arch/s390/include/asm/mutex.h
+++ b/arch/s390/include/asm/mutex.h
@@ -7,3 +7,5 @@
7 */ 7 */
8 8
9#include <asm-generic/mutex-dec.h> 9#include <asm-generic/mutex-dec.h>
10
11#define arch_mutex_cpu_relax() barrier()
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 182845147fe4..08cba2c3b612 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = {
1407 1407
1408#endif 1408#endif
1409 1409
1410#ifdef CONFIG_SCHED_AUTOGROUP
1411/*
1412 * Print out autogroup related information:
1413 */
1414static int sched_autogroup_show(struct seq_file *m, void *v)
1415{
1416 struct inode *inode = m->private;
1417 struct task_struct *p;
1418
1419 p = get_proc_task(inode);
1420 if (!p)
1421 return -ESRCH;
1422 proc_sched_autogroup_show_task(p, m);
1423
1424 put_task_struct(p);
1425
1426 return 0;
1427}
1428
1429static ssize_t
1430sched_autogroup_write(struct file *file, const char __user *buf,
1431 size_t count, loff_t *offset)
1432{
1433 struct inode *inode = file->f_path.dentry->d_inode;
1434 struct task_struct *p;
1435 char buffer[PROC_NUMBUF];
1436 long nice;
1437 int err;
1438
1439 memset(buffer, 0, sizeof(buffer));
1440 if (count > sizeof(buffer) - 1)
1441 count = sizeof(buffer) - 1;
1442 if (copy_from_user(buffer, buf, count))
1443 return -EFAULT;
1444
1445 err = strict_strtol(strstrip(buffer), 0, &nice);
1446 if (err)
1447 return -EINVAL;
1448
1449 p = get_proc_task(inode);
1450 if (!p)
1451 return -ESRCH;
1452
1453 err = nice;
1454 err = proc_sched_autogroup_set_nice(p, &err);
1455 if (err)
1456 count = err;
1457
1458 put_task_struct(p);
1459
1460 return count;
1461}
1462
1463static int sched_autogroup_open(struct inode *inode, struct file *filp)
1464{
1465 int ret;
1466
1467 ret = single_open(filp, sched_autogroup_show, NULL);
1468 if (!ret) {
1469 struct seq_file *m = filp->private_data;
1470
1471 m->private = inode;
1472 }
1473 return ret;
1474}
1475
1476static const struct file_operations proc_pid_sched_autogroup_operations = {
1477 .open = sched_autogroup_open,
1478 .read = seq_read,
1479 .write = sched_autogroup_write,
1480 .llseek = seq_lseek,
1481 .release = single_release,
1482};
1483
1484#endif /* CONFIG_SCHED_AUTOGROUP */
1485
1410static ssize_t comm_write(struct file *file, const char __user *buf, 1486static ssize_t comm_write(struct file *file, const char __user *buf,
1411 size_t count, loff_t *offset) 1487 size_t count, loff_t *offset)
1412{ 1488{
@@ -2733,6 +2809,9 @@ static const struct pid_entry tgid_base_stuff[] = {
2733#ifdef CONFIG_SCHED_DEBUG 2809#ifdef CONFIG_SCHED_DEBUG
2734 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2810 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2735#endif 2811#endif
2812#ifdef CONFIG_SCHED_AUTOGROUP
2813 REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
2814#endif
2736 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2815 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2737#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2816#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2738 INF("syscall", S_IRUSR, proc_pid_syscall), 2817 INF("syscall", S_IRUSR, proc_pid_syscall),
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f8c06ce0fa6..6ed8812bfe2d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -12,6 +12,13 @@
12#include <linux/securebits.h> 12#include <linux/securebits.h>
13#include <net/net_namespace.h> 13#include <net/net_namespace.h>
14 14
15#ifdef CONFIG_SMP
16# define INIT_PUSHABLE_TASKS(tsk) \
17 .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
18#else
19# define INIT_PUSHABLE_TASKS(tsk)
20#endif
21
15extern struct files_struct init_files; 22extern struct files_struct init_files;
16extern struct fs_struct init_fs; 23extern struct fs_struct init_fs;
17 24
@@ -137,7 +144,7 @@ extern struct cred init_cred;
137 .nr_cpus_allowed = NR_CPUS, \ 144 .nr_cpus_allowed = NR_CPUS, \
138 }, \ 145 }, \
139 .tasks = LIST_HEAD_INIT(tsk.tasks), \ 146 .tasks = LIST_HEAD_INIT(tsk.tasks), \
140 .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \ 147 INIT_PUSHABLE_TASKS(tsk) \
141 .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ 148 .ptraced = LIST_HEAD_INIT(tsk.ptraced), \
142 .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ 149 .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
143 .real_parent = &tsk, \ 150 .real_parent = &tsk, \
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index f363bc8fdc74..94b48bd40dd7 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -160,4 +160,8 @@ extern int mutex_trylock(struct mutex *lock);
160extern void mutex_unlock(struct mutex *lock); 160extern void mutex_unlock(struct mutex *lock);
161extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); 161extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
162 162
163#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
164#define arch_mutex_cpu_relax() cpu_relax()
165#endif
166
163#endif 167#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 223874538b33..883ad10eadbf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -509,6 +509,8 @@ struct thread_group_cputimer {
509 spinlock_t lock; 509 spinlock_t lock;
510}; 510};
511 511
512struct autogroup;
513
512/* 514/*
513 * NOTE! "signal_struct" does not have it's own 515 * NOTE! "signal_struct" does not have it's own
514 * locking, because a shared signal_struct always 516 * locking, because a shared signal_struct always
@@ -576,6 +578,9 @@ struct signal_struct {
576 578
577 struct tty_struct *tty; /* NULL if no tty */ 579 struct tty_struct *tty; /* NULL if no tty */
578 580
581#ifdef CONFIG_SCHED_AUTOGROUP
582 struct autogroup *autogroup;
583#endif
579 /* 584 /*
580 * Cumulative resource counters for dead threads in the group, 585 * Cumulative resource counters for dead threads in the group,
581 * and for reaped dead child processes forked by this group. 586 * and for reaped dead child processes forked by this group.
@@ -1235,7 +1240,9 @@ struct task_struct {
1235#endif 1240#endif
1236 1241
1237 struct list_head tasks; 1242 struct list_head tasks;
1243#ifdef CONFIG_SMP
1238 struct plist_node pushable_tasks; 1244 struct plist_node pushable_tasks;
1245#endif
1239 1246
1240 struct mm_struct *mm, *active_mm; 1247 struct mm_struct *mm, *active_mm;
1241#if defined(SPLIT_RSS_COUNTING) 1248#if defined(SPLIT_RSS_COUNTING)
@@ -1872,14 +1879,11 @@ extern void sched_clock_idle_sleep_event(void);
1872extern void sched_clock_idle_wakeup_event(u64 delta_ns); 1879extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1873 1880
1874#ifdef CONFIG_HOTPLUG_CPU 1881#ifdef CONFIG_HOTPLUG_CPU
1875extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
1876extern void idle_task_exit(void); 1882extern void idle_task_exit(void);
1877#else 1883#else
1878static inline void idle_task_exit(void) {} 1884static inline void idle_task_exit(void) {}
1879#endif 1885#endif
1880 1886
1881extern void sched_idle_next(void);
1882
1883#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 1887#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
1884extern void wake_up_idle_cpu(int cpu); 1888extern void wake_up_idle_cpu(int cpu);
1885#else 1889#else
@@ -1889,8 +1893,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
1889extern unsigned int sysctl_sched_latency; 1893extern unsigned int sysctl_sched_latency;
1890extern unsigned int sysctl_sched_min_granularity; 1894extern unsigned int sysctl_sched_min_granularity;
1891extern unsigned int sysctl_sched_wakeup_granularity; 1895extern unsigned int sysctl_sched_wakeup_granularity;
1892extern unsigned int sysctl_sched_shares_ratelimit;
1893extern unsigned int sysctl_sched_shares_thresh;
1894extern unsigned int sysctl_sched_child_runs_first; 1896extern unsigned int sysctl_sched_child_runs_first;
1895 1897
1896enum sched_tunable_scaling { 1898enum sched_tunable_scaling {
@@ -1906,6 +1908,7 @@ extern unsigned int sysctl_sched_migration_cost;
1906extern unsigned int sysctl_sched_nr_migrate; 1908extern unsigned int sysctl_sched_nr_migrate;
1907extern unsigned int sysctl_sched_time_avg; 1909extern unsigned int sysctl_sched_time_avg;
1908extern unsigned int sysctl_timer_migration; 1910extern unsigned int sysctl_timer_migration;
1911extern unsigned int sysctl_sched_shares_window;
1909 1912
1910int sched_proc_update_handler(struct ctl_table *table, int write, 1913int sched_proc_update_handler(struct ctl_table *table, int write,
1911 void __user *buffer, size_t *length, 1914 void __user *buffer, size_t *length,
@@ -1931,6 +1934,24 @@ int sched_rt_handler(struct ctl_table *table, int write,
1931 1934
1932extern unsigned int sysctl_sched_compat_yield; 1935extern unsigned int sysctl_sched_compat_yield;
1933 1936
1937#ifdef CONFIG_SCHED_AUTOGROUP
1938extern unsigned int sysctl_sched_autogroup_enabled;
1939
1940extern void sched_autogroup_create_attach(struct task_struct *p);
1941extern void sched_autogroup_detach(struct task_struct *p);
1942extern void sched_autogroup_fork(struct signal_struct *sig);
1943extern void sched_autogroup_exit(struct signal_struct *sig);
1944#ifdef CONFIG_PROC_FS
1945extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
1946extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice);
1947#endif
1948#else
1949static inline void sched_autogroup_create_attach(struct task_struct *p) { }
1950static inline void sched_autogroup_detach(struct task_struct *p) { }
1951static inline void sched_autogroup_fork(struct signal_struct *sig) { }
1952static inline void sched_autogroup_exit(struct signal_struct *sig) { }
1953#endif
1954
1934#ifdef CONFIG_RT_MUTEXES 1955#ifdef CONFIG_RT_MUTEXES
1935extern int rt_mutex_getprio(struct task_struct *p); 1956extern int rt_mutex_getprio(struct task_struct *p);
1936extern void rt_mutex_setprio(struct task_struct *p, int prio); 1957extern void rt_mutex_setprio(struct task_struct *p, int prio);
@@ -1949,9 +1970,10 @@ extern int task_nice(const struct task_struct *p);
1949extern int can_nice(const struct task_struct *p, const int nice); 1970extern int can_nice(const struct task_struct *p, const int nice);
1950extern int task_curr(const struct task_struct *p); 1971extern int task_curr(const struct task_struct *p);
1951extern int idle_cpu(int cpu); 1972extern int idle_cpu(int cpu);
1952extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); 1973extern int sched_setscheduler(struct task_struct *, int,
1974 const struct sched_param *);
1953extern int sched_setscheduler_nocheck(struct task_struct *, int, 1975extern int sched_setscheduler_nocheck(struct task_struct *, int,
1954 struct sched_param *); 1976 const struct sched_param *);
1955extern struct task_struct *idle_task(int cpu); 1977extern struct task_struct *idle_task(int cpu);
1956extern struct task_struct *curr_task(int cpu); 1978extern struct task_struct *curr_task(int cpu);
1957extern void set_curr_task(int cpu, struct task_struct *p); 1979extern void set_curr_task(int cpu, struct task_struct *p);
diff --git a/init/Kconfig b/init/Kconfig
index c9728992a776..62b71491dde5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -741,6 +741,19 @@ config NET_NS
741 741
742endif # NAMESPACES 742endif # NAMESPACES
743 743
744config SCHED_AUTOGROUP
745 bool "Automatic process group scheduling"
746 select EVENTFD
747 select CGROUPS
748 select CGROUP_SCHED
749 select FAIR_GROUP_SCHED
750 help
751 This option optimizes the scheduler for common desktop workloads by
752 automatically creating and populating task groups. This separation
753 of workloads isolates aggressive CPU burners (like build jobs) from
754 desktop applications. Task group autogeneration is currently based
755 upon task session.
756
744config MM_OWNER 757config MM_OWNER
745 bool 758 bool
746 759
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..cb7a1efa9c2b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
189} 189}
190 190
191struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
193 unsigned long mod; 192 unsigned long mod;
194 void *hcpu; 193 void *hcpu;
195}; 194};
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
198static int __ref take_cpu_down(void *_param) 197static int __ref take_cpu_down(void *_param)
199{ 198{
200 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
202 int err; 200 int err;
203 201
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
208 206
209 cpu_notify(CPU_DYING | param->mod, param->hcpu); 207 cpu_notify(CPU_DYING | param->mod, param->hcpu);
210 208
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
213 /* Force idle task to run as soon as we yield: it should
214 immediately notice cpu is offline and die quickly. */
215 sched_idle_next();
216 return 0; 209 return 0;
217} 210}
218 211
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 void *hcpu = (void *)(long)cpu; 216 void *hcpu = (void *)(long)cpu;
224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 217 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
225 struct take_cpu_down_param tcd_param = { 218 struct take_cpu_down_param tcd_param = {
226 .caller = current,
227 .mod = mod, 219 .mod = mod,
228 .hcpu = hcpu, 220 .hcpu = hcpu,
229 }; 221 };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
253 } 245 }
254 BUG_ON(cpu_online(cpu)); 246 BUG_ON(cpu_online(cpu));
255 247
256 /* Wait for it to sleep (leaving idle task). */ 248 /*
249 * The migration_call() CPU_DYING callback will have removed all
250 * runnable tasks from the cpu, there's only the idle task left now
251 * that the migration thread is done doing the stop_machine thing.
252 *
253 * Wait for the stop thread to go away.
254 */
257 while (!idle_cpu(cpu)) 255 while (!idle_cpu(cpu))
258 yield(); 256 cpu_relax();
259 257
260 /* This actually kills the CPU. */ 258 /* This actually kills the CPU. */
261 __cpu_die(cpu); 259 __cpu_die(cpu);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5447dc7defa9..7d164e25b0f0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
174 174
175static inline void put_signal_struct(struct signal_struct *sig) 175static inline void put_signal_struct(struct signal_struct *sig)
176{ 176{
177 if (atomic_dec_and_test(&sig->sigcnt)) 177 if (atomic_dec_and_test(&sig->sigcnt)) {
178 sched_autogroup_exit(sig);
178 free_signal_struct(sig); 179 free_signal_struct(sig);
180 }
179} 181}
180 182
181void __put_task_struct(struct task_struct *tsk) 183void __put_task_struct(struct task_struct *tsk)
@@ -905,6 +907,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
905 posix_cpu_timers_init_group(sig); 907 posix_cpu_timers_init_group(sig);
906 908
907 tty_audit_fork(sig); 909 tty_audit_fork(sig);
910 sched_autogroup_fork(sig);
908 911
909 sig->oom_adj = current->signal->oom_adj; 912 sig->oom_adj = current->signal->oom_adj;
910 sig->oom_score_adj = current->signal->oom_score_adj; 913 sig->oom_score_adj = current->signal->oom_score_adj;
@@ -1315,7 +1318,7 @@ bad_fork_cleanup_mm:
1315 } 1318 }
1316bad_fork_cleanup_signal: 1319bad_fork_cleanup_signal:
1317 if (!(clone_flags & CLONE_THREAD)) 1320 if (!(clone_flags & CLONE_THREAD))
1318 free_signal_struct(p->signal); 1321 put_signal_struct(p->signal);
1319bad_fork_cleanup_sighand: 1322bad_fork_cleanup_sighand:
1320 __cleanup_sighand(p->sighand); 1323 __cleanup_sighand(p->sighand);
1321bad_fork_cleanup_fs: 1324bad_fork_cleanup_fs:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc5f952..91a5fa25054e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
577 */ 577 */
578static int irq_thread(void *data) 578static int irq_thread(void *data)
579{ 579{
580 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 580 static struct sched_param param = {
581 .sched_priority = MAX_USER_RT_PRIO/2,
582 };
581 struct irqaction *action = data; 583 struct irqaction *action = data;
582 struct irq_desc *desc = irq_to_desc(action->irq); 584 struct irq_desc *desc = irq_to_desc(action->irq);
583 int wake, oneshot = desc->status & IRQ_ONESHOT; 585 int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ca61bbdd44b2..5355cfd44a3f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
148 wait_for_completion(&create.done); 148 wait_for_completion(&create.done);
149 149
150 if (!IS_ERR(create.result)) { 150 if (!IS_ERR(create.result)) {
151 struct sched_param param = { .sched_priority = 0 }; 151 static struct sched_param param = { .sched_priority = 0 };
152 va_list args; 152 va_list args;
153 153
154 va_start(args, namefmt); 154 va_start(args, namefmt);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..a5889fb28ecf 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
199 * memory barriers as we'll eventually observe the right 199 * memory barriers as we'll eventually observe the right
200 * values at the cost of a few extra spins. 200 * values at the cost of a few extra spins.
201 */ 201 */
202 cpu_relax(); 202 arch_mutex_cpu_relax();
203 } 203 }
204#endif 204#endif
205 spin_lock_mutex(&lock->wait_lock, flags); 205 spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/printk.c b/kernel/printk.c
index a23315dc4498..ab3ffc5b3b64 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending);
1074 1074
1075void printk_tick(void) 1075void printk_tick(void)
1076{ 1076{
1077 if (__get_cpu_var(printk_pending)) { 1077 if (__this_cpu_read(printk_pending)) {
1078 __get_cpu_var(printk_pending) = 0; 1078 __this_cpu_write(printk_pending, 0);
1079 wake_up_interruptible(&log_wait); 1079 wake_up_interruptible(&log_wait);
1080 } 1080 }
1081} 1081}
1082 1082
1083int printk_needs_cpu(int cpu) 1083int printk_needs_cpu(int cpu)
1084{ 1084{
1085 if (unlikely(cpu_is_offline(cpu))) 1085 if (cpu_is_offline(cpu))
1086 printk_tick(); 1086 printk_tick();
1087 return per_cpu(printk_pending, cpu); 1087 return __this_cpu_read(printk_pending);
1088} 1088}
1089 1089
1090void wake_up_klogd(void) 1090void wake_up_klogd(void)
diff --git a/kernel/sched.c b/kernel/sched.c
index 297d1a0eedb0..f2f914e0c47c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
75 75
76#include <asm/tlb.h> 76#include <asm/tlb.h>
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
78 79
79#include "sched_cpupri.h" 80#include "sched_cpupri.h"
80#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
81 83
82#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 255 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 256 struct cfs_rq **cfs_rq;
255 unsigned long shares; 257 unsigned long shares;
258
259 atomic_t load_weight;
256#endif 260#endif
257 261
258#ifdef CONFIG_RT_GROUP_SCHED 262#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,24 +272,19 @@ struct task_group {
268 struct task_group *parent; 272 struct task_group *parent;
269 struct list_head siblings; 273 struct list_head siblings;
270 struct list_head children; 274 struct list_head children;
275
276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
271}; 279};
272 280
273#define root_task_group init_task_group 281#define root_task_group init_task_group
274 282
275/* task_group_lock serializes add/remove of task groups and also changes to 283/* task_group_lock serializes the addition/removal of task groups */
276 * a task group's cpu shares.
277 */
278static DEFINE_SPINLOCK(task_group_lock); 284static DEFINE_SPINLOCK(task_group_lock);
279 285
280#ifdef CONFIG_FAIR_GROUP_SCHED 286#ifdef CONFIG_FAIR_GROUP_SCHED
281 287
282#ifdef CONFIG_SMP
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 288# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290 289
291/* 290/*
@@ -342,6 +341,7 @@ struct cfs_rq {
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 341 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance. 342 * list is used during load balance.
344 */ 343 */
344 int on_list;
345 struct list_head leaf_cfs_rq_list; 345 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 346 struct task_group *tg; /* group that "owns" this runqueue */
347 347
@@ -360,14 +360,17 @@ struct cfs_rq {
360 unsigned long h_load; 360 unsigned long h_load;
361 361
362 /* 362 /*
363 * this cpu's part of tg->shares 363 * Maintaining per-cpu shares distribution for group scheduling
364 *
365 * load_stamp is the last time we updated the load average
366 * load_last is the last time we updated the load average and saw load
367 * load_unacc_exec_time is currently unaccounted execution time
364 */ 368 */
365 unsigned long shares; 369 u64 load_avg;
370 u64 load_period;
371 u64 load_stamp, load_last, load_unacc_exec_time;
366 372
367 /* 373 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 374#endif
372#endif 375#endif
373}; 376};
@@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
605 */ 608 */
606static inline struct task_group *task_group(struct task_struct *p) 609static inline struct task_group *task_group(struct task_struct *p)
607{ 610{
611 struct task_group *tg;
608 struct cgroup_subsys_state *css; 612 struct cgroup_subsys_state *css;
609 613
610 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 614 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
611 lockdep_is_held(&task_rq(p)->lock)); 615 lockdep_is_held(&task_rq(p)->lock));
612 return container_of(css, struct task_group, css); 616 tg = container_of(css, struct task_group, css);
617
618 return autogroup_task_group(p, tg);
613} 619}
614 620
615/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 621/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -793,20 +799,6 @@ late_initcall(sched_init_debug);
793const_debug unsigned int sysctl_sched_nr_migrate = 32; 799const_debug unsigned int sysctl_sched_nr_migrate = 32;
794 800
795/* 801/*
796 * ratelimit for updating the group shares.
797 * default: 0.25ms
798 */
799unsigned int sysctl_sched_shares_ratelimit = 250000;
800unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
801
802/*
803 * Inject some fuzzyness into changing the per-cpu group shares
804 * this avoids remote rq-locks at the expense of fairness.
805 * default: 4
806 */
807unsigned int sysctl_sched_shares_thresh = 4;
808
809/*
810 * period over which we average the RT time consumption, measured 802 * period over which we average the RT time consumption, measured
811 * in ms. 803 * in ms.
812 * 804 *
@@ -1355,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1355 lw->inv_weight = 0; 1347 lw->inv_weight = 0;
1356} 1348}
1357 1349
1350static inline void update_load_set(struct load_weight *lw, unsigned long w)
1351{
1352 lw->weight = w;
1353 lw->inv_weight = 0;
1354}
1355
1358/* 1356/*
1359 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1357 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1360 * of tasks with abnormal "nice" values across CPUs the contribution that 1358 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1543 1541
1544#ifdef CONFIG_FAIR_GROUP_SCHED 1542#ifdef CONFIG_FAIR_GROUP_SCHED
1545 1543
1546static __read_mostly unsigned long __percpu *update_shares_data;
1547
1548static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1549
1550/*
1551 * Calculate and set the cpu's group shares.
1552 */
1553static void update_group_shares_cpu(struct task_group *tg, int cpu,
1554 unsigned long sd_shares,
1555 unsigned long sd_rq_weight,
1556 unsigned long *usd_rq_weight)
1557{
1558 unsigned long shares, rq_weight;
1559 int boost = 0;
1560
1561 rq_weight = usd_rq_weight[cpu];
1562 if (!rq_weight) {
1563 boost = 1;
1564 rq_weight = NICE_0_LOAD;
1565 }
1566
1567 /*
1568 * \Sum_j shares_j * rq_weight_i
1569 * shares_i = -----------------------------
1570 * \Sum_j rq_weight_j
1571 */
1572 shares = (sd_shares * rq_weight) / sd_rq_weight;
1573 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1574
1575 if (abs(shares - tg->se[cpu]->load.weight) >
1576 sysctl_sched_shares_thresh) {
1577 struct rq *rq = cpu_rq(cpu);
1578 unsigned long flags;
1579
1580 raw_spin_lock_irqsave(&rq->lock, flags);
1581 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1582 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1583 __set_se_shares(tg->se[cpu], shares);
1584 raw_spin_unlock_irqrestore(&rq->lock, flags);
1585 }
1586}
1587
1588/*
1589 * Re-compute the task group their per cpu shares over the given domain.
1590 * This needs to be done in a bottom-up fashion because the rq weight of a
1591 * parent group depends on the shares of its child groups.
1592 */
1593static int tg_shares_up(struct task_group *tg, void *data)
1594{
1595 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1596 unsigned long *usd_rq_weight;
1597 struct sched_domain *sd = data;
1598 unsigned long flags;
1599 int i;
1600
1601 if (!tg->se[0])
1602 return 0;
1603
1604 local_irq_save(flags);
1605 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1606
1607 for_each_cpu(i, sched_domain_span(sd)) {
1608 weight = tg->cfs_rq[i]->load.weight;
1609 usd_rq_weight[i] = weight;
1610
1611 rq_weight += weight;
1612 /*
1613 * If there are currently no tasks on the cpu pretend there
1614 * is one of average load so that when a new task gets to
1615 * run here it will not get delayed by group starvation.
1616 */
1617 if (!weight)
1618 weight = NICE_0_LOAD;
1619
1620 sum_weight += weight;
1621 shares += tg->cfs_rq[i]->shares;
1622 }
1623
1624 if (!rq_weight)
1625 rq_weight = sum_weight;
1626
1627 if ((!shares && rq_weight) || shares > tg->shares)
1628 shares = tg->shares;
1629
1630 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1631 shares = tg->shares;
1632
1633 for_each_cpu(i, sched_domain_span(sd))
1634 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1635
1636 local_irq_restore(flags);
1637
1638 return 0;
1639}
1640
1641/* 1544/*
1642 * Compute the cpu's hierarchical load factor for each task group. 1545 * Compute the cpu's hierarchical load factor for each task group.
1643 * This needs to be done in a top-down fashion because the load of a child 1546 * This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1652 load = cpu_rq(cpu)->load.weight; 1555 load = cpu_rq(cpu)->load.weight;
1653 } else { 1556 } else {
1654 load = tg->parent->cfs_rq[cpu]->h_load; 1557 load = tg->parent->cfs_rq[cpu]->h_load;
1655 load *= tg->cfs_rq[cpu]->shares; 1558 load *= tg->se[cpu]->load.weight;
1656 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1559 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1657 } 1560 }
1658 1561
@@ -1661,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1661 return 0; 1564 return 0;
1662} 1565}
1663 1566
1664static void update_shares(struct sched_domain *sd)
1665{
1666 s64 elapsed;
1667 u64 now;
1668
1669 if (root_task_group_empty())
1670 return;
1671
1672 now = local_clock();
1673 elapsed = now - sd->last_update;
1674
1675 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1676 sd->last_update = now;
1677 walk_tg_tree(tg_nop, tg_shares_up, sd);
1678 }
1679}
1680
1681static void update_h_load(long cpu) 1567static void update_h_load(long cpu)
1682{ 1568{
1683 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1569 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1684} 1570}
1685 1571
1686#else
1687
1688static inline void update_shares(struct sched_domain *sd)
1689{
1690}
1691
1692#endif 1572#endif
1693 1573
1694#ifdef CONFIG_PREEMPT 1574#ifdef CONFIG_PREEMPT
@@ -1810,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1810 1690
1811#endif 1691#endif
1812 1692
1813#ifdef CONFIG_FAIR_GROUP_SCHED
1814static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1815{
1816#ifdef CONFIG_SMP
1817 cfs_rq->shares = shares;
1818#endif
1819}
1820#endif
1821
1822static void calc_load_account_idle(struct rq *this_rq); 1693static void calc_load_account_idle(struct rq *this_rq);
1823static void update_sysctl(void); 1694static void update_sysctl(void);
1824static int get_update_sysctl_factor(void); 1695static int get_update_sysctl_factor(void);
@@ -2063,6 +1934,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2063#include "sched_idletask.c" 1934#include "sched_idletask.c"
2064#include "sched_fair.c" 1935#include "sched_fair.c"
2065#include "sched_rt.c" 1936#include "sched_rt.c"
1937#include "sched_autogroup.c"
2066#include "sched_stoptask.c" 1938#include "sched_stoptask.c"
2067#ifdef CONFIG_SCHED_DEBUG 1939#ifdef CONFIG_SCHED_DEBUG
2068# include "sched_debug.c" 1940# include "sched_debug.c"
@@ -2255,10 +2127,8 @@ static int migration_cpu_stop(void *data);
2255 * The task's runqueue lock must be held. 2127 * The task's runqueue lock must be held.
2256 * Returns true if you have to wait for migration thread. 2128 * Returns true if you have to wait for migration thread.
2257 */ 2129 */
2258static bool migrate_task(struct task_struct *p, int dest_cpu) 2130static bool migrate_task(struct task_struct *p, struct rq *rq)
2259{ 2131{
2260 struct rq *rq = task_rq(p);
2261
2262 /* 2132 /*
2263 * If the task is not on a runqueue (and not running), then 2133 * If the task is not on a runqueue (and not running), then
2264 * the next wake-up will properly place the task. 2134 * the next wake-up will properly place the task.
@@ -2438,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2438 return dest_cpu; 2308 return dest_cpu;
2439 2309
2440 /* No more Mr. Nice Guy. */ 2310 /* No more Mr. Nice Guy. */
2441 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2311 dest_cpu = cpuset_cpus_allowed_fallback(p);
2442 dest_cpu = cpuset_cpus_allowed_fallback(p); 2312 /*
2443 /* 2313 * Don't tell them about moving exiting tasks or
2444 * Don't tell them about moving exiting tasks or 2314 * kernel threads (both mm NULL), since they never
2445 * kernel threads (both mm NULL), since they never 2315 * leave kernel.
2446 * leave kernel. 2316 */
2447 */ 2317 if (p->mm && printk_ratelimit()) {
2448 if (p->mm && printk_ratelimit()) { 2318 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2449 printk(KERN_INFO "process %d (%s) no " 2319 task_pid_nr(p), p->comm, cpu);
2450 "longer affine to cpu%d\n",
2451 task_pid_nr(p), p->comm, cpu);
2452 }
2453 } 2320 }
2454 2321
2455 return dest_cpu; 2322 return dest_cpu;
@@ -2785,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2785 /* Want to start with kernel preemption disabled. */ 2652 /* Want to start with kernel preemption disabled. */
2786 task_thread_info(p)->preempt_count = 1; 2653 task_thread_info(p)->preempt_count = 1;
2787#endif 2654#endif
2655#ifdef CONFIG_SMP
2788 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2656 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2657#endif
2789 2658
2790 put_cpu(); 2659 put_cpu();
2791} 2660}
@@ -3549,7 +3418,7 @@ void sched_exec(void)
3549 * select_task_rq() can race against ->cpus_allowed 3418 * select_task_rq() can race against ->cpus_allowed
3550 */ 3419 */
3551 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3420 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3552 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3421 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3553 struct migration_arg arg = { p, dest_cpu }; 3422 struct migration_arg arg = { p, dest_cpu };
3554 3423
3555 task_rq_unlock(rq, &flags); 3424 task_rq_unlock(rq, &flags);
@@ -4214,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4214 if (task_thread_info(rq->curr) != owner || need_resched()) 4083 if (task_thread_info(rq->curr) != owner || need_resched())
4215 return 0; 4084 return 0;
4216 4085
4217 cpu_relax(); 4086 arch_mutex_cpu_relax();
4218 } 4087 }
4219 4088
4220 return 1; 4089 return 1;
@@ -4901,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p)
4901} 4770}
4902 4771
4903static int __sched_setscheduler(struct task_struct *p, int policy, 4772static int __sched_setscheduler(struct task_struct *p, int policy,
4904 struct sched_param *param, bool user) 4773 const struct sched_param *param, bool user)
4905{ 4774{
4906 int retval, oldprio, oldpolicy = -1, on_rq, running; 4775 int retval, oldprio, oldpolicy = -1, on_rq, running;
4907 unsigned long flags; 4776 unsigned long flags;
@@ -5056,7 +4925,7 @@ recheck:
5056 * NOTE that the task may be already dead. 4925 * NOTE that the task may be already dead.
5057 */ 4926 */
5058int sched_setscheduler(struct task_struct *p, int policy, 4927int sched_setscheduler(struct task_struct *p, int policy,
5059 struct sched_param *param) 4928 const struct sched_param *param)
5060{ 4929{
5061 return __sched_setscheduler(p, policy, param, true); 4930 return __sched_setscheduler(p, policy, param, true);
5062} 4931}
@@ -5074,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
5074 * but our caller might not have that capability. 4943 * but our caller might not have that capability.
5075 */ 4944 */
5076int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4945int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5077 struct sched_param *param) 4946 const struct sched_param *param)
5078{ 4947{
5079 return __sched_setscheduler(p, policy, param, false); 4948 return __sched_setscheduler(p, policy, param, false);
5080} 4949}
@@ -5590,7 +5459,7 @@ void sched_show_task(struct task_struct *p)
5590 unsigned state; 5459 unsigned state;
5591 5460
5592 state = p->state ? __ffs(p->state) + 1 : 0; 5461 state = p->state ? __ffs(p->state) + 1 : 0;
5593 printk(KERN_INFO "%-13.13s %c", p->comm, 5462 printk(KERN_INFO "%-15.15s %c", p->comm,
5594 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5463 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5595#if BITS_PER_LONG == 32 5464#if BITS_PER_LONG == 32
5596 if (state == TASK_RUNNING) 5465 if (state == TASK_RUNNING)
@@ -5754,7 +5623,6 @@ static void update_sysctl(void)
5754 SET_SYSCTL(sched_min_granularity); 5623 SET_SYSCTL(sched_min_granularity);
5755 SET_SYSCTL(sched_latency); 5624 SET_SYSCTL(sched_latency);
5756 SET_SYSCTL(sched_wakeup_granularity); 5625 SET_SYSCTL(sched_wakeup_granularity);
5757 SET_SYSCTL(sched_shares_ratelimit);
5758#undef SET_SYSCTL 5626#undef SET_SYSCTL
5759} 5627}
5760 5628
@@ -5830,7 +5698,7 @@ again:
5830 goto out; 5698 goto out;
5831 5699
5832 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5700 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5833 if (migrate_task(p, dest_cpu)) { 5701 if (migrate_task(p, rq)) {
5834 struct migration_arg arg = { p, dest_cpu }; 5702 struct migration_arg arg = { p, dest_cpu };
5835 /* Need help from migration thread: drop lock and wait. */ 5703 /* Need help from migration thread: drop lock and wait. */
5836 task_rq_unlock(rq, &flags); 5704 task_rq_unlock(rq, &flags);
@@ -5912,29 +5780,20 @@ static int migration_cpu_stop(void *data)
5912} 5780}
5913 5781
5914#ifdef CONFIG_HOTPLUG_CPU 5782#ifdef CONFIG_HOTPLUG_CPU
5783
5915/* 5784/*
5916 * Figure out where task on dead CPU should go, use force if necessary. 5785 * Ensures that the idle task is using init_mm right before its cpu goes
5786 * offline.
5917 */ 5787 */
5918void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5788void idle_task_exit(void)
5919{ 5789{
5920 struct rq *rq = cpu_rq(dead_cpu); 5790 struct mm_struct *mm = current->active_mm;
5921 int needs_cpu, uninitialized_var(dest_cpu);
5922 unsigned long flags;
5923 5791
5924 local_irq_save(flags); 5792 BUG_ON(cpu_online(smp_processor_id()));
5925 5793
5926 raw_spin_lock(&rq->lock); 5794 if (mm != &init_mm)
5927 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5795 switch_mm(mm, &init_mm, current);
5928 if (needs_cpu) 5796 mmdrop(mm);
5929 dest_cpu = select_fallback_rq(dead_cpu, p);
5930 raw_spin_unlock(&rq->lock);
5931 /*
5932 * It can only fail if we race with set_cpus_allowed(),
5933 * in the racer should migrate the task anyway.
5934 */
5935 if (needs_cpu)
5936 __migrate_task(p, dead_cpu, dest_cpu);
5937 local_irq_restore(flags);
5938} 5797}
5939 5798
5940/* 5799/*
@@ -5947,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5947static void migrate_nr_uninterruptible(struct rq *rq_src) 5806static void migrate_nr_uninterruptible(struct rq *rq_src)
5948{ 5807{
5949 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5808 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5950 unsigned long flags;
5951 5809
5952 local_irq_save(flags);
5953 double_rq_lock(rq_src, rq_dest);
5954 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5810 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5955 rq_src->nr_uninterruptible = 0; 5811 rq_src->nr_uninterruptible = 0;
5956 double_rq_unlock(rq_src, rq_dest);
5957 local_irq_restore(flags);
5958}
5959
5960/* Run through task list and migrate tasks from the dead cpu. */
5961static void migrate_live_tasks(int src_cpu)
5962{
5963 struct task_struct *p, *t;
5964
5965 read_lock(&tasklist_lock);
5966
5967 do_each_thread(t, p) {
5968 if (p == current)
5969 continue;
5970
5971 if (task_cpu(p) == src_cpu)
5972 move_task_off_dead_cpu(src_cpu, p);
5973 } while_each_thread(t, p);
5974
5975 read_unlock(&tasklist_lock);
5976} 5812}
5977 5813
5978/* 5814/*
5979 * Schedules idle task to be the next runnable task on current CPU. 5815 * remove the tasks which were accounted by rq from calc_load_tasks.
5980 * It does so by boosting its priority to highest possible.
5981 * Used by CPU offline code.
5982 */ 5816 */
5983void sched_idle_next(void) 5817static void calc_global_load_remove(struct rq *rq)
5984{ 5818{
5985 int this_cpu = smp_processor_id(); 5819 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5986 struct rq *rq = cpu_rq(this_cpu); 5820 rq->calc_load_active = 0;
5987 struct task_struct *p = rq->idle;
5988 unsigned long flags;
5989
5990 /* cpu has to be offline */
5991 BUG_ON(cpu_online(this_cpu));
5992
5993 /*
5994 * Strictly not necessary since rest of the CPUs are stopped by now
5995 * and interrupts disabled on the current cpu.
5996 */
5997 raw_spin_lock_irqsave(&rq->lock, flags);
5998
5999 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6000
6001 activate_task(rq, p, 0);
6002
6003 raw_spin_unlock_irqrestore(&rq->lock, flags);
6004} 5821}
6005 5822
6006/* 5823/*
6007 * Ensures that the idle task is using init_mm right before its cpu goes 5824 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6008 * offline. 5825 * try_to_wake_up()->select_task_rq().
5826 *
5827 * Called with rq->lock held even though we'er in stop_machine() and
5828 * there's no concurrency possible, we hold the required locks anyway
5829 * because of lock validation efforts.
6009 */ 5830 */
6010void idle_task_exit(void) 5831static void migrate_tasks(unsigned int dead_cpu)
6011{
6012 struct mm_struct *mm = current->active_mm;
6013
6014 BUG_ON(cpu_online(smp_processor_id()));
6015
6016 if (mm != &init_mm)
6017 switch_mm(mm, &init_mm, current);
6018 mmdrop(mm);
6019}
6020
6021/* called under rq->lock with disabled interrupts */
6022static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
6023{ 5832{
6024 struct rq *rq = cpu_rq(dead_cpu); 5833 struct rq *rq = cpu_rq(dead_cpu);
6025 5834 struct task_struct *next, *stop = rq->stop;
6026 /* Must be exiting, otherwise would be on tasklist. */ 5835 int dest_cpu;
6027 BUG_ON(!p->exit_state);
6028
6029 /* Cannot have done final schedule yet: would have vanished. */
6030 BUG_ON(p->state == TASK_DEAD);
6031
6032 get_task_struct(p);
6033 5836
6034 /* 5837 /*
6035 * Drop lock around migration; if someone else moves it, 5838 * Fudge the rq selection such that the below task selection loop
6036 * that's OK. No task can be added to this CPU, so iteration is 5839 * doesn't get stuck on the currently eligible stop task.
6037 * fine. 5840 *
5841 * We're currently inside stop_machine() and the rq is either stuck
5842 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5843 * either way we should never end up calling schedule() until we're
5844 * done here.
6038 */ 5845 */
6039 raw_spin_unlock_irq(&rq->lock); 5846 rq->stop = NULL;
6040 move_task_off_dead_cpu(dead_cpu, p);
6041 raw_spin_lock_irq(&rq->lock);
6042
6043 put_task_struct(p);
6044}
6045
6046/* release_task() removes task from tasklist, so we won't find dead tasks. */
6047static void migrate_dead_tasks(unsigned int dead_cpu)
6048{
6049 struct rq *rq = cpu_rq(dead_cpu);
6050 struct task_struct *next;
6051 5847
6052 for ( ; ; ) { 5848 for ( ; ; ) {
6053 if (!rq->nr_running) 5849 /*
5850 * There's this thread running, bail when that's the only
5851 * remaining thread.
5852 */
5853 if (rq->nr_running == 1)
6054 break; 5854 break;
5855
6055 next = pick_next_task(rq); 5856 next = pick_next_task(rq);
6056 if (!next) 5857 BUG_ON(!next);
6057 break;
6058 next->sched_class->put_prev_task(rq, next); 5858 next->sched_class->put_prev_task(rq, next);
6059 migrate_dead(dead_cpu, next);
6060 5859
5860 /* Find suitable destination for @next, with force if needed. */
5861 dest_cpu = select_fallback_rq(dead_cpu, next);
5862 raw_spin_unlock(&rq->lock);
5863
5864 __migrate_task(next, dead_cpu, dest_cpu);
5865
5866 raw_spin_lock(&rq->lock);
6061 } 5867 }
6062}
6063 5868
6064/* 5869 rq->stop = stop;
6065 * remove the tasks which were accounted by rq from calc_load_tasks.
6066 */
6067static void calc_global_load_remove(struct rq *rq)
6068{
6069 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
6070 rq->calc_load_active = 0;
6071} 5870}
5871
6072#endif /* CONFIG_HOTPLUG_CPU */ 5872#endif /* CONFIG_HOTPLUG_CPU */
6073 5873
6074#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5874#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6278,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6278 unsigned long flags; 6078 unsigned long flags;
6279 struct rq *rq = cpu_rq(cpu); 6079 struct rq *rq = cpu_rq(cpu);
6280 6080
6281 switch (action) { 6081 switch (action & ~CPU_TASKS_FROZEN) {
6282 6082
6283 case CPU_UP_PREPARE: 6083 case CPU_UP_PREPARE:
6284 case CPU_UP_PREPARE_FROZEN:
6285 rq->calc_load_update = calc_load_update; 6084 rq->calc_load_update = calc_load_update;
6286 break; 6085 break;
6287 6086
6288 case CPU_ONLINE: 6087 case CPU_ONLINE:
6289 case CPU_ONLINE_FROZEN:
6290 /* Update our root-domain */ 6088 /* Update our root-domain */
6291 raw_spin_lock_irqsave(&rq->lock, flags); 6089 raw_spin_lock_irqsave(&rq->lock, flags);
6292 if (rq->rd) { 6090 if (rq->rd) {
@@ -6298,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6298 break; 6096 break;
6299 6097
6300#ifdef CONFIG_HOTPLUG_CPU 6098#ifdef CONFIG_HOTPLUG_CPU
6301 case CPU_DEAD:
6302 case CPU_DEAD_FROZEN:
6303 migrate_live_tasks(cpu);
6304 /* Idle task back to normal (off runqueue, low prio) */
6305 raw_spin_lock_irq(&rq->lock);
6306 deactivate_task(rq, rq->idle, 0);
6307 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6308 rq->idle->sched_class = &idle_sched_class;
6309 migrate_dead_tasks(cpu);
6310 raw_spin_unlock_irq(&rq->lock);
6311 migrate_nr_uninterruptible(rq);
6312 BUG_ON(rq->nr_running != 0);
6313 calc_global_load_remove(rq);
6314 break;
6315
6316 case CPU_DYING: 6099 case CPU_DYING:
6317 case CPU_DYING_FROZEN:
6318 /* Update our root-domain */ 6100 /* Update our root-domain */
6319 raw_spin_lock_irqsave(&rq->lock, flags); 6101 raw_spin_lock_irqsave(&rq->lock, flags);
6320 if (rq->rd) { 6102 if (rq->rd) {
6321 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6103 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6322 set_rq_offline(rq); 6104 set_rq_offline(rq);
6323 } 6105 }
6106 migrate_tasks(cpu);
6107 BUG_ON(rq->nr_running != 1); /* the migration thread */
6324 raw_spin_unlock_irqrestore(&rq->lock, flags); 6108 raw_spin_unlock_irqrestore(&rq->lock, flags);
6109
6110 migrate_nr_uninterruptible(rq);
6111 calc_global_load_remove(rq);
6325 break; 6112 break;
6326#endif 6113#endif
6327 } 6114 }
@@ -8052,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8052 7839
8053#ifdef CONFIG_FAIR_GROUP_SCHED 7840#ifdef CONFIG_FAIR_GROUP_SCHED
8054static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7841static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8055 struct sched_entity *se, int cpu, int add, 7842 struct sched_entity *se, int cpu,
8056 struct sched_entity *parent) 7843 struct sched_entity *parent)
8057{ 7844{
8058 struct rq *rq = cpu_rq(cpu); 7845 struct rq *rq = cpu_rq(cpu);
8059 tg->cfs_rq[cpu] = cfs_rq; 7846 tg->cfs_rq[cpu] = cfs_rq;
8060 init_cfs_rq(cfs_rq, rq); 7847 init_cfs_rq(cfs_rq, rq);
8061 cfs_rq->tg = tg; 7848 cfs_rq->tg = tg;
8062 if (add)
8063 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
8064 7849
8065 tg->se[cpu] = se; 7850 tg->se[cpu] = se;
8066 /* se could be NULL for init_task_group */ 7851 /* se could be NULL for init_task_group */
@@ -8073,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8073 se->cfs_rq = parent->my_q; 7858 se->cfs_rq = parent->my_q;
8074 7859
8075 se->my_q = cfs_rq; 7860 se->my_q = cfs_rq;
8076 se->load.weight = tg->shares; 7861 update_load_set(&se->load, 0);
8077 se->load.inv_weight = 0;
8078 se->parent = parent; 7862 se->parent = parent;
8079} 7863}
8080#endif 7864#endif
8081 7865
8082#ifdef CONFIG_RT_GROUP_SCHED 7866#ifdef CONFIG_RT_GROUP_SCHED
8083static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7867static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8084 struct sched_rt_entity *rt_se, int cpu, int add, 7868 struct sched_rt_entity *rt_se, int cpu,
8085 struct sched_rt_entity *parent) 7869 struct sched_rt_entity *parent)
8086{ 7870{
8087 struct rq *rq = cpu_rq(cpu); 7871 struct rq *rq = cpu_rq(cpu);
@@ -8090,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8090 init_rt_rq(rt_rq, rq); 7874 init_rt_rq(rt_rq, rq);
8091 rt_rq->tg = tg; 7875 rt_rq->tg = tg;
8092 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7876 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8093 if (add)
8094 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
8095 7877
8096 tg->rt_se[cpu] = rt_se; 7878 tg->rt_se[cpu] = rt_se;
8097 if (!rt_se) 7879 if (!rt_se)
@@ -8164,13 +7946,9 @@ void __init sched_init(void)
8164#ifdef CONFIG_CGROUP_SCHED 7946#ifdef CONFIG_CGROUP_SCHED
8165 list_add(&init_task_group.list, &task_groups); 7947 list_add(&init_task_group.list, &task_groups);
8166 INIT_LIST_HEAD(&init_task_group.children); 7948 INIT_LIST_HEAD(&init_task_group.children);
8167 7949 autogroup_init(&init_task);
8168#endif /* CONFIG_CGROUP_SCHED */ 7950#endif /* CONFIG_CGROUP_SCHED */
8169 7951
8170#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
8171 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
8172 __alignof__(unsigned long));
8173#endif
8174 for_each_possible_cpu(i) { 7952 for_each_possible_cpu(i) {
8175 struct rq *rq; 7953 struct rq *rq;
8176 7954
@@ -8184,7 +7962,6 @@ void __init sched_init(void)
8184#ifdef CONFIG_FAIR_GROUP_SCHED 7962#ifdef CONFIG_FAIR_GROUP_SCHED
8185 init_task_group.shares = init_task_group_load; 7963 init_task_group.shares = init_task_group_load;
8186 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7964 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8187#ifdef CONFIG_CGROUP_SCHED
8188 /* 7965 /*
8189 * How much cpu bandwidth does init_task_group get? 7966 * How much cpu bandwidth does init_task_group get?
8190 * 7967 *
@@ -8204,16 +7981,13 @@ void __init sched_init(void)
8204 * We achieve this by letting init_task_group's tasks sit 7981 * We achieve this by letting init_task_group's tasks sit
8205 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7982 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
8206 */ 7983 */
8207 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7984 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
8208#endif
8209#endif /* CONFIG_FAIR_GROUP_SCHED */ 7985#endif /* CONFIG_FAIR_GROUP_SCHED */
8210 7986
8211 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7987 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8212#ifdef CONFIG_RT_GROUP_SCHED 7988#ifdef CONFIG_RT_GROUP_SCHED
8213 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7989 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8214#ifdef CONFIG_CGROUP_SCHED 7990 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
8215 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8216#endif
8217#endif 7991#endif
8218 7992
8219 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7993 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8488,7 +8262,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8488 if (!se) 8262 if (!se)
8489 goto err_free_rq; 8263 goto err_free_rq;
8490 8264
8491 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8265 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8492 } 8266 }
8493 8267
8494 return 1; 8268 return 1;
@@ -8499,15 +8273,21 @@ err:
8499 return 0; 8273 return 0;
8500} 8274}
8501 8275
8502static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8503{
8504 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8505 &cpu_rq(cpu)->leaf_cfs_rq_list);
8506}
8507
8508static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8276static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8509{ 8277{
8510 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8278 struct rq *rq = cpu_rq(cpu);
8279 unsigned long flags;
8280
8281 /*
8282 * Only empty task groups can be destroyed; so we can speculatively
8283 * check on_list without danger of it being re-added.
8284 */
8285 if (!tg->cfs_rq[cpu]->on_list)
8286 return;
8287
8288 raw_spin_lock_irqsave(&rq->lock, flags);
8289 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8290 raw_spin_unlock_irqrestore(&rq->lock, flags);
8511} 8291}
8512#else /* !CONFG_FAIR_GROUP_SCHED */ 8292#else /* !CONFG_FAIR_GROUP_SCHED */
8513static inline void free_fair_sched_group(struct task_group *tg) 8293static inline void free_fair_sched_group(struct task_group *tg)
@@ -8520,10 +8300,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8520 return 1; 8300 return 1;
8521} 8301}
8522 8302
8523static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8524{
8525}
8526
8527static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8303static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8528{ 8304{
8529} 8305}
@@ -8578,7 +8354,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8578 if (!rt_se) 8354 if (!rt_se)
8579 goto err_free_rq; 8355 goto err_free_rq;
8580 8356
8581 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8357 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8582 } 8358 }
8583 8359
8584 return 1; 8360 return 1;
@@ -8588,17 +8364,6 @@ err_free_rq:
8588err: 8364err:
8589 return 0; 8365 return 0;
8590} 8366}
8591
8592static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8593{
8594 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8595 &cpu_rq(cpu)->leaf_rt_rq_list);
8596}
8597
8598static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8599{
8600 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8601}
8602#else /* !CONFIG_RT_GROUP_SCHED */ 8367#else /* !CONFIG_RT_GROUP_SCHED */
8603static inline void free_rt_sched_group(struct task_group *tg) 8368static inline void free_rt_sched_group(struct task_group *tg)
8604{ 8369{
@@ -8609,14 +8374,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8609{ 8374{
8610 return 1; 8375 return 1;
8611} 8376}
8612
8613static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8614{
8615}
8616
8617static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8618{
8619}
8620#endif /* CONFIG_RT_GROUP_SCHED */ 8377#endif /* CONFIG_RT_GROUP_SCHED */
8621 8378
8622#ifdef CONFIG_CGROUP_SCHED 8379#ifdef CONFIG_CGROUP_SCHED
@@ -8632,7 +8389,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8632{ 8389{
8633 struct task_group *tg; 8390 struct task_group *tg;
8634 unsigned long flags; 8391 unsigned long flags;
8635 int i;
8636 8392
8637 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8393 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8638 if (!tg) 8394 if (!tg)
@@ -8645,10 +8401,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8645 goto err; 8401 goto err;
8646 8402
8647 spin_lock_irqsave(&task_group_lock, flags); 8403 spin_lock_irqsave(&task_group_lock, flags);
8648 for_each_possible_cpu(i) {
8649 register_fair_sched_group(tg, i);
8650 register_rt_sched_group(tg, i);
8651 }
8652 list_add_rcu(&tg->list, &task_groups); 8404 list_add_rcu(&tg->list, &task_groups);
8653 8405
8654 WARN_ON(!parent); /* root should already exist */ 8406 WARN_ON(!parent); /* root should already exist */
@@ -8678,11 +8430,11 @@ void sched_destroy_group(struct task_group *tg)
8678 unsigned long flags; 8430 unsigned long flags;
8679 int i; 8431 int i;
8680 8432
8681 spin_lock_irqsave(&task_group_lock, flags); 8433 /* end participation in shares distribution */
8682 for_each_possible_cpu(i) { 8434 for_each_possible_cpu(i)
8683 unregister_fair_sched_group(tg, i); 8435 unregister_fair_sched_group(tg, i);
8684 unregister_rt_sched_group(tg, i); 8436
8685 } 8437 spin_lock_irqsave(&task_group_lock, flags);
8686 list_del_rcu(&tg->list); 8438 list_del_rcu(&tg->list);
8687 list_del_rcu(&tg->siblings); 8439 list_del_rcu(&tg->siblings);
8688 spin_unlock_irqrestore(&task_group_lock, flags); 8440 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8729,33 +8481,6 @@ void sched_move_task(struct task_struct *tsk)
8729#endif /* CONFIG_CGROUP_SCHED */ 8481#endif /* CONFIG_CGROUP_SCHED */
8730 8482
8731#ifdef CONFIG_FAIR_GROUP_SCHED 8483#ifdef CONFIG_FAIR_GROUP_SCHED
8732static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8733{
8734 struct cfs_rq *cfs_rq = se->cfs_rq;
8735 int on_rq;
8736
8737 on_rq = se->on_rq;
8738 if (on_rq)
8739 dequeue_entity(cfs_rq, se, 0);
8740
8741 se->load.weight = shares;
8742 se->load.inv_weight = 0;
8743
8744 if (on_rq)
8745 enqueue_entity(cfs_rq, se, 0);
8746}
8747
8748static void set_se_shares(struct sched_entity *se, unsigned long shares)
8749{
8750 struct cfs_rq *cfs_rq = se->cfs_rq;
8751 struct rq *rq = cfs_rq->rq;
8752 unsigned long flags;
8753
8754 raw_spin_lock_irqsave(&rq->lock, flags);
8755 __set_se_shares(se, shares);
8756 raw_spin_unlock_irqrestore(&rq->lock, flags);
8757}
8758
8759static DEFINE_MUTEX(shares_mutex); 8484static DEFINE_MUTEX(shares_mutex);
8760 8485
8761int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8486int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8778,37 +8503,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8778 if (tg->shares == shares) 8503 if (tg->shares == shares)
8779 goto done; 8504 goto done;
8780 8505
8781 spin_lock_irqsave(&task_group_lock, flags);
8782 for_each_possible_cpu(i)
8783 unregister_fair_sched_group(tg, i);
8784 list_del_rcu(&tg->siblings);
8785 spin_unlock_irqrestore(&task_group_lock, flags);
8786
8787 /* wait for any ongoing reference to this group to finish */
8788 synchronize_sched();
8789
8790 /*
8791 * Now we are free to modify the group's share on each cpu
8792 * w/o tripping rebalance_share or load_balance_fair.
8793 */
8794 tg->shares = shares; 8506 tg->shares = shares;
8795 for_each_possible_cpu(i) { 8507 for_each_possible_cpu(i) {
8796 /* 8508 struct rq *rq = cpu_rq(i);
8797 * force a rebalance 8509 struct sched_entity *se;
8798 */ 8510
8799 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8511 se = tg->se[i];
8800 set_se_shares(tg->se[i], shares); 8512 /* Propagate contribution to hierarchy */
8513 raw_spin_lock_irqsave(&rq->lock, flags);
8514 for_each_sched_entity(se)
8515 update_cfs_shares(group_cfs_rq(se), 0);
8516 raw_spin_unlock_irqrestore(&rq->lock, flags);
8801 } 8517 }
8802 8518
8803 /*
8804 * Enable load balance activity on this group, by inserting it back on
8805 * each cpu's rq->leaf_cfs_rq_list.
8806 */
8807 spin_lock_irqsave(&task_group_lock, flags);
8808 for_each_possible_cpu(i)
8809 register_fair_sched_group(tg, i);
8810 list_add_rcu(&tg->siblings, &tg->parent->children);
8811 spin_unlock_irqrestore(&task_group_lock, flags);
8812done: 8519done:
8813 mutex_unlock(&shares_mutex); 8520 mutex_unlock(&shares_mutex);
8814 return 0; 8521 return 0;
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..c80fedcd476b
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,238 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include <linux/proc_fs.h>
4#include <linux/seq_file.h>
5#include <linux/kallsyms.h>
6#include <linux/utsname.h>
7
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr;
11
12static void autogroup_init(struct task_struct *init_task)
13{
14 autogroup_default.tg = &init_task_group;
15 init_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default;
19}
20
21static inline void autogroup_free(struct task_group *tg)
22{
23 kfree(tg->autogroup);
24}
25
26static inline void autogroup_destroy(struct kref *kref)
27{
28 struct autogroup *ag = container_of(kref, struct autogroup, kref);
29
30 sched_destroy_group(ag->tg);
31}
32
33static inline void autogroup_kref_put(struct autogroup *ag)
34{
35 kref_put(&ag->kref, autogroup_destroy);
36}
37
38static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
39{
40 kref_get(&ag->kref);
41 return ag;
42}
43
44static inline struct autogroup *autogroup_task_get(struct task_struct *p)
45{
46 struct autogroup *ag;
47 unsigned long flags;
48
49 if (!lock_task_sighand(p, &flags))
50 return autogroup_kref_get(&autogroup_default);
51
52 ag = autogroup_kref_get(p->signal->autogroup);
53 unlock_task_sighand(p, &flags);
54
55 return ag;
56}
57
58static inline struct autogroup *autogroup_create(void)
59{
60 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
61 struct task_group *tg;
62
63 if (!ag)
64 goto out_fail;
65
66 tg = sched_create_group(&init_task_group);
67
68 if (IS_ERR(tg))
69 goto out_free;
70
71 kref_init(&ag->kref);
72 init_rwsem(&ag->lock);
73 ag->id = atomic_inc_return(&autogroup_seq_nr);
74 ag->tg = tg;
75 tg->autogroup = ag;
76
77 return ag;
78
79out_free:
80 kfree(ag);
81out_fail:
82 if (printk_ratelimit()) {
83 printk(KERN_WARNING "autogroup_create: %s failure.\n",
84 ag ? "sched_create_group()" : "kmalloc()");
85 }
86
87 return autogroup_kref_get(&autogroup_default);
88}
89
90static inline bool
91task_wants_autogroup(struct task_struct *p, struct task_group *tg)
92{
93 if (tg != &root_task_group)
94 return false;
95
96 if (p->sched_class != &fair_sched_class)
97 return false;
98
99 /*
100 * We can only assume the task group can't go away on us if
101 * autogroup_move_group() can see us on ->thread_group list.
102 */
103 if (p->flags & PF_EXITING)
104 return false;
105
106 return true;
107}
108
109static inline struct task_group *
110autogroup_task_group(struct task_struct *p, struct task_group *tg)
111{
112 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
113
114 if (enabled && task_wants_autogroup(p, tg))
115 return p->signal->autogroup->tg;
116
117 return tg;
118}
119
120static void
121autogroup_move_group(struct task_struct *p, struct autogroup *ag)
122{
123 struct autogroup *prev;
124 struct task_struct *t;
125 unsigned long flags;
126
127 BUG_ON(!lock_task_sighand(p, &flags));
128
129 prev = p->signal->autogroup;
130 if (prev == ag) {
131 unlock_task_sighand(p, &flags);
132 return;
133 }
134
135 p->signal->autogroup = autogroup_kref_get(ag);
136
137 t = p;
138 do {
139 sched_move_task(t);
140 } while_each_thread(p, t);
141
142 unlock_task_sighand(p, &flags);
143 autogroup_kref_put(prev);
144}
145
146/* Allocates GFP_KERNEL, cannot be called under any spinlock */
147void sched_autogroup_create_attach(struct task_struct *p)
148{
149 struct autogroup *ag = autogroup_create();
150
151 autogroup_move_group(p, ag);
152 /* drop extra refrence added by autogroup_create() */
153 autogroup_kref_put(ag);
154}
155EXPORT_SYMBOL(sched_autogroup_create_attach);
156
157/* Cannot be called under siglock. Currently has no users */
158void sched_autogroup_detach(struct task_struct *p)
159{
160 autogroup_move_group(p, &autogroup_default);
161}
162EXPORT_SYMBOL(sched_autogroup_detach);
163
164void sched_autogroup_fork(struct signal_struct *sig)
165{
166 sig->autogroup = autogroup_task_get(current);
167}
168
169void sched_autogroup_exit(struct signal_struct *sig)
170{
171 autogroup_kref_put(sig->autogroup);
172}
173
174static int __init setup_autogroup(char *str)
175{
176 sysctl_sched_autogroup_enabled = 0;
177
178 return 1;
179}
180
181__setup("noautogroup", setup_autogroup);
182
183#ifdef CONFIG_PROC_FS
184
185int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
186{
187 static unsigned long next = INITIAL_JIFFIES;
188 struct autogroup *ag;
189 int err;
190
191 if (*nice < -20 || *nice > 19)
192 return -EINVAL;
193
194 err = security_task_setnice(current, *nice);
195 if (err)
196 return err;
197
198 if (*nice < 0 && !can_nice(current, *nice))
199 return -EPERM;
200
201 /* this is a heavy operation taking global locks.. */
202 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
203 return -EAGAIN;
204
205 next = HZ / 10 + jiffies;
206 ag = autogroup_task_get(p);
207
208 down_write(&ag->lock);
209 err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
210 if (!err)
211 ag->nice = *nice;
212 up_write(&ag->lock);
213
214 autogroup_kref_put(ag);
215
216 return err;
217}
218
219void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
220{
221 struct autogroup *ag = autogroup_task_get(p);
222
223 down_read(&ag->lock);
224 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
225 up_read(&ag->lock);
226
227 autogroup_kref_put(ag);
228}
229#endif /* CONFIG_PROC_FS */
230
231#ifdef CONFIG_SCHED_DEBUG
232static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
233{
234 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
235}
236#endif /* CONFIG_SCHED_DEBUG */
237
238#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..5358e241cb20
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,32 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3struct autogroup {
4 struct kref kref;
5 struct task_group *tg;
6 struct rw_semaphore lock;
7 unsigned long id;
8 int nice;
9};
10
11static inline struct task_group *
12autogroup_task_group(struct task_struct *p, struct task_group *tg);
13
14#else /* !CONFIG_SCHED_AUTOGROUP */
15
16static inline void autogroup_init(struct task_struct *init_task) { }
17static inline void autogroup_free(struct task_group *tg) { }
18
19static inline struct task_group *
20autogroup_task_group(struct task_struct *p, struct task_group *tg)
21{
22 return tg;
23}
24
25#ifdef CONFIG_SCHED_DEBUG
26static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
27{
28 return 0;
29}
30#endif
31
32#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb1..9d8af0b3fb64 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
79} 79}
80EXPORT_SYMBOL_GPL(sched_clock); 80EXPORT_SYMBOL_GPL(sched_clock);
81 81
82static __read_mostly int sched_clock_running; 82__read_mostly int sched_clock_running;
83 83
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 85__read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..1dfae3d014b5 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED 56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu, 57static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
58 struct task_group *tg)
59{ 58{
60 struct sched_entity *se = tg->se[cpu]; 59 struct sched_entity *se = tg->se[cpu];
61 if (!se) 60 if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 109 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
111#endif 110#endif
112 111
113#ifdef CONFIG_CGROUP_SCHED
114 {
115 char path[64];
116
117 rcu_read_lock();
118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
120 SEQ_printf(m, " %s", path);
121 }
122#endif
123 SEQ_printf(m, "\n"); 112 SEQ_printf(m, "\n");
124} 113}
125 114
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
147 read_unlock_irqrestore(&tasklist_lock, flags); 136 read_unlock_irqrestore(&tasklist_lock, flags);
148} 137}
149 138
150#if defined(CONFIG_CGROUP_SCHED) && \
151 (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
152static void task_group_path(struct task_group *tg, char *buf, int buflen)
153{
154 /* may be NULL if the underlying cgroup isn't fully-created yet */
155 if (!tg->css.cgroup) {
156 buf[0] = '\0';
157 return;
158 }
159 cgroup_path(tg->css.cgroup, buf, buflen);
160}
161#endif
162
163void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 139void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
164{ 140{
165 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 141 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168 struct sched_entity *last; 144 struct sched_entity *last;
169 unsigned long flags; 145 unsigned long flags;
170 146
171#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
172 char path[128];
173 struct task_group *tg = cfs_rq->tg;
174
175 task_group_path(tg, path, sizeof(path));
176
177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
178#else
179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 147 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
180#endif
181 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 148 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
182 SPLIT_NS(cfs_rq->exec_clock)); 149 SPLIT_NS(cfs_rq->exec_clock));
183 150
@@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 spread0 = min_vruntime - rq0_min_vruntime; 169 spread0 = min_vruntime - rq0_min_vruntime;
203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", 170 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
204 SPLIT_NS(spread0)); 171 SPLIT_NS(spread0));
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207
208 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 172 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
209 cfs_rq->nr_spread_over); 173 cfs_rq->nr_spread_over);
174 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
175 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
210#ifdef CONFIG_FAIR_GROUP_SCHED 176#ifdef CONFIG_FAIR_GROUP_SCHED
211#ifdef CONFIG_SMP 177#ifdef CONFIG_SMP
212 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 178 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
179 SPLIT_NS(cfs_rq->load_avg));
180 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
181 SPLIT_NS(cfs_rq->load_period));
182 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
183 cfs_rq->load_contribution);
184 SEQ_printf(m, " .%-30s: %d\n", "load_tg",
185 atomic_read(&cfs_rq->tg->load_weight));
213#endif 186#endif
187
214 print_cfs_group_stats(m, cpu, cfs_rq->tg); 188 print_cfs_group_stats(m, cpu, cfs_rq->tg);
215#endif 189#endif
216} 190}
217 191
218void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 192void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
219{ 193{
220#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
221 char path[128];
222 struct task_group *tg = rt_rq->tg;
223
224 task_group_path(tg, path, sizeof(path));
225
226 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
227#else
228 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 194 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
229#endif
230
231 195
232#define P(x) \ 196#define P(x) \
233 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 197 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
243#undef P 207#undef P
244} 208}
245 209
210extern __read_mostly int sched_clock_running;
211
246static void print_cpu(struct seq_file *m, int cpu) 212static void print_cpu(struct seq_file *m, int cpu)
247{ 213{
248 struct rq *rq = cpu_rq(cpu); 214 struct rq *rq = cpu_rq(cpu);
@@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = {
314 280
315static int sched_debug_show(struct seq_file *m, void *v) 281static int sched_debug_show(struct seq_file *m, void *v)
316{ 282{
317 u64 now = ktime_to_ns(ktime_get()); 283 u64 ktime, sched_clk, cpu_clk;
284 unsigned long flags;
318 int cpu; 285 int cpu;
319 286
320 SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", 287 local_irq_save(flags);
288 ktime = ktime_to_ns(ktime_get());
289 sched_clk = sched_clock();
290 cpu_clk = local_clock();
291 local_irq_restore(flags);
292
293 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
321 init_utsname()->release, 294 init_utsname()->release,
322 (int)strcspn(init_utsname()->version, " "), 295 (int)strcspn(init_utsname()->version, " "),
323 init_utsname()->version); 296 init_utsname()->version);
324 297
325 SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); 298#define P(x) \
299 SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
300#define PN(x) \
301 SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
302 PN(ktime);
303 PN(sched_clk);
304 PN(cpu_clk);
305 P(jiffies);
306#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
307 P(sched_clock_stable);
308#endif
309#undef PN
310#undef P
311
312 SEQ_printf(m, "\n");
313 SEQ_printf(m, "sysctl_sched\n");
326 314
327#define P(x) \ 315#define P(x) \
328 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) 316 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
329#define PN(x) \ 317#define PN(x) \
330 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 318 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
331 P(jiffies);
332 PN(sysctl_sched_latency); 319 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 320 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 321 PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 00ebd7686676..c62ebae65cf0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 89
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 91
92/*
93 * The exponential sliding window over which load is averaged for shares
94 * distribution.
95 * (default: 10msec)
96 */
97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98
92static const struct sched_class fair_sched_class; 99static const struct sched_class fair_sched_class;
93 100
94/************************************************************** 101/**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
143 return cfs_rq->tg->cfs_rq[this_cpu]; 150 return cfs_rq->tg->cfs_rq[this_cpu];
144} 151}
145 152
153static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
154{
155 if (!cfs_rq->on_list) {
156 /*
157 * Ensure we either appear before our parent (if already
158 * enqueued) or force our parent to appear after us when it is
159 * enqueued. The fact that we always enqueue bottom-up
160 * reduces this to two cases.
161 */
162 if (cfs_rq->tg->parent &&
163 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
164 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
165 &rq_of(cfs_rq)->leaf_cfs_rq_list);
166 } else {
167 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
168 &rq_of(cfs_rq)->leaf_cfs_rq_list);
169 }
170
171 cfs_rq->on_list = 1;
172 }
173}
174
175static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
176{
177 if (cfs_rq->on_list) {
178 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
179 cfs_rq->on_list = 0;
180 }
181}
182
146/* Iterate thr' all leaf cfs_rq's on a runqueue */ 183/* Iterate thr' all leaf cfs_rq's on a runqueue */
147#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 184#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 185 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
246 return &cpu_rq(this_cpu)->cfs; 283 return &cpu_rq(this_cpu)->cfs;
247} 284}
248 285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288}
289
290static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291{
292}
293
249#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 294#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 295 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251 296
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 462 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 463 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 464 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 465#undef WRT_SYSCTL
422 466
423 return 0; 467 return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
495 return calc_delta_fair(sched_slice(cfs_rq, se), se); 539 return calc_delta_fair(sched_slice(cfs_rq, se), se);
496} 540}
497 541
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
544
498/* 545/*
499 * Update the current task's runtime statistics. Skip current tasks that 546 * Update the current task's runtime statistics. Skip current tasks that
500 * are not in our scheduling class. 547 * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
514 561
515 curr->vruntime += delta_exec_weighted; 562 curr->vruntime += delta_exec_weighted;
516 update_min_vruntime(cfs_rq); 563 update_min_vruntime(cfs_rq);
564
565#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
566 cfs_rq->load_unacc_exec_time += delta_exec;
567#endif
517} 568}
518 569
519static void update_curr(struct cfs_rq *cfs_rq) 570static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 684 list_add(&se->group_node, &cfs_rq->tasks);
634 } 685 }
635 cfs_rq->nr_running++; 686 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 687}
638 688
639static void 689static void
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 697 list_del_init(&se->group_node);
648 } 698 }
649 cfs_rq->nr_running--; 699 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 700}
652 701
702#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
703static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
704 int global_update)
705{
706 struct task_group *tg = cfs_rq->tg;
707 long load_avg;
708
709 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
710 load_avg -= cfs_rq->load_contribution;
711
712 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
713 atomic_add(load_avg, &tg->load_weight);
714 cfs_rq->load_contribution += load_avg;
715 }
716}
717
718static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
719{
720 u64 period = sysctl_sched_shares_window;
721 u64 now, delta;
722 unsigned long load = cfs_rq->load.weight;
723
724 if (!cfs_rq)
725 return;
726
727 now = rq_of(cfs_rq)->clock;
728 delta = now - cfs_rq->load_stamp;
729
730 /* truncate load history at 4 idle periods */
731 if (cfs_rq->load_stamp > cfs_rq->load_last &&
732 now - cfs_rq->load_last > 4 * period) {
733 cfs_rq->load_period = 0;
734 cfs_rq->load_avg = 0;
735 }
736
737 cfs_rq->load_stamp = now;
738 cfs_rq->load_unacc_exec_time = 0;
739 cfs_rq->load_period += delta;
740 if (load) {
741 cfs_rq->load_last = now;
742 cfs_rq->load_avg += delta * load;
743 }
744
745 /* consider updating load contribution on each fold or truncate */
746 if (global_update || cfs_rq->load_period > period
747 || !cfs_rq->load_period)
748 update_cfs_rq_load_contribution(cfs_rq, global_update);
749
750 while (cfs_rq->load_period > period) {
751 /*
752 * Inline assembly required to prevent the compiler
753 * optimising this loop into a divmod call.
754 * See __iter_div_u64_rem() for another example of this.
755 */
756 asm("" : "+rm" (cfs_rq->load_period));
757 cfs_rq->load_period /= 2;
758 cfs_rq->load_avg /= 2;
759 }
760
761 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
762 list_del_leaf_cfs_rq(cfs_rq);
763}
764
765static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
766 unsigned long weight)
767{
768 if (se->on_rq) {
769 /* commit outstanding execution time */
770 if (cfs_rq->curr == se)
771 update_curr(cfs_rq);
772 account_entity_dequeue(cfs_rq, se);
773 }
774
775 update_load_set(&se->load, weight);
776
777 if (se->on_rq)
778 account_entity_enqueue(cfs_rq, se);
779}
780
781static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
782{
783 struct task_group *tg;
784 struct sched_entity *se;
785 long load_weight, load, shares;
786
787 if (!cfs_rq)
788 return;
789
790 tg = cfs_rq->tg;
791 se = tg->se[cpu_of(rq_of(cfs_rq))];
792 if (!se)
793 return;
794
795 load = cfs_rq->load.weight + weight_delta;
796
797 load_weight = atomic_read(&tg->load_weight);
798 load_weight -= cfs_rq->load_contribution;
799 load_weight += load;
800
801 shares = (tg->shares * load);
802 if (load_weight)
803 shares /= load_weight;
804
805 if (shares < MIN_SHARES)
806 shares = MIN_SHARES;
807 if (shares > tg->shares)
808 shares = tg->shares;
809
810 reweight_entity(cfs_rq_of(se), se, shares);
811}
812
813static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
814{
815 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
816 update_cfs_load(cfs_rq, 0);
817 update_cfs_shares(cfs_rq, 0);
818 }
819}
820#else /* CONFIG_FAIR_GROUP_SCHED */
821static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
822{
823}
824
825static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
826{
827}
828
829static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
830{
831}
832#endif /* CONFIG_FAIR_GROUP_SCHED */
833
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 834static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 835{
655#ifdef CONFIG_SCHEDSTATS 836#ifdef CONFIG_SCHEDSTATS
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 952 * Update run-time statistics of the 'current'.
772 */ 953 */
773 update_curr(cfs_rq); 954 update_curr(cfs_rq);
955 update_cfs_load(cfs_rq, 0);
956 update_cfs_shares(cfs_rq, se->load.weight);
774 account_entity_enqueue(cfs_rq, se); 957 account_entity_enqueue(cfs_rq, se);
775 958
776 if (flags & ENQUEUE_WAKEUP) { 959 if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 965 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 966 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 967 __enqueue_entity(cfs_rq, se);
968 se->on_rq = 1;
969
970 if (cfs_rq->nr_running == 1)
971 list_add_leaf_cfs_rq(cfs_rq);
785} 972}
786 973
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 974static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 1012
826 if (se != cfs_rq->curr) 1013 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 1014 __dequeue_entity(cfs_rq, se);
1015 se->on_rq = 0;
1016 update_cfs_load(cfs_rq, 0);
828 account_entity_dequeue(cfs_rq, se); 1017 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq); 1018 update_min_vruntime(cfs_rq);
1019 update_cfs_shares(cfs_rq, 0);
830 1020
831 /* 1021 /*
832 * Normalize the entity after updating the min_vruntime because the 1022 * Normalize the entity after updating the min_vruntime because the
@@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
955 */ 1145 */
956 update_curr(cfs_rq); 1146 update_curr(cfs_rq);
957 1147
1148 /*
1149 * Update share accounting for long-running entities.
1150 */
1151 update_entity_shares_tick(cfs_rq);
1152
958#ifdef CONFIG_SCHED_HRTICK 1153#ifdef CONFIG_SCHED_HRTICK
959 /* 1154 /*
960 * queued ticks are scheduled to match the slice, so don't bother 1155 * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1250 flags = ENQUEUE_WAKEUP;
1056 } 1251 }
1057 1252
1253 for_each_sched_entity(se) {
1254 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1255
1256 update_cfs_load(cfs_rq, 0);
1257 update_cfs_shares(cfs_rq, 0);
1258 }
1259
1058 hrtick_update(rq); 1260 hrtick_update(rq);
1059} 1261}
1060 1262
@@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1071 for_each_sched_entity(se) { 1273 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1274 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1275 dequeue_entity(cfs_rq, se, flags);
1276
1074 /* Don't dequeue parent if it has other entities besides us */ 1277 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1278 if (cfs_rq->load.weight)
1076 break; 1279 break;
1077 flags |= DEQUEUE_SLEEP; 1280 flags |= DEQUEUE_SLEEP;
1078 } 1281 }
1079 1282
1283 for_each_sched_entity(se) {
1284 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1285
1286 update_cfs_load(cfs_rq, 0);
1287 update_cfs_shares(cfs_rq, 0);
1288 }
1289
1080 hrtick_update(rq); 1290 hrtick_update(rq);
1081} 1291}
1082 1292
@@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1353 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1354 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1355 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1356 */
1161static long effective_load(struct task_group *tg, int cpu, 1357static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1358{
1164 struct sched_entity *se = tg->se[cpu]; 1359 struct sched_entity *se = tg->se[cpu];
1165 1360
1166 if (!tg->parent) 1361 if (!tg->parent)
1167 return wl; 1362 return wl;
1168 1363
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1364 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1365 long S, rw, s, a, b;
1178 long more_w;
1179
1180 /*
1181 * Instead of using this increment, also add the difference
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187 1366
1188 S = se->my_q->tg->shares; 1367 S = se->my_q->tg->shares;
1189 s = se->my_q->shares; 1368 s = se->load.weight;
1190 rw = se->my_q->rq_weight; 1369 rw = se->my_q->load.weight;
1191 1370
1192 a = S*(rw + wl); 1371 a = S*(rw + wl);
1193 b = S*rw + s*wg; 1372 b = S*rw + s*wg;
@@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1687 sd = tmp;
1509 } 1688 }
1510 1689
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1690 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1691 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1692 return select_idle_sibling(p, cpu);
@@ -1909,6 +2071,48 @@ out:
1909} 2071}
1910 2072
1911#ifdef CONFIG_FAIR_GROUP_SCHED 2073#ifdef CONFIG_FAIR_GROUP_SCHED
2074/*
2075 * update tg->load_weight by folding this cpu's load_avg
2076 */
2077static int update_shares_cpu(struct task_group *tg, int cpu)
2078{
2079 struct cfs_rq *cfs_rq;
2080 unsigned long flags;
2081 struct rq *rq;
2082
2083 if (!tg->se[cpu])
2084 return 0;
2085
2086 rq = cpu_rq(cpu);
2087 cfs_rq = tg->cfs_rq[cpu];
2088
2089 raw_spin_lock_irqsave(&rq->lock, flags);
2090
2091 update_rq_clock(rq);
2092 update_cfs_load(cfs_rq, 1);
2093
2094 /*
2095 * We need to update shares after updating tg->load_weight in
2096 * order to adjust the weight of groups with long running tasks.
2097 */
2098 update_cfs_shares(cfs_rq, 0);
2099
2100 raw_spin_unlock_irqrestore(&rq->lock, flags);
2101
2102 return 0;
2103}
2104
2105static void update_shares(int cpu)
2106{
2107 struct cfs_rq *cfs_rq;
2108 struct rq *rq = cpu_rq(cpu);
2109
2110 rcu_read_lock();
2111 for_each_leaf_cfs_rq(rq, cfs_rq)
2112 update_shares_cpu(cfs_rq->tg, cpu);
2113 rcu_read_unlock();
2114}
2115
1912static unsigned long 2116static unsigned long
1913load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2117load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1914 unsigned long max_load_move, 2118 unsigned long max_load_move,
@@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1956 return max_load_move - rem_load_move; 2160 return max_load_move - rem_load_move;
1957} 2161}
1958#else 2162#else
2163static inline void update_shares(int cpu)
2164{
2165}
2166
1959static unsigned long 2167static unsigned long
1960load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2168load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1961 unsigned long max_load_move, 2169 unsigned long max_load_move,
@@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3032 schedstat_inc(sd, lb_count[idle]); 3240 schedstat_inc(sd, lb_count[idle]);
3033 3241
3034redo: 3242redo:
3035 update_shares(sd);
3036 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3243 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3037 cpus, balance); 3244 cpus, balance);
3038 3245
@@ -3174,8 +3381,6 @@ out_one_pinned:
3174 else 3381 else
3175 ld_moved = 0; 3382 ld_moved = 0;
3176out: 3383out:
3177 if (ld_moved)
3178 update_shares(sd);
3179 return ld_moved; 3384 return ld_moved;
3180} 3385}
3181 3386
@@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3199 */ 3404 */
3200 raw_spin_unlock(&this_rq->lock); 3405 raw_spin_unlock(&this_rq->lock);
3201 3406
3407 update_shares(this_cpu);
3202 for_each_domain(this_cpu, sd) { 3408 for_each_domain(this_cpu, sd) {
3203 unsigned long interval; 3409 unsigned long interval;
3204 int balance = 1; 3410 int balance = 1;
@@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3569 int update_next_balance = 0; 3775 int update_next_balance = 0;
3570 int need_serialize; 3776 int need_serialize;
3571 3777
3778 update_shares(cpu);
3779
3572 for_each_domain(cpu, sd) { 3780 for_each_domain(cpu, sd) {
3573 if (!(sd->flags & SD_LOAD_BALANCE)) 3781 if (!(sd->flags & SD_LOAD_BALANCE))
3574 continue; 3782 continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920ec1a2..68e69acc29b9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
52SCHED_FEAT(HRTICK, 0) 52SCHED_FEAT(HRTICK, 0)
53SCHED_FEAT(DOUBLE_TICK, 0) 53SCHED_FEAT(DOUBLE_TICK, 0)
54SCHED_FEAT(LB_BIAS, 1) 54SCHED_FEAT(LB_BIAS, 1)
55SCHED_FEAT(LB_SHARES_UPDATE, 1)
56SCHED_FEAT(ASYM_EFF_LOAD, 1)
57 55
58/* 56/*
59 * Spin-wait on mutex acquisition when the mutex owner is running on 57 * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bea7d79f7e9c..c914ec747ca6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list,
189 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
190}
191
192static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
193{
194 list_del_rcu(&rt_rq->leaf_rt_rq_list);
195}
196
186#define for_each_leaf_rt_rq(rt_rq, rq) \ 197#define for_each_leaf_rt_rq(rt_rq, rq) \
187 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 198 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
188 199
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
276 return ktime_to_ns(def_rt_bandwidth.rt_period); 287 return ktime_to_ns(def_rt_bandwidth.rt_period);
277} 288}
278 289
290static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
291{
292}
293
294static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
295{
296}
297
279#define for_each_leaf_rt_rq(rt_rq, rq) \ 298#define for_each_leaf_rt_rq(rt_rq, rq) \
280 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 299 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
281 300
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 844 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
826 return; 845 return;
827 846
847 if (!rt_rq->rt_nr_running)
848 list_add_leaf_rt_rq(rt_rq);
849
828 if (head) 850 if (head)
829 list_add(&rt_se->run_list, queue); 851 list_add(&rt_se->run_list, queue);
830 else 852 else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
844 __clear_bit(rt_se_prio(rt_se), array->bitmap); 866 __clear_bit(rt_se_prio(rt_se), array->bitmap);
845 867
846 dec_rt_tasks(rt_se, rt_rq); 868 dec_rt_tasks(rt_se, rt_rq);
869 if (!rt_rq->rt_nr_running)
870 list_del_leaf_rt_rq(rt_rq);
847} 871}
848 872
849/* 873/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18f4be0d5fe0..d4d918a91881 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
853 cpumask_any(cpu_online_mask)); 853 cpumask_any(cpu_online_mask));
854 case CPU_DEAD: 854 case CPU_DEAD:
855 case CPU_DEAD_FROZEN: { 855 case CPU_DEAD_FROZEN: {
856 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 856 static struct sched_param param = {
857 .sched_priority = MAX_RT_PRIO-1
858 };
857 859
858 p = per_cpu(ksoftirqd, hotcpu); 860 p = per_cpu(ksoftirqd, hotcpu);
859 per_cpu(ksoftirqd, hotcpu) = NULL; 861 per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..2745dcdb6c6c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
1080 err = session; 1080 err = session;
1081out: 1081out:
1082 write_unlock_irq(&tasklist_lock); 1082 write_unlock_irq(&tasklist_lock);
1083 if (err > 0) 1083 if (err > 0) {
1084 proc_sid_connector(group_leader); 1084 proc_sid_connector(group_leader);
1085 sched_autogroup_create_attach(group_leader);
1086 }
1085 return err; 1087 return err;
1086} 1088}
1087 1089
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5abfa1518554..121e4fff03d1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
262static int min_sched_shares_ratelimit = 100000; /* 100 usec */
263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
264#endif 262#endif
265 263
266#ifdef CONFIG_COMPACTION 264#ifdef CONFIG_COMPACTION
@@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = {
305 .extra2 = &max_wakeup_granularity_ns, 303 .extra2 = &max_wakeup_granularity_ns,
306 }, 304 },
307 { 305 {
308 .procname = "sched_shares_ratelimit",
309 .data = &sysctl_sched_shares_ratelimit,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = sched_proc_update_handler,
313 .extra1 = &min_sched_shares_ratelimit,
314 .extra2 = &max_sched_shares_ratelimit,
315 },
316 {
317 .procname = "sched_tunable_scaling", 306 .procname = "sched_tunable_scaling",
318 .data = &sysctl_sched_tunable_scaling, 307 .data = &sysctl_sched_tunable_scaling,
319 .maxlen = sizeof(enum sched_tunable_scaling), 308 .maxlen = sizeof(enum sched_tunable_scaling),
@@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = {
323 .extra2 = &max_sched_tunable_scaling, 312 .extra2 = &max_sched_tunable_scaling,
324 }, 313 },
325 { 314 {
326 .procname = "sched_shares_thresh",
327 .data = &sysctl_sched_shares_thresh,
328 .maxlen = sizeof(unsigned int),
329 .mode = 0644,
330 .proc_handler = proc_dointvec_minmax,
331 .extra1 = &zero,
332 },
333 {
334 .procname = "sched_migration_cost", 315 .procname = "sched_migration_cost",
335 .data = &sysctl_sched_migration_cost, 316 .data = &sysctl_sched_migration_cost,
336 .maxlen = sizeof(unsigned int), 317 .maxlen = sizeof(unsigned int),
@@ -352,6 +333,13 @@ static struct ctl_table kern_table[] = {
352 .proc_handler = proc_dointvec, 333 .proc_handler = proc_dointvec,
353 }, 334 },
354 { 335 {
336 .procname = "sched_shares_window",
337 .data = &sysctl_sched_shares_window,
338 .maxlen = sizeof(unsigned int),
339 .mode = 0644,
340 .proc_handler = proc_dointvec,
341 },
342 {
355 .procname = "timer_migration", 343 .procname = "timer_migration",
356 .data = &sysctl_timer_migration, 344 .data = &sysctl_timer_migration,
357 .maxlen = sizeof(unsigned int), 345 .maxlen = sizeof(unsigned int),
@@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = {
382 .mode = 0644, 370 .mode = 0644,
383 .proc_handler = proc_dointvec, 371 .proc_handler = proc_dointvec,
384 }, 372 },
373#ifdef CONFIG_SCHED_AUTOGROUP
374 {
375 .procname = "sched_autogroup_enabled",
376 .data = &sysctl_sched_autogroup_enabled,
377 .maxlen = sizeof(unsigned int),
378 .mode = 0644,
379 .proc_handler = proc_dointvec,
380 .extra1 = &zero,
381 .extra2 = &one,
382 },
383#endif
385#ifdef CONFIG_PROVE_LOCKING 384#ifdef CONFIG_PROVE_LOCKING
386 { 385 {
387 .procname = "prove_locking", 386 .procname = "prove_locking",
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b3209..562c56e048fd 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
558static int trace_wakeup_test_thread(void *data) 558static int trace_wakeup_test_thread(void *data)
559{ 559{
560 /* Make this a RT thread, doesn't need to be too high */ 560 /* Make this a RT thread, doesn't need to be too high */
561 struct sched_param param = { .sched_priority = 5 }; 561 static struct sched_param param = { .sched_priority = 5 };
562 struct completion *x = data; 562 struct completion *x = data;
563 563
564 sched_setscheduler(current, SCHED_FIFO, &param); 564 sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5b082156cd21..c812c4927cab 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -307,7 +307,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
307 */ 307 */
308static int watchdog(void *unused) 308static int watchdog(void *unused)
309{ 309{
310 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 310 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
311 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 311 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
312 312
313 sched_setscheduler(current, SCHED_FIFO, &param); 313 sched_setscheduler(current, SCHED_FIFO, &param);