aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/controllers/cpuacct.txt32
-rw-r--r--Documentation/scheduler/sched-arch.txt4
-rw-r--r--arch/ia64/Kconfig2
-rw-r--r--arch/m32r/Kconfig2
-rw-r--r--arch/mips/Kconfig2
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/s390/kernel/topology.c5
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--fs/proc/base.c4
-rw-r--r--include/asm-m32r/system.h2
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/topology.h2
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/sched.c367
-rw-r--r--kernel/sched_debug.c57
-rw-r--r--kernel/sched_fair.c9
-rw-r--r--kernel/sched_rt.c9
-rw-r--r--kernel/sched_stats.h5
-rw-r--r--kernel/user.c2
20 files changed, 299 insertions, 222 deletions
diff --git a/Documentation/controllers/cpuacct.txt b/Documentation/controllers/cpuacct.txt
new file mode 100644
index 000000000000..bb775fbe43d7
--- /dev/null
+++ b/Documentation/controllers/cpuacct.txt
@@ -0,0 +1,32 @@
1CPU Accounting Controller
2-------------------------
3
4The CPU accounting controller is used to group tasks using cgroups and
5account the CPU usage of these groups of tasks.
6
7The CPU accounting controller supports multi-hierarchy groups. An accounting
8group accumulates the CPU usage of all of its child groups and the tasks
9directly present in its group.
10
11Accounting groups can be created by first mounting the cgroup filesystem.
12
13# mkdir /cgroups
14# mount -t cgroup -ocpuacct none /cgroups
15
16With the above step, the initial or the parent accounting group
17becomes visible at /cgroups. At bootup, this group includes all the
18tasks in the system. /cgroups/tasks lists the tasks in this cgroup.
19/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by
20this group which is essentially the CPU time obtained by all the tasks
21in the system.
22
23New accounting groups can be created under the parent group /cgroups.
24
25# cd /cgroups
26# mkdir g1
27# echo $$ > g1
28
29The above steps create a new group g1 and move the current shell
30process (bash) into it. CPU time consumed by this bash and its children
31can be obtained from g1/cpuacct.usage and the same is accumulated in
32/cgroups/cpuacct.usage also.
diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt
index 941615a9769b..d43dbcbd163b 100644
--- a/Documentation/scheduler/sched-arch.txt
+++ b/Documentation/scheduler/sched-arch.txt
@@ -8,7 +8,7 @@ Context switch
8By default, the switch_to arch function is called with the runqueue 8By default, the switch_to arch function is called with the runqueue
9locked. This is usually not a problem unless switch_to may need to 9locked. This is usually not a problem unless switch_to may need to
10take the runqueue lock. This is usually due to a wake up operation in 10take the runqueue lock. This is usually due to a wake up operation in
11the context switch. See include/asm-ia64/system.h for an example. 11the context switch. See arch/ia64/include/asm/system.h for an example.
12 12
13To request the scheduler call switch_to with the runqueue unlocked, 13To request the scheduler call switch_to with the runqueue unlocked,
14you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file 14you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
@@ -23,7 +23,7 @@ disabled. Interrupts may be enabled over the call if it is likely to
23introduce a significant interrupt latency by adding the line 23introduce a significant interrupt latency by adding the line
24`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for 24`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for
25unlocked context switches. This define also implies 25unlocked context switches. This define also implies
26`__ARCH_WANT_UNLOCKED_CTXSW`. See include/asm-arm/system.h for an 26`__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an
27example. 27example.
28 28
29 29
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 6bd91ed7cd03..7fa8f615ba6e 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -99,7 +99,7 @@ config GENERIC_IOMAP
99 bool 99 bool
100 default y 100 default y
101 101
102config SCHED_NO_NO_OMIT_FRAME_POINTER 102config SCHED_OMIT_FRAME_POINTER
103 bool 103 bool
104 default y 104 default y
105 105
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index dbaed4a63815..29047d5c259a 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -273,7 +273,7 @@ config GENERIC_CALIBRATE_DELAY
273 bool 273 bool
274 default y 274 default y
275 275
276config SCHED_NO_NO_OMIT_FRAME_POINTER 276config SCHED_OMIT_FRAME_POINTER
277 bool 277 bool
278 default y 278 default y
279 279
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index f4af967a6b30..a5255e7c79e0 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -653,7 +653,7 @@ config GENERIC_CMOS_UPDATE
653 bool 653 bool
654 default y 654 default y
655 655
656config SCHED_NO_NO_OMIT_FRAME_POINTER 656config SCHED_OMIT_FRAME_POINTER
657 bool 657 bool
658 default y 658 default y
659 659
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 525c13a4de93..adb23ea1c1ef 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,7 +141,7 @@ config GENERIC_NVRAM
141 bool 141 bool
142 default y if PPC32 142 default y if PPC32
143 143
144config SCHED_NO_NO_OMIT_FRAME_POINTER 144config SCHED_OMIT_FRAME_POINTER
145 bool 145 bool
146 default y 146 default y
147 147
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index a947899dcba1..bf96f1b5c6ec 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -212,7 +212,7 @@ static void update_cpu_core_map(void)
212 cpu_core_map[cpu] = cpu_coregroup_map(cpu); 212 cpu_core_map[cpu] = cpu_coregroup_map(cpu);
213} 213}
214 214
215void arch_update_cpu_topology(void) 215int arch_update_cpu_topology(void)
216{ 216{
217 struct tl_info *info = tl_info; 217 struct tl_info *info = tl_info;
218 struct sys_device *sysdev; 218 struct sys_device *sysdev;
@@ -221,7 +221,7 @@ void arch_update_cpu_topology(void)
221 if (!machine_has_topology) { 221 if (!machine_has_topology) {
222 update_cpu_core_map(); 222 update_cpu_core_map();
223 topology_update_polarization_simple(); 223 topology_update_polarization_simple();
224 return; 224 return 0;
225 } 225 }
226 stsi(info, 15, 1, 2); 226 stsi(info, 15, 1, 2);
227 tl_to_cores(info); 227 tl_to_cores(info);
@@ -230,6 +230,7 @@ void arch_update_cpu_topology(void)
230 sysdev = get_cpu_sysdev(cpu); 230 sysdev = get_cpu_sysdev(cpu);
231 kobject_uevent(&sysdev->kobj, KOBJ_CHANGE); 231 kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
232 } 232 }
233 return 1;
233} 234}
234 235
235static void topology_work_fn(struct work_struct *work) 236static void topology_work_fn(struct work_struct *work)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ac22bb7719f7..7b7d2764a215 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -367,10 +367,10 @@ config X86_RDC321X
367 as R-8610-(G). 367 as R-8610-(G).
368 If you don't have one of these chips, you should say N here. 368 If you don't have one of these chips, you should say N here.
369 369
370config SCHED_NO_NO_OMIT_FRAME_POINTER 370config SCHED_OMIT_FRAME_POINTER
371 def_bool y 371 def_bool y
372 prompt "Single-depth WCHAN output" 372 prompt "Single-depth WCHAN output"
373 depends on X86_32 373 depends on X86
374 help 374 help
375 Calculate simpler /proc/<PID>/wchan values. If this option 375 Calculate simpler /proc/<PID>/wchan values. If this option
376 is disabled then wchan values will recurse back to the 376 is disabled then wchan values will recurse back to the
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d4677603c889..a48200c71127 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -347,8 +347,8 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
347static int proc_pid_schedstat(struct task_struct *task, char *buffer) 347static int proc_pid_schedstat(struct task_struct *task, char *buffer)
348{ 348{
349 return sprintf(buffer, "%llu %llu %lu\n", 349 return sprintf(buffer, "%llu %llu %lu\n",
350 task->sched_info.cpu_time, 350 (unsigned long long)task->se.sum_exec_runtime,
351 task->sched_info.run_delay, 351 (unsigned long long)task->sched_info.run_delay,
352 task->sched_info.pcount); 352 task->sched_info.pcount);
353} 353}
354#endif 354#endif
diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h
index 70a57c8c002b..c980f5ba8de7 100644
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@@ -23,7 +23,7 @@
23 */ 23 */
24 24
25#if defined(CONFIG_FRAME_POINTER) || \ 25#if defined(CONFIG_FRAME_POINTER) || \
26 !defined(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER) 26 !defined(CONFIG_SCHED_OMIT_FRAME_POINTER)
27#define M32R_PUSH_FP " push fp\n" 27#define M32R_PUSH_FP " push fp\n"
28#define M32R_POP_FP " pop fp\n" 28#define M32R_POP_FP " pop fp\n"
29#else 29#else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55e30d114477..2d1e840ddd35 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -259,8 +259,6 @@ static inline int select_nohz_load_balancer(int cpu)
259} 259}
260#endif 260#endif
261 261
262extern unsigned long rt_needs_cpu(int cpu);
263
264/* 262/*
265 * Only dump TASK_* tasks. (0 for all tasks) 263 * Only dump TASK_* tasks. (0 for all tasks)
266 */ 264 */
@@ -672,8 +670,7 @@ struct reclaim_state;
672struct sched_info { 670struct sched_info {
673 /* cumulative counters */ 671 /* cumulative counters */
674 unsigned long pcount; /* # of times run on this cpu */ 672 unsigned long pcount; /* # of times run on this cpu */
675 unsigned long long cpu_time, /* time spent on the cpu */ 673 unsigned long long run_delay; /* time spent waiting on a runqueue */
676 run_delay; /* time spent waiting on a runqueue */
677 674
678 /* timestamps */ 675 /* timestamps */
679 unsigned long long last_arrival,/* when we last ran on a cpu */ 676 unsigned long long last_arrival,/* when we last ran on a cpu */
@@ -2224,6 +2221,7 @@ extern void normalize_rt_tasks(void);
2224extern struct task_group init_task_group; 2221extern struct task_group init_task_group;
2225#ifdef CONFIG_USER_SCHED 2222#ifdef CONFIG_USER_SCHED
2226extern struct task_group root_task_group; 2223extern struct task_group root_task_group;
2224extern void set_tg_uid(struct user_struct *user);
2227#endif 2225#endif
2228 2226
2229extern struct task_group *sched_create_group(struct task_group *parent); 2227extern struct task_group *sched_create_group(struct task_group *parent);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 117f1b7405cf..0c5b5ac36d8e 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -49,7 +49,7 @@
49 for_each_online_node(node) \ 49 for_each_online_node(node) \
50 if (nr_cpus_node(node)) 50 if (nr_cpus_node(node))
51 51
52void arch_update_cpu_topology(void); 52int arch_update_cpu_topology(void);
53 53
54/* Conform to ACPI 2.0 SLIT distance definitions */ 54/* Conform to ACPI 2.0 SLIT distance definitions */
55#define LOCAL_DISTANCE 10 55#define LOCAL_DISTANCE 10
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad003b19d..6a212b842d86 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -19,7 +19,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
19CFLAGS_REMOVE_rtmutex-debug.o = -pg 19CFLAGS_REMOVE_rtmutex-debug.o = -pg
20CFLAGS_REMOVE_cgroup-debug.o = -pg 20CFLAGS_REMOVE_cgroup-debug.o = -pg
21CFLAGS_REMOVE_sched_clock.o = -pg 21CFLAGS_REMOVE_sched_clock.o = -pg
22CFLAGS_REMOVE_sched.o = -pg
23endif 22endif
24 23
25obj-$(CONFIG_FREEZER) += freezer.o 24obj-$(CONFIG_FREEZER) += freezer.o
@@ -90,7 +89,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
90obj-$(CONFIG_TRACING) += trace/ 89obj-$(CONFIG_TRACING) += trace/
91obj-$(CONFIG_SMP) += sched_cpupri.o 90obj-$(CONFIG_SMP) += sched_cpupri.o
92 91
93ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 92ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
94# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 93# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
95# needed for x86 only. Why this used to be enabled for all architectures is beyond 94# needed for x86 only. Why this used to be enabled for all architectures is beyond
96# me. I suspect most platforms don't need this, but until we know that for sure 95# me. I suspect most platforms don't need this, but until we know that for sure
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index b3179dad71be..abb6e17505e2 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
127 */ 127 */
128 t1 = tsk->sched_info.pcount; 128 t1 = tsk->sched_info.pcount;
129 t2 = tsk->sched_info.run_delay; 129 t2 = tsk->sched_info.run_delay;
130 t3 = tsk->sched_info.cpu_time; 130 t3 = tsk->se.sum_exec_runtime;
131 131
132 d->cpu_count += t1; 132 d->cpu_count += t1;
133 133
diff --git a/kernel/sched.c b/kernel/sched.c
index e4bb1dd7b308..fd835fc320b8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -261,6 +261,10 @@ struct task_group {
261 struct cgroup_subsys_state css; 261 struct cgroup_subsys_state css;
262#endif 262#endif
263 263
264#ifdef CONFIG_USER_SCHED
265 uid_t uid;
266#endif
267
264#ifdef CONFIG_FAIR_GROUP_SCHED 268#ifdef CONFIG_FAIR_GROUP_SCHED
265 /* schedulable entities of this group on each cpu */ 269 /* schedulable entities of this group on each cpu */
266 struct sched_entity **se; 270 struct sched_entity **se;
@@ -286,6 +290,12 @@ struct task_group {
286 290
287#ifdef CONFIG_USER_SCHED 291#ifdef CONFIG_USER_SCHED
288 292
293/* Helper function to pass uid information to create_sched_user() */
294void set_tg_uid(struct user_struct *user)
295{
296 user->tg->uid = user->uid;
297}
298
289/* 299/*
290 * Root task group. 300 * Root task group.
291 * Every UID task group (including init_task_group aka UID-0) will 301 * Every UID task group (including init_task_group aka UID-0) will
@@ -586,6 +596,8 @@ struct rq {
586#ifdef CONFIG_SCHEDSTATS 596#ifdef CONFIG_SCHEDSTATS
587 /* latency stats */ 597 /* latency stats */
588 struct sched_info rq_sched_info; 598 struct sched_info rq_sched_info;
599 unsigned long long rq_cpu_time;
600 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
589 601
590 /* sys_sched_yield() stats */ 602 /* sys_sched_yield() stats */
591 unsigned int yld_exp_empty; 603 unsigned int yld_exp_empty;
@@ -703,45 +715,18 @@ static __read_mostly char *sched_feat_names[] = {
703 715
704#undef SCHED_FEAT 716#undef SCHED_FEAT
705 717
706static int sched_feat_open(struct inode *inode, struct file *filp) 718static int sched_feat_show(struct seq_file *m, void *v)
707{
708 filp->private_data = inode->i_private;
709 return 0;
710}
711
712static ssize_t
713sched_feat_read(struct file *filp, char __user *ubuf,
714 size_t cnt, loff_t *ppos)
715{ 719{
716 char *buf;
717 int r = 0;
718 int len = 0;
719 int i; 720 int i;
720 721
721 for (i = 0; sched_feat_names[i]; i++) { 722 for (i = 0; sched_feat_names[i]; i++) {
722 len += strlen(sched_feat_names[i]); 723 if (!(sysctl_sched_features & (1UL << i)))
723 len += 4; 724 seq_puts(m, "NO_");
724 } 725 seq_printf(m, "%s ", sched_feat_names[i]);
725
726 buf = kmalloc(len + 2, GFP_KERNEL);
727 if (!buf)
728 return -ENOMEM;
729
730 for (i = 0; sched_feat_names[i]; i++) {
731 if (sysctl_sched_features & (1UL << i))
732 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733 else
734 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735 } 726 }
727 seq_puts(m, "\n");
736 728
737 r += sprintf(buf + r, "\n"); 729 return 0;
738 WARN_ON(r >= len + 2);
739
740 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742 kfree(buf);
743
744 return r;
745} 730}
746 731
747static ssize_t 732static ssize_t
@@ -786,10 +771,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
786 return cnt; 771 return cnt;
787} 772}
788 773
774static int sched_feat_open(struct inode *inode, struct file *filp)
775{
776 return single_open(filp, sched_feat_show, NULL);
777}
778
789static struct file_operations sched_feat_fops = { 779static struct file_operations sched_feat_fops = {
790 .open = sched_feat_open, 780 .open = sched_feat_open,
791 .read = sched_feat_read, 781 .write = sched_feat_write,
792 .write = sched_feat_write, 782 .read = seq_read,
783 .llseek = seq_lseek,
784 .release = single_release,
793}; 785};
794 786
795static __init int sched_init_debug(void) 787static __init int sched_init_debug(void)
@@ -1474,27 +1466,13 @@ static void
1474update_group_shares_cpu(struct task_group *tg, int cpu, 1466update_group_shares_cpu(struct task_group *tg, int cpu,
1475 unsigned long sd_shares, unsigned long sd_rq_weight) 1467 unsigned long sd_shares, unsigned long sd_rq_weight)
1476{ 1468{
1477 int boost = 0;
1478 unsigned long shares; 1469 unsigned long shares;
1479 unsigned long rq_weight; 1470 unsigned long rq_weight;
1480 1471
1481 if (!tg->se[cpu]) 1472 if (!tg->se[cpu])
1482 return; 1473 return;
1483 1474
1484 rq_weight = tg->cfs_rq[cpu]->load.weight; 1475 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1485
1486 /*
1487 * If there are currently no tasks on the cpu pretend there is one of
1488 * average load so that when a new task gets to run here it will not
1489 * get delayed by group starvation.
1490 */
1491 if (!rq_weight) {
1492 boost = 1;
1493 rq_weight = NICE_0_LOAD;
1494 }
1495
1496 if (unlikely(rq_weight > sd_rq_weight))
1497 rq_weight = sd_rq_weight;
1498 1476
1499 /* 1477 /*
1500 * \Sum shares * rq_weight 1478 * \Sum shares * rq_weight
@@ -1502,7 +1480,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1502 * \Sum rq_weight 1480 * \Sum rq_weight
1503 * 1481 *
1504 */ 1482 */
1505 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1483 shares = (sd_shares * rq_weight) / sd_rq_weight;
1506 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1484 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1507 1485
1508 if (abs(shares - tg->se[cpu]->load.weight) > 1486 if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1511,11 +1489,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1511 unsigned long flags; 1489 unsigned long flags;
1512 1490
1513 spin_lock_irqsave(&rq->lock, flags); 1491 spin_lock_irqsave(&rq->lock, flags);
1514 /* 1492 tg->cfs_rq[cpu]->shares = shares;
1515 * record the actual number of shares, not the boosted amount.
1516 */
1517 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1518 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1519 1493
1520 __set_se_shares(tg->se[cpu], shares); 1494 __set_se_shares(tg->se[cpu], shares);
1521 spin_unlock_irqrestore(&rq->lock, flags); 1495 spin_unlock_irqrestore(&rq->lock, flags);
@@ -1529,13 +1503,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1529 */ 1503 */
1530static int tg_shares_up(struct task_group *tg, void *data) 1504static int tg_shares_up(struct task_group *tg, void *data)
1531{ 1505{
1532 unsigned long rq_weight = 0; 1506 unsigned long weight, rq_weight = 0;
1533 unsigned long shares = 0; 1507 unsigned long shares = 0;
1534 struct sched_domain *sd = data; 1508 struct sched_domain *sd = data;
1535 int i; 1509 int i;
1536 1510
1537 for_each_cpu_mask(i, sd->span) { 1511 for_each_cpu_mask(i, sd->span) {
1538 rq_weight += tg->cfs_rq[i]->load.weight; 1512 /*
1513 * If there are currently no tasks on the cpu pretend there
1514 * is one of average load so that when a new task gets to
1515 * run here it will not get delayed by group starvation.
1516 */
1517 weight = tg->cfs_rq[i]->load.weight;
1518 if (!weight)
1519 weight = NICE_0_LOAD;
1520
1521 tg->cfs_rq[i]->rq_weight = weight;
1522 rq_weight += weight;
1539 shares += tg->cfs_rq[i]->shares; 1523 shares += tg->cfs_rq[i]->shares;
1540 } 1524 }
1541 1525
@@ -1545,9 +1529,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
1545 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1529 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1546 shares = tg->shares; 1530 shares = tg->shares;
1547 1531
1548 if (!rq_weight)
1549 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1550
1551 for_each_cpu_mask(i, sd->span) 1532 for_each_cpu_mask(i, sd->span)
1552 update_group_shares_cpu(tg, i, shares, rq_weight); 1533 update_group_shares_cpu(tg, i, shares, rq_weight);
1553 1534
@@ -1612,6 +1593,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1612 1593
1613#endif 1594#endif
1614 1595
1596/*
1597 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1598 */
1599static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1600 __releases(this_rq->lock)
1601 __acquires(busiest->lock)
1602 __acquires(this_rq->lock)
1603{
1604 int ret = 0;
1605
1606 if (unlikely(!irqs_disabled())) {
1607 /* printk() doesn't work good under rq->lock */
1608 spin_unlock(&this_rq->lock);
1609 BUG_ON(1);
1610 }
1611 if (unlikely(!spin_trylock(&busiest->lock))) {
1612 if (busiest < this_rq) {
1613 spin_unlock(&this_rq->lock);
1614 spin_lock(&busiest->lock);
1615 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1616 ret = 1;
1617 } else
1618 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1619 }
1620 return ret;
1621}
1622
1623static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1624 __releases(busiest->lock)
1625{
1626 spin_unlock(&busiest->lock);
1627 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1628}
1615#endif 1629#endif
1616 1630
1617#ifdef CONFIG_FAIR_GROUP_SCHED 1631#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2254,6 +2268,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2254 2268
2255 smp_wmb(); 2269 smp_wmb();
2256 rq = task_rq_lock(p, &flags); 2270 rq = task_rq_lock(p, &flags);
2271 update_rq_clock(rq);
2257 old_state = p->state; 2272 old_state = p->state;
2258 if (!(old_state & state)) 2273 if (!(old_state & state))
2259 goto out; 2274 goto out;
@@ -2311,7 +2326,6 @@ out_activate:
2311 schedstat_inc(p, se.nr_wakeups_local); 2326 schedstat_inc(p, se.nr_wakeups_local);
2312 else 2327 else
2313 schedstat_inc(p, se.nr_wakeups_remote); 2328 schedstat_inc(p, se.nr_wakeups_remote);
2314 update_rq_clock(rq);
2315 activate_task(rq, p, 1); 2329 activate_task(rq, p, 1);
2316 success = 1; 2330 success = 1;
2317 2331
@@ -2812,40 +2826,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2812} 2826}
2813 2827
2814/* 2828/*
2815 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2816 */
2817static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2818 __releases(this_rq->lock)
2819 __acquires(busiest->lock)
2820 __acquires(this_rq->lock)
2821{
2822 int ret = 0;
2823
2824 if (unlikely(!irqs_disabled())) {
2825 /* printk() doesn't work good under rq->lock */
2826 spin_unlock(&this_rq->lock);
2827 BUG_ON(1);
2828 }
2829 if (unlikely(!spin_trylock(&busiest->lock))) {
2830 if (busiest < this_rq) {
2831 spin_unlock(&this_rq->lock);
2832 spin_lock(&busiest->lock);
2833 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2834 ret = 1;
2835 } else
2836 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2837 }
2838 return ret;
2839}
2840
2841static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2842 __releases(busiest->lock)
2843{
2844 spin_unlock(&busiest->lock);
2845 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2846}
2847
2848/*
2849 * If dest_cpu is allowed for this process, migrate the task to it. 2829 * If dest_cpu is allowed for this process, migrate the task to it.
2850 * This is accomplished by forcing the cpu_allowed mask to only 2830 * This is accomplished by forcing the cpu_allowed mask to only
2851 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2831 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -3707,7 +3687,7 @@ out_balanced:
3707static void idle_balance(int this_cpu, struct rq *this_rq) 3687static void idle_balance(int this_cpu, struct rq *this_rq)
3708{ 3688{
3709 struct sched_domain *sd; 3689 struct sched_domain *sd;
3710 int pulled_task = -1; 3690 int pulled_task = 0;
3711 unsigned long next_balance = jiffies + HZ; 3691 unsigned long next_balance = jiffies + HZ;
3712 cpumask_t tmpmask; 3692 cpumask_t tmpmask;
3713 3693
@@ -6126,7 +6106,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6126 6106
6127/* 6107/*
6128 * Figure out where task on dead CPU should go, use force if necessary. 6108 * Figure out where task on dead CPU should go, use force if necessary.
6129 * NOTE: interrupts should be disabled by the caller
6130 */ 6109 */
6131static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6110static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6132{ 6111{
@@ -6638,28 +6617,6 @@ early_initcall(migration_init);
6638 6617
6639#ifdef CONFIG_SCHED_DEBUG 6618#ifdef CONFIG_SCHED_DEBUG
6640 6619
6641static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6642{
6643 switch (lvl) {
6644 case SD_LV_NONE:
6645 return "NONE";
6646 case SD_LV_SIBLING:
6647 return "SIBLING";
6648 case SD_LV_MC:
6649 return "MC";
6650 case SD_LV_CPU:
6651 return "CPU";
6652 case SD_LV_NODE:
6653 return "NODE";
6654 case SD_LV_ALLNODES:
6655 return "ALLNODES";
6656 case SD_LV_MAX:
6657 return "MAX";
6658
6659 }
6660 return "MAX";
6661}
6662
6663static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6620static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6664 cpumask_t *groupmask) 6621 cpumask_t *groupmask)
6665{ 6622{
@@ -6679,8 +6636,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6679 return -1; 6636 return -1;
6680 } 6637 }
6681 6638
6682 printk(KERN_CONT "span %s level %s\n", 6639 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6683 str, sd_level_to_string(sd->level));
6684 6640
6685 if (!cpu_isset(cpu, sd->span)) { 6641 if (!cpu_isset(cpu, sd->span)) {
6686 printk(KERN_ERR "ERROR: domain->span does not contain " 6642 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6816,6 +6772,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6816 SD_BALANCE_EXEC | 6772 SD_BALANCE_EXEC |
6817 SD_SHARE_CPUPOWER | 6773 SD_SHARE_CPUPOWER |
6818 SD_SHARE_PKG_RESOURCES); 6774 SD_SHARE_PKG_RESOURCES);
6775 if (nr_node_ids == 1)
6776 pflags &= ~SD_SERIALIZE;
6819 } 6777 }
6820 if (~cflags & pflags) 6778 if (~cflags & pflags)
6821 return 0; 6779 return 0;
@@ -7336,13 +7294,21 @@ struct allmasks {
7336}; 7294};
7337 7295
7338#if NR_CPUS > 128 7296#if NR_CPUS > 128
7339#define SCHED_CPUMASK_ALLOC 1 7297#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7340#define SCHED_CPUMASK_FREE(v) kfree(v) 7298static inline void sched_cpumask_alloc(struct allmasks **masks)
7341#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v 7299{
7300 *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
7301}
7302static inline void sched_cpumask_free(struct allmasks *masks)
7303{
7304 kfree(masks);
7305}
7342#else 7306#else
7343#define SCHED_CPUMASK_ALLOC 0 7307#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7344#define SCHED_CPUMASK_FREE(v) 7308static inline void sched_cpumask_alloc(struct allmasks **masks)
7345#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v 7309{ }
7310static inline void sched_cpumask_free(struct allmasks *masks)
7311{ }
7346#endif 7312#endif
7347 7313
7348#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ 7314#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
@@ -7418,9 +7384,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7418 return -ENOMEM; 7384 return -ENOMEM;
7419 } 7385 }
7420 7386
7421#if SCHED_CPUMASK_ALLOC
7422 /* get space for all scratch cpumask variables */ 7387 /* get space for all scratch cpumask variables */
7423 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); 7388 sched_cpumask_alloc(&allmasks);
7424 if (!allmasks) { 7389 if (!allmasks) {
7425 printk(KERN_WARNING "Cannot alloc cpumask array\n"); 7390 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7426 kfree(rd); 7391 kfree(rd);
@@ -7429,7 +7394,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7429#endif 7394#endif
7430 return -ENOMEM; 7395 return -ENOMEM;
7431 } 7396 }
7432#endif 7397
7433 tmpmask = (cpumask_t *)allmasks; 7398 tmpmask = (cpumask_t *)allmasks;
7434 7399
7435 7400
@@ -7683,13 +7648,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7683 cpu_attach_domain(sd, rd, i); 7648 cpu_attach_domain(sd, rd, i);
7684 } 7649 }
7685 7650
7686 SCHED_CPUMASK_FREE((void *)allmasks); 7651 sched_cpumask_free(allmasks);
7687 return 0; 7652 return 0;
7688 7653
7689#ifdef CONFIG_NUMA 7654#ifdef CONFIG_NUMA
7690error: 7655error:
7691 free_sched_groups(cpu_map, tmpmask); 7656 free_sched_groups(cpu_map, tmpmask);
7692 SCHED_CPUMASK_FREE((void *)allmasks); 7657 sched_cpumask_free(allmasks);
7693 kfree(rd); 7658 kfree(rd);
7694 return -ENOMEM; 7659 return -ENOMEM;
7695#endif 7660#endif
@@ -7712,8 +7677,14 @@ static struct sched_domain_attr *dattr_cur;
7712 */ 7677 */
7713static cpumask_t fallback_doms; 7678static cpumask_t fallback_doms;
7714 7679
7715void __attribute__((weak)) arch_update_cpu_topology(void) 7680/*
7681 * arch_update_cpu_topology lets virtualized architectures update the
7682 * cpu core maps. It is supposed to return 1 if the topology changed
7683 * or 0 if it stayed the same.
7684 */
7685int __attribute__((weak)) arch_update_cpu_topology(void)
7716{ 7686{
7687 return 0;
7717} 7688}
7718 7689
7719/* 7690/*
@@ -7753,8 +7724,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7753 cpumask_t tmpmask; 7724 cpumask_t tmpmask;
7754 int i; 7725 int i;
7755 7726
7756 unregister_sched_domain_sysctl();
7757
7758 for_each_cpu_mask_nr(i, *cpu_map) 7727 for_each_cpu_mask_nr(i, *cpu_map)
7759 cpu_attach_domain(NULL, &def_root_domain, i); 7728 cpu_attach_domain(NULL, &def_root_domain, i);
7760 synchronize_sched(); 7729 synchronize_sched();
@@ -7807,17 +7776,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7807 struct sched_domain_attr *dattr_new) 7776 struct sched_domain_attr *dattr_new)
7808{ 7777{
7809 int i, j, n; 7778 int i, j, n;
7779 int new_topology;
7810 7780
7811 mutex_lock(&sched_domains_mutex); 7781 mutex_lock(&sched_domains_mutex);
7812 7782
7813 /* always unregister in case we don't destroy any domains */ 7783 /* always unregister in case we don't destroy any domains */
7814 unregister_sched_domain_sysctl(); 7784 unregister_sched_domain_sysctl();
7815 7785
7786 /* Let architecture update cpu core mappings. */
7787 new_topology = arch_update_cpu_topology();
7788
7816 n = doms_new ? ndoms_new : 0; 7789 n = doms_new ? ndoms_new : 0;
7817 7790
7818 /* Destroy deleted domains */ 7791 /* Destroy deleted domains */
7819 for (i = 0; i < ndoms_cur; i++) { 7792 for (i = 0; i < ndoms_cur; i++) {
7820 for (j = 0; j < n; j++) { 7793 for (j = 0; j < n && !new_topology; j++) {
7821 if (cpus_equal(doms_cur[i], doms_new[j]) 7794 if (cpus_equal(doms_cur[i], doms_new[j])
7822 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7795 && dattrs_equal(dattr_cur, i, dattr_new, j))
7823 goto match1; 7796 goto match1;
@@ -7832,12 +7805,12 @@ match1:
7832 ndoms_cur = 0; 7805 ndoms_cur = 0;
7833 doms_new = &fallback_doms; 7806 doms_new = &fallback_doms;
7834 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7807 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7835 dattr_new = NULL; 7808 WARN_ON_ONCE(dattr_new);
7836 } 7809 }
7837 7810
7838 /* Build new domains */ 7811 /* Build new domains */
7839 for (i = 0; i < ndoms_new; i++) { 7812 for (i = 0; i < ndoms_new; i++) {
7840 for (j = 0; j < ndoms_cur; j++) { 7813 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7841 if (cpus_equal(doms_new[i], doms_cur[j]) 7814 if (cpus_equal(doms_new[i], doms_cur[j])
7842 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7815 && dattrs_equal(dattr_new, i, dattr_cur, j))
7843 goto match2; 7816 goto match2;
@@ -8492,7 +8465,7 @@ static
8492int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8465int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8493{ 8466{
8494 struct cfs_rq *cfs_rq; 8467 struct cfs_rq *cfs_rq;
8495 struct sched_entity *se, *parent_se; 8468 struct sched_entity *se;
8496 struct rq *rq; 8469 struct rq *rq;
8497 int i; 8470 int i;
8498 8471
@@ -8508,18 +8481,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8508 for_each_possible_cpu(i) { 8481 for_each_possible_cpu(i) {
8509 rq = cpu_rq(i); 8482 rq = cpu_rq(i);
8510 8483
8511 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), 8484 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8512 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8485 GFP_KERNEL, cpu_to_node(i));
8513 if (!cfs_rq) 8486 if (!cfs_rq)
8514 goto err; 8487 goto err;
8515 8488
8516 se = kmalloc_node(sizeof(struct sched_entity), 8489 se = kzalloc_node(sizeof(struct sched_entity),
8517 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8490 GFP_KERNEL, cpu_to_node(i));
8518 if (!se) 8491 if (!se)
8519 goto err; 8492 goto err;
8520 8493
8521 parent_se = parent ? parent->se[i] : NULL; 8494 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8522 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8523 } 8495 }
8524 8496
8525 return 1; 8497 return 1;
@@ -8580,7 +8552,7 @@ static
8580int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8552int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8581{ 8553{
8582 struct rt_rq *rt_rq; 8554 struct rt_rq *rt_rq;
8583 struct sched_rt_entity *rt_se, *parent_se; 8555 struct sched_rt_entity *rt_se;
8584 struct rq *rq; 8556 struct rq *rq;
8585 int i; 8557 int i;
8586 8558
@@ -8597,18 +8569,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8597 for_each_possible_cpu(i) { 8569 for_each_possible_cpu(i) {
8598 rq = cpu_rq(i); 8570 rq = cpu_rq(i);
8599 8571
8600 rt_rq = kmalloc_node(sizeof(struct rt_rq), 8572 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8601 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8573 GFP_KERNEL, cpu_to_node(i));
8602 if (!rt_rq) 8574 if (!rt_rq)
8603 goto err; 8575 goto err;
8604 8576
8605 rt_se = kmalloc_node(sizeof(struct sched_rt_entity), 8577 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8606 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8578 GFP_KERNEL, cpu_to_node(i));
8607 if (!rt_se) 8579 if (!rt_se)
8608 goto err; 8580 goto err;
8609 8581
8610 parent_se = parent ? parent->rt_se[i] : NULL; 8582 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8611 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8612 } 8583 }
8613 8584
8614 return 1; 8585 return 1;
@@ -9251,11 +9222,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9251 * (balbir@in.ibm.com). 9222 * (balbir@in.ibm.com).
9252 */ 9223 */
9253 9224
9254/* track cpu usage of a group of tasks */ 9225/* track cpu usage of a group of tasks and its child groups */
9255struct cpuacct { 9226struct cpuacct {
9256 struct cgroup_subsys_state css; 9227 struct cgroup_subsys_state css;
9257 /* cpuusage holds pointer to a u64-type object on every cpu */ 9228 /* cpuusage holds pointer to a u64-type object on every cpu */
9258 u64 *cpuusage; 9229 u64 *cpuusage;
9230 struct cpuacct *parent;
9259}; 9231};
9260 9232
9261struct cgroup_subsys cpuacct_subsys; 9233struct cgroup_subsys cpuacct_subsys;
@@ -9289,6 +9261,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9289 return ERR_PTR(-ENOMEM); 9261 return ERR_PTR(-ENOMEM);
9290 } 9262 }
9291 9263
9264 if (cgrp->parent)
9265 ca->parent = cgroup_ca(cgrp->parent);
9266
9292 return &ca->css; 9267 return &ca->css;
9293} 9268}
9294 9269
@@ -9302,6 +9277,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9302 kfree(ca); 9277 kfree(ca);
9303} 9278}
9304 9279
9280static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9281{
9282 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9283 u64 data;
9284
9285#ifndef CONFIG_64BIT
9286 /*
9287 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9288 */
9289 spin_lock_irq(&cpu_rq(cpu)->lock);
9290 data = *cpuusage;
9291 spin_unlock_irq(&cpu_rq(cpu)->lock);
9292#else
9293 data = *cpuusage;
9294#endif
9295
9296 return data;
9297}
9298
9299static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9300{
9301 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9302
9303#ifndef CONFIG_64BIT
9304 /*
9305 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9306 */
9307 spin_lock_irq(&cpu_rq(cpu)->lock);
9308 *cpuusage = val;
9309 spin_unlock_irq(&cpu_rq(cpu)->lock);
9310#else
9311 *cpuusage = val;
9312#endif
9313}
9314
9305/* return total cpu usage (in nanoseconds) of a group */ 9315/* return total cpu usage (in nanoseconds) of a group */
9306static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9316static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9307{ 9317{
@@ -9309,17 +9319,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9309 u64 totalcpuusage = 0; 9319 u64 totalcpuusage = 0;
9310 int i; 9320 int i;
9311 9321
9312 for_each_possible_cpu(i) { 9322 for_each_present_cpu(i)
9313 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9323 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9314
9315 /*
9316 * Take rq->lock to make 64-bit addition safe on 32-bit
9317 * platforms.
9318 */
9319 spin_lock_irq(&cpu_rq(i)->lock);
9320 totalcpuusage += *cpuusage;
9321 spin_unlock_irq(&cpu_rq(i)->lock);
9322 }
9323 9324
9324 return totalcpuusage; 9325 return totalcpuusage;
9325} 9326}
@@ -9336,23 +9337,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9336 goto out; 9337 goto out;
9337 } 9338 }
9338 9339
9339 for_each_possible_cpu(i) { 9340 for_each_present_cpu(i)
9340 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9341 cpuacct_cpuusage_write(ca, i, 0);
9341 9342
9342 spin_lock_irq(&cpu_rq(i)->lock);
9343 *cpuusage = 0;
9344 spin_unlock_irq(&cpu_rq(i)->lock);
9345 }
9346out: 9343out:
9347 return err; 9344 return err;
9348} 9345}
9349 9346
9347static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9348 struct seq_file *m)
9349{
9350 struct cpuacct *ca = cgroup_ca(cgroup);
9351 u64 percpu;
9352 int i;
9353
9354 for_each_present_cpu(i) {
9355 percpu = cpuacct_cpuusage_read(ca, i);
9356 seq_printf(m, "%llu ", (unsigned long long) percpu);
9357 }
9358 seq_printf(m, "\n");
9359 return 0;
9360}
9361
9350static struct cftype files[] = { 9362static struct cftype files[] = {
9351 { 9363 {
9352 .name = "usage", 9364 .name = "usage",
9353 .read_u64 = cpuusage_read, 9365 .read_u64 = cpuusage_read,
9354 .write_u64 = cpuusage_write, 9366 .write_u64 = cpuusage_write,
9355 }, 9367 },
9368 {
9369 .name = "usage_percpu",
9370 .read_seq_string = cpuacct_percpu_seq_read,
9371 },
9372
9356}; 9373};
9357 9374
9358static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9375static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9368,14 +9385,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9368static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9385static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9369{ 9386{
9370 struct cpuacct *ca; 9387 struct cpuacct *ca;
9388 int cpu;
9371 9389
9372 if (!cpuacct_subsys.active) 9390 if (!cpuacct_subsys.active)
9373 return; 9391 return;
9374 9392
9393 cpu = task_cpu(tsk);
9375 ca = task_ca(tsk); 9394 ca = task_ca(tsk);
9376 if (ca) {
9377 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9378 9395
9396 for (; ca; ca = ca->parent) {
9397 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9379 *cpuusage += cputime; 9398 *cpuusage += cputime;
9380 } 9399 }
9381} 9400}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 26ed8e3d1c15..4293cfa9681d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
53 53
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu,
58 struct task_group *tg)
59{
60 struct sched_entity *se = tg->se[cpu];
61 if (!se)
62 return;
63
64#define P(F) \
65 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
66#define PN(F) \
67 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
68
69 PN(se->exec_start);
70 PN(se->vruntime);
71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start);
74 PN(se->sleep_start);
75 PN(se->block_start);
76 PN(se->sleep_max);
77 PN(se->block_max);
78 PN(se->exec_max);
79 PN(se->slice_max);
80 PN(se->wait_max);
81 PN(se->wait_sum);
82 P(se->wait_count);
83#endif
84 P(se->load.weight);
85#undef PN
86#undef P
87}
88#endif
89
56static void 90static void
57print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 91print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
58{ 92{
@@ -121,20 +155,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
121 155
122#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) 156#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
123 char path[128] = ""; 157 char path[128] = "";
124 struct cgroup *cgroup = NULL;
125 struct task_group *tg = cfs_rq->tg; 158 struct task_group *tg = cfs_rq->tg;
126 159
127 if (tg) 160 cgroup_path(tg->css.cgroup, path, sizeof(path));
128 cgroup = tg->css.cgroup;
129
130 if (cgroup)
131 cgroup_path(cgroup, path, sizeof(path));
132 161
133 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 162 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
163#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
164 {
165 uid_t uid = cfs_rq->tg->uid;
166 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
167 }
134#else 168#else
135 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 169 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
136#endif 170#endif
137
138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 171 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
139 SPLIT_NS(cfs_rq->exec_clock)); 172 SPLIT_NS(cfs_rq->exec_clock));
140 173
@@ -168,6 +201,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168#ifdef CONFIG_SMP 201#ifdef CONFIG_SMP
169 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 202 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
170#endif 203#endif
204 print_cfs_group_stats(m, cpu, cfs_rq->tg);
171#endif 205#endif
172} 206}
173 207
@@ -175,14 +209,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
175{ 209{
176#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) 210#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
177 char path[128] = ""; 211 char path[128] = "";
178 struct cgroup *cgroup = NULL;
179 struct task_group *tg = rt_rq->tg; 212 struct task_group *tg = rt_rq->tg;
180 213
181 if (tg) 214 cgroup_path(tg->css.cgroup, path, sizeof(path));
182 cgroup = tg->css.cgroup;
183
184 if (cgroup)
185 cgroup_path(cgroup, path, sizeof(path));
186 215
187 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); 216 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
188#else 217#else
@@ -272,7 +301,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
272 u64 now = ktime_to_ns(ktime_get()); 301 u64 now = ktime_to_ns(ktime_get());
273 int cpu; 302 int cpu;
274 303
275 SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n", 304 SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
276 init_utsname()->release, 305 init_utsname()->release,
277 (int)strcspn(init_utsname()->version, " "), 306 (int)strcspn(init_utsname()->version, " "),
278 init_utsname()->version); 307 init_utsname()->version);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 98345e45b059..5ad4440f0fc4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -492,6 +492,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
492 * overflow on 32 bits): 492 * overflow on 32 bits):
493 */ 493 */
494 delta_exec = (unsigned long)(now - curr->exec_start); 494 delta_exec = (unsigned long)(now - curr->exec_start);
495 if (!delta_exec)
496 return;
495 497
496 __update_curr(cfs_rq, curr, delta_exec); 498 __update_curr(cfs_rq, curr, delta_exec);
497 curr->exec_start = now; 499 curr->exec_start = now;
@@ -1345,12 +1347,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1345{ 1347{
1346 struct task_struct *curr = rq->curr; 1348 struct task_struct *curr = rq->curr;
1347 struct sched_entity *se = &curr->se, *pse = &p->se; 1349 struct sched_entity *se = &curr->se, *pse = &p->se;
1350 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1348 1351
1349 if (unlikely(rt_prio(p->prio))) { 1352 update_curr(cfs_rq);
1350 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1351 1353
1352 update_rq_clock(rq); 1354 if (unlikely(rt_prio(p->prio))) {
1353 update_curr(cfs_rq);
1354 resched_task(curr); 1355 resched_task(curr);
1355 return; 1356 return;
1356 } 1357 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d9ba9d5f99d6..51d2af3e6191 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -77,7 +77,7 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
77} 77}
78 78
79#define for_each_leaf_rt_rq(rt_rq, rq) \ 79#define for_each_leaf_rt_rq(rt_rq, rq) \
80 list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 80 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
81 81
82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
83{ 83{
@@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq)
537 for_each_sched_rt_entity(rt_se) { 537 for_each_sched_rt_entity(rt_se) {
538 rt_rq = rt_rq_of_se(rt_se); 538 rt_rq = rt_rq_of_se(rt_se);
539 539
540 spin_lock(&rt_rq->rt_runtime_lock);
541 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 540 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
541 spin_lock(&rt_rq->rt_runtime_lock);
542 rt_rq->rt_time += delta_exec; 542 rt_rq->rt_time += delta_exec;
543 if (sched_rt_runtime_exceeded(rt_rq)) 543 if (sched_rt_runtime_exceeded(rt_rq))
544 resched_task(curr); 544 resched_task(curr);
545 spin_unlock(&rt_rq->rt_runtime_lock);
545 } 546 }
546 spin_unlock(&rt_rq->rt_runtime_lock);
547 } 547 }
548} 548}
549 549
@@ -909,9 +909,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
909/* Only try algorithms three times */ 909/* Only try algorithms three times */
910#define RT_MAX_TRIES 3 910#define RT_MAX_TRIES 3
911 911
912static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
913static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
914
915static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); 912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
916 913
917static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 914static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 7dbf72a2b02c..3b01098164c8 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -31,7 +31,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, 31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
32 rq->sched_switch, rq->sched_count, rq->sched_goidle, 32 rq->sched_switch, rq->sched_count, rq->sched_goidle,
33 rq->ttwu_count, rq->ttwu_local, 33 rq->ttwu_count, rq->ttwu_local,
34 rq->rq_sched_info.cpu_time, 34 rq->rq_cpu_time,
35 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); 35 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
36 36
37 seq_printf(seq, "\n"); 37 seq_printf(seq, "\n");
@@ -123,7 +123,7 @@ static inline void
123rq_sched_info_depart(struct rq *rq, unsigned long long delta) 123rq_sched_info_depart(struct rq *rq, unsigned long long delta)
124{ 124{
125 if (rq) 125 if (rq)
126 rq->rq_sched_info.cpu_time += delta; 126 rq->rq_cpu_time += delta;
127} 127}
128 128
129static inline void 129static inline void
@@ -236,7 +236,6 @@ static inline void sched_info_depart(struct task_struct *t)
236 unsigned long long delta = task_rq(t)->clock - 236 unsigned long long delta = task_rq(t)->clock -
237 t->sched_info.last_arrival; 237 t->sched_info.last_arrival;
238 238
239 t->sched_info.cpu_time += delta;
240 rq_sched_info_depart(task_rq(t), delta); 239 rq_sched_info_depart(task_rq(t), delta);
241 240
242 if (t->state == TASK_RUNNING) 241 if (t->state == TASK_RUNNING)
diff --git a/kernel/user.c b/kernel/user.c
index 39d6159fae43..cec2224bc9f5 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -101,6 +101,8 @@ static int sched_create_user(struct user_struct *up)
101 if (IS_ERR(up->tg)) 101 if (IS_ERR(up->tg))
102 rc = -ENOMEM; 102 rc = -ENOMEM;
103 103
104 set_tg_uid(up);
105
104 return rc; 106 return rc;
105} 107}
106 108