aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-12-28 15:27:58 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2008-12-28 15:27:58 -0500
commita39b863342b8aba52390092be95db58f6ed56061 (patch)
treea952625e9815c0a4d7fe9f85c33908068513429a
parentb0f4b285d7ed174804658539129a834270f4829a (diff)
parent4e202284e6ac1695df3eb4a0e549ea78addfb663 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (31 commits) sched: fix warning in fs/proc/base.c schedstat: consolidate per-task cpu runtime stats sched: use RCU variant of list traversal in for_each_leaf_rt_rq() sched, cpuacct: export percpu cpuacct cgroup stats sched, cpuacct: refactoring cpuusage_read / cpuusage_write sched: optimize update_curr() sched: fix wakeup preemption clock sched: add missing arch_update_cpu_topology() call sched: let arch_update_cpu_topology indicate if topology changed sched: idle_balance() does not call load_balance_newidle() sched: fix sd_parent_degenerate on non-numa smp machine sched: add uid information to sched_debug for CONFIG_USER_SCHED sched: move double_unlock_balance() higher sched: update comment for move_task_off_dead_cpu sched: fix inconsistency when redistribute per-cpu tg->cfs_rq shares sched/rt: removed unneeded defintion sched: add hierarchical accounting to cpu accounting controller sched: include group statistics in /proc/sched_debug sched: rename SCHED_NO_NO_OMIT_FRAME_POINTER => SCHED_OMIT_FRAME_POINTER sched: clean up SCHED_CPUMASK_ALLOC ...
-rw-r--r--Documentation/controllers/cpuacct.txt32
-rw-r--r--Documentation/scheduler/sched-arch.txt4
-rw-r--r--arch/ia64/Kconfig2
-rw-r--r--arch/m32r/Kconfig2
-rw-r--r--arch/mips/Kconfig2
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/s390/kernel/topology.c5
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--fs/proc/base.c4
-rw-r--r--include/asm-m32r/system.h2
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/topology.h2
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/sched.c367
-rw-r--r--kernel/sched_debug.c57
-rw-r--r--kernel/sched_fair.c9
-rw-r--r--kernel/sched_rt.c9
-rw-r--r--kernel/sched_stats.h5
-rw-r--r--kernel/user.c2
20 files changed, 299 insertions, 222 deletions
diff --git a/Documentation/controllers/cpuacct.txt b/Documentation/controllers/cpuacct.txt
new file mode 100644
index 000000000000..bb775fbe43d7
--- /dev/null
+++ b/Documentation/controllers/cpuacct.txt
@@ -0,0 +1,32 @@
1CPU Accounting Controller
2-------------------------
3
4The CPU accounting controller is used to group tasks using cgroups and
5account the CPU usage of these groups of tasks.
6
7The CPU accounting controller supports multi-hierarchy groups. An accounting
8group accumulates the CPU usage of all of its child groups and the tasks
9directly present in its group.
10
11Accounting groups can be created by first mounting the cgroup filesystem.
12
13# mkdir /cgroups
14# mount -t cgroup -ocpuacct none /cgroups
15
16With the above step, the initial or the parent accounting group
17becomes visible at /cgroups. At bootup, this group includes all the
18tasks in the system. /cgroups/tasks lists the tasks in this cgroup.
19/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by
20this group which is essentially the CPU time obtained by all the tasks
21in the system.
22
23New accounting groups can be created under the parent group /cgroups.
24
25# cd /cgroups
26# mkdir g1
27# echo $$ > g1
28
29The above steps create a new group g1 and move the current shell
30process (bash) into it. CPU time consumed by this bash and its children
31can be obtained from g1/cpuacct.usage and the same is accumulated in
32/cgroups/cpuacct.usage also.
diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt
index 941615a9769b..d43dbcbd163b 100644
--- a/Documentation/scheduler/sched-arch.txt
+++ b/Documentation/scheduler/sched-arch.txt
@@ -8,7 +8,7 @@ Context switch
8By default, the switch_to arch function is called with the runqueue 8By default, the switch_to arch function is called with the runqueue
9locked. This is usually not a problem unless switch_to may need to 9locked. This is usually not a problem unless switch_to may need to
10take the runqueue lock. This is usually due to a wake up operation in 10take the runqueue lock. This is usually due to a wake up operation in
11the context switch. See include/asm-ia64/system.h for an example. 11the context switch. See arch/ia64/include/asm/system.h for an example.
12 12
13To request the scheduler call switch_to with the runqueue unlocked, 13To request the scheduler call switch_to with the runqueue unlocked,
14you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file 14you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
@@ -23,7 +23,7 @@ disabled. Interrupts may be enabled over the call if it is likely to
23introduce a significant interrupt latency by adding the line 23introduce a significant interrupt latency by adding the line
24`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for 24`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for
25unlocked context switches. This define also implies 25unlocked context switches. This define also implies
26`__ARCH_WANT_UNLOCKED_CTXSW`. See include/asm-arm/system.h for an 26`__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an
27example. 27example.
28 28
29 29
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 6bd91ed7cd03..7fa8f615ba6e 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -99,7 +99,7 @@ config GENERIC_IOMAP
99 bool 99 bool
100 default y 100 default y
101 101
102config SCHED_NO_NO_OMIT_FRAME_POINTER 102config SCHED_OMIT_FRAME_POINTER
103 bool 103 bool
104 default y 104 default y
105 105
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index dbaed4a63815..29047d5c259a 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -273,7 +273,7 @@ config GENERIC_CALIBRATE_DELAY
273 bool 273 bool
274 default y 274 default y
275 275
276config SCHED_NO_NO_OMIT_FRAME_POINTER 276config SCHED_OMIT_FRAME_POINTER
277 bool 277 bool
278 default y 278 default y
279 279
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index f4af967a6b30..a5255e7c79e0 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -653,7 +653,7 @@ config GENERIC_CMOS_UPDATE
653 bool 653 bool
654 default y 654 default y
655 655
656config SCHED_NO_NO_OMIT_FRAME_POINTER 656config SCHED_OMIT_FRAME_POINTER
657 bool 657 bool
658 default y 658 default y
659 659
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 525c13a4de93..adb23ea1c1ef 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,7 +141,7 @@ config GENERIC_NVRAM
141 bool 141 bool
142 default y if PPC32 142 default y if PPC32
143 143
144config SCHED_NO_NO_OMIT_FRAME_POINTER 144config SCHED_OMIT_FRAME_POINTER
145 bool 145 bool
146 default y 146 default y
147 147
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index a947899dcba1..bf96f1b5c6ec 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -212,7 +212,7 @@ static void update_cpu_core_map(void)
212 cpu_core_map[cpu] = cpu_coregroup_map(cpu); 212 cpu_core_map[cpu] = cpu_coregroup_map(cpu);
213} 213}
214 214
215void arch_update_cpu_topology(void) 215int arch_update_cpu_topology(void)
216{ 216{
217 struct tl_info *info = tl_info; 217 struct tl_info *info = tl_info;
218 struct sys_device *sysdev; 218 struct sys_device *sysdev;
@@ -221,7 +221,7 @@ void arch_update_cpu_topology(void)
221 if (!machine_has_topology) { 221 if (!machine_has_topology) {
222 update_cpu_core_map(); 222 update_cpu_core_map();
223 topology_update_polarization_simple(); 223 topology_update_polarization_simple();
224 return; 224 return 0;
225 } 225 }
226 stsi(info, 15, 1, 2); 226 stsi(info, 15, 1, 2);
227 tl_to_cores(info); 227 tl_to_cores(info);
@@ -230,6 +230,7 @@ void arch_update_cpu_topology(void)
230 sysdev = get_cpu_sysdev(cpu); 230 sysdev = get_cpu_sysdev(cpu);
231 kobject_uevent(&sysdev->kobj, KOBJ_CHANGE); 231 kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
232 } 232 }
233 return 1;
233} 234}
234 235
235static void topology_work_fn(struct work_struct *work) 236static void topology_work_fn(struct work_struct *work)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7b17f9d72ba1..98a0ed52b5c3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -368,10 +368,10 @@ config X86_RDC321X
368 as R-8610-(G). 368 as R-8610-(G).
369 If you don't have one of these chips, you should say N here. 369 If you don't have one of these chips, you should say N here.
370 370
371config SCHED_NO_NO_OMIT_FRAME_POINTER 371config SCHED_OMIT_FRAME_POINTER
372 def_bool y 372 def_bool y
373 prompt "Single-depth WCHAN output" 373 prompt "Single-depth WCHAN output"
374 depends on X86_32 374 depends on X86
375 help 375 help
376 Calculate simpler /proc/<PID>/wchan values. If this option 376 Calculate simpler /proc/<PID>/wchan values. If this option
377 is disabled then wchan values will recurse back to the 377 is disabled then wchan values will recurse back to the
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0a8a5f880349..cad92c1ac2b3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -347,8 +347,8 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
347static int proc_pid_schedstat(struct task_struct *task, char *buffer) 347static int proc_pid_schedstat(struct task_struct *task, char *buffer)
348{ 348{
349 return sprintf(buffer, "%llu %llu %lu\n", 349 return sprintf(buffer, "%llu %llu %lu\n",
350 task->sched_info.cpu_time, 350 (unsigned long long)task->se.sum_exec_runtime,
351 task->sched_info.run_delay, 351 (unsigned long long)task->sched_info.run_delay,
352 task->sched_info.pcount); 352 task->sched_info.pcount);
353} 353}
354#endif 354#endif
diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h
index 70a57c8c002b..c980f5ba8de7 100644
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@@ -23,7 +23,7 @@
23 */ 23 */
24 24
25#if defined(CONFIG_FRAME_POINTER) || \ 25#if defined(CONFIG_FRAME_POINTER) || \
26 !defined(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER) 26 !defined(CONFIG_SCHED_OMIT_FRAME_POINTER)
27#define M32R_PUSH_FP " push fp\n" 27#define M32R_PUSH_FP " push fp\n"
28#define M32R_POP_FP " pop fp\n" 28#define M32R_POP_FP " pop fp\n"
29#else 29#else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0a1094d84b77..8395e715809d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -260,8 +260,6 @@ static inline int select_nohz_load_balancer(int cpu)
260} 260}
261#endif 261#endif
262 262
263extern unsigned long rt_needs_cpu(int cpu);
264
265/* 263/*
266 * Only dump TASK_* tasks. (0 for all tasks) 264 * Only dump TASK_* tasks. (0 for all tasks)
267 */ 265 */
@@ -669,8 +667,7 @@ struct reclaim_state;
669struct sched_info { 667struct sched_info {
670 /* cumulative counters */ 668 /* cumulative counters */
671 unsigned long pcount; /* # of times run on this cpu */ 669 unsigned long pcount; /* # of times run on this cpu */
672 unsigned long long cpu_time, /* time spent on the cpu */ 670 unsigned long long run_delay; /* time spent waiting on a runqueue */
673 run_delay; /* time spent waiting on a runqueue */
674 671
675 /* timestamps */ 672 /* timestamps */
676 unsigned long long last_arrival,/* when we last ran on a cpu */ 673 unsigned long long last_arrival,/* when we last ran on a cpu */
@@ -2210,6 +2207,7 @@ extern void normalize_rt_tasks(void);
2210extern struct task_group init_task_group; 2207extern struct task_group init_task_group;
2211#ifdef CONFIG_USER_SCHED 2208#ifdef CONFIG_USER_SCHED
2212extern struct task_group root_task_group; 2209extern struct task_group root_task_group;
2210extern void set_tg_uid(struct user_struct *user);
2213#endif 2211#endif
2214 2212
2215extern struct task_group *sched_create_group(struct task_group *parent); 2213extern struct task_group *sched_create_group(struct task_group *parent);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 117f1b7405cf..0c5b5ac36d8e 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -49,7 +49,7 @@
49 for_each_online_node(node) \ 49 for_each_online_node(node) \
50 if (nr_cpus_node(node)) 50 if (nr_cpus_node(node))
51 51
52void arch_update_cpu_topology(void); 52int arch_update_cpu_topology(void);
53 53
54/* Conform to ACPI 2.0 SLIT distance definitions */ 54/* Conform to ACPI 2.0 SLIT distance definitions */
55#define LOCAL_DISTANCE 10 55#define LOCAL_DISTANCE 10
diff --git a/kernel/Makefile b/kernel/Makefile
index b1e6b6625ea2..027edda63511 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -19,7 +19,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
19CFLAGS_REMOVE_rtmutex-debug.o = -pg 19CFLAGS_REMOVE_rtmutex-debug.o = -pg
20CFLAGS_REMOVE_cgroup-debug.o = -pg 20CFLAGS_REMOVE_cgroup-debug.o = -pg
21CFLAGS_REMOVE_sched_clock.o = -pg 21CFLAGS_REMOVE_sched_clock.o = -pg
22CFLAGS_REMOVE_sched.o = -pg
23endif 22endif
24 23
25obj-$(CONFIG_FREEZER) += freezer.o 24obj-$(CONFIG_FREEZER) += freezer.o
@@ -90,7 +89,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
90obj-$(CONFIG_TRACING) += trace/ 89obj-$(CONFIG_TRACING) += trace/
91obj-$(CONFIG_SMP) += sched_cpupri.o 90obj-$(CONFIG_SMP) += sched_cpupri.o
92 91
93ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 92ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
94# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 93# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
95# needed for x86 only. Why this used to be enabled for all architectures is beyond 94# needed for x86 only. Why this used to be enabled for all architectures is beyond
96# me. I suspect most platforms don't need this, but until we know that for sure 95# me. I suspect most platforms don't need this, but until we know that for sure
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index b3179dad71be..abb6e17505e2 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
127 */ 127 */
128 t1 = tsk->sched_info.pcount; 128 t1 = tsk->sched_info.pcount;
129 t2 = tsk->sched_info.run_delay; 129 t2 = tsk->sched_info.run_delay;
130 t3 = tsk->sched_info.cpu_time; 130 t3 = tsk->se.sum_exec_runtime;
131 131
132 d->cpu_count += t1; 132 d->cpu_count += t1;
133 133
diff --git a/kernel/sched.c b/kernel/sched.c
index 3798b954e6e8..748ff924a290 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -267,6 +267,10 @@ struct task_group {
267 struct cgroup_subsys_state css; 267 struct cgroup_subsys_state css;
268#endif 268#endif
269 269
270#ifdef CONFIG_USER_SCHED
271 uid_t uid;
272#endif
273
270#ifdef CONFIG_FAIR_GROUP_SCHED 274#ifdef CONFIG_FAIR_GROUP_SCHED
271 /* schedulable entities of this group on each cpu */ 275 /* schedulable entities of this group on each cpu */
272 struct sched_entity **se; 276 struct sched_entity **se;
@@ -292,6 +296,12 @@ struct task_group {
292 296
293#ifdef CONFIG_USER_SCHED 297#ifdef CONFIG_USER_SCHED
294 298
299/* Helper function to pass uid information to create_sched_user() */
300void set_tg_uid(struct user_struct *user)
301{
302 user->tg->uid = user->uid;
303}
304
295/* 305/*
296 * Root task group. 306 * Root task group.
297 * Every UID task group (including init_task_group aka UID-0) will 307 * Every UID task group (including init_task_group aka UID-0) will
@@ -594,6 +604,8 @@ struct rq {
594#ifdef CONFIG_SCHEDSTATS 604#ifdef CONFIG_SCHEDSTATS
595 /* latency stats */ 605 /* latency stats */
596 struct sched_info rq_sched_info; 606 struct sched_info rq_sched_info;
607 unsigned long long rq_cpu_time;
608 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
597 609
598 /* sys_sched_yield() stats */ 610 /* sys_sched_yield() stats */
599 unsigned int yld_exp_empty; 611 unsigned int yld_exp_empty;
@@ -711,45 +723,18 @@ static __read_mostly char *sched_feat_names[] = {
711 723
712#undef SCHED_FEAT 724#undef SCHED_FEAT
713 725
714static int sched_feat_open(struct inode *inode, struct file *filp) 726static int sched_feat_show(struct seq_file *m, void *v)
715{
716 filp->private_data = inode->i_private;
717 return 0;
718}
719
720static ssize_t
721sched_feat_read(struct file *filp, char __user *ubuf,
722 size_t cnt, loff_t *ppos)
723{ 727{
724 char *buf;
725 int r = 0;
726 int len = 0;
727 int i; 728 int i;
728 729
729 for (i = 0; sched_feat_names[i]; i++) { 730 for (i = 0; sched_feat_names[i]; i++) {
730 len += strlen(sched_feat_names[i]); 731 if (!(sysctl_sched_features & (1UL << i)))
731 len += 4; 732 seq_puts(m, "NO_");
732 } 733 seq_printf(m, "%s ", sched_feat_names[i]);
733
734 buf = kmalloc(len + 2, GFP_KERNEL);
735 if (!buf)
736 return -ENOMEM;
737
738 for (i = 0; sched_feat_names[i]; i++) {
739 if (sysctl_sched_features & (1UL << i))
740 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
741 else
742 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
743 } 734 }
735 seq_puts(m, "\n");
744 736
745 r += sprintf(buf + r, "\n"); 737 return 0;
746 WARN_ON(r >= len + 2);
747
748 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
749
750 kfree(buf);
751
752 return r;
753} 738}
754 739
755static ssize_t 740static ssize_t
@@ -794,10 +779,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
794 return cnt; 779 return cnt;
795} 780}
796 781
782static int sched_feat_open(struct inode *inode, struct file *filp)
783{
784 return single_open(filp, sched_feat_show, NULL);
785}
786
797static struct file_operations sched_feat_fops = { 787static struct file_operations sched_feat_fops = {
798 .open = sched_feat_open, 788 .open = sched_feat_open,
799 .read = sched_feat_read, 789 .write = sched_feat_write,
800 .write = sched_feat_write, 790 .read = seq_read,
791 .llseek = seq_lseek,
792 .release = single_release,
801}; 793};
802 794
803static __init int sched_init_debug(void) 795static __init int sched_init_debug(void)
@@ -1482,27 +1474,13 @@ static void
1482update_group_shares_cpu(struct task_group *tg, int cpu, 1474update_group_shares_cpu(struct task_group *tg, int cpu,
1483 unsigned long sd_shares, unsigned long sd_rq_weight) 1475 unsigned long sd_shares, unsigned long sd_rq_weight)
1484{ 1476{
1485 int boost = 0;
1486 unsigned long shares; 1477 unsigned long shares;
1487 unsigned long rq_weight; 1478 unsigned long rq_weight;
1488 1479
1489 if (!tg->se[cpu]) 1480 if (!tg->se[cpu])
1490 return; 1481 return;
1491 1482
1492 rq_weight = tg->cfs_rq[cpu]->load.weight; 1483 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1493
1494 /*
1495 * If there are currently no tasks on the cpu pretend there is one of
1496 * average load so that when a new task gets to run here it will not
1497 * get delayed by group starvation.
1498 */
1499 if (!rq_weight) {
1500 boost = 1;
1501 rq_weight = NICE_0_LOAD;
1502 }
1503
1504 if (unlikely(rq_weight > sd_rq_weight))
1505 rq_weight = sd_rq_weight;
1506 1484
1507 /* 1485 /*
1508 * \Sum shares * rq_weight 1486 * \Sum shares * rq_weight
@@ -1510,7 +1488,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1510 * \Sum rq_weight 1488 * \Sum rq_weight
1511 * 1489 *
1512 */ 1490 */
1513 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1491 shares = (sd_shares * rq_weight) / sd_rq_weight;
1514 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1492 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1515 1493
1516 if (abs(shares - tg->se[cpu]->load.weight) > 1494 if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1519,11 +1497,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1519 unsigned long flags; 1497 unsigned long flags;
1520 1498
1521 spin_lock_irqsave(&rq->lock, flags); 1499 spin_lock_irqsave(&rq->lock, flags);
1522 /* 1500 tg->cfs_rq[cpu]->shares = shares;
1523 * record the actual number of shares, not the boosted amount.
1524 */
1525 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1526 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1527 1501
1528 __set_se_shares(tg->se[cpu], shares); 1502 __set_se_shares(tg->se[cpu], shares);
1529 spin_unlock_irqrestore(&rq->lock, flags); 1503 spin_unlock_irqrestore(&rq->lock, flags);
@@ -1537,13 +1511,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1537 */ 1511 */
1538static int tg_shares_up(struct task_group *tg, void *data) 1512static int tg_shares_up(struct task_group *tg, void *data)
1539{ 1513{
1540 unsigned long rq_weight = 0; 1514 unsigned long weight, rq_weight = 0;
1541 unsigned long shares = 0; 1515 unsigned long shares = 0;
1542 struct sched_domain *sd = data; 1516 struct sched_domain *sd = data;
1543 int i; 1517 int i;
1544 1518
1545 for_each_cpu_mask(i, sd->span) { 1519 for_each_cpu_mask(i, sd->span) {
1546 rq_weight += tg->cfs_rq[i]->load.weight; 1520 /*
1521 * If there are currently no tasks on the cpu pretend there
1522 * is one of average load so that when a new task gets to
1523 * run here it will not get delayed by group starvation.
1524 */
1525 weight = tg->cfs_rq[i]->load.weight;
1526 if (!weight)
1527 weight = NICE_0_LOAD;
1528
1529 tg->cfs_rq[i]->rq_weight = weight;
1530 rq_weight += weight;
1547 shares += tg->cfs_rq[i]->shares; 1531 shares += tg->cfs_rq[i]->shares;
1548 } 1532 }
1549 1533
@@ -1553,9 +1537,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
1553 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1537 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1554 shares = tg->shares; 1538 shares = tg->shares;
1555 1539
1556 if (!rq_weight)
1557 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1558
1559 for_each_cpu_mask(i, sd->span) 1540 for_each_cpu_mask(i, sd->span)
1560 update_group_shares_cpu(tg, i, shares, rq_weight); 1541 update_group_shares_cpu(tg, i, shares, rq_weight);
1561 1542
@@ -1620,6 +1601,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1620 1601
1621#endif 1602#endif
1622 1603
1604/*
1605 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1606 */
1607static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1608 __releases(this_rq->lock)
1609 __acquires(busiest->lock)
1610 __acquires(this_rq->lock)
1611{
1612 int ret = 0;
1613
1614 if (unlikely(!irqs_disabled())) {
1615 /* printk() doesn't work good under rq->lock */
1616 spin_unlock(&this_rq->lock);
1617 BUG_ON(1);
1618 }
1619 if (unlikely(!spin_trylock(&busiest->lock))) {
1620 if (busiest < this_rq) {
1621 spin_unlock(&this_rq->lock);
1622 spin_lock(&busiest->lock);
1623 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1624 ret = 1;
1625 } else
1626 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1627 }
1628 return ret;
1629}
1630
1631static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1632 __releases(busiest->lock)
1633{
1634 spin_unlock(&busiest->lock);
1635 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1636}
1623#endif 1637#endif
1624 1638
1625#ifdef CONFIG_FAIR_GROUP_SCHED 1639#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2264,6 +2278,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2264 2278
2265 smp_wmb(); 2279 smp_wmb();
2266 rq = task_rq_lock(p, &flags); 2280 rq = task_rq_lock(p, &flags);
2281 update_rq_clock(rq);
2267 old_state = p->state; 2282 old_state = p->state;
2268 if (!(old_state & state)) 2283 if (!(old_state & state))
2269 goto out; 2284 goto out;
@@ -2321,7 +2336,6 @@ out_activate:
2321 schedstat_inc(p, se.nr_wakeups_local); 2336 schedstat_inc(p, se.nr_wakeups_local);
2322 else 2337 else
2323 schedstat_inc(p, se.nr_wakeups_remote); 2338 schedstat_inc(p, se.nr_wakeups_remote);
2324 update_rq_clock(rq);
2325 activate_task(rq, p, 1); 2339 activate_task(rq, p, 1);
2326 success = 1; 2340 success = 1;
2327 2341
@@ -2822,40 +2836,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2822} 2836}
2823 2837
2824/* 2838/*
2825 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2826 */
2827static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2828 __releases(this_rq->lock)
2829 __acquires(busiest->lock)
2830 __acquires(this_rq->lock)
2831{
2832 int ret = 0;
2833
2834 if (unlikely(!irqs_disabled())) {
2835 /* printk() doesn't work good under rq->lock */
2836 spin_unlock(&this_rq->lock);
2837 BUG_ON(1);
2838 }
2839 if (unlikely(!spin_trylock(&busiest->lock))) {
2840 if (busiest < this_rq) {
2841 spin_unlock(&this_rq->lock);
2842 spin_lock(&busiest->lock);
2843 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2844 ret = 1;
2845 } else
2846 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2847 }
2848 return ret;
2849}
2850
2851static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2852 __releases(busiest->lock)
2853{
2854 spin_unlock(&busiest->lock);
2855 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2856}
2857
2858/*
2859 * If dest_cpu is allowed for this process, migrate the task to it. 2839 * If dest_cpu is allowed for this process, migrate the task to it.
2860 * This is accomplished by forcing the cpu_allowed mask to only 2840 * This is accomplished by forcing the cpu_allowed mask to only
2861 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2841 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -3716,7 +3696,7 @@ out_balanced:
3716static void idle_balance(int this_cpu, struct rq *this_rq) 3696static void idle_balance(int this_cpu, struct rq *this_rq)
3717{ 3697{
3718 struct sched_domain *sd; 3698 struct sched_domain *sd;
3719 int pulled_task = -1; 3699 int pulled_task = 0;
3720 unsigned long next_balance = jiffies + HZ; 3700 unsigned long next_balance = jiffies + HZ;
3721 cpumask_t tmpmask; 3701 cpumask_t tmpmask;
3722 3702
@@ -6150,7 +6130,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6150 6130
6151/* 6131/*
6152 * Figure out where task on dead CPU should go, use force if necessary. 6132 * Figure out where task on dead CPU should go, use force if necessary.
6153 * NOTE: interrupts should be disabled by the caller
6154 */ 6133 */
6155static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6134static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6156{ 6135{
@@ -6662,28 +6641,6 @@ early_initcall(migration_init);
6662 6641
6663#ifdef CONFIG_SCHED_DEBUG 6642#ifdef CONFIG_SCHED_DEBUG
6664 6643
6665static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6666{
6667 switch (lvl) {
6668 case SD_LV_NONE:
6669 return "NONE";
6670 case SD_LV_SIBLING:
6671 return "SIBLING";
6672 case SD_LV_MC:
6673 return "MC";
6674 case SD_LV_CPU:
6675 return "CPU";
6676 case SD_LV_NODE:
6677 return "NODE";
6678 case SD_LV_ALLNODES:
6679 return "ALLNODES";
6680 case SD_LV_MAX:
6681 return "MAX";
6682
6683 }
6684 return "MAX";
6685}
6686
6687static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6644static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6688 cpumask_t *groupmask) 6645 cpumask_t *groupmask)
6689{ 6646{
@@ -6703,8 +6660,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6703 return -1; 6660 return -1;
6704 } 6661 }
6705 6662
6706 printk(KERN_CONT "span %s level %s\n", 6663 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6707 str, sd_level_to_string(sd->level));
6708 6664
6709 if (!cpu_isset(cpu, sd->span)) { 6665 if (!cpu_isset(cpu, sd->span)) {
6710 printk(KERN_ERR "ERROR: domain->span does not contain " 6666 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6840,6 +6796,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6840 SD_BALANCE_EXEC | 6796 SD_BALANCE_EXEC |
6841 SD_SHARE_CPUPOWER | 6797 SD_SHARE_CPUPOWER |
6842 SD_SHARE_PKG_RESOURCES); 6798 SD_SHARE_PKG_RESOURCES);
6799 if (nr_node_ids == 1)
6800 pflags &= ~SD_SERIALIZE;
6843 } 6801 }
6844 if (~cflags & pflags) 6802 if (~cflags & pflags)
6845 return 0; 6803 return 0;
@@ -7360,13 +7318,21 @@ struct allmasks {
7360}; 7318};
7361 7319
7362#if NR_CPUS > 128 7320#if NR_CPUS > 128
7363#define SCHED_CPUMASK_ALLOC 1 7321#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7364#define SCHED_CPUMASK_FREE(v) kfree(v) 7322static inline void sched_cpumask_alloc(struct allmasks **masks)
7365#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v 7323{
7324 *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
7325}
7326static inline void sched_cpumask_free(struct allmasks *masks)
7327{
7328 kfree(masks);
7329}
7366#else 7330#else
7367#define SCHED_CPUMASK_ALLOC 0 7331#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7368#define SCHED_CPUMASK_FREE(v) 7332static inline void sched_cpumask_alloc(struct allmasks **masks)
7369#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v 7333{ }
7334static inline void sched_cpumask_free(struct allmasks *masks)
7335{ }
7370#endif 7336#endif
7371 7337
7372#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ 7338#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
@@ -7442,9 +7408,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7442 return -ENOMEM; 7408 return -ENOMEM;
7443 } 7409 }
7444 7410
7445#if SCHED_CPUMASK_ALLOC
7446 /* get space for all scratch cpumask variables */ 7411 /* get space for all scratch cpumask variables */
7447 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); 7412 sched_cpumask_alloc(&allmasks);
7448 if (!allmasks) { 7413 if (!allmasks) {
7449 printk(KERN_WARNING "Cannot alloc cpumask array\n"); 7414 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7450 kfree(rd); 7415 kfree(rd);
@@ -7453,7 +7418,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7453#endif 7418#endif
7454 return -ENOMEM; 7419 return -ENOMEM;
7455 } 7420 }
7456#endif 7421
7457 tmpmask = (cpumask_t *)allmasks; 7422 tmpmask = (cpumask_t *)allmasks;
7458 7423
7459 7424
@@ -7707,13 +7672,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7707 cpu_attach_domain(sd, rd, i); 7672 cpu_attach_domain(sd, rd, i);
7708 } 7673 }
7709 7674
7710 SCHED_CPUMASK_FREE((void *)allmasks); 7675 sched_cpumask_free(allmasks);
7711 return 0; 7676 return 0;
7712 7677
7713#ifdef CONFIG_NUMA 7678#ifdef CONFIG_NUMA
7714error: 7679error:
7715 free_sched_groups(cpu_map, tmpmask); 7680 free_sched_groups(cpu_map, tmpmask);
7716 SCHED_CPUMASK_FREE((void *)allmasks); 7681 sched_cpumask_free(allmasks);
7717 kfree(rd); 7682 kfree(rd);
7718 return -ENOMEM; 7683 return -ENOMEM;
7719#endif 7684#endif
@@ -7736,8 +7701,14 @@ static struct sched_domain_attr *dattr_cur;
7736 */ 7701 */
7737static cpumask_t fallback_doms; 7702static cpumask_t fallback_doms;
7738 7703
7739void __attribute__((weak)) arch_update_cpu_topology(void) 7704/*
7705 * arch_update_cpu_topology lets virtualized architectures update the
7706 * cpu core maps. It is supposed to return 1 if the topology changed
7707 * or 0 if it stayed the same.
7708 */
7709int __attribute__((weak)) arch_update_cpu_topology(void)
7740{ 7710{
7711 return 0;
7741} 7712}
7742 7713
7743/* 7714/*
@@ -7777,8 +7748,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7777 cpumask_t tmpmask; 7748 cpumask_t tmpmask;
7778 int i; 7749 int i;
7779 7750
7780 unregister_sched_domain_sysctl();
7781
7782 for_each_cpu_mask_nr(i, *cpu_map) 7751 for_each_cpu_mask_nr(i, *cpu_map)
7783 cpu_attach_domain(NULL, &def_root_domain, i); 7752 cpu_attach_domain(NULL, &def_root_domain, i);
7784 synchronize_sched(); 7753 synchronize_sched();
@@ -7831,17 +7800,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7831 struct sched_domain_attr *dattr_new) 7800 struct sched_domain_attr *dattr_new)
7832{ 7801{
7833 int i, j, n; 7802 int i, j, n;
7803 int new_topology;
7834 7804
7835 mutex_lock(&sched_domains_mutex); 7805 mutex_lock(&sched_domains_mutex);
7836 7806
7837 /* always unregister in case we don't destroy any domains */ 7807 /* always unregister in case we don't destroy any domains */
7838 unregister_sched_domain_sysctl(); 7808 unregister_sched_domain_sysctl();
7839 7809
7810 /* Let architecture update cpu core mappings. */
7811 new_topology = arch_update_cpu_topology();
7812
7840 n = doms_new ? ndoms_new : 0; 7813 n = doms_new ? ndoms_new : 0;
7841 7814
7842 /* Destroy deleted domains */ 7815 /* Destroy deleted domains */
7843 for (i = 0; i < ndoms_cur; i++) { 7816 for (i = 0; i < ndoms_cur; i++) {
7844 for (j = 0; j < n; j++) { 7817 for (j = 0; j < n && !new_topology; j++) {
7845 if (cpus_equal(doms_cur[i], doms_new[j]) 7818 if (cpus_equal(doms_cur[i], doms_new[j])
7846 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7819 && dattrs_equal(dattr_cur, i, dattr_new, j))
7847 goto match1; 7820 goto match1;
@@ -7856,12 +7829,12 @@ match1:
7856 ndoms_cur = 0; 7829 ndoms_cur = 0;
7857 doms_new = &fallback_doms; 7830 doms_new = &fallback_doms;
7858 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7831 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7859 dattr_new = NULL; 7832 WARN_ON_ONCE(dattr_new);
7860 } 7833 }
7861 7834
7862 /* Build new domains */ 7835 /* Build new domains */
7863 for (i = 0; i < ndoms_new; i++) { 7836 for (i = 0; i < ndoms_new; i++) {
7864 for (j = 0; j < ndoms_cur; j++) { 7837 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7865 if (cpus_equal(doms_new[i], doms_cur[j]) 7838 if (cpus_equal(doms_new[i], doms_cur[j])
7866 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7839 && dattrs_equal(dattr_new, i, dattr_cur, j))
7867 goto match2; 7840 goto match2;
@@ -8516,7 +8489,7 @@ static
8516int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8489int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8517{ 8490{
8518 struct cfs_rq *cfs_rq; 8491 struct cfs_rq *cfs_rq;
8519 struct sched_entity *se, *parent_se; 8492 struct sched_entity *se;
8520 struct rq *rq; 8493 struct rq *rq;
8521 int i; 8494 int i;
8522 8495
@@ -8532,18 +8505,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8532 for_each_possible_cpu(i) { 8505 for_each_possible_cpu(i) {
8533 rq = cpu_rq(i); 8506 rq = cpu_rq(i);
8534 8507
8535 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), 8508 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8536 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8509 GFP_KERNEL, cpu_to_node(i));
8537 if (!cfs_rq) 8510 if (!cfs_rq)
8538 goto err; 8511 goto err;
8539 8512
8540 se = kmalloc_node(sizeof(struct sched_entity), 8513 se = kzalloc_node(sizeof(struct sched_entity),
8541 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8514 GFP_KERNEL, cpu_to_node(i));
8542 if (!se) 8515 if (!se)
8543 goto err; 8516 goto err;
8544 8517
8545 parent_se = parent ? parent->se[i] : NULL; 8518 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8546 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8547 } 8519 }
8548 8520
8549 return 1; 8521 return 1;
@@ -8604,7 +8576,7 @@ static
8604int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8576int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8605{ 8577{
8606 struct rt_rq *rt_rq; 8578 struct rt_rq *rt_rq;
8607 struct sched_rt_entity *rt_se, *parent_se; 8579 struct sched_rt_entity *rt_se;
8608 struct rq *rq; 8580 struct rq *rq;
8609 int i; 8581 int i;
8610 8582
@@ -8621,18 +8593,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8621 for_each_possible_cpu(i) { 8593 for_each_possible_cpu(i) {
8622 rq = cpu_rq(i); 8594 rq = cpu_rq(i);
8623 8595
8624 rt_rq = kmalloc_node(sizeof(struct rt_rq), 8596 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8625 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8597 GFP_KERNEL, cpu_to_node(i));
8626 if (!rt_rq) 8598 if (!rt_rq)
8627 goto err; 8599 goto err;
8628 8600
8629 rt_se = kmalloc_node(sizeof(struct sched_rt_entity), 8601 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8630 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8602 GFP_KERNEL, cpu_to_node(i));
8631 if (!rt_se) 8603 if (!rt_se)
8632 goto err; 8604 goto err;
8633 8605
8634 parent_se = parent ? parent->rt_se[i] : NULL; 8606 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8635 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8636 } 8607 }
8637 8608
8638 return 1; 8609 return 1;
@@ -9275,11 +9246,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9275 * (balbir@in.ibm.com). 9246 * (balbir@in.ibm.com).
9276 */ 9247 */
9277 9248
9278/* track cpu usage of a group of tasks */ 9249/* track cpu usage of a group of tasks and its child groups */
9279struct cpuacct { 9250struct cpuacct {
9280 struct cgroup_subsys_state css; 9251 struct cgroup_subsys_state css;
9281 /* cpuusage holds pointer to a u64-type object on every cpu */ 9252 /* cpuusage holds pointer to a u64-type object on every cpu */
9282 u64 *cpuusage; 9253 u64 *cpuusage;
9254 struct cpuacct *parent;
9283}; 9255};
9284 9256
9285struct cgroup_subsys cpuacct_subsys; 9257struct cgroup_subsys cpuacct_subsys;
@@ -9313,6 +9285,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9313 return ERR_PTR(-ENOMEM); 9285 return ERR_PTR(-ENOMEM);
9314 } 9286 }
9315 9287
9288 if (cgrp->parent)
9289 ca->parent = cgroup_ca(cgrp->parent);
9290
9316 return &ca->css; 9291 return &ca->css;
9317} 9292}
9318 9293
@@ -9326,6 +9301,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9326 kfree(ca); 9301 kfree(ca);
9327} 9302}
9328 9303
9304static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9305{
9306 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9307 u64 data;
9308
9309#ifndef CONFIG_64BIT
9310 /*
9311 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9312 */
9313 spin_lock_irq(&cpu_rq(cpu)->lock);
9314 data = *cpuusage;
9315 spin_unlock_irq(&cpu_rq(cpu)->lock);
9316#else
9317 data = *cpuusage;
9318#endif
9319
9320 return data;
9321}
9322
9323static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9324{
9325 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9326
9327#ifndef CONFIG_64BIT
9328 /*
9329 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9330 */
9331 spin_lock_irq(&cpu_rq(cpu)->lock);
9332 *cpuusage = val;
9333 spin_unlock_irq(&cpu_rq(cpu)->lock);
9334#else
9335 *cpuusage = val;
9336#endif
9337}
9338
9329/* return total cpu usage (in nanoseconds) of a group */ 9339/* return total cpu usage (in nanoseconds) of a group */
9330static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9340static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9331{ 9341{
@@ -9333,17 +9343,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9333 u64 totalcpuusage = 0; 9343 u64 totalcpuusage = 0;
9334 int i; 9344 int i;
9335 9345
9336 for_each_possible_cpu(i) { 9346 for_each_present_cpu(i)
9337 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9347 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9338
9339 /*
9340 * Take rq->lock to make 64-bit addition safe on 32-bit
9341 * platforms.
9342 */
9343 spin_lock_irq(&cpu_rq(i)->lock);
9344 totalcpuusage += *cpuusage;
9345 spin_unlock_irq(&cpu_rq(i)->lock);
9346 }
9347 9348
9348 return totalcpuusage; 9349 return totalcpuusage;
9349} 9350}
@@ -9360,23 +9361,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9360 goto out; 9361 goto out;
9361 } 9362 }
9362 9363
9363 for_each_possible_cpu(i) { 9364 for_each_present_cpu(i)
9364 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9365 cpuacct_cpuusage_write(ca, i, 0);
9365 9366
9366 spin_lock_irq(&cpu_rq(i)->lock);
9367 *cpuusage = 0;
9368 spin_unlock_irq(&cpu_rq(i)->lock);
9369 }
9370out: 9367out:
9371 return err; 9368 return err;
9372} 9369}
9373 9370
9371static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9372 struct seq_file *m)
9373{
9374 struct cpuacct *ca = cgroup_ca(cgroup);
9375 u64 percpu;
9376 int i;
9377
9378 for_each_present_cpu(i) {
9379 percpu = cpuacct_cpuusage_read(ca, i);
9380 seq_printf(m, "%llu ", (unsigned long long) percpu);
9381 }
9382 seq_printf(m, "\n");
9383 return 0;
9384}
9385
9374static struct cftype files[] = { 9386static struct cftype files[] = {
9375 { 9387 {
9376 .name = "usage", 9388 .name = "usage",
9377 .read_u64 = cpuusage_read, 9389 .read_u64 = cpuusage_read,
9378 .write_u64 = cpuusage_write, 9390 .write_u64 = cpuusage_write,
9379 }, 9391 },
9392 {
9393 .name = "usage_percpu",
9394 .read_seq_string = cpuacct_percpu_seq_read,
9395 },
9396
9380}; 9397};
9381 9398
9382static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9399static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9392,14 +9409,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9392static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9409static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9393{ 9410{
9394 struct cpuacct *ca; 9411 struct cpuacct *ca;
9412 int cpu;
9395 9413
9396 if (!cpuacct_subsys.active) 9414 if (!cpuacct_subsys.active)
9397 return; 9415 return;
9398 9416
9417 cpu = task_cpu(tsk);
9399 ca = task_ca(tsk); 9418 ca = task_ca(tsk);
9400 if (ca) {
9401 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9402 9419
9420 for (; ca; ca = ca->parent) {
9421 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9403 *cpuusage += cputime; 9422 *cpuusage += cputime;
9404 } 9423 }
9405} 9424}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 26ed8e3d1c15..4293cfa9681d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
53 53
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu,
58 struct task_group *tg)
59{
60 struct sched_entity *se = tg->se[cpu];
61 if (!se)
62 return;
63
64#define P(F) \
65 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
66#define PN(F) \
67 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
68
69 PN(se->exec_start);
70 PN(se->vruntime);
71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start);
74 PN(se->sleep_start);
75 PN(se->block_start);
76 PN(se->sleep_max);
77 PN(se->block_max);
78 PN(se->exec_max);
79 PN(se->slice_max);
80 PN(se->wait_max);
81 PN(se->wait_sum);
82 P(se->wait_count);
83#endif
84 P(se->load.weight);
85#undef PN
86#undef P
87}
88#endif
89
56static void 90static void
57print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 91print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
58{ 92{
@@ -121,20 +155,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
121 155
122#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) 156#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
123 char path[128] = ""; 157 char path[128] = "";
124 struct cgroup *cgroup = NULL;
125 struct task_group *tg = cfs_rq->tg; 158 struct task_group *tg = cfs_rq->tg;
126 159
127 if (tg) 160 cgroup_path(tg->css.cgroup, path, sizeof(path));
128 cgroup = tg->css.cgroup;
129
130 if (cgroup)
131 cgroup_path(cgroup, path, sizeof(path));
132 161
133 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 162 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
163#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
164 {
165 uid_t uid = cfs_rq->tg->uid;
166 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
167 }
134#else 168#else
135 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 169 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
136#endif 170#endif
137
138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 171 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
139 SPLIT_NS(cfs_rq->exec_clock)); 172 SPLIT_NS(cfs_rq->exec_clock));
140 173
@@ -168,6 +201,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168#ifdef CONFIG_SMP 201#ifdef CONFIG_SMP
169 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 202 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
170#endif 203#endif
204 print_cfs_group_stats(m, cpu, cfs_rq->tg);
171#endif 205#endif
172} 206}
173 207
@@ -175,14 +209,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
175{ 209{
176#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) 210#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
177 char path[128] = ""; 211 char path[128] = "";
178 struct cgroup *cgroup = NULL;
179 struct task_group *tg = rt_rq->tg; 212 struct task_group *tg = rt_rq->tg;
180 213
181 if (tg) 214 cgroup_path(tg->css.cgroup, path, sizeof(path));
182 cgroup = tg->css.cgroup;
183
184 if (cgroup)
185 cgroup_path(cgroup, path, sizeof(path));
186 215
187 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); 216 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
188#else 217#else
@@ -272,7 +301,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
272 u64 now = ktime_to_ns(ktime_get()); 301 u64 now = ktime_to_ns(ktime_get());
273 int cpu; 302 int cpu;
274 303
275 SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n", 304 SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
276 init_utsname()->release, 305 init_utsname()->release,
277 (int)strcspn(init_utsname()->version, " "), 306 (int)strcspn(init_utsname()->version, " "),
278 init_utsname()->version); 307 init_utsname()->version);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 98345e45b059..5ad4440f0fc4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -492,6 +492,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
492 * overflow on 32 bits): 492 * overflow on 32 bits):
493 */ 493 */
494 delta_exec = (unsigned long)(now - curr->exec_start); 494 delta_exec = (unsigned long)(now - curr->exec_start);
495 if (!delta_exec)
496 return;
495 497
496 __update_curr(cfs_rq, curr, delta_exec); 498 __update_curr(cfs_rq, curr, delta_exec);
497 curr->exec_start = now; 499 curr->exec_start = now;
@@ -1345,12 +1347,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1345{ 1347{
1346 struct task_struct *curr = rq->curr; 1348 struct task_struct *curr = rq->curr;
1347 struct sched_entity *se = &curr->se, *pse = &p->se; 1349 struct sched_entity *se = &curr->se, *pse = &p->se;
1350 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1348 1351
1349 if (unlikely(rt_prio(p->prio))) { 1352 update_curr(cfs_rq);
1350 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1351 1353
1352 update_rq_clock(rq); 1354 if (unlikely(rt_prio(p->prio))) {
1353 update_curr(cfs_rq);
1354 resched_task(curr); 1355 resched_task(curr);
1355 return; 1356 return;
1356 } 1357 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d9ba9d5f99d6..51d2af3e6191 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -77,7 +77,7 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
77} 77}
78 78
79#define for_each_leaf_rt_rq(rt_rq, rq) \ 79#define for_each_leaf_rt_rq(rt_rq, rq) \
80 list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 80 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
81 81
82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
83{ 83{
@@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq)
537 for_each_sched_rt_entity(rt_se) { 537 for_each_sched_rt_entity(rt_se) {
538 rt_rq = rt_rq_of_se(rt_se); 538 rt_rq = rt_rq_of_se(rt_se);
539 539
540 spin_lock(&rt_rq->rt_runtime_lock);
541 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 540 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
541 spin_lock(&rt_rq->rt_runtime_lock);
542 rt_rq->rt_time += delta_exec; 542 rt_rq->rt_time += delta_exec;
543 if (sched_rt_runtime_exceeded(rt_rq)) 543 if (sched_rt_runtime_exceeded(rt_rq))
544 resched_task(curr); 544 resched_task(curr);
545 spin_unlock(&rt_rq->rt_runtime_lock);
545 } 546 }
546 spin_unlock(&rt_rq->rt_runtime_lock);
547 } 547 }
548} 548}
549 549
@@ -909,9 +909,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
909/* Only try algorithms three times */ 909/* Only try algorithms three times */
910#define RT_MAX_TRIES 3 910#define RT_MAX_TRIES 3
911 911
912static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
913static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
914
915static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); 912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
916 913
917static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 914static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 7dbf72a2b02c..3b01098164c8 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -31,7 +31,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, 31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
32 rq->sched_switch, rq->sched_count, rq->sched_goidle, 32 rq->sched_switch, rq->sched_count, rq->sched_goidle,
33 rq->ttwu_count, rq->ttwu_local, 33 rq->ttwu_count, rq->ttwu_local,
34 rq->rq_sched_info.cpu_time, 34 rq->rq_cpu_time,
35 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); 35 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
36 36
37 seq_printf(seq, "\n"); 37 seq_printf(seq, "\n");
@@ -123,7 +123,7 @@ static inline void
123rq_sched_info_depart(struct rq *rq, unsigned long long delta) 123rq_sched_info_depart(struct rq *rq, unsigned long long delta)
124{ 124{
125 if (rq) 125 if (rq)
126 rq->rq_sched_info.cpu_time += delta; 126 rq->rq_cpu_time += delta;
127} 127}
128 128
129static inline void 129static inline void
@@ -236,7 +236,6 @@ static inline void sched_info_depart(struct task_struct *t)
236 unsigned long long delta = task_rq(t)->clock - 236 unsigned long long delta = task_rq(t)->clock -
237 t->sched_info.last_arrival; 237 t->sched_info.last_arrival;
238 238
239 t->sched_info.cpu_time += delta;
240 rq_sched_info_depart(task_rq(t), delta); 239 rq_sched_info_depart(task_rq(t), delta);
241 240
242 if (t->state == TASK_RUNNING) 241 if (t->state == TASK_RUNNING)
diff --git a/kernel/user.c b/kernel/user.c
index 6608a3d8ca61..477b6660f447 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -104,6 +104,8 @@ static int sched_create_user(struct user_struct *up)
104 if (IS_ERR(up->tg)) 104 if (IS_ERR(up->tg))
105 rc = -ENOMEM; 105 rc = -ENOMEM;
106 106
107 set_tg_uid(up);
108
107 return rc; 109 return rc;
108} 110}
109 111