aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c304
1 files changed, 210 insertions, 94 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 98dcdf272db3..c6e551de795b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -209,9 +209,8 @@ static inline struct task_group *task_group(struct task_struct *p)
209 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 209 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
210 struct task_group, css); 210 struct task_group, css);
211#else 211#else
212 tg = &init_task_group; 212 tg = &init_task_group;
213#endif 213#endif
214
215 return tg; 214 return tg;
216} 215}
217 216
@@ -249,15 +248,16 @@ struct cfs_rq {
249#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
250 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 249 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
251 250
252 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 251 /*
252 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
253 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 253 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
254 * (like users, containers etc.) 254 * (like users, containers etc.)
255 * 255 *
256 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 256 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
257 * list is used during load balance. 257 * list is used during load balance.
258 */ 258 */
259 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ 259 struct list_head leaf_cfs_rq_list;
260 struct task_group *tg; /* group that "owns" this runqueue */ 260 struct task_group *tg; /* group that "owns" this runqueue */
261#endif 261#endif
262}; 262};
263 263
@@ -300,7 +300,7 @@ struct rq {
300 /* list of leaf cfs_rq on this cpu: */ 300 /* list of leaf cfs_rq on this cpu: */
301 struct list_head leaf_cfs_rq_list; 301 struct list_head leaf_cfs_rq_list;
302#endif 302#endif
303 struct rt_rq rt; 303 struct rt_rq rt;
304 304
305 /* 305 /*
306 * This is part of a global counter where only the total sum 306 * This is part of a global counter where only the total sum
@@ -457,8 +457,8 @@ enum {
457 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, 457 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
458 SCHED_FEAT_WAKEUP_PREEMPT = 2, 458 SCHED_FEAT_WAKEUP_PREEMPT = 2,
459 SCHED_FEAT_START_DEBIT = 4, 459 SCHED_FEAT_START_DEBIT = 4,
460 SCHED_FEAT_TREE_AVG = 8, 460 SCHED_FEAT_TREE_AVG = 8,
461 SCHED_FEAT_APPROX_AVG = 16, 461 SCHED_FEAT_APPROX_AVG = 16,
462}; 462};
463 463
464const_debug unsigned int sysctl_sched_features = 464const_debug unsigned int sysctl_sched_features =
@@ -488,7 +488,12 @@ unsigned long long cpu_clock(int cpu)
488 488
489 local_irq_save(flags); 489 local_irq_save(flags);
490 rq = cpu_rq(cpu); 490 rq = cpu_rq(cpu);
491 update_rq_clock(rq); 491 /*
492 * Only call sched_clock() if the scheduler has already been
493 * initialized (some code might call cpu_clock() very early):
494 */
495 if (rq->idle)
496 update_rq_clock(rq);
492 now = rq->clock; 497 now = rq->clock;
493 local_irq_restore(flags); 498 local_irq_restore(flags);
494 499
@@ -591,7 +596,7 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
591 596
592/* 597/*
593 * task_rq_lock - lock the runqueue a given task resides on and disable 598 * task_rq_lock - lock the runqueue a given task resides on and disable
594 * interrupts. Note the ordering: we can safely lookup the task_rq without 599 * interrupts. Note the ordering: we can safely lookup the task_rq without
595 * explicitly disabling preemption. 600 * explicitly disabling preemption.
596 */ 601 */
597static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 602static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
@@ -779,7 +784,7 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
779 * To aid in avoiding the subversion of "niceness" due to uneven distribution 784 * To aid in avoiding the subversion of "niceness" due to uneven distribution
780 * of tasks with abnormal "nice" values across CPUs the contribution that 785 * of tasks with abnormal "nice" values across CPUs the contribution that
781 * each task makes to its run queue's load is weighted according to its 786 * each task makes to its run queue's load is weighted according to its
782 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a 787 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
783 * scaled version of the new time slice allocation that they receive on time 788 * scaled version of the new time slice allocation that they receive on time
784 * slice expiry etc. 789 * slice expiry etc.
785 */ 790 */
@@ -854,6 +859,12 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
854 struct rq_iterator *iterator); 859 struct rq_iterator *iterator);
855#endif 860#endif
856 861
862#ifdef CONFIG_CGROUP_CPUACCT
863static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
864#else
865static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
866#endif
867
857#include "sched_stats.h" 868#include "sched_stats.h"
858#include "sched_idletask.c" 869#include "sched_idletask.c"
859#include "sched_fair.c" 870#include "sched_fair.c"
@@ -1848,7 +1859,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
1848 * and do any other architecture-specific cleanup actions. 1859 * and do any other architecture-specific cleanup actions.
1849 * 1860 *
1850 * Note that we may have delayed dropping an mm in context_switch(). If 1861 * Note that we may have delayed dropping an mm in context_switch(). If
1851 * so, we finish that here outside of the runqueue lock. (Doing it 1862 * so, we finish that here outside of the runqueue lock. (Doing it
1852 * with the lock held can cause deadlocks; see schedule() for 1863 * with the lock held can cause deadlocks; see schedule() for
1853 * details.) 1864 * details.)
1854 */ 1865 */
@@ -2130,7 +2141,7 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2130/* 2141/*
2131 * If dest_cpu is allowed for this process, migrate the task to it. 2142 * If dest_cpu is allowed for this process, migrate the task to it.
2132 * This is accomplished by forcing the cpu_allowed mask to only 2143 * This is accomplished by forcing the cpu_allowed mask to only
2133 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2144 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
2134 * the cpu_allowed mask is restored. 2145 * the cpu_allowed mask is restored.
2135 */ 2146 */
2136static void sched_migrate_task(struct task_struct *p, int dest_cpu) 2147static void sched_migrate_task(struct task_struct *p, int dest_cpu)
@@ -2575,7 +2586,7 @@ group_next:
2575 * tasks around. Thus we look for the minimum possible imbalance. 2586 * tasks around. Thus we look for the minimum possible imbalance.
2576 * Negative imbalances (*we* are more loaded than anyone else) will 2587 * Negative imbalances (*we* are more loaded than anyone else) will
2577 * be counted as no imbalance for these purposes -- we can't fix that 2588 * be counted as no imbalance for these purposes -- we can't fix that
2578 * by pulling tasks to us. Be careful of negative numbers as they'll 2589 * by pulling tasks to us. Be careful of negative numbers as they'll
2579 * appear as very large values with unsigned longs. 2590 * appear as very large values with unsigned longs.
2580 */ 2591 */
2581 if (max_load <= busiest_load_per_task) 2592 if (max_load <= busiest_load_per_task)
@@ -3010,7 +3021,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3010 3021
3011 /* 3022 /*
3012 * This condition is "impossible", if it occurs 3023 * This condition is "impossible", if it occurs
3013 * we need to fix it. Originally reported by 3024 * we need to fix it. Originally reported by
3014 * Bjorn Helgaas on a 128-cpu setup. 3025 * Bjorn Helgaas on a 128-cpu setup.
3015 */ 3026 */
3016 BUG_ON(busiest_rq == target_rq); 3027 BUG_ON(busiest_rq == target_rq);
@@ -3042,7 +3053,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3042#ifdef CONFIG_NO_HZ 3053#ifdef CONFIG_NO_HZ
3043static struct { 3054static struct {
3044 atomic_t load_balancer; 3055 atomic_t load_balancer;
3045 cpumask_t cpu_mask; 3056 cpumask_t cpu_mask;
3046} nohz ____cacheline_aligned = { 3057} nohz ____cacheline_aligned = {
3047 .load_balancer = ATOMIC_INIT(-1), 3058 .load_balancer = ATOMIC_INIT(-1),
3048 .cpu_mask = CPU_MASK_NONE, 3059 .cpu_mask = CPU_MASK_NONE,
@@ -3546,7 +3557,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
3546static inline void schedule_debug(struct task_struct *prev) 3557static inline void schedule_debug(struct task_struct *prev)
3547{ 3558{
3548 /* 3559 /*
3549 * Test if we are atomic. Since do_exit() needs to call into 3560 * Test if we are atomic. Since do_exit() needs to call into
3550 * schedule() atomically, we ignore that path for now. 3561 * schedule() atomically, we ignore that path for now.
3551 * Otherwise, whine if we are scheduling when we should not be. 3562 * Otherwise, whine if we are scheduling when we should not be.
3552 */ 3563 */
@@ -3668,7 +3679,7 @@ EXPORT_SYMBOL(schedule);
3668#ifdef CONFIG_PREEMPT 3679#ifdef CONFIG_PREEMPT
3669/* 3680/*
3670 * this is the entry point to schedule() from in-kernel preemption 3681 * this is the entry point to schedule() from in-kernel preemption
3671 * off of preempt_enable. Kernel preemptions off return from interrupt 3682 * off of preempt_enable. Kernel preemptions off return from interrupt
3672 * occur there and call schedule directly. 3683 * occur there and call schedule directly.
3673 */ 3684 */
3674asmlinkage void __sched preempt_schedule(void) 3685asmlinkage void __sched preempt_schedule(void)
@@ -3680,7 +3691,7 @@ asmlinkage void __sched preempt_schedule(void)
3680#endif 3691#endif
3681 /* 3692 /*
3682 * If there is a non-zero preempt_count or interrupts are disabled, 3693 * If there is a non-zero preempt_count or interrupts are disabled,
3683 * we do not want to preempt the current task. Just return.. 3694 * we do not want to preempt the current task. Just return..
3684 */ 3695 */
3685 if (likely(ti->preempt_count || irqs_disabled())) 3696 if (likely(ti->preempt_count || irqs_disabled()))
3686 return; 3697 return;
@@ -3766,12 +3777,12 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3766EXPORT_SYMBOL(default_wake_function); 3777EXPORT_SYMBOL(default_wake_function);
3767 3778
3768/* 3779/*
3769 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 3780 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3770 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 3781 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3771 * number) then we wake all the non-exclusive tasks and one exclusive task. 3782 * number) then we wake all the non-exclusive tasks and one exclusive task.
3772 * 3783 *
3773 * There are circumstances in which we can try to wake a task which has already 3784 * There are circumstances in which we can try to wake a task which has already
3774 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 3785 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3775 * zero in this (rare) case, and we handle it by continuing to scan the queue. 3786 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3776 */ 3787 */
3777static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 3788static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
@@ -4384,8 +4395,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4384 * @policy: new policy. 4395 * @policy: new policy.
4385 * @param: structure containing the new RT priority. 4396 * @param: structure containing the new RT priority.
4386 */ 4397 */
4387asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 4398asmlinkage long
4388 struct sched_param __user *param) 4399sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4389{ 4400{
4390 /* negative values for policy are not valid */ 4401 /* negative values for policy are not valid */
4391 if (policy < 0) 4402 if (policy < 0)
@@ -4485,7 +4496,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4485 4496
4486 /* 4497 /*
4487 * It is not safe to call set_cpus_allowed with the 4498 * It is not safe to call set_cpus_allowed with the
4488 * tasklist_lock held. We will bump the task_struct's 4499 * tasklist_lock held. We will bump the task_struct's
4489 * usage count and then drop tasklist_lock. 4500 * usage count and then drop tasklist_lock.
4490 */ 4501 */
4491 get_task_struct(p); 4502 get_task_struct(p);
@@ -4681,7 +4692,7 @@ EXPORT_SYMBOL(cond_resched);
4681 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 4692 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4682 * call schedule, and on return reacquire the lock. 4693 * call schedule, and on return reacquire the lock.
4683 * 4694 *
4684 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4695 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4685 * operations here to prevent schedule() from being called twice (once via 4696 * operations here to prevent schedule() from being called twice (once via
4686 * spin_unlock(), once by hand). 4697 * spin_unlock(), once by hand).
4687 */ 4698 */
@@ -4735,7 +4746,7 @@ void __sched yield(void)
4735EXPORT_SYMBOL(yield); 4746EXPORT_SYMBOL(yield);
4736 4747
4737/* 4748/*
4738 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4749 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4739 * that process accounting knows that this is a task in IO wait state. 4750 * that process accounting knows that this is a task in IO wait state.
4740 * 4751 *
4741 * But don't do that if it is a deliberate, throttling IO wait (this task 4752 * But don't do that if it is a deliberate, throttling IO wait (this task
@@ -4844,17 +4855,21 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4844 if (retval) 4855 if (retval)
4845 goto out_unlock; 4856 goto out_unlock;
4846 4857
4847 if (p->policy == SCHED_FIFO) 4858 /*
4848 time_slice = 0; 4859 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
4849 else if (p->policy == SCHED_RR) 4860 * tasks that are on an otherwise idle runqueue:
4861 */
4862 time_slice = 0;
4863 if (p->policy == SCHED_RR) {
4850 time_slice = DEF_TIMESLICE; 4864 time_slice = DEF_TIMESLICE;
4851 else { 4865 } else {
4852 struct sched_entity *se = &p->se; 4866 struct sched_entity *se = &p->se;
4853 unsigned long flags; 4867 unsigned long flags;
4854 struct rq *rq; 4868 struct rq *rq;
4855 4869
4856 rq = task_rq_lock(p, &flags); 4870 rq = task_rq_lock(p, &flags);
4857 time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); 4871 if (rq->cfs.load.weight)
4872 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
4858 task_rq_unlock(rq, &flags); 4873 task_rq_unlock(rq, &flags);
4859 } 4874 }
4860 read_unlock(&tasklist_lock); 4875 read_unlock(&tasklist_lock);
@@ -5040,7 +5055,7 @@ static inline void sched_init_granularity(void)
5040 * is removed from the allowed bitmask. 5055 * is removed from the allowed bitmask.
5041 * 5056 *
5042 * NOTE: the caller must have a valid reference to the task, the 5057 * NOTE: the caller must have a valid reference to the task, the
5043 * task must not exit() & deallocate itself prematurely. The 5058 * task must not exit() & deallocate itself prematurely. The
5044 * call is not atomic; no spinlocks may be held. 5059 * call is not atomic; no spinlocks may be held.
5045 */ 5060 */
5046int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) 5061int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
@@ -5077,7 +5092,7 @@ out:
5077EXPORT_SYMBOL_GPL(set_cpus_allowed); 5092EXPORT_SYMBOL_GPL(set_cpus_allowed);
5078 5093
5079/* 5094/*
5080 * Move (not current) task off this cpu, onto dest cpu. We're doing 5095 * Move (not current) task off this cpu, onto dest cpu. We're doing
5081 * this because either it can't run here any more (set_cpus_allowed() 5096 * this because either it can't run here any more (set_cpus_allowed()
5082 * away from this CPU, or CPU going down), or because we're 5097 * away from this CPU, or CPU going down), or because we're
5083 * attempting to rebalance this task on exec (sched_exec). 5098 * attempting to rebalance this task on exec (sched_exec).
@@ -5222,7 +5237,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5222 * Try to stay on the same cpuset, where the 5237 * Try to stay on the same cpuset, where the
5223 * current cpuset may be a subset of all cpus. 5238 * current cpuset may be a subset of all cpus.
5224 * The cpuset_cpus_allowed_locked() variant of 5239 * The cpuset_cpus_allowed_locked() variant of
5225 * cpuset_cpus_allowed() will not block. It must be 5240 * cpuset_cpus_allowed() will not block. It must be
5226 * called within calls to cpuset_lock/cpuset_unlock. 5241 * called within calls to cpuset_lock/cpuset_unlock.
5227 */ 5242 */
5228 rq = task_rq_lock(p, &flags); 5243 rq = task_rq_lock(p, &flags);
@@ -5235,10 +5250,11 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5235 * kernel threads (both mm NULL), since they never 5250 * kernel threads (both mm NULL), since they never
5236 * leave kernel. 5251 * leave kernel.
5237 */ 5252 */
5238 if (p->mm && printk_ratelimit()) 5253 if (p->mm && printk_ratelimit()) {
5239 printk(KERN_INFO "process %d (%s) no " 5254 printk(KERN_INFO "process %d (%s) no "
5240 "longer affine to cpu%d\n", 5255 "longer affine to cpu%d\n",
5241 task_pid_nr(p), p->comm, dead_cpu); 5256 task_pid_nr(p), p->comm, dead_cpu);
5257 }
5242 } 5258 }
5243 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); 5259 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
5244} 5260}
@@ -5340,7 +5356,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5340 5356
5341 /* 5357 /*
5342 * Drop lock around migration; if someone else moves it, 5358 * Drop lock around migration; if someone else moves it,
5343 * that's OK. No task can be added to this CPU, so iteration is 5359 * that's OK. No task can be added to this CPU, so iteration is
5344 * fine. 5360 * fine.
5345 */ 5361 */
5346 spin_unlock_irq(&rq->lock); 5362 spin_unlock_irq(&rq->lock);
@@ -5404,7 +5420,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
5404 /* 5420 /*
5405 * In the intermediate directories, both the child directory and 5421 * In the intermediate directories, both the child directory and
5406 * procname are dynamically allocated and could fail but the mode 5422 * procname are dynamically allocated and could fail but the mode
5407 * will always be set. In the lowest directory the names are 5423 * will always be set. In the lowest directory the names are
5408 * static strings and all have proc handlers. 5424 * static strings and all have proc handlers.
5409 */ 5425 */
5410 for (entry = *tablep; entry->mode; entry++) { 5426 for (entry = *tablep; entry->mode; entry++) {
@@ -5575,7 +5591,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5575 case CPU_UP_CANCELED_FROZEN: 5591 case CPU_UP_CANCELED_FROZEN:
5576 if (!cpu_rq(cpu)->migration_thread) 5592 if (!cpu_rq(cpu)->migration_thread)
5577 break; 5593 break;
5578 /* Unbind it from offline cpu so it can run. Fall thru. */ 5594 /* Unbind it from offline cpu so it can run. Fall thru. */
5579 kthread_bind(cpu_rq(cpu)->migration_thread, 5595 kthread_bind(cpu_rq(cpu)->migration_thread,
5580 any_online_cpu(cpu_online_map)); 5596 any_online_cpu(cpu_online_map));
5581 kthread_stop(cpu_rq(cpu)->migration_thread); 5597 kthread_stop(cpu_rq(cpu)->migration_thread);
@@ -5602,9 +5618,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5602 migrate_nr_uninterruptible(rq); 5618 migrate_nr_uninterruptible(rq);
5603 BUG_ON(rq->nr_running != 0); 5619 BUG_ON(rq->nr_running != 0);
5604 5620
5605 /* No need to migrate the tasks: it was best-effort if 5621 /*
5606 * they didn't take sched_hotcpu_mutex. Just wake up 5622 * No need to migrate the tasks: it was best-effort if
5607 * the requestors. */ 5623 * they didn't take sched_hotcpu_mutex. Just wake up
5624 * the requestors.
5625 */
5608 spin_lock_irq(&rq->lock); 5626 spin_lock_irq(&rq->lock);
5609 while (!list_empty(&rq->migration_queue)) { 5627 while (!list_empty(&rq->migration_queue)) {
5610 struct migration_req *req; 5628 struct migration_req *req;
@@ -5912,7 +5930,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5912 * @node: node whose sched_domain we're building 5930 * @node: node whose sched_domain we're building
5913 * @used_nodes: nodes already in the sched_domain 5931 * @used_nodes: nodes already in the sched_domain
5914 * 5932 *
5915 * Find the next node to include in a given scheduling domain. Simply 5933 * Find the next node to include in a given scheduling domain. Simply
5916 * finds the closest node not already in the @used_nodes map. 5934 * finds the closest node not already in the @used_nodes map.
5917 * 5935 *
5918 * Should use nodemask_t. 5936 * Should use nodemask_t.
@@ -5952,7 +5970,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
5952 * @node: node whose cpumask we're constructing 5970 * @node: node whose cpumask we're constructing
5953 * @size: number of nodes to include in this span 5971 * @size: number of nodes to include in this span
5954 * 5972 *
5955 * Given a node, construct a good cpumask for its sched_domain to span. It 5973 * Given a node, construct a good cpumask for its sched_domain to span. It
5956 * should be one that prevents unnecessary balancing, but also spreads tasks 5974 * should be one that prevents unnecessary balancing, but also spreads tasks
5957 * out optimally. 5975 * out optimally.
5958 */ 5976 */
@@ -5989,8 +6007,8 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5989static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 6007static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5990static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 6008static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5991 6009
5992static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, 6010static int
5993 struct sched_group **sg) 6011cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
5994{ 6012{
5995 if (sg) 6013 if (sg)
5996 *sg = &per_cpu(sched_group_cpus, cpu); 6014 *sg = &per_cpu(sched_group_cpus, cpu);
@@ -6007,8 +6025,8 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6007#endif 6025#endif
6008 6026
6009#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6027#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6010static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, 6028static int
6011 struct sched_group **sg) 6029cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6012{ 6030{
6013 int group; 6031 int group;
6014 cpumask_t mask = per_cpu(cpu_sibling_map, cpu); 6032 cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
@@ -6019,8 +6037,8 @@ static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6019 return group; 6037 return group;
6020} 6038}
6021#elif defined(CONFIG_SCHED_MC) 6039#elif defined(CONFIG_SCHED_MC)
6022static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, 6040static int
6023 struct sched_group **sg) 6041cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6024{ 6042{
6025 if (sg) 6043 if (sg)
6026 *sg = &per_cpu(sched_group_core, cpu); 6044 *sg = &per_cpu(sched_group_core, cpu);
@@ -6031,8 +6049,8 @@ static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6031static DEFINE_PER_CPU(struct sched_domain, phys_domains); 6049static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6032static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 6050static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6033 6051
6034static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, 6052static int
6035 struct sched_group **sg) 6053cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6036{ 6054{
6037 int group; 6055 int group;
6038#ifdef CONFIG_SCHED_MC 6056#ifdef CONFIG_SCHED_MC
@@ -6212,7 +6230,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6212 * Allocate the per-node list of sched groups 6230 * Allocate the per-node list of sched groups
6213 */ 6231 */
6214 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), 6232 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
6215 GFP_KERNEL); 6233 GFP_KERNEL);
6216 if (!sched_group_nodes) { 6234 if (!sched_group_nodes) {
6217 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6235 printk(KERN_WARNING "Can not alloc sched group node list\n");
6218 return -ENOMEM; 6236 return -ENOMEM;
@@ -6459,7 +6477,7 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6459static cpumask_t fallback_doms; 6477static cpumask_t fallback_doms;
6460 6478
6461/* 6479/*
6462 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6480 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6463 * For now this just excludes isolated cpus, but could be used to 6481 * For now this just excludes isolated cpus, but could be used to
6464 * exclude other special cases in the future. 6482 * exclude other special cases in the future.
6465 */ 6483 */
@@ -6501,19 +6519,19 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6501 6519
6502/* 6520/*
6503 * Partition sched domains as specified by the 'ndoms_new' 6521 * Partition sched domains as specified by the 'ndoms_new'
6504 * cpumasks in the array doms_new[] of cpumasks. This compares 6522 * cpumasks in the array doms_new[] of cpumasks. This compares
6505 * doms_new[] to the current sched domain partitioning, doms_cur[]. 6523 * doms_new[] to the current sched domain partitioning, doms_cur[].
6506 * It destroys each deleted domain and builds each new domain. 6524 * It destroys each deleted domain and builds each new domain.
6507 * 6525 *
6508 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. 6526 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
6509 * The masks don't intersect (don't overlap.) We should setup one 6527 * The masks don't intersect (don't overlap.) We should setup one
6510 * sched domain for each mask. CPUs not in any of the cpumasks will 6528 * sched domain for each mask. CPUs not in any of the cpumasks will
6511 * not be load balanced. If the same cpumask appears both in the 6529 * not be load balanced. If the same cpumask appears both in the
6512 * current 'doms_cur' domains and in the new 'doms_new', we can leave 6530 * current 'doms_cur' domains and in the new 'doms_new', we can leave
6513 * it as it is. 6531 * it as it is.
6514 * 6532 *
6515 * The passed in 'doms_new' should be kmalloc'd. This routine takes 6533 * The passed in 'doms_new' should be kmalloc'd. This routine takes
6516 * ownership of it and will kfree it when done with it. If the caller 6534 * ownership of it and will kfree it when done with it. If the caller
6517 * failed the kmalloc call, then it can pass in doms_new == NULL, 6535 * failed the kmalloc call, then it can pass in doms_new == NULL,
6518 * and partition_sched_domains() will fallback to the single partition 6536 * and partition_sched_domains() will fallback to the single partition
6519 * 'fallback_doms'. 6537 * 'fallback_doms'.
@@ -6643,7 +6661,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6643#endif 6661#endif
6644 6662
6645/* 6663/*
6646 * Force a reinitialization of the sched domains hierarchy. The domains 6664 * Force a reinitialization of the sched domains hierarchy. The domains
6647 * and groups cannot be updated in place without racing with the balancing 6665 * and groups cannot be updated in place without racing with the balancing
6648 * code, so we temporarily attach all running cpus to the NULL domain 6666 * code, so we temporarily attach all running cpus to the NULL domain
6649 * which will prevent rebalancing while the sched domains are recalculated. 6667 * which will prevent rebalancing while the sched domains are recalculated.
@@ -6933,8 +6951,8 @@ struct task_struct *curr_task(int cpu)
6933 * @p: the task pointer to set. 6951 * @p: the task pointer to set.
6934 * 6952 *
6935 * Description: This function must only be used when non-maskable interrupts 6953 * Description: This function must only be used when non-maskable interrupts
6936 * are serviced on a separate stack. It allows the architecture to switch the 6954 * are serviced on a separate stack. It allows the architecture to switch the
6937 * notion of the current task on a cpu in a non-blocking manner. This function 6955 * notion of the current task on a cpu in a non-blocking manner. This function
6938 * must be called with all CPU's synchronized, and interrupts disabled, the 6956 * must be called with all CPU's synchronized, and interrupts disabled, the
6939 * and caller must save the original value of the current task (see 6957 * and caller must save the original value of the current task (see
6940 * curr_task() above) and restore that value before reenabling interrupts and 6958 * curr_task() above) and restore that value before reenabling interrupts and
@@ -7183,16 +7201,17 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7183 return &tg->css; 7201 return &tg->css;
7184} 7202}
7185 7203
7186static void cpu_cgroup_destroy(struct cgroup_subsys *ss, 7204static void
7187 struct cgroup *cgrp) 7205cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7188{ 7206{
7189 struct task_group *tg = cgroup_tg(cgrp); 7207 struct task_group *tg = cgroup_tg(cgrp);
7190 7208
7191 sched_destroy_group(tg); 7209 sched_destroy_group(tg);
7192} 7210}
7193 7211
7194static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, 7212static int
7195 struct cgroup *cgrp, struct task_struct *tsk) 7213cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7214 struct task_struct *tsk)
7196{ 7215{
7197 /* We don't support RT-tasks being in separate groups */ 7216 /* We don't support RT-tasks being in separate groups */
7198 if (tsk->sched_class != &fair_sched_class) 7217 if (tsk->sched_class != &fair_sched_class)
@@ -7221,38 +7240,12 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7221 return (u64) tg->shares; 7240 return (u64) tg->shares;
7222} 7241}
7223 7242
7224static u64 cpu_usage_read(struct cgroup *cgrp, struct cftype *cft)
7225{
7226 struct task_group *tg = cgroup_tg(cgrp);
7227 unsigned long flags;
7228 u64 res = 0;
7229 int i;
7230
7231 for_each_possible_cpu(i) {
7232 /*
7233 * Lock to prevent races with updating 64-bit counters
7234 * on 32-bit arches.
7235 */
7236 spin_lock_irqsave(&cpu_rq(i)->lock, flags);
7237 res += tg->se[i]->sum_exec_runtime;
7238 spin_unlock_irqrestore(&cpu_rq(i)->lock, flags);
7239 }
7240 /* Convert from ns to ms */
7241 do_div(res, NSEC_PER_MSEC);
7242
7243 return res;
7244}
7245
7246static struct cftype cpu_files[] = { 7243static struct cftype cpu_files[] = {
7247 { 7244 {
7248 .name = "shares", 7245 .name = "shares",
7249 .read_uint = cpu_shares_read_uint, 7246 .read_uint = cpu_shares_read_uint,
7250 .write_uint = cpu_shares_write_uint, 7247 .write_uint = cpu_shares_write_uint,
7251 }, 7248 },
7252 {
7253 .name = "usage",
7254 .read_uint = cpu_usage_read,
7255 },
7256}; 7249};
7257 7250
7258static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 7251static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -7272,3 +7265,126 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7272}; 7265};
7273 7266
7274#endif /* CONFIG_FAIR_CGROUP_SCHED */ 7267#endif /* CONFIG_FAIR_CGROUP_SCHED */
7268
7269#ifdef CONFIG_CGROUP_CPUACCT
7270
7271/*
7272 * CPU accounting code for task groups.
7273 *
7274 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
7275 * (balbir@in.ibm.com).
7276 */
7277
7278/* track cpu usage of a group of tasks */
7279struct cpuacct {
7280 struct cgroup_subsys_state css;
7281 /* cpuusage holds pointer to a u64-type object on every cpu */
7282 u64 *cpuusage;
7283};
7284
7285struct cgroup_subsys cpuacct_subsys;
7286
7287/* return cpu accounting group corresponding to this container */
7288static inline struct cpuacct *cgroup_ca(struct cgroup *cont)
7289{
7290 return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id),
7291 struct cpuacct, css);
7292}
7293
7294/* return cpu accounting group to which this task belongs */
7295static inline struct cpuacct *task_ca(struct task_struct *tsk)
7296{
7297 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
7298 struct cpuacct, css);
7299}
7300
7301/* create a new cpu accounting group */
7302static struct cgroup_subsys_state *cpuacct_create(
7303 struct cgroup_subsys *ss, struct cgroup *cont)
7304{
7305 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
7306
7307 if (!ca)
7308 return ERR_PTR(-ENOMEM);
7309
7310 ca->cpuusage = alloc_percpu(u64);
7311 if (!ca->cpuusage) {
7312 kfree(ca);
7313 return ERR_PTR(-ENOMEM);
7314 }
7315
7316 return &ca->css;
7317}
7318
7319/* destroy an existing cpu accounting group */
7320static void
7321cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
7322{
7323 struct cpuacct *ca = cgroup_ca(cont);
7324
7325 free_percpu(ca->cpuusage);
7326 kfree(ca);
7327}
7328
7329/* return total cpu usage (in nanoseconds) of a group */
7330static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
7331{
7332 struct cpuacct *ca = cgroup_ca(cont);
7333 u64 totalcpuusage = 0;
7334 int i;
7335
7336 for_each_possible_cpu(i) {
7337 u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
7338
7339 /*
7340 * Take rq->lock to make 64-bit addition safe on 32-bit
7341 * platforms.
7342 */
7343 spin_lock_irq(&cpu_rq(i)->lock);
7344 totalcpuusage += *cpuusage;
7345 spin_unlock_irq(&cpu_rq(i)->lock);
7346 }
7347
7348 return totalcpuusage;
7349}
7350
7351static struct cftype files[] = {
7352 {
7353 .name = "usage",
7354 .read_uint = cpuusage_read,
7355 },
7356};
7357
7358static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7359{
7360 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
7361}
7362
7363/*
7364 * charge this task's execution time to its accounting group.
7365 *
7366 * called with rq->lock held.
7367 */
7368static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
7369{
7370 struct cpuacct *ca;
7371
7372 if (!cpuacct_subsys.active)
7373 return;
7374
7375 ca = task_ca(tsk);
7376 if (ca) {
7377 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
7378
7379 *cpuusage += cputime;
7380 }
7381}
7382
7383struct cgroup_subsys cpuacct_subsys = {
7384 .name = "cpuacct",
7385 .create = cpuacct_create,
7386 .destroy = cpuacct_destroy,
7387 .populate = cpuacct_populate,
7388 .subsys_id = cpuacct_subsys_id,
7389};
7390#endif /* CONFIG_CGROUP_CPUACCT */