diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 304 |
1 files changed, 210 insertions, 94 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 98dcdf272db3..c6e551de795b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -209,9 +209,8 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
209 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), | 209 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), |
210 | struct task_group, css); | 210 | struct task_group, css); |
211 | #else | 211 | #else |
212 | tg = &init_task_group; | 212 | tg = &init_task_group; |
213 | #endif | 213 | #endif |
214 | |||
215 | return tg; | 214 | return tg; |
216 | } | 215 | } |
217 | 216 | ||
@@ -249,15 +248,16 @@ struct cfs_rq { | |||
249 | #ifdef CONFIG_FAIR_GROUP_SCHED | 248 | #ifdef CONFIG_FAIR_GROUP_SCHED |
250 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 249 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
251 | 250 | ||
252 | /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 251 | /* |
252 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | ||
253 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | 253 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities |
254 | * (like users, containers etc.) | 254 | * (like users, containers etc.) |
255 | * | 255 | * |
256 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 256 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
257 | * list is used during load balance. | 257 | * list is used during load balance. |
258 | */ | 258 | */ |
259 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ | 259 | struct list_head leaf_cfs_rq_list; |
260 | struct task_group *tg; /* group that "owns" this runqueue */ | 260 | struct task_group *tg; /* group that "owns" this runqueue */ |
261 | #endif | 261 | #endif |
262 | }; | 262 | }; |
263 | 263 | ||
@@ -300,7 +300,7 @@ struct rq { | |||
300 | /* list of leaf cfs_rq on this cpu: */ | 300 | /* list of leaf cfs_rq on this cpu: */ |
301 | struct list_head leaf_cfs_rq_list; | 301 | struct list_head leaf_cfs_rq_list; |
302 | #endif | 302 | #endif |
303 | struct rt_rq rt; | 303 | struct rt_rq rt; |
304 | 304 | ||
305 | /* | 305 | /* |
306 | * This is part of a global counter where only the total sum | 306 | * This is part of a global counter where only the total sum |
@@ -457,8 +457,8 @@ enum { | |||
457 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, | 457 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, |
458 | SCHED_FEAT_WAKEUP_PREEMPT = 2, | 458 | SCHED_FEAT_WAKEUP_PREEMPT = 2, |
459 | SCHED_FEAT_START_DEBIT = 4, | 459 | SCHED_FEAT_START_DEBIT = 4, |
460 | SCHED_FEAT_TREE_AVG = 8, | 460 | SCHED_FEAT_TREE_AVG = 8, |
461 | SCHED_FEAT_APPROX_AVG = 16, | 461 | SCHED_FEAT_APPROX_AVG = 16, |
462 | }; | 462 | }; |
463 | 463 | ||
464 | const_debug unsigned int sysctl_sched_features = | 464 | const_debug unsigned int sysctl_sched_features = |
@@ -488,7 +488,12 @@ unsigned long long cpu_clock(int cpu) | |||
488 | 488 | ||
489 | local_irq_save(flags); | 489 | local_irq_save(flags); |
490 | rq = cpu_rq(cpu); | 490 | rq = cpu_rq(cpu); |
491 | update_rq_clock(rq); | 491 | /* |
492 | * Only call sched_clock() if the scheduler has already been | ||
493 | * initialized (some code might call cpu_clock() very early): | ||
494 | */ | ||
495 | if (rq->idle) | ||
496 | update_rq_clock(rq); | ||
492 | now = rq->clock; | 497 | now = rq->clock; |
493 | local_irq_restore(flags); | 498 | local_irq_restore(flags); |
494 | 499 | ||
@@ -591,7 +596,7 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
591 | 596 | ||
592 | /* | 597 | /* |
593 | * task_rq_lock - lock the runqueue a given task resides on and disable | 598 | * task_rq_lock - lock the runqueue a given task resides on and disable |
594 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 599 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
595 | * explicitly disabling preemption. | 600 | * explicitly disabling preemption. |
596 | */ | 601 | */ |
597 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 602 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
@@ -779,7 +784,7 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | |||
779 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 784 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
780 | * of tasks with abnormal "nice" values across CPUs the contribution that | 785 | * of tasks with abnormal "nice" values across CPUs the contribution that |
781 | * each task makes to its run queue's load is weighted according to its | 786 | * each task makes to its run queue's load is weighted according to its |
782 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | 787 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a |
783 | * scaled version of the new time slice allocation that they receive on time | 788 | * scaled version of the new time slice allocation that they receive on time |
784 | * slice expiry etc. | 789 | * slice expiry etc. |
785 | */ | 790 | */ |
@@ -854,6 +859,12 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
854 | struct rq_iterator *iterator); | 859 | struct rq_iterator *iterator); |
855 | #endif | 860 | #endif |
856 | 861 | ||
862 | #ifdef CONFIG_CGROUP_CPUACCT | ||
863 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
864 | #else | ||
865 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | ||
866 | #endif | ||
867 | |||
857 | #include "sched_stats.h" | 868 | #include "sched_stats.h" |
858 | #include "sched_idletask.c" | 869 | #include "sched_idletask.c" |
859 | #include "sched_fair.c" | 870 | #include "sched_fair.c" |
@@ -1848,7 +1859,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
1848 | * and do any other architecture-specific cleanup actions. | 1859 | * and do any other architecture-specific cleanup actions. |
1849 | * | 1860 | * |
1850 | * Note that we may have delayed dropping an mm in context_switch(). If | 1861 | * Note that we may have delayed dropping an mm in context_switch(). If |
1851 | * so, we finish that here outside of the runqueue lock. (Doing it | 1862 | * so, we finish that here outside of the runqueue lock. (Doing it |
1852 | * with the lock held can cause deadlocks; see schedule() for | 1863 | * with the lock held can cause deadlocks; see schedule() for |
1853 | * details.) | 1864 | * details.) |
1854 | */ | 1865 | */ |
@@ -2130,7 +2141,7 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
2130 | /* | 2141 | /* |
2131 | * If dest_cpu is allowed for this process, migrate the task to it. | 2142 | * If dest_cpu is allowed for this process, migrate the task to it. |
2132 | * This is accomplished by forcing the cpu_allowed mask to only | 2143 | * This is accomplished by forcing the cpu_allowed mask to only |
2133 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 2144 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
2134 | * the cpu_allowed mask is restored. | 2145 | * the cpu_allowed mask is restored. |
2135 | */ | 2146 | */ |
2136 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) | 2147 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) |
@@ -2575,7 +2586,7 @@ group_next: | |||
2575 | * tasks around. Thus we look for the minimum possible imbalance. | 2586 | * tasks around. Thus we look for the minimum possible imbalance. |
2576 | * Negative imbalances (*we* are more loaded than anyone else) will | 2587 | * Negative imbalances (*we* are more loaded than anyone else) will |
2577 | * be counted as no imbalance for these purposes -- we can't fix that | 2588 | * be counted as no imbalance for these purposes -- we can't fix that |
2578 | * by pulling tasks to us. Be careful of negative numbers as they'll | 2589 | * by pulling tasks to us. Be careful of negative numbers as they'll |
2579 | * appear as very large values with unsigned longs. | 2590 | * appear as very large values with unsigned longs. |
2580 | */ | 2591 | */ |
2581 | if (max_load <= busiest_load_per_task) | 2592 | if (max_load <= busiest_load_per_task) |
@@ -3010,7 +3021,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
3010 | 3021 | ||
3011 | /* | 3022 | /* |
3012 | * This condition is "impossible", if it occurs | 3023 | * This condition is "impossible", if it occurs |
3013 | * we need to fix it. Originally reported by | 3024 | * we need to fix it. Originally reported by |
3014 | * Bjorn Helgaas on a 128-cpu setup. | 3025 | * Bjorn Helgaas on a 128-cpu setup. |
3015 | */ | 3026 | */ |
3016 | BUG_ON(busiest_rq == target_rq); | 3027 | BUG_ON(busiest_rq == target_rq); |
@@ -3042,7 +3053,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
3042 | #ifdef CONFIG_NO_HZ | 3053 | #ifdef CONFIG_NO_HZ |
3043 | static struct { | 3054 | static struct { |
3044 | atomic_t load_balancer; | 3055 | atomic_t load_balancer; |
3045 | cpumask_t cpu_mask; | 3056 | cpumask_t cpu_mask; |
3046 | } nohz ____cacheline_aligned = { | 3057 | } nohz ____cacheline_aligned = { |
3047 | .load_balancer = ATOMIC_INIT(-1), | 3058 | .load_balancer = ATOMIC_INIT(-1), |
3048 | .cpu_mask = CPU_MASK_NONE, | 3059 | .cpu_mask = CPU_MASK_NONE, |
@@ -3546,7 +3557,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3546 | static inline void schedule_debug(struct task_struct *prev) | 3557 | static inline void schedule_debug(struct task_struct *prev) |
3547 | { | 3558 | { |
3548 | /* | 3559 | /* |
3549 | * Test if we are atomic. Since do_exit() needs to call into | 3560 | * Test if we are atomic. Since do_exit() needs to call into |
3550 | * schedule() atomically, we ignore that path for now. | 3561 | * schedule() atomically, we ignore that path for now. |
3551 | * Otherwise, whine if we are scheduling when we should not be. | 3562 | * Otherwise, whine if we are scheduling when we should not be. |
3552 | */ | 3563 | */ |
@@ -3668,7 +3679,7 @@ EXPORT_SYMBOL(schedule); | |||
3668 | #ifdef CONFIG_PREEMPT | 3679 | #ifdef CONFIG_PREEMPT |
3669 | /* | 3680 | /* |
3670 | * this is the entry point to schedule() from in-kernel preemption | 3681 | * this is the entry point to schedule() from in-kernel preemption |
3671 | * off of preempt_enable. Kernel preemptions off return from interrupt | 3682 | * off of preempt_enable. Kernel preemptions off return from interrupt |
3672 | * occur there and call schedule directly. | 3683 | * occur there and call schedule directly. |
3673 | */ | 3684 | */ |
3674 | asmlinkage void __sched preempt_schedule(void) | 3685 | asmlinkage void __sched preempt_schedule(void) |
@@ -3680,7 +3691,7 @@ asmlinkage void __sched preempt_schedule(void) | |||
3680 | #endif | 3691 | #endif |
3681 | /* | 3692 | /* |
3682 | * If there is a non-zero preempt_count or interrupts are disabled, | 3693 | * If there is a non-zero preempt_count or interrupts are disabled, |
3683 | * we do not want to preempt the current task. Just return.. | 3694 | * we do not want to preempt the current task. Just return.. |
3684 | */ | 3695 | */ |
3685 | if (likely(ti->preempt_count || irqs_disabled())) | 3696 | if (likely(ti->preempt_count || irqs_disabled())) |
3686 | return; | 3697 | return; |
@@ -3766,12 +3777,12 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | |||
3766 | EXPORT_SYMBOL(default_wake_function); | 3777 | EXPORT_SYMBOL(default_wake_function); |
3767 | 3778 | ||
3768 | /* | 3779 | /* |
3769 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | 3780 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just |
3770 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | 3781 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve |
3771 | * number) then we wake all the non-exclusive tasks and one exclusive task. | 3782 | * number) then we wake all the non-exclusive tasks and one exclusive task. |
3772 | * | 3783 | * |
3773 | * There are circumstances in which we can try to wake a task which has already | 3784 | * There are circumstances in which we can try to wake a task which has already |
3774 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | 3785 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |
3775 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 3786 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
3776 | */ | 3787 | */ |
3777 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 3788 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
@@ -4384,8 +4395,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
4384 | * @policy: new policy. | 4395 | * @policy: new policy. |
4385 | * @param: structure containing the new RT priority. | 4396 | * @param: structure containing the new RT priority. |
4386 | */ | 4397 | */ |
4387 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | 4398 | asmlinkage long |
4388 | struct sched_param __user *param) | 4399 | sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
4389 | { | 4400 | { |
4390 | /* negative values for policy are not valid */ | 4401 | /* negative values for policy are not valid */ |
4391 | if (policy < 0) | 4402 | if (policy < 0) |
@@ -4485,7 +4496,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4485 | 4496 | ||
4486 | /* | 4497 | /* |
4487 | * It is not safe to call set_cpus_allowed with the | 4498 | * It is not safe to call set_cpus_allowed with the |
4488 | * tasklist_lock held. We will bump the task_struct's | 4499 | * tasklist_lock held. We will bump the task_struct's |
4489 | * usage count and then drop tasklist_lock. | 4500 | * usage count and then drop tasklist_lock. |
4490 | */ | 4501 | */ |
4491 | get_task_struct(p); | 4502 | get_task_struct(p); |
@@ -4681,7 +4692,7 @@ EXPORT_SYMBOL(cond_resched); | |||
4681 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4692 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, |
4682 | * call schedule, and on return reacquire the lock. | 4693 | * call schedule, and on return reacquire the lock. |
4683 | * | 4694 | * |
4684 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 4695 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
4685 | * operations here to prevent schedule() from being called twice (once via | 4696 | * operations here to prevent schedule() from being called twice (once via |
4686 | * spin_unlock(), once by hand). | 4697 | * spin_unlock(), once by hand). |
4687 | */ | 4698 | */ |
@@ -4735,7 +4746,7 @@ void __sched yield(void) | |||
4735 | EXPORT_SYMBOL(yield); | 4746 | EXPORT_SYMBOL(yield); |
4736 | 4747 | ||
4737 | /* | 4748 | /* |
4738 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 4749 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
4739 | * that process accounting knows that this is a task in IO wait state. | 4750 | * that process accounting knows that this is a task in IO wait state. |
4740 | * | 4751 | * |
4741 | * But don't do that if it is a deliberate, throttling IO wait (this task | 4752 | * But don't do that if it is a deliberate, throttling IO wait (this task |
@@ -4844,17 +4855,21 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
4844 | if (retval) | 4855 | if (retval) |
4845 | goto out_unlock; | 4856 | goto out_unlock; |
4846 | 4857 | ||
4847 | if (p->policy == SCHED_FIFO) | 4858 | /* |
4848 | time_slice = 0; | 4859 | * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER |
4849 | else if (p->policy == SCHED_RR) | 4860 | * tasks that are on an otherwise idle runqueue: |
4861 | */ | ||
4862 | time_slice = 0; | ||
4863 | if (p->policy == SCHED_RR) { | ||
4850 | time_slice = DEF_TIMESLICE; | 4864 | time_slice = DEF_TIMESLICE; |
4851 | else { | 4865 | } else { |
4852 | struct sched_entity *se = &p->se; | 4866 | struct sched_entity *se = &p->se; |
4853 | unsigned long flags; | 4867 | unsigned long flags; |
4854 | struct rq *rq; | 4868 | struct rq *rq; |
4855 | 4869 | ||
4856 | rq = task_rq_lock(p, &flags); | 4870 | rq = task_rq_lock(p, &flags); |
4857 | time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); | 4871 | if (rq->cfs.load.weight) |
4872 | time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); | ||
4858 | task_rq_unlock(rq, &flags); | 4873 | task_rq_unlock(rq, &flags); |
4859 | } | 4874 | } |
4860 | read_unlock(&tasklist_lock); | 4875 | read_unlock(&tasklist_lock); |
@@ -5040,7 +5055,7 @@ static inline void sched_init_granularity(void) | |||
5040 | * is removed from the allowed bitmask. | 5055 | * is removed from the allowed bitmask. |
5041 | * | 5056 | * |
5042 | * NOTE: the caller must have a valid reference to the task, the | 5057 | * NOTE: the caller must have a valid reference to the task, the |
5043 | * task must not exit() & deallocate itself prematurely. The | 5058 | * task must not exit() & deallocate itself prematurely. The |
5044 | * call is not atomic; no spinlocks may be held. | 5059 | * call is not atomic; no spinlocks may be held. |
5045 | */ | 5060 | */ |
5046 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 5061 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) |
@@ -5077,7 +5092,7 @@ out: | |||
5077 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 5092 | EXPORT_SYMBOL_GPL(set_cpus_allowed); |
5078 | 5093 | ||
5079 | /* | 5094 | /* |
5080 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 5095 | * Move (not current) task off this cpu, onto dest cpu. We're doing |
5081 | * this because either it can't run here any more (set_cpus_allowed() | 5096 | * this because either it can't run here any more (set_cpus_allowed() |
5082 | * away from this CPU, or CPU going down), or because we're | 5097 | * away from this CPU, or CPU going down), or because we're |
5083 | * attempting to rebalance this task on exec (sched_exec). | 5098 | * attempting to rebalance this task on exec (sched_exec). |
@@ -5222,7 +5237,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5222 | * Try to stay on the same cpuset, where the | 5237 | * Try to stay on the same cpuset, where the |
5223 | * current cpuset may be a subset of all cpus. | 5238 | * current cpuset may be a subset of all cpus. |
5224 | * The cpuset_cpus_allowed_locked() variant of | 5239 | * The cpuset_cpus_allowed_locked() variant of |
5225 | * cpuset_cpus_allowed() will not block. It must be | 5240 | * cpuset_cpus_allowed() will not block. It must be |
5226 | * called within calls to cpuset_lock/cpuset_unlock. | 5241 | * called within calls to cpuset_lock/cpuset_unlock. |
5227 | */ | 5242 | */ |
5228 | rq = task_rq_lock(p, &flags); | 5243 | rq = task_rq_lock(p, &flags); |
@@ -5235,10 +5250,11 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5235 | * kernel threads (both mm NULL), since they never | 5250 | * kernel threads (both mm NULL), since they never |
5236 | * leave kernel. | 5251 | * leave kernel. |
5237 | */ | 5252 | */ |
5238 | if (p->mm && printk_ratelimit()) | 5253 | if (p->mm && printk_ratelimit()) { |
5239 | printk(KERN_INFO "process %d (%s) no " | 5254 | printk(KERN_INFO "process %d (%s) no " |
5240 | "longer affine to cpu%d\n", | 5255 | "longer affine to cpu%d\n", |
5241 | task_pid_nr(p), p->comm, dead_cpu); | 5256 | task_pid_nr(p), p->comm, dead_cpu); |
5257 | } | ||
5242 | } | 5258 | } |
5243 | } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); | 5259 | } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); |
5244 | } | 5260 | } |
@@ -5340,7 +5356,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | |||
5340 | 5356 | ||
5341 | /* | 5357 | /* |
5342 | * Drop lock around migration; if someone else moves it, | 5358 | * Drop lock around migration; if someone else moves it, |
5343 | * that's OK. No task can be added to this CPU, so iteration is | 5359 | * that's OK. No task can be added to this CPU, so iteration is |
5344 | * fine. | 5360 | * fine. |
5345 | */ | 5361 | */ |
5346 | spin_unlock_irq(&rq->lock); | 5362 | spin_unlock_irq(&rq->lock); |
@@ -5404,7 +5420,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) | |||
5404 | /* | 5420 | /* |
5405 | * In the intermediate directories, both the child directory and | 5421 | * In the intermediate directories, both the child directory and |
5406 | * procname are dynamically allocated and could fail but the mode | 5422 | * procname are dynamically allocated and could fail but the mode |
5407 | * will always be set. In the lowest directory the names are | 5423 | * will always be set. In the lowest directory the names are |
5408 | * static strings and all have proc handlers. | 5424 | * static strings and all have proc handlers. |
5409 | */ | 5425 | */ |
5410 | for (entry = *tablep; entry->mode; entry++) { | 5426 | for (entry = *tablep; entry->mode; entry++) { |
@@ -5575,7 +5591,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5575 | case CPU_UP_CANCELED_FROZEN: | 5591 | case CPU_UP_CANCELED_FROZEN: |
5576 | if (!cpu_rq(cpu)->migration_thread) | 5592 | if (!cpu_rq(cpu)->migration_thread) |
5577 | break; | 5593 | break; |
5578 | /* Unbind it from offline cpu so it can run. Fall thru. */ | 5594 | /* Unbind it from offline cpu so it can run. Fall thru. */ |
5579 | kthread_bind(cpu_rq(cpu)->migration_thread, | 5595 | kthread_bind(cpu_rq(cpu)->migration_thread, |
5580 | any_online_cpu(cpu_online_map)); | 5596 | any_online_cpu(cpu_online_map)); |
5581 | kthread_stop(cpu_rq(cpu)->migration_thread); | 5597 | kthread_stop(cpu_rq(cpu)->migration_thread); |
@@ -5602,9 +5618,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5602 | migrate_nr_uninterruptible(rq); | 5618 | migrate_nr_uninterruptible(rq); |
5603 | BUG_ON(rq->nr_running != 0); | 5619 | BUG_ON(rq->nr_running != 0); |
5604 | 5620 | ||
5605 | /* No need to migrate the tasks: it was best-effort if | 5621 | /* |
5606 | * they didn't take sched_hotcpu_mutex. Just wake up | 5622 | * No need to migrate the tasks: it was best-effort if |
5607 | * the requestors. */ | 5623 | * they didn't take sched_hotcpu_mutex. Just wake up |
5624 | * the requestors. | ||
5625 | */ | ||
5608 | spin_lock_irq(&rq->lock); | 5626 | spin_lock_irq(&rq->lock); |
5609 | while (!list_empty(&rq->migration_queue)) { | 5627 | while (!list_empty(&rq->migration_queue)) { |
5610 | struct migration_req *req; | 5628 | struct migration_req *req; |
@@ -5912,7 +5930,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | |||
5912 | * @node: node whose sched_domain we're building | 5930 | * @node: node whose sched_domain we're building |
5913 | * @used_nodes: nodes already in the sched_domain | 5931 | * @used_nodes: nodes already in the sched_domain |
5914 | * | 5932 | * |
5915 | * Find the next node to include in a given scheduling domain. Simply | 5933 | * Find the next node to include in a given scheduling domain. Simply |
5916 | * finds the closest node not already in the @used_nodes map. | 5934 | * finds the closest node not already in the @used_nodes map. |
5917 | * | 5935 | * |
5918 | * Should use nodemask_t. | 5936 | * Should use nodemask_t. |
@@ -5952,7 +5970,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
5952 | * @node: node whose cpumask we're constructing | 5970 | * @node: node whose cpumask we're constructing |
5953 | * @size: number of nodes to include in this span | 5971 | * @size: number of nodes to include in this span |
5954 | * | 5972 | * |
5955 | * Given a node, construct a good cpumask for its sched_domain to span. It | 5973 | * Given a node, construct a good cpumask for its sched_domain to span. It |
5956 | * should be one that prevents unnecessary balancing, but also spreads tasks | 5974 | * should be one that prevents unnecessary balancing, but also spreads tasks |
5957 | * out optimally. | 5975 | * out optimally. |
5958 | */ | 5976 | */ |
@@ -5989,8 +6007,8 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | |||
5989 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 6007 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
5990 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); | 6008 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); |
5991 | 6009 | ||
5992 | static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, | 6010 | static int |
5993 | struct sched_group **sg) | 6011 | cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) |
5994 | { | 6012 | { |
5995 | if (sg) | 6013 | if (sg) |
5996 | *sg = &per_cpu(sched_group_cpus, cpu); | 6014 | *sg = &per_cpu(sched_group_cpus, cpu); |
@@ -6007,8 +6025,8 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core); | |||
6007 | #endif | 6025 | #endif |
6008 | 6026 | ||
6009 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6027 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
6010 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, | 6028 | static int |
6011 | struct sched_group **sg) | 6029 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) |
6012 | { | 6030 | { |
6013 | int group; | 6031 | int group; |
6014 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); | 6032 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); |
@@ -6019,8 +6037,8 @@ static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, | |||
6019 | return group; | 6037 | return group; |
6020 | } | 6038 | } |
6021 | #elif defined(CONFIG_SCHED_MC) | 6039 | #elif defined(CONFIG_SCHED_MC) |
6022 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, | 6040 | static int |
6023 | struct sched_group **sg) | 6041 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) |
6024 | { | 6042 | { |
6025 | if (sg) | 6043 | if (sg) |
6026 | *sg = &per_cpu(sched_group_core, cpu); | 6044 | *sg = &per_cpu(sched_group_core, cpu); |
@@ -6031,8 +6049,8 @@ static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, | |||
6031 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 6049 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
6032 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); | 6050 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); |
6033 | 6051 | ||
6034 | static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, | 6052 | static int |
6035 | struct sched_group **sg) | 6053 | cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) |
6036 | { | 6054 | { |
6037 | int group; | 6055 | int group; |
6038 | #ifdef CONFIG_SCHED_MC | 6056 | #ifdef CONFIG_SCHED_MC |
@@ -6212,7 +6230,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6212 | * Allocate the per-node list of sched groups | 6230 | * Allocate the per-node list of sched groups |
6213 | */ | 6231 | */ |
6214 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), | 6232 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), |
6215 | GFP_KERNEL); | 6233 | GFP_KERNEL); |
6216 | if (!sched_group_nodes) { | 6234 | if (!sched_group_nodes) { |
6217 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6235 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
6218 | return -ENOMEM; | 6236 | return -ENOMEM; |
@@ -6459,7 +6477,7 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | |||
6459 | static cpumask_t fallback_doms; | 6477 | static cpumask_t fallback_doms; |
6460 | 6478 | ||
6461 | /* | 6479 | /* |
6462 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6480 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
6463 | * For now this just excludes isolated cpus, but could be used to | 6481 | * For now this just excludes isolated cpus, but could be used to |
6464 | * exclude other special cases in the future. | 6482 | * exclude other special cases in the future. |
6465 | */ | 6483 | */ |
@@ -6501,19 +6519,19 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6501 | 6519 | ||
6502 | /* | 6520 | /* |
6503 | * Partition sched domains as specified by the 'ndoms_new' | 6521 | * Partition sched domains as specified by the 'ndoms_new' |
6504 | * cpumasks in the array doms_new[] of cpumasks. This compares | 6522 | * cpumasks in the array doms_new[] of cpumasks. This compares |
6505 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | 6523 | * doms_new[] to the current sched domain partitioning, doms_cur[]. |
6506 | * It destroys each deleted domain and builds each new domain. | 6524 | * It destroys each deleted domain and builds each new domain. |
6507 | * | 6525 | * |
6508 | * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. | 6526 | * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. |
6509 | * The masks don't intersect (don't overlap.) We should setup one | 6527 | * The masks don't intersect (don't overlap.) We should setup one |
6510 | * sched domain for each mask. CPUs not in any of the cpumasks will | 6528 | * sched domain for each mask. CPUs not in any of the cpumasks will |
6511 | * not be load balanced. If the same cpumask appears both in the | 6529 | * not be load balanced. If the same cpumask appears both in the |
6512 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | 6530 | * current 'doms_cur' domains and in the new 'doms_new', we can leave |
6513 | * it as it is. | 6531 | * it as it is. |
6514 | * | 6532 | * |
6515 | * The passed in 'doms_new' should be kmalloc'd. This routine takes | 6533 | * The passed in 'doms_new' should be kmalloc'd. This routine takes |
6516 | * ownership of it and will kfree it when done with it. If the caller | 6534 | * ownership of it and will kfree it when done with it. If the caller |
6517 | * failed the kmalloc call, then it can pass in doms_new == NULL, | 6535 | * failed the kmalloc call, then it can pass in doms_new == NULL, |
6518 | * and partition_sched_domains() will fallback to the single partition | 6536 | * and partition_sched_domains() will fallback to the single partition |
6519 | * 'fallback_doms'. | 6537 | * 'fallback_doms'. |
@@ -6643,7 +6661,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
6643 | #endif | 6661 | #endif |
6644 | 6662 | ||
6645 | /* | 6663 | /* |
6646 | * Force a reinitialization of the sched domains hierarchy. The domains | 6664 | * Force a reinitialization of the sched domains hierarchy. The domains |
6647 | * and groups cannot be updated in place without racing with the balancing | 6665 | * and groups cannot be updated in place without racing with the balancing |
6648 | * code, so we temporarily attach all running cpus to the NULL domain | 6666 | * code, so we temporarily attach all running cpus to the NULL domain |
6649 | * which will prevent rebalancing while the sched domains are recalculated. | 6667 | * which will prevent rebalancing while the sched domains are recalculated. |
@@ -6933,8 +6951,8 @@ struct task_struct *curr_task(int cpu) | |||
6933 | * @p: the task pointer to set. | 6951 | * @p: the task pointer to set. |
6934 | * | 6952 | * |
6935 | * Description: This function must only be used when non-maskable interrupts | 6953 | * Description: This function must only be used when non-maskable interrupts |
6936 | * are serviced on a separate stack. It allows the architecture to switch the | 6954 | * are serviced on a separate stack. It allows the architecture to switch the |
6937 | * notion of the current task on a cpu in a non-blocking manner. This function | 6955 | * notion of the current task on a cpu in a non-blocking manner. This function |
6938 | * must be called with all CPU's synchronized, and interrupts disabled, the | 6956 | * must be called with all CPU's synchronized, and interrupts disabled, the |
6939 | * and caller must save the original value of the current task (see | 6957 | * and caller must save the original value of the current task (see |
6940 | * curr_task() above) and restore that value before reenabling interrupts and | 6958 | * curr_task() above) and restore that value before reenabling interrupts and |
@@ -7183,16 +7201,17 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
7183 | return &tg->css; | 7201 | return &tg->css; |
7184 | } | 7202 | } |
7185 | 7203 | ||
7186 | static void cpu_cgroup_destroy(struct cgroup_subsys *ss, | 7204 | static void |
7187 | struct cgroup *cgrp) | 7205 | cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
7188 | { | 7206 | { |
7189 | struct task_group *tg = cgroup_tg(cgrp); | 7207 | struct task_group *tg = cgroup_tg(cgrp); |
7190 | 7208 | ||
7191 | sched_destroy_group(tg); | 7209 | sched_destroy_group(tg); |
7192 | } | 7210 | } |
7193 | 7211 | ||
7194 | static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, | 7212 | static int |
7195 | struct cgroup *cgrp, struct task_struct *tsk) | 7213 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
7214 | struct task_struct *tsk) | ||
7196 | { | 7215 | { |
7197 | /* We don't support RT-tasks being in separate groups */ | 7216 | /* We don't support RT-tasks being in separate groups */ |
7198 | if (tsk->sched_class != &fair_sched_class) | 7217 | if (tsk->sched_class != &fair_sched_class) |
@@ -7221,38 +7240,12 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
7221 | return (u64) tg->shares; | 7240 | return (u64) tg->shares; |
7222 | } | 7241 | } |
7223 | 7242 | ||
7224 | static u64 cpu_usage_read(struct cgroup *cgrp, struct cftype *cft) | ||
7225 | { | ||
7226 | struct task_group *tg = cgroup_tg(cgrp); | ||
7227 | unsigned long flags; | ||
7228 | u64 res = 0; | ||
7229 | int i; | ||
7230 | |||
7231 | for_each_possible_cpu(i) { | ||
7232 | /* | ||
7233 | * Lock to prevent races with updating 64-bit counters | ||
7234 | * on 32-bit arches. | ||
7235 | */ | ||
7236 | spin_lock_irqsave(&cpu_rq(i)->lock, flags); | ||
7237 | res += tg->se[i]->sum_exec_runtime; | ||
7238 | spin_unlock_irqrestore(&cpu_rq(i)->lock, flags); | ||
7239 | } | ||
7240 | /* Convert from ns to ms */ | ||
7241 | do_div(res, NSEC_PER_MSEC); | ||
7242 | |||
7243 | return res; | ||
7244 | } | ||
7245 | |||
7246 | static struct cftype cpu_files[] = { | 7243 | static struct cftype cpu_files[] = { |
7247 | { | 7244 | { |
7248 | .name = "shares", | 7245 | .name = "shares", |
7249 | .read_uint = cpu_shares_read_uint, | 7246 | .read_uint = cpu_shares_read_uint, |
7250 | .write_uint = cpu_shares_write_uint, | 7247 | .write_uint = cpu_shares_write_uint, |
7251 | }, | 7248 | }, |
7252 | { | ||
7253 | .name = "usage", | ||
7254 | .read_uint = cpu_usage_read, | ||
7255 | }, | ||
7256 | }; | 7249 | }; |
7257 | 7250 | ||
7258 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 7251 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) |
@@ -7272,3 +7265,126 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7272 | }; | 7265 | }; |
7273 | 7266 | ||
7274 | #endif /* CONFIG_FAIR_CGROUP_SCHED */ | 7267 | #endif /* CONFIG_FAIR_CGROUP_SCHED */ |
7268 | |||
7269 | #ifdef CONFIG_CGROUP_CPUACCT | ||
7270 | |||
7271 | /* | ||
7272 | * CPU accounting code for task groups. | ||
7273 | * | ||
7274 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | ||
7275 | * (balbir@in.ibm.com). | ||
7276 | */ | ||
7277 | |||
7278 | /* track cpu usage of a group of tasks */ | ||
7279 | struct cpuacct { | ||
7280 | struct cgroup_subsys_state css; | ||
7281 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
7282 | u64 *cpuusage; | ||
7283 | }; | ||
7284 | |||
7285 | struct cgroup_subsys cpuacct_subsys; | ||
7286 | |||
7287 | /* return cpu accounting group corresponding to this container */ | ||
7288 | static inline struct cpuacct *cgroup_ca(struct cgroup *cont) | ||
7289 | { | ||
7290 | return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), | ||
7291 | struct cpuacct, css); | ||
7292 | } | ||
7293 | |||
7294 | /* return cpu accounting group to which this task belongs */ | ||
7295 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
7296 | { | ||
7297 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
7298 | struct cpuacct, css); | ||
7299 | } | ||
7300 | |||
7301 | /* create a new cpu accounting group */ | ||
7302 | static struct cgroup_subsys_state *cpuacct_create( | ||
7303 | struct cgroup_subsys *ss, struct cgroup *cont) | ||
7304 | { | ||
7305 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
7306 | |||
7307 | if (!ca) | ||
7308 | return ERR_PTR(-ENOMEM); | ||
7309 | |||
7310 | ca->cpuusage = alloc_percpu(u64); | ||
7311 | if (!ca->cpuusage) { | ||
7312 | kfree(ca); | ||
7313 | return ERR_PTR(-ENOMEM); | ||
7314 | } | ||
7315 | |||
7316 | return &ca->css; | ||
7317 | } | ||
7318 | |||
7319 | /* destroy an existing cpu accounting group */ | ||
7320 | static void | ||
7321 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | ||
7322 | { | ||
7323 | struct cpuacct *ca = cgroup_ca(cont); | ||
7324 | |||
7325 | free_percpu(ca->cpuusage); | ||
7326 | kfree(ca); | ||
7327 | } | ||
7328 | |||
7329 | /* return total cpu usage (in nanoseconds) of a group */ | ||
7330 | static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) | ||
7331 | { | ||
7332 | struct cpuacct *ca = cgroup_ca(cont); | ||
7333 | u64 totalcpuusage = 0; | ||
7334 | int i; | ||
7335 | |||
7336 | for_each_possible_cpu(i) { | ||
7337 | u64 *cpuusage = percpu_ptr(ca->cpuusage, i); | ||
7338 | |||
7339 | /* | ||
7340 | * Take rq->lock to make 64-bit addition safe on 32-bit | ||
7341 | * platforms. | ||
7342 | */ | ||
7343 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7344 | totalcpuusage += *cpuusage; | ||
7345 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7346 | } | ||
7347 | |||
7348 | return totalcpuusage; | ||
7349 | } | ||
7350 | |||
7351 | static struct cftype files[] = { | ||
7352 | { | ||
7353 | .name = "usage", | ||
7354 | .read_uint = cpuusage_read, | ||
7355 | }, | ||
7356 | }; | ||
7357 | |||
7358 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
7359 | { | ||
7360 | return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | ||
7361 | } | ||
7362 | |||
7363 | /* | ||
7364 | * charge this task's execution time to its accounting group. | ||
7365 | * | ||
7366 | * called with rq->lock held. | ||
7367 | */ | ||
7368 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | ||
7369 | { | ||
7370 | struct cpuacct *ca; | ||
7371 | |||
7372 | if (!cpuacct_subsys.active) | ||
7373 | return; | ||
7374 | |||
7375 | ca = task_ca(tsk); | ||
7376 | if (ca) { | ||
7377 | u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk)); | ||
7378 | |||
7379 | *cpuusage += cputime; | ||
7380 | } | ||
7381 | } | ||
7382 | |||
7383 | struct cgroup_subsys cpuacct_subsys = { | ||
7384 | .name = "cpuacct", | ||
7385 | .create = cpuacct_create, | ||
7386 | .destroy = cpuacct_destroy, | ||
7387 | .populate = cpuacct_populate, | ||
7388 | .subsys_id = cpuacct_subsys_id, | ||
7389 | }; | ||
7390 | #endif /* CONFIG_CGROUP_CPUACCT */ | ||