aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-02-28 13:31:01 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-02-28 13:31:01 -0500
commitf66ffdedbf0fc059a92219bb08c1dbcac88f074b (patch)
tree9db4ad51764455123130e82fb7acf4f0a0be58ce
parent2531216f236cb2a1f39ffa12a4a9339541e52191 (diff)
parentdd5feea14a7de4edbd9f36db1a2db785de91b88d (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (25 commits) sched: Fix SCHED_MC regression caused by change in sched cpu_power sched: Don't use possibly stale sched_class kthread, sched: Remove reference to kthread_create_on_cpu sched: cpuacct: Use bigger percpu counter batch values for stats counters percpu_counter: Make __percpu_counter_add an inline function on UP sched: Remove member rt_se from struct rt_rq sched: Change usage of rt_rq->rt_se to rt_rq->tg->rt_se[cpu] sched: Remove unused update_shares_locked() sched: Use for_each_bit sched: Queue a deboosted task to the head of the RT prio queue sched: Implement head queueing for sched_rt sched: Extend enqueue_task to allow head queueing sched: Remove USER_SCHED sched: Fix the place where group powers are updated sched: Assume *balance is valid sched: Remove load_balance_newidle() sched: Unify load_balance{,_newidle}() sched: Add a lock break for PREEMPT=y sched: Remove from fwd decls sched: Remove rq_iterator from move_one_task ... Fix up trivial conflicts in kernel/sched.c
-rw-r--r--Documentation/feature-removal-schedule.txt15
-rw-r--r--include/linux/kernel.h5
-rw-r--r--include/linux/percpu_counter.h9
-rw-r--r--include/linux/sched.h25
-rw-r--r--init/Kconfig81
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/sched.c2125
-rw-r--r--kernel/sched_cpupri.c4
-rw-r--r--kernel/sched_fair.c1699
-rw-r--r--kernel/sched_idletask.c23
-rw-r--r--kernel/sched_rt.c54
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/user.c305
14 files changed, 1827 insertions, 2533 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index b9eba900e0f0..ea401495528d 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,21 +6,6 @@ be removed from this file.
6 6
7--------------------------- 7---------------------------
8 8
9What: USER_SCHED
10When: 2.6.34
11
12Why: USER_SCHED was implemented as a proof of concept for group scheduling.
13 The effect of USER_SCHED can already be achieved from userspace with
14 the help of libcgroup. The removal of USER_SCHED will also simplify
15 the scheduler code with the removal of one major ifdef. There are also
16 issues USER_SCHED has with USER_NS. A decision was taken not to fix
17 those and instead remove USER_SCHED. Also new group scheduling
18 features will not be implemented for USER_SCHED.
19
20Who: Dhaval Giani <dhaval@linux.vnet.ibm.com>
21
22---------------------------
23
24What: PRISM54 9What: PRISM54
25When: 2.6.34 10When: 2.6.34
26 11
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 328bca609b9b..1221d2331a6d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -124,7 +124,7 @@ extern int _cond_resched(void);
124#endif 124#endif
125 125
126#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 126#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
127 void __might_sleep(char *file, int line, int preempt_offset); 127 void __might_sleep(const char *file, int line, int preempt_offset);
128/** 128/**
129 * might_sleep - annotation for functions that can sleep 129 * might_sleep - annotation for functions that can sleep
130 * 130 *
@@ -138,7 +138,8 @@ extern int _cond_resched(void);
138# define might_sleep() \ 138# define might_sleep() \
139 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) 139 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
140#else 140#else
141 static inline void __might_sleep(char *file, int line, int preempt_offset) { } 141 static inline void __might_sleep(const char *file, int line,
142 int preempt_offset) { }
142# define might_sleep() do { might_resched(); } while (0) 143# define might_sleep() do { might_resched(); } while (0)
143#endif 144#endif
144 145
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index a7684a513994..794662b2be5d 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -98,9 +98,6 @@ static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
98 fbc->count = amount; 98 fbc->count = amount;
99} 99}
100 100
101#define __percpu_counter_add(fbc, amount, batch) \
102 percpu_counter_add(fbc, amount)
103
104static inline void 101static inline void
105percpu_counter_add(struct percpu_counter *fbc, s64 amount) 102percpu_counter_add(struct percpu_counter *fbc, s64 amount)
106{ 103{
@@ -109,6 +106,12 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
109 preempt_enable(); 106 preempt_enable();
110} 107}
111 108
109static inline void
110__percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
111{
112 percpu_counter_add(fbc, amount);
113}
114
112static inline s64 percpu_counter_read(struct percpu_counter *fbc) 115static inline s64 percpu_counter_read(struct percpu_counter *fbc)
113{ 116{
114 return fbc->count; 117 return fbc->count;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1f5fa53b46b1..0eef87b58ea5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -740,14 +740,6 @@ struct user_struct {
740 uid_t uid; 740 uid_t uid;
741 struct user_namespace *user_ns; 741 struct user_namespace *user_ns;
742 742
743#ifdef CONFIG_USER_SCHED
744 struct task_group *tg;
745#ifdef CONFIG_SYSFS
746 struct kobject kobj;
747 struct delayed_work work;
748#endif
749#endif
750
751#ifdef CONFIG_PERF_EVENTS 743#ifdef CONFIG_PERF_EVENTS
752 atomic_long_t locked_vm; 744 atomic_long_t locked_vm;
753#endif 745#endif
@@ -1087,7 +1079,8 @@ struct sched_domain;
1087struct sched_class { 1079struct sched_class {
1088 const struct sched_class *next; 1080 const struct sched_class *next;
1089 1081
1090 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); 1082 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup,
1083 bool head);
1091 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); 1084 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
1092 void (*yield_task) (struct rq *rq); 1085 void (*yield_task) (struct rq *rq);
1093 1086
@@ -1099,14 +1092,6 @@ struct sched_class {
1099#ifdef CONFIG_SMP 1092#ifdef CONFIG_SMP
1100 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 1093 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
1101 1094
1102 unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
1103 struct rq *busiest, unsigned long max_load_move,
1104 struct sched_domain *sd, enum cpu_idle_type idle,
1105 int *all_pinned, int *this_best_prio);
1106
1107 int (*move_one_task) (struct rq *this_rq, int this_cpu,
1108 struct rq *busiest, struct sched_domain *sd,
1109 enum cpu_idle_type idle);
1110 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1095 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1111 void (*post_schedule) (struct rq *this_rq); 1096 void (*post_schedule) (struct rq *this_rq);
1112 void (*task_waking) (struct rq *this_rq, struct task_struct *task); 1097 void (*task_waking) (struct rq *this_rq, struct task_struct *task);
@@ -2520,13 +2505,9 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
2520 2505
2521extern void normalize_rt_tasks(void); 2506extern void normalize_rt_tasks(void);
2522 2507
2523#ifdef CONFIG_GROUP_SCHED 2508#ifdef CONFIG_CGROUP_SCHED
2524 2509
2525extern struct task_group init_task_group; 2510extern struct task_group init_task_group;
2526#ifdef CONFIG_USER_SCHED
2527extern struct task_group root_task_group;
2528extern void set_tg_uid(struct user_struct *user);
2529#endif
2530 2511
2531extern struct task_group *sched_create_group(struct task_group *parent); 2512extern struct task_group *sched_create_group(struct task_group *parent);
2532extern void sched_destroy_group(struct task_group *tg); 2513extern void sched_destroy_group(struct task_group *tg);
diff --git a/init/Kconfig b/init/Kconfig
index c6d95f8ea055..089a230e5652 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -461,57 +461,6 @@ config LOG_BUF_SHIFT
461config HAVE_UNSTABLE_SCHED_CLOCK 461config HAVE_UNSTABLE_SCHED_CLOCK
462 bool 462 bool
463 463
464config GROUP_SCHED
465 bool "Group CPU scheduler"
466 depends on EXPERIMENTAL
467 default n
468 help
469 This feature lets CPU scheduler recognize task groups and control CPU
470 bandwidth allocation to such task groups.
471 In order to create a group from arbitrary set of processes, use
472 CONFIG_CGROUPS. (See Control Group support.)
473
474config FAIR_GROUP_SCHED
475 bool "Group scheduling for SCHED_OTHER"
476 depends on GROUP_SCHED
477 default GROUP_SCHED
478
479config RT_GROUP_SCHED
480 bool "Group scheduling for SCHED_RR/FIFO"
481 depends on EXPERIMENTAL
482 depends on GROUP_SCHED
483 default n
484 help
485 This feature lets you explicitly allocate real CPU bandwidth
486 to users or control groups (depending on the "Basis for grouping tasks"
487 setting below. If enabled, it will also make it impossible to
488 schedule realtime tasks for non-root users until you allocate
489 realtime bandwidth for them.
490 See Documentation/scheduler/sched-rt-group.txt for more information.
491
492choice
493 depends on GROUP_SCHED
494 prompt "Basis for grouping tasks"
495 default USER_SCHED
496
497config USER_SCHED
498 bool "user id"
499 help
500 This option will choose userid as the basis for grouping
501 tasks, thus providing equal CPU bandwidth to each user.
502
503config CGROUP_SCHED
504 bool "Control groups"
505 depends on CGROUPS
506 help
507 This option allows you to create arbitrary task groups
508 using the "cgroup" pseudo filesystem and control
509 the cpu bandwidth allocated to each such task group.
510 Refer to Documentation/cgroups/cgroups.txt for more
511 information on "cgroup" pseudo filesystem.
512
513endchoice
514
515menuconfig CGROUPS 464menuconfig CGROUPS
516 boolean "Control Group support" 465 boolean "Control Group support"
517 help 466 help
@@ -632,6 +581,36 @@ config CGROUP_MEM_RES_CTLR_SWAP
632 Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page 581 Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
633 size is 4096bytes, 512k per 1Gbytes of swap. 582 size is 4096bytes, 512k per 1Gbytes of swap.
634 583
584menuconfig CGROUP_SCHED
585 bool "Group CPU scheduler"
586 depends on EXPERIMENTAL && CGROUPS
587 default n
588 help
589 This feature lets CPU scheduler recognize task groups and control CPU
590 bandwidth allocation to such task groups. It uses cgroups to group
591 tasks.
592
593if CGROUP_SCHED
594config FAIR_GROUP_SCHED
595 bool "Group scheduling for SCHED_OTHER"
596 depends on CGROUP_SCHED
597 default CGROUP_SCHED
598
599config RT_GROUP_SCHED
600 bool "Group scheduling for SCHED_RR/FIFO"
601 depends on EXPERIMENTAL
602 depends on CGROUP_SCHED
603 default n
604 help
605 This feature lets you explicitly allocate real CPU bandwidth
606 to users or control groups (depending on the "Basis for grouping tasks"
607 setting below. If enabled, it will also make it impossible to
608 schedule realtime tasks for non-root users until you allocate
609 realtime bandwidth for them.
610 See Documentation/scheduler/sched-rt-group.txt for more information.
611
612endif #CGROUP_SCHED
613
635endif # CGROUPS 614endif # CGROUPS
636 615
637config MM_OWNER 616config MM_OWNER
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..6b1ccc3f0205 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void)
197 goto group_exit; 197 goto group_exit;
198 } 198 }
199 199
200 /* create the /sys/kernel/uids/ directory */
201 error = uids_sysfs_init();
202 if (error)
203 goto notes_exit;
204
205 return 0; 200 return 0;
206 201
207notes_exit:
208 if (notes_size > 0)
209 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
210group_exit: 202group_exit:
211 sysfs_remove_group(kernel_kobj, &kernel_attr_group); 203 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
212kset_exit: 204kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e0..82ed0ea15194 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
101 * 101 *
102 * Description: This helper function creates and names a kernel 102 * Description: This helper function creates and names a kernel
103 * thread. The thread will be stopped: use wake_up_process() to start 103 * thread. The thread will be stopped: use wake_up_process() to start
104 * it. See also kthread_run(), kthread_create_on_cpu(). 104 * it. See also kthread_run().
105 * 105 *
106 * When woken, the thread will run @threadfn() with @data as its 106 * When woken, the thread will run @threadfn() with @data as its
107 * argument. @threadfn() can either call do_exit() directly if it is a 107 * argument. @threadfn() can either call do_exit() directly if it is a
diff --git a/kernel/sched.c b/kernel/sched.c
index caf54e1eef6e..6a212c97f523 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -1414,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
1414 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1371 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1415}; 1372};
1416 1373
1417static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1418
1419/*
1420 * runqueue iterator, to support SMP load-balancing between different
1421 * scheduling classes, without having to expose their internal data
1422 * structures to the load-balancing proper:
1423 */
1424struct rq_iterator {
1425 void *arg;
1426 struct task_struct *(*start)(void *);
1427 struct task_struct *(*next)(void *);
1428};
1429
1430#ifdef CONFIG_SMP
1431static unsigned long
1432balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1433 unsigned long max_load_move, struct sched_domain *sd,
1434 enum cpu_idle_type idle, int *all_pinned,
1435 int *this_best_prio, struct rq_iterator *iterator);
1436
1437static int
1438iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1439 struct sched_domain *sd, enum cpu_idle_type idle,
1440 struct rq_iterator *iterator);
1441#endif
1442
1443/* Time spent by the tasks of the cpu accounting group executing in ... */ 1374/* Time spent by the tasks of the cpu accounting group executing in ... */
1444enum cpuacct_stat_index { 1375enum cpuacct_stat_index {
1445 CPUACCT_STAT_USER, /* ... user mode */ 1376 CPUACCT_STAT_USER, /* ... user mode */
@@ -1725,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
1725 } 1656 }
1726} 1657}
1727 1658
1728static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1729{
1730 if (root_task_group_empty())
1731 return;
1732
1733 raw_spin_unlock(&rq->lock);
1734 update_shares(sd);
1735 raw_spin_lock(&rq->lock);
1736}
1737
1738static void update_h_load(long cpu) 1659static void update_h_load(long cpu)
1739{ 1660{
1740 if (root_task_group_empty()) 1661 if (root_task_group_empty())
@@ -1749,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
1749{ 1670{
1750} 1671}
1751 1672
1752static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1753{
1754}
1755
1756#endif 1673#endif
1757 1674
1758#ifdef CONFIG_PREEMPT 1675#ifdef CONFIG_PREEMPT
@@ -1829,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1829 raw_spin_unlock(&busiest->lock); 1746 raw_spin_unlock(&busiest->lock);
1830 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1747 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1831} 1748}
1749
1750/*
1751 * double_rq_lock - safely lock two runqueues
1752 *
1753 * Note this does not disable interrupts like task_rq_lock,
1754 * you need to do so manually before calling.
1755 */
1756static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1757 __acquires(rq1->lock)
1758 __acquires(rq2->lock)
1759{
1760 BUG_ON(!irqs_disabled());
1761 if (rq1 == rq2) {
1762 raw_spin_lock(&rq1->lock);
1763 __acquire(rq2->lock); /* Fake it out ;) */
1764 } else {
1765 if (rq1 < rq2) {
1766 raw_spin_lock(&rq1->lock);
1767 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1768 } else {
1769 raw_spin_lock(&rq2->lock);
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 }
1772 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775}
1776
1777/*
1778 * double_rq_unlock - safely unlock two runqueues
1779 *
1780 * Note this does not restore interrupts like task_rq_unlock,
1781 * you need to do so manually after calling.
1782 */
1783static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1784 __releases(rq1->lock)
1785 __releases(rq2->lock)
1786{
1787 raw_spin_unlock(&rq1->lock);
1788 if (rq1 != rq2)
1789 raw_spin_unlock(&rq2->lock);
1790 else
1791 __release(rq2->lock);
1792}
1793
1832#endif 1794#endif
1833 1795
1834#ifdef CONFIG_FAIR_GROUP_SCHED 1796#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1858,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1858#endif 1820#endif
1859} 1821}
1860 1822
1861#include "sched_stats.h" 1823static const struct sched_class rt_sched_class;
1862#include "sched_idletask.c"
1863#include "sched_fair.c"
1864#include "sched_rt.c"
1865#ifdef CONFIG_SCHED_DEBUG
1866# include "sched_debug.c"
1867#endif
1868 1824
1869#define sched_class_highest (&rt_sched_class) 1825#define sched_class_highest (&rt_sched_class)
1870#define for_each_class(class) \ 1826#define for_each_class(class) \
1871 for (class = sched_class_highest; class; class = class->next) 1827 for (class = sched_class_highest; class; class = class->next)
1872 1828
1829#include "sched_stats.h"
1830
1873static void inc_nr_running(struct rq *rq) 1831static void inc_nr_running(struct rq *rq)
1874{ 1832{
1875 rq->nr_running++; 1833 rq->nr_running++;
@@ -1907,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
1907 *avg += diff >> 3; 1865 *avg += diff >> 3;
1908} 1866}
1909 1867
1910static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1911{ 1870{
1912 if (wakeup) 1871 if (wakeup)
1913 p->se.start_runtime = p->se.sum_exec_runtime; 1872 p->se.start_runtime = p->se.sum_exec_runtime;
1914 1873
1915 sched_info_queued(p); 1874 sched_info_queued(p);
1916 p->sched_class->enqueue_task(rq, p, wakeup); 1875 p->sched_class->enqueue_task(rq, p, wakeup, head);
1917 p->se.on_rq = 1; 1876 p->se.on_rq = 1;
1918} 1877}
1919 1878
@@ -1936,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1936} 1895}
1937 1896
1938/* 1897/*
1898 * activate_task - move a task to the runqueue.
1899 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1901{
1902 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--;
1904
1905 enqueue_task(rq, p, wakeup, false);
1906 inc_nr_running(rq);
1907}
1908
1909/*
1910 * deactivate_task - remove a task from the runqueue.
1911 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1913{
1914 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++;
1916
1917 dequeue_task(rq, p, sleep);
1918 dec_nr_running(rq);
1919}
1920
1921#include "sched_idletask.c"
1922#include "sched_fair.c"
1923#include "sched_rt.c"
1924#ifdef CONFIG_SCHED_DEBUG
1925# include "sched_debug.c"
1926#endif
1927
1928/*
1939 * __normal_prio - return the priority that is based on the static prio 1929 * __normal_prio - return the priority that is based on the static prio
1940 */ 1930 */
1941static inline int __normal_prio(struct task_struct *p) 1931static inline int __normal_prio(struct task_struct *p)
@@ -1981,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
1981 return p->prio; 1971 return p->prio;
1982} 1972}
1983 1973
1984/*
1985 * activate_task - move a task to the runqueue.
1986 */
1987static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1988{
1989 if (task_contributes_to_load(p))
1990 rq->nr_uninterruptible--;
1991
1992 enqueue_task(rq, p, wakeup);
1993 inc_nr_running(rq);
1994}
1995
1996/*
1997 * deactivate_task - remove a task from the runqueue.
1998 */
1999static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
2000{
2001 if (task_contributes_to_load(p))
2002 rq->nr_uninterruptible++;
2003
2004 dequeue_task(rq, p, sleep);
2005 dec_nr_running(rq);
2006}
2007
2008/** 1974/**
2009 * task_curr - is this task currently executing on a CPU? 1975 * task_curr - is this task currently executing on a CPU?
2010 * @p: the task in question. 1976 * @p: the task in question.
@@ -3148,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq)
3148#ifdef CONFIG_SMP 3114#ifdef CONFIG_SMP
3149 3115
3150/* 3116/*
3151 * double_rq_lock - safely lock two runqueues
3152 *
3153 * Note this does not disable interrupts like task_rq_lock,
3154 * you need to do so manually before calling.
3155 */
3156static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3157 __acquires(rq1->lock)
3158 __acquires(rq2->lock)
3159{
3160 BUG_ON(!irqs_disabled());
3161 if (rq1 == rq2) {
3162 raw_spin_lock(&rq1->lock);
3163 __acquire(rq2->lock); /* Fake it out ;) */
3164 } else {
3165 if (rq1 < rq2) {
3166 raw_spin_lock(&rq1->lock);
3167 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3168 } else {
3169 raw_spin_lock(&rq2->lock);
3170 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3171 }
3172 }
3173 update_rq_clock(rq1);
3174 update_rq_clock(rq2);
3175}
3176
3177/*
3178 * double_rq_unlock - safely unlock two runqueues
3179 *
3180 * Note this does not restore interrupts like task_rq_unlock,
3181 * you need to do so manually after calling.
3182 */
3183static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3184 __releases(rq1->lock)
3185 __releases(rq2->lock)
3186{
3187 raw_spin_unlock(&rq1->lock);
3188 if (rq1 != rq2)
3189 raw_spin_unlock(&rq2->lock);
3190 else
3191 __release(rq2->lock);
3192}
3193
3194/*
3195 * sched_exec - execve() is a valuable balancing opportunity, because at 3117 * sched_exec - execve() is a valuable balancing opportunity, because at
3196 * this point the task has the smallest effective memory and cache footprint. 3118 * this point the task has the smallest effective memory and cache footprint.
3197 */ 3119 */
@@ -3239,1782 +3161,6 @@ again:
3239 task_rq_unlock(rq, &flags); 3161 task_rq_unlock(rq, &flags);
3240} 3162}
3241 3163
3242/*
3243 * pull_task - move a task from a remote runqueue to the local runqueue.
3244 * Both runqueues must be locked.
3245 */
3246static void pull_task(struct rq *src_rq, struct task_struct *p,
3247 struct rq *this_rq, int this_cpu)
3248{
3249 deactivate_task(src_rq, p, 0);
3250 set_task_cpu(p, this_cpu);
3251 activate_task(this_rq, p, 0);
3252 check_preempt_curr(this_rq, p, 0);
3253}
3254
3255/*
3256 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3257 */
3258static
3259int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3260 struct sched_domain *sd, enum cpu_idle_type idle,
3261 int *all_pinned)
3262{
3263 int tsk_cache_hot = 0;
3264 /*
3265 * We do not migrate tasks that are:
3266 * 1) running (obviously), or
3267 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3268 * 3) are cache-hot on their current CPU.
3269 */
3270 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3271 schedstat_inc(p, se.nr_failed_migrations_affine);
3272 return 0;
3273 }
3274 *all_pinned = 0;
3275
3276 if (task_running(rq, p)) {
3277 schedstat_inc(p, se.nr_failed_migrations_running);
3278 return 0;
3279 }
3280
3281 /*
3282 * Aggressive migration if:
3283 * 1) task is cache cold, or
3284 * 2) too many balance attempts have failed.
3285 */
3286
3287 tsk_cache_hot = task_hot(p, rq->clock, sd);
3288 if (!tsk_cache_hot ||
3289 sd->nr_balance_failed > sd->cache_nice_tries) {
3290#ifdef CONFIG_SCHEDSTATS
3291 if (tsk_cache_hot) {
3292 schedstat_inc(sd, lb_hot_gained[idle]);
3293 schedstat_inc(p, se.nr_forced_migrations);
3294 }
3295#endif
3296 return 1;
3297 }
3298
3299 if (tsk_cache_hot) {
3300 schedstat_inc(p, se.nr_failed_migrations_hot);
3301 return 0;
3302 }
3303 return 1;
3304}
3305
3306static unsigned long
3307balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3308 unsigned long max_load_move, struct sched_domain *sd,
3309 enum cpu_idle_type idle, int *all_pinned,
3310 int *this_best_prio, struct rq_iterator *iterator)
3311{
3312 int loops = 0, pulled = 0, pinned = 0;
3313 struct task_struct *p;
3314 long rem_load_move = max_load_move;
3315
3316 if (max_load_move == 0)
3317 goto out;
3318
3319 pinned = 1;
3320
3321 /*
3322 * Start the load-balancing iterator:
3323 */
3324 p = iterator->start(iterator->arg);
3325next:
3326 if (!p || loops++ > sysctl_sched_nr_migrate)
3327 goto out;
3328
3329 if ((p->se.load.weight >> 1) > rem_load_move ||
3330 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3331 p = iterator->next(iterator->arg);
3332 goto next;
3333 }
3334
3335 pull_task(busiest, p, this_rq, this_cpu);
3336 pulled++;
3337 rem_load_move -= p->se.load.weight;
3338
3339#ifdef CONFIG_PREEMPT
3340 /*
3341 * NEWIDLE balancing is a source of latency, so preemptible kernels
3342 * will stop after the first task is pulled to minimize the critical
3343 * section.
3344 */
3345 if (idle == CPU_NEWLY_IDLE)
3346 goto out;
3347#endif
3348
3349 /*
3350 * We only want to steal up to the prescribed amount of weighted load.
3351 */
3352 if (rem_load_move > 0) {
3353 if (p->prio < *this_best_prio)
3354 *this_best_prio = p->prio;
3355 p = iterator->next(iterator->arg);
3356 goto next;
3357 }
3358out:
3359 /*
3360 * Right now, this is one of only two places pull_task() is called,
3361 * so we can safely collect pull_task() stats here rather than
3362 * inside pull_task().
3363 */
3364 schedstat_add(sd, lb_gained[idle], pulled);
3365
3366 if (all_pinned)
3367 *all_pinned = pinned;
3368
3369 return max_load_move - rem_load_move;
3370}
3371
3372/*
3373 * move_tasks tries to move up to max_load_move weighted load from busiest to
3374 * this_rq, as part of a balancing operation within domain "sd".
3375 * Returns 1 if successful and 0 otherwise.
3376 *
3377 * Called with both runqueues locked.
3378 */
3379static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3380 unsigned long max_load_move,
3381 struct sched_domain *sd, enum cpu_idle_type idle,
3382 int *all_pinned)
3383{
3384 const struct sched_class *class = sched_class_highest;
3385 unsigned long total_load_moved = 0;
3386 int this_best_prio = this_rq->curr->prio;
3387
3388 do {
3389 total_load_moved +=
3390 class->load_balance(this_rq, this_cpu, busiest,
3391 max_load_move - total_load_moved,
3392 sd, idle, all_pinned, &this_best_prio);
3393 class = class->next;
3394
3395#ifdef CONFIG_PREEMPT
3396 /*
3397 * NEWIDLE balancing is a source of latency, so preemptible
3398 * kernels will stop after the first task is pulled to minimize
3399 * the critical section.
3400 */
3401 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3402 break;
3403#endif
3404 } while (class && max_load_move > total_load_moved);
3405
3406 return total_load_moved > 0;
3407}
3408
3409static int
3410iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3411 struct sched_domain *sd, enum cpu_idle_type idle,
3412 struct rq_iterator *iterator)
3413{
3414 struct task_struct *p = iterator->start(iterator->arg);
3415 int pinned = 0;
3416
3417 while (p) {
3418 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3419 pull_task(busiest, p, this_rq, this_cpu);
3420 /*
3421 * Right now, this is only the second place pull_task()
3422 * is called, so we can safely collect pull_task()
3423 * stats here rather than inside pull_task().
3424 */
3425 schedstat_inc(sd, lb_gained[idle]);
3426
3427 return 1;
3428 }
3429 p = iterator->next(iterator->arg);
3430 }
3431
3432 return 0;
3433}
3434
3435/*
3436 * move_one_task tries to move exactly one task from busiest to this_rq, as
3437 * part of active balancing operations within "domain".
3438 * Returns 1 if successful and 0 otherwise.
3439 *
3440 * Called with both runqueues locked.
3441 */
3442static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3443 struct sched_domain *sd, enum cpu_idle_type idle)
3444{
3445 const struct sched_class *class;
3446
3447 for_each_class(class) {
3448 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3449 return 1;
3450 }
3451
3452 return 0;
3453}
3454/********** Helpers for find_busiest_group ************************/
3455/*
3456 * sd_lb_stats - Structure to store the statistics of a sched_domain
3457 * during load balancing.
3458 */
3459struct sd_lb_stats {
3460 struct sched_group *busiest; /* Busiest group in this sd */
3461 struct sched_group *this; /* Local group in this sd */
3462 unsigned long total_load; /* Total load of all groups in sd */
3463 unsigned long total_pwr; /* Total power of all groups in sd */
3464 unsigned long avg_load; /* Average load across all groups in sd */
3465
3466 /** Statistics of this group */
3467 unsigned long this_load;
3468 unsigned long this_load_per_task;
3469 unsigned long this_nr_running;
3470
3471 /* Statistics of the busiest group */
3472 unsigned long max_load;
3473 unsigned long busiest_load_per_task;
3474 unsigned long busiest_nr_running;
3475
3476 int group_imb; /* Is there imbalance in this sd */
3477#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3478 int power_savings_balance; /* Is powersave balance needed for this sd */
3479 struct sched_group *group_min; /* Least loaded group in sd */
3480 struct sched_group *group_leader; /* Group which relieves group_min */
3481 unsigned long min_load_per_task; /* load_per_task in group_min */
3482 unsigned long leader_nr_running; /* Nr running of group_leader */
3483 unsigned long min_nr_running; /* Nr running of group_min */
3484#endif
3485};
3486
3487/*
3488 * sg_lb_stats - stats of a sched_group required for load_balancing
3489 */
3490struct sg_lb_stats {
3491 unsigned long avg_load; /*Avg load across the CPUs of the group */
3492 unsigned long group_load; /* Total load over the CPUs of the group */
3493 unsigned long sum_nr_running; /* Nr tasks running in the group */
3494 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3495 unsigned long group_capacity;
3496 int group_imb; /* Is there an imbalance in the group ? */
3497};
3498
3499/**
3500 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3501 * @group: The group whose first cpu is to be returned.
3502 */
3503static inline unsigned int group_first_cpu(struct sched_group *group)
3504{
3505 return cpumask_first(sched_group_cpus(group));
3506}
3507
3508/**
3509 * get_sd_load_idx - Obtain the load index for a given sched domain.
3510 * @sd: The sched_domain whose load_idx is to be obtained.
3511 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3512 */
3513static inline int get_sd_load_idx(struct sched_domain *sd,
3514 enum cpu_idle_type idle)
3515{
3516 int load_idx;
3517
3518 switch (idle) {
3519 case CPU_NOT_IDLE:
3520 load_idx = sd->busy_idx;
3521 break;
3522
3523 case CPU_NEWLY_IDLE:
3524 load_idx = sd->newidle_idx;
3525 break;
3526 default:
3527 load_idx = sd->idle_idx;
3528 break;
3529 }
3530
3531 return load_idx;
3532}
3533
3534
3535#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3536/**
3537 * init_sd_power_savings_stats - Initialize power savings statistics for
3538 * the given sched_domain, during load balancing.
3539 *
3540 * @sd: Sched domain whose power-savings statistics are to be initialized.
3541 * @sds: Variable containing the statistics for sd.
3542 * @idle: Idle status of the CPU at which we're performing load-balancing.
3543 */
3544static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3545 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3546{
3547 /*
3548 * Busy processors will not participate in power savings
3549 * balance.
3550 */
3551 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3552 sds->power_savings_balance = 0;
3553 else {
3554 sds->power_savings_balance = 1;
3555 sds->min_nr_running = ULONG_MAX;
3556 sds->leader_nr_running = 0;
3557 }
3558}
3559
3560/**
3561 * update_sd_power_savings_stats - Update the power saving stats for a
3562 * sched_domain while performing load balancing.
3563 *
3564 * @group: sched_group belonging to the sched_domain under consideration.
3565 * @sds: Variable containing the statistics of the sched_domain
3566 * @local_group: Does group contain the CPU for which we're performing
3567 * load balancing ?
3568 * @sgs: Variable containing the statistics of the group.
3569 */
3570static inline void update_sd_power_savings_stats(struct sched_group *group,
3571 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3572{
3573
3574 if (!sds->power_savings_balance)
3575 return;
3576
3577 /*
3578 * If the local group is idle or completely loaded
3579 * no need to do power savings balance at this domain
3580 */
3581 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3582 !sds->this_nr_running))
3583 sds->power_savings_balance = 0;
3584
3585 /*
3586 * If a group is already running at full capacity or idle,
3587 * don't include that group in power savings calculations
3588 */
3589 if (!sds->power_savings_balance ||
3590 sgs->sum_nr_running >= sgs->group_capacity ||
3591 !sgs->sum_nr_running)
3592 return;
3593
3594 /*
3595 * Calculate the group which has the least non-idle load.
3596 * This is the group from where we need to pick up the load
3597 * for saving power
3598 */
3599 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3600 (sgs->sum_nr_running == sds->min_nr_running &&
3601 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3602 sds->group_min = group;
3603 sds->min_nr_running = sgs->sum_nr_running;
3604 sds->min_load_per_task = sgs->sum_weighted_load /
3605 sgs->sum_nr_running;
3606 }
3607
3608 /*
3609 * Calculate the group which is almost near its
3610 * capacity but still has some space to pick up some load
3611 * from other group and save more power
3612 */
3613 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3614 return;
3615
3616 if (sgs->sum_nr_running > sds->leader_nr_running ||
3617 (sgs->sum_nr_running == sds->leader_nr_running &&
3618 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3619 sds->group_leader = group;
3620 sds->leader_nr_running = sgs->sum_nr_running;
3621 }
3622}
3623
3624/**
3625 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3626 * @sds: Variable containing the statistics of the sched_domain
3627 * under consideration.
3628 * @this_cpu: Cpu at which we're currently performing load-balancing.
3629 * @imbalance: Variable to store the imbalance.
3630 *
3631 * Description:
3632 * Check if we have potential to perform some power-savings balance.
3633 * If yes, set the busiest group to be the least loaded group in the
3634 * sched_domain, so that it's CPUs can be put to idle.
3635 *
3636 * Returns 1 if there is potential to perform power-savings balance.
3637 * Else returns 0.
3638 */
3639static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3640 int this_cpu, unsigned long *imbalance)
3641{
3642 if (!sds->power_savings_balance)
3643 return 0;
3644
3645 if (sds->this != sds->group_leader ||
3646 sds->group_leader == sds->group_min)
3647 return 0;
3648
3649 *imbalance = sds->min_load_per_task;
3650 sds->busiest = sds->group_min;
3651
3652 return 1;
3653
3654}
3655#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3656static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3657 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3658{
3659 return;
3660}
3661
3662static inline void update_sd_power_savings_stats(struct sched_group *group,
3663 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3664{
3665 return;
3666}
3667
3668static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3669 int this_cpu, unsigned long *imbalance)
3670{
3671 return 0;
3672}
3673#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3674
3675
3676unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3677{
3678 return SCHED_LOAD_SCALE;
3679}
3680
3681unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3682{
3683 return default_scale_freq_power(sd, cpu);
3684}
3685
3686unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3687{
3688 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3689 unsigned long smt_gain = sd->smt_gain;
3690
3691 smt_gain /= weight;
3692
3693 return smt_gain;
3694}
3695
3696unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3697{
3698 return default_scale_smt_power(sd, cpu);
3699}
3700
3701unsigned long scale_rt_power(int cpu)
3702{
3703 struct rq *rq = cpu_rq(cpu);
3704 u64 total, available;
3705
3706 sched_avg_update(rq);
3707
3708 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3709 available = total - rq->rt_avg;
3710
3711 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3712 total = SCHED_LOAD_SCALE;
3713
3714 total >>= SCHED_LOAD_SHIFT;
3715
3716 return div_u64(available, total);
3717}
3718
3719static void update_cpu_power(struct sched_domain *sd, int cpu)
3720{
3721 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3722 unsigned long power = SCHED_LOAD_SCALE;
3723 struct sched_group *sdg = sd->groups;
3724
3725 if (sched_feat(ARCH_POWER))
3726 power *= arch_scale_freq_power(sd, cpu);
3727 else
3728 power *= default_scale_freq_power(sd, cpu);
3729
3730 power >>= SCHED_LOAD_SHIFT;
3731
3732 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3733 if (sched_feat(ARCH_POWER))
3734 power *= arch_scale_smt_power(sd, cpu);
3735 else
3736 power *= default_scale_smt_power(sd, cpu);
3737
3738 power >>= SCHED_LOAD_SHIFT;
3739 }
3740
3741 power *= scale_rt_power(cpu);
3742 power >>= SCHED_LOAD_SHIFT;
3743
3744 if (!power)
3745 power = 1;
3746
3747 sdg->cpu_power = power;
3748}
3749
3750static void update_group_power(struct sched_domain *sd, int cpu)
3751{
3752 struct sched_domain *child = sd->child;
3753 struct sched_group *group, *sdg = sd->groups;
3754 unsigned long power;
3755
3756 if (!child) {
3757 update_cpu_power(sd, cpu);
3758 return;
3759 }
3760
3761 power = 0;
3762
3763 group = child->groups;
3764 do {
3765 power += group->cpu_power;
3766 group = group->next;
3767 } while (group != child->groups);
3768
3769 sdg->cpu_power = power;
3770}
3771
3772/**
3773 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3774 * @sd: The sched_domain whose statistics are to be updated.
3775 * @group: sched_group whose statistics are to be updated.
3776 * @this_cpu: Cpu for which load balance is currently performed.
3777 * @idle: Idle status of this_cpu
3778 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3779 * @sd_idle: Idle status of the sched_domain containing group.
3780 * @local_group: Does group contain this_cpu.
3781 * @cpus: Set of cpus considered for load balancing.
3782 * @balance: Should we balance.
3783 * @sgs: variable to hold the statistics for this group.
3784 */
3785static inline void update_sg_lb_stats(struct sched_domain *sd,
3786 struct sched_group *group, int this_cpu,
3787 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3788 int local_group, const struct cpumask *cpus,
3789 int *balance, struct sg_lb_stats *sgs)
3790{
3791 unsigned long load, max_cpu_load, min_cpu_load;
3792 int i;
3793 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3794 unsigned long sum_avg_load_per_task;
3795 unsigned long avg_load_per_task;
3796
3797 if (local_group) {
3798 balance_cpu = group_first_cpu(group);
3799 if (balance_cpu == this_cpu)
3800 update_group_power(sd, this_cpu);
3801 }
3802
3803 /* Tally up the load of all CPUs in the group */
3804 sum_avg_load_per_task = avg_load_per_task = 0;
3805 max_cpu_load = 0;
3806 min_cpu_load = ~0UL;
3807
3808 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3809 struct rq *rq = cpu_rq(i);
3810
3811 if (*sd_idle && rq->nr_running)
3812 *sd_idle = 0;
3813
3814 /* Bias balancing toward cpus of our domain */
3815 if (local_group) {
3816 if (idle_cpu(i) && !first_idle_cpu) {
3817 first_idle_cpu = 1;
3818 balance_cpu = i;
3819 }
3820
3821 load = target_load(i, load_idx);
3822 } else {
3823 load = source_load(i, load_idx);
3824 if (load > max_cpu_load)
3825 max_cpu_load = load;
3826 if (min_cpu_load > load)
3827 min_cpu_load = load;
3828 }
3829
3830 sgs->group_load += load;
3831 sgs->sum_nr_running += rq->nr_running;
3832 sgs->sum_weighted_load += weighted_cpuload(i);
3833
3834 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3835 }
3836
3837 /*
3838 * First idle cpu or the first cpu(busiest) in this sched group
3839 * is eligible for doing load balancing at this and above
3840 * domains. In the newly idle case, we will allow all the cpu's
3841 * to do the newly idle load balance.
3842 */
3843 if (idle != CPU_NEWLY_IDLE && local_group &&
3844 balance_cpu != this_cpu && balance) {
3845 *balance = 0;
3846 return;
3847 }
3848
3849 /* Adjust by relative CPU power of the group */
3850 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3851
3852
3853 /*
3854 * Consider the group unbalanced when the imbalance is larger
3855 * than the average weight of two tasks.
3856 *
3857 * APZ: with cgroup the avg task weight can vary wildly and
3858 * might not be a suitable number - should we keep a
3859 * normalized nr_running number somewhere that negates
3860 * the hierarchy?
3861 */
3862 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3863 group->cpu_power;
3864
3865 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3866 sgs->group_imb = 1;
3867
3868 sgs->group_capacity =
3869 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3870}
3871
3872/**
3873 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3874 * @sd: sched_domain whose statistics are to be updated.
3875 * @this_cpu: Cpu for which load balance is currently performed.
3876 * @idle: Idle status of this_cpu
3877 * @sd_idle: Idle status of the sched_domain containing group.
3878 * @cpus: Set of cpus considered for load balancing.
3879 * @balance: Should we balance.
3880 * @sds: variable to hold the statistics for this sched_domain.
3881 */
3882static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3883 enum cpu_idle_type idle, int *sd_idle,
3884 const struct cpumask *cpus, int *balance,
3885 struct sd_lb_stats *sds)
3886{
3887 struct sched_domain *child = sd->child;
3888 struct sched_group *group = sd->groups;
3889 struct sg_lb_stats sgs;
3890 int load_idx, prefer_sibling = 0;
3891
3892 if (child && child->flags & SD_PREFER_SIBLING)
3893 prefer_sibling = 1;
3894
3895 init_sd_power_savings_stats(sd, sds, idle);
3896 load_idx = get_sd_load_idx(sd, idle);
3897
3898 do {
3899 int local_group;
3900
3901 local_group = cpumask_test_cpu(this_cpu,
3902 sched_group_cpus(group));
3903 memset(&sgs, 0, sizeof(sgs));
3904 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3905 local_group, cpus, balance, &sgs);
3906
3907 if (local_group && balance && !(*balance))
3908 return;
3909
3910 sds->total_load += sgs.group_load;
3911 sds->total_pwr += group->cpu_power;
3912
3913 /*
3914 * In case the child domain prefers tasks go to siblings
3915 * first, lower the group capacity to one so that we'll try
3916 * and move all the excess tasks away.
3917 */
3918 if (prefer_sibling)
3919 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3920
3921 if (local_group) {
3922 sds->this_load = sgs.avg_load;
3923 sds->this = group;
3924 sds->this_nr_running = sgs.sum_nr_running;
3925 sds->this_load_per_task = sgs.sum_weighted_load;
3926 } else if (sgs.avg_load > sds->max_load &&
3927 (sgs.sum_nr_running > sgs.group_capacity ||
3928 sgs.group_imb)) {
3929 sds->max_load = sgs.avg_load;
3930 sds->busiest = group;
3931 sds->busiest_nr_running = sgs.sum_nr_running;
3932 sds->busiest_load_per_task = sgs.sum_weighted_load;
3933 sds->group_imb = sgs.group_imb;
3934 }
3935
3936 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3937 group = group->next;
3938 } while (group != sd->groups);
3939}
3940
3941/**
3942 * fix_small_imbalance - Calculate the minor imbalance that exists
3943 * amongst the groups of a sched_domain, during
3944 * load balancing.
3945 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3946 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3947 * @imbalance: Variable to store the imbalance.
3948 */
3949static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3950 int this_cpu, unsigned long *imbalance)
3951{
3952 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3953 unsigned int imbn = 2;
3954
3955 if (sds->this_nr_running) {
3956 sds->this_load_per_task /= sds->this_nr_running;
3957 if (sds->busiest_load_per_task >
3958 sds->this_load_per_task)
3959 imbn = 1;
3960 } else
3961 sds->this_load_per_task =
3962 cpu_avg_load_per_task(this_cpu);
3963
3964 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3965 sds->busiest_load_per_task * imbn) {
3966 *imbalance = sds->busiest_load_per_task;
3967 return;
3968 }
3969
3970 /*
3971 * OK, we don't have enough imbalance to justify moving tasks,
3972 * however we may be able to increase total CPU power used by
3973 * moving them.
3974 */
3975
3976 pwr_now += sds->busiest->cpu_power *
3977 min(sds->busiest_load_per_task, sds->max_load);
3978 pwr_now += sds->this->cpu_power *
3979 min(sds->this_load_per_task, sds->this_load);
3980 pwr_now /= SCHED_LOAD_SCALE;
3981
3982 /* Amount of load we'd subtract */
3983 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3984 sds->busiest->cpu_power;
3985 if (sds->max_load > tmp)
3986 pwr_move += sds->busiest->cpu_power *
3987 min(sds->busiest_load_per_task, sds->max_load - tmp);
3988
3989 /* Amount of load we'd add */
3990 if (sds->max_load * sds->busiest->cpu_power <
3991 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3992 tmp = (sds->max_load * sds->busiest->cpu_power) /
3993 sds->this->cpu_power;
3994 else
3995 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3996 sds->this->cpu_power;
3997 pwr_move += sds->this->cpu_power *
3998 min(sds->this_load_per_task, sds->this_load + tmp);
3999 pwr_move /= SCHED_LOAD_SCALE;
4000
4001 /* Move if we gain throughput */
4002 if (pwr_move > pwr_now)
4003 *imbalance = sds->busiest_load_per_task;
4004}
4005
4006/**
4007 * calculate_imbalance - Calculate the amount of imbalance present within the
4008 * groups of a given sched_domain during load balance.
4009 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4010 * @this_cpu: Cpu for which currently load balance is being performed.
4011 * @imbalance: The variable to store the imbalance.
4012 */
4013static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4014 unsigned long *imbalance)
4015{
4016 unsigned long max_pull;
4017 /*
4018 * In the presence of smp nice balancing, certain scenarios can have
4019 * max load less than avg load(as we skip the groups at or below
4020 * its cpu_power, while calculating max_load..)
4021 */
4022 if (sds->max_load < sds->avg_load) {
4023 *imbalance = 0;
4024 return fix_small_imbalance(sds, this_cpu, imbalance);
4025 }
4026
4027 /* Don't want to pull so many tasks that a group would go idle */
4028 max_pull = min(sds->max_load - sds->avg_load,
4029 sds->max_load - sds->busiest_load_per_task);
4030
4031 /* How much load to actually move to equalise the imbalance */
4032 *imbalance = min(max_pull * sds->busiest->cpu_power,
4033 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
4034 / SCHED_LOAD_SCALE;
4035
4036 /*
4037 * if *imbalance is less than the average load per runnable task
4038 * there is no gaurantee that any tasks will be moved so we'll have
4039 * a think about bumping its value to force at least one task to be
4040 * moved
4041 */
4042 if (*imbalance < sds->busiest_load_per_task)
4043 return fix_small_imbalance(sds, this_cpu, imbalance);
4044
4045}
4046/******* find_busiest_group() helpers end here *********************/
4047
4048/**
4049 * find_busiest_group - Returns the busiest group within the sched_domain
4050 * if there is an imbalance. If there isn't an imbalance, and
4051 * the user has opted for power-savings, it returns a group whose
4052 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4053 * such a group exists.
4054 *
4055 * Also calculates the amount of weighted load which should be moved
4056 * to restore balance.
4057 *
4058 * @sd: The sched_domain whose busiest group is to be returned.
4059 * @this_cpu: The cpu for which load balancing is currently being performed.
4060 * @imbalance: Variable which stores amount of weighted load which should
4061 * be moved to restore balance/put a group to idle.
4062 * @idle: The idle status of this_cpu.
4063 * @sd_idle: The idleness of sd
4064 * @cpus: The set of CPUs under consideration for load-balancing.
4065 * @balance: Pointer to a variable indicating if this_cpu
4066 * is the appropriate cpu to perform load balancing at this_level.
4067 *
4068 * Returns: - the busiest group if imbalance exists.
4069 * - If no imbalance and user has opted for power-savings balance,
4070 * return the least loaded group whose CPUs can be
4071 * put to idle by rebalancing its tasks onto our group.
4072 */
4073static struct sched_group *
4074find_busiest_group(struct sched_domain *sd, int this_cpu,
4075 unsigned long *imbalance, enum cpu_idle_type idle,
4076 int *sd_idle, const struct cpumask *cpus, int *balance)
4077{
4078 struct sd_lb_stats sds;
4079
4080 memset(&sds, 0, sizeof(sds));
4081
4082 /*
4083 * Compute the various statistics relavent for load balancing at
4084 * this level.
4085 */
4086 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4087 balance, &sds);
4088
4089 /* Cases where imbalance does not exist from POV of this_cpu */
4090 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4091 * at this level.
4092 * 2) There is no busy sibling group to pull from.
4093 * 3) This group is the busiest group.
4094 * 4) This group is more busy than the avg busieness at this
4095 * sched_domain.
4096 * 5) The imbalance is within the specified limit.
4097 * 6) Any rebalance would lead to ping-pong
4098 */
4099 if (balance && !(*balance))
4100 goto ret;
4101
4102 if (!sds.busiest || sds.busiest_nr_running == 0)
4103 goto out_balanced;
4104
4105 if (sds.this_load >= sds.max_load)
4106 goto out_balanced;
4107
4108 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4109
4110 if (sds.this_load >= sds.avg_load)
4111 goto out_balanced;
4112
4113 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4114 goto out_balanced;
4115
4116 sds.busiest_load_per_task /= sds.busiest_nr_running;
4117 if (sds.group_imb)
4118 sds.busiest_load_per_task =
4119 min(sds.busiest_load_per_task, sds.avg_load);
4120
4121 /*
4122 * We're trying to get all the cpus to the average_load, so we don't
4123 * want to push ourselves above the average load, nor do we wish to
4124 * reduce the max loaded cpu below the average load, as either of these
4125 * actions would just result in more rebalancing later, and ping-pong
4126 * tasks around. Thus we look for the minimum possible imbalance.
4127 * Negative imbalances (*we* are more loaded than anyone else) will
4128 * be counted as no imbalance for these purposes -- we can't fix that
4129 * by pulling tasks to us. Be careful of negative numbers as they'll
4130 * appear as very large values with unsigned longs.
4131 */
4132 if (sds.max_load <= sds.busiest_load_per_task)
4133 goto out_balanced;
4134
4135 /* Looks like there is an imbalance. Compute it */
4136 calculate_imbalance(&sds, this_cpu, imbalance);
4137 return sds.busiest;
4138
4139out_balanced:
4140 /*
4141 * There is no obvious imbalance. But check if we can do some balancing
4142 * to save power.
4143 */
4144 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4145 return sds.busiest;
4146ret:
4147 *imbalance = 0;
4148 return NULL;
4149}
4150
4151/*
4152 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4153 */
4154static struct rq *
4155find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4156 unsigned long imbalance, const struct cpumask *cpus)
4157{
4158 struct rq *busiest = NULL, *rq;
4159 unsigned long max_load = 0;
4160 int i;
4161
4162 for_each_cpu(i, sched_group_cpus(group)) {
4163 unsigned long power = power_of(i);
4164 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4165 unsigned long wl;
4166
4167 if (!cpumask_test_cpu(i, cpus))
4168 continue;
4169
4170 rq = cpu_rq(i);
4171 wl = weighted_cpuload(i);
4172
4173 /*
4174 * When comparing with imbalance, use weighted_cpuload()
4175 * which is not scaled with the cpu power.
4176 */
4177 if (capacity && rq->nr_running == 1 && wl > imbalance)
4178 continue;
4179
4180 /*
4181 * For the load comparisons with the other cpu's, consider
4182 * the weighted_cpuload() scaled with the cpu power, so that
4183 * the load can be moved away from the cpu that is potentially
4184 * running at a lower capacity.
4185 */
4186 wl = (wl * SCHED_LOAD_SCALE) / power;
4187
4188 if (wl > max_load) {
4189 max_load = wl;
4190 busiest = rq;
4191 }
4192 }
4193
4194 return busiest;
4195}
4196
4197/*
4198 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4199 * so long as it is large enough.
4200 */
4201#define MAX_PINNED_INTERVAL 512
4202
4203/* Working cpumask for load_balance and load_balance_newidle. */
4204static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4205
4206/*
4207 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4208 * tasks if there is an imbalance.
4209 */
4210static int load_balance(int this_cpu, struct rq *this_rq,
4211 struct sched_domain *sd, enum cpu_idle_type idle,
4212 int *balance)
4213{
4214 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4215 struct sched_group *group;
4216 unsigned long imbalance;
4217 struct rq *busiest;
4218 unsigned long flags;
4219 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4220
4221 cpumask_copy(cpus, cpu_active_mask);
4222
4223 /*
4224 * When power savings policy is enabled for the parent domain, idle
4225 * sibling can pick up load irrespective of busy siblings. In this case,
4226 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4227 * portraying it as CPU_NOT_IDLE.
4228 */
4229 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4230 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4231 sd_idle = 1;
4232
4233 schedstat_inc(sd, lb_count[idle]);
4234
4235redo:
4236 update_shares(sd);
4237 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4238 cpus, balance);
4239
4240 if (*balance == 0)
4241 goto out_balanced;
4242
4243 if (!group) {
4244 schedstat_inc(sd, lb_nobusyg[idle]);
4245 goto out_balanced;
4246 }
4247
4248 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4249 if (!busiest) {
4250 schedstat_inc(sd, lb_nobusyq[idle]);
4251 goto out_balanced;
4252 }
4253
4254 BUG_ON(busiest == this_rq);
4255
4256 schedstat_add(sd, lb_imbalance[idle], imbalance);
4257
4258 ld_moved = 0;
4259 if (busiest->nr_running > 1) {
4260 /*
4261 * Attempt to move tasks. If find_busiest_group has found
4262 * an imbalance but busiest->nr_running <= 1, the group is
4263 * still unbalanced. ld_moved simply stays zero, so it is
4264 * correctly treated as an imbalance.
4265 */
4266 local_irq_save(flags);
4267 double_rq_lock(this_rq, busiest);
4268 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4269 imbalance, sd, idle, &all_pinned);
4270 double_rq_unlock(this_rq, busiest);
4271 local_irq_restore(flags);
4272
4273 /*
4274 * some other cpu did the load balance for us.
4275 */
4276 if (ld_moved && this_cpu != smp_processor_id())
4277 resched_cpu(this_cpu);
4278
4279 /* All tasks on this runqueue were pinned by CPU affinity */
4280 if (unlikely(all_pinned)) {
4281 cpumask_clear_cpu(cpu_of(busiest), cpus);
4282 if (!cpumask_empty(cpus))
4283 goto redo;
4284 goto out_balanced;
4285 }
4286 }
4287
4288 if (!ld_moved) {
4289 schedstat_inc(sd, lb_failed[idle]);
4290 sd->nr_balance_failed++;
4291
4292 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4293
4294 raw_spin_lock_irqsave(&busiest->lock, flags);
4295
4296 /* don't kick the migration_thread, if the curr
4297 * task on busiest cpu can't be moved to this_cpu
4298 */
4299 if (!cpumask_test_cpu(this_cpu,
4300 &busiest->curr->cpus_allowed)) {
4301 raw_spin_unlock_irqrestore(&busiest->lock,
4302 flags);
4303 all_pinned = 1;
4304 goto out_one_pinned;
4305 }
4306
4307 if (!busiest->active_balance) {
4308 busiest->active_balance = 1;
4309 busiest->push_cpu = this_cpu;
4310 active_balance = 1;
4311 }
4312 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4313 if (active_balance)
4314 wake_up_process(busiest->migration_thread);
4315
4316 /*
4317 * We've kicked active balancing, reset the failure
4318 * counter.
4319 */
4320 sd->nr_balance_failed = sd->cache_nice_tries+1;
4321 }
4322 } else
4323 sd->nr_balance_failed = 0;
4324
4325 if (likely(!active_balance)) {
4326 /* We were unbalanced, so reset the balancing interval */
4327 sd->balance_interval = sd->min_interval;
4328 } else {
4329 /*
4330 * If we've begun active balancing, start to back off. This
4331 * case may not be covered by the all_pinned logic if there
4332 * is only 1 task on the busy runqueue (because we don't call
4333 * move_tasks).
4334 */
4335 if (sd->balance_interval < sd->max_interval)
4336 sd->balance_interval *= 2;
4337 }
4338
4339 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4340 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4341 ld_moved = -1;
4342
4343 goto out;
4344
4345out_balanced:
4346 schedstat_inc(sd, lb_balanced[idle]);
4347
4348 sd->nr_balance_failed = 0;
4349
4350out_one_pinned:
4351 /* tune up the balancing interval */
4352 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4353 (sd->balance_interval < sd->max_interval))
4354 sd->balance_interval *= 2;
4355
4356 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4357 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4358 ld_moved = -1;
4359 else
4360 ld_moved = 0;
4361out:
4362 if (ld_moved)
4363 update_shares(sd);
4364 return ld_moved;
4365}
4366
4367/*
4368 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4369 * tasks if there is an imbalance.
4370 *
4371 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4372 * this_rq is locked.
4373 */
4374static int
4375load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4376{
4377 struct sched_group *group;
4378 struct rq *busiest = NULL;
4379 unsigned long imbalance;
4380 int ld_moved = 0;
4381 int sd_idle = 0;
4382 int all_pinned = 0;
4383 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4384
4385 cpumask_copy(cpus, cpu_active_mask);
4386
4387 /*
4388 * When power savings policy is enabled for the parent domain, idle
4389 * sibling can pick up load irrespective of busy siblings. In this case,
4390 * let the state of idle sibling percolate up as IDLE, instead of
4391 * portraying it as CPU_NOT_IDLE.
4392 */
4393 if (sd->flags & SD_SHARE_CPUPOWER &&
4394 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4395 sd_idle = 1;
4396
4397 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4398redo:
4399 update_shares_locked(this_rq, sd);
4400 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4401 &sd_idle, cpus, NULL);
4402 if (!group) {
4403 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4404 goto out_balanced;
4405 }
4406
4407 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4408 if (!busiest) {
4409 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4410 goto out_balanced;
4411 }
4412
4413 BUG_ON(busiest == this_rq);
4414
4415 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4416
4417 ld_moved = 0;
4418 if (busiest->nr_running > 1) {
4419 /* Attempt to move tasks */
4420 double_lock_balance(this_rq, busiest);
4421 /* this_rq->clock is already updated */
4422 update_rq_clock(busiest);
4423 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4424 imbalance, sd, CPU_NEWLY_IDLE,
4425 &all_pinned);
4426 double_unlock_balance(this_rq, busiest);
4427
4428 if (unlikely(all_pinned)) {
4429 cpumask_clear_cpu(cpu_of(busiest), cpus);
4430 if (!cpumask_empty(cpus))
4431 goto redo;
4432 }
4433 }
4434
4435 if (!ld_moved) {
4436 int active_balance = 0;
4437
4438 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4439 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4440 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4441 return -1;
4442
4443 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4444 return -1;
4445
4446 if (sd->nr_balance_failed++ < 2)
4447 return -1;
4448
4449 /*
4450 * The only task running in a non-idle cpu can be moved to this
4451 * cpu in an attempt to completely freeup the other CPU
4452 * package. The same method used to move task in load_balance()
4453 * have been extended for load_balance_newidle() to speedup
4454 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4455 *
4456 * The package power saving logic comes from
4457 * find_busiest_group(). If there are no imbalance, then
4458 * f_b_g() will return NULL. However when sched_mc={1,2} then
4459 * f_b_g() will select a group from which a running task may be
4460 * pulled to this cpu in order to make the other package idle.
4461 * If there is no opportunity to make a package idle and if
4462 * there are no imbalance, then f_b_g() will return NULL and no
4463 * action will be taken in load_balance_newidle().
4464 *
4465 * Under normal task pull operation due to imbalance, there
4466 * will be more than one task in the source run queue and
4467 * move_tasks() will succeed. ld_moved will be true and this
4468 * active balance code will not be triggered.
4469 */
4470
4471 /* Lock busiest in correct order while this_rq is held */
4472 double_lock_balance(this_rq, busiest);
4473
4474 /*
4475 * don't kick the migration_thread, if the curr
4476 * task on busiest cpu can't be moved to this_cpu
4477 */
4478 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4479 double_unlock_balance(this_rq, busiest);
4480 all_pinned = 1;
4481 return ld_moved;
4482 }
4483
4484 if (!busiest->active_balance) {
4485 busiest->active_balance = 1;
4486 busiest->push_cpu = this_cpu;
4487 active_balance = 1;
4488 }
4489
4490 double_unlock_balance(this_rq, busiest);
4491 /*
4492 * Should not call ttwu while holding a rq->lock
4493 */
4494 raw_spin_unlock(&this_rq->lock);
4495 if (active_balance)
4496 wake_up_process(busiest->migration_thread);
4497 raw_spin_lock(&this_rq->lock);
4498
4499 } else
4500 sd->nr_balance_failed = 0;
4501
4502 update_shares_locked(this_rq, sd);
4503 return ld_moved;
4504
4505out_balanced:
4506 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4507 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4508 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4509 return -1;
4510 sd->nr_balance_failed = 0;
4511
4512 return 0;
4513}
4514
4515/*
4516 * idle_balance is called by schedule() if this_cpu is about to become
4517 * idle. Attempts to pull tasks from other CPUs.
4518 */
4519static void idle_balance(int this_cpu, struct rq *this_rq)
4520{
4521 struct sched_domain *sd;
4522 int pulled_task = 0;
4523 unsigned long next_balance = jiffies + HZ;
4524
4525 this_rq->idle_stamp = this_rq->clock;
4526
4527 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4528 return;
4529
4530 for_each_domain(this_cpu, sd) {
4531 unsigned long interval;
4532
4533 if (!(sd->flags & SD_LOAD_BALANCE))
4534 continue;
4535
4536 if (sd->flags & SD_BALANCE_NEWIDLE)
4537 /* If we've pulled tasks over stop searching: */
4538 pulled_task = load_balance_newidle(this_cpu, this_rq,
4539 sd);
4540
4541 interval = msecs_to_jiffies(sd->balance_interval);
4542 if (time_after(next_balance, sd->last_balance + interval))
4543 next_balance = sd->last_balance + interval;
4544 if (pulled_task) {
4545 this_rq->idle_stamp = 0;
4546 break;
4547 }
4548 }
4549 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4550 /*
4551 * We are going idle. next_balance may be set based on
4552 * a busy processor. So reset next_balance.
4553 */
4554 this_rq->next_balance = next_balance;
4555 }
4556}
4557
4558/*
4559 * active_load_balance is run by migration threads. It pushes running tasks
4560 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4561 * running on each physical CPU where possible, and avoids physical /
4562 * logical imbalances.
4563 *
4564 * Called with busiest_rq locked.
4565 */
4566static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4567{
4568 int target_cpu = busiest_rq->push_cpu;
4569 struct sched_domain *sd;
4570 struct rq *target_rq;
4571
4572 /* Is there any task to move? */
4573 if (busiest_rq->nr_running <= 1)
4574 return;
4575
4576 target_rq = cpu_rq(target_cpu);
4577
4578 /*
4579 * This condition is "impossible", if it occurs
4580 * we need to fix it. Originally reported by
4581 * Bjorn Helgaas on a 128-cpu setup.
4582 */
4583 BUG_ON(busiest_rq == target_rq);
4584
4585 /* move a task from busiest_rq to target_rq */
4586 double_lock_balance(busiest_rq, target_rq);
4587 update_rq_clock(busiest_rq);
4588 update_rq_clock(target_rq);
4589
4590 /* Search for an sd spanning us and the target CPU. */
4591 for_each_domain(target_cpu, sd) {
4592 if ((sd->flags & SD_LOAD_BALANCE) &&
4593 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4594 break;
4595 }
4596
4597 if (likely(sd)) {
4598 schedstat_inc(sd, alb_count);
4599
4600 if (move_one_task(target_rq, target_cpu, busiest_rq,
4601 sd, CPU_IDLE))
4602 schedstat_inc(sd, alb_pushed);
4603 else
4604 schedstat_inc(sd, alb_failed);
4605 }
4606 double_unlock_balance(busiest_rq, target_rq);
4607}
4608
4609#ifdef CONFIG_NO_HZ
4610static struct {
4611 atomic_t load_balancer;
4612 cpumask_var_t cpu_mask;
4613 cpumask_var_t ilb_grp_nohz_mask;
4614} nohz ____cacheline_aligned = {
4615 .load_balancer = ATOMIC_INIT(-1),
4616};
4617
4618int get_nohz_load_balancer(void)
4619{
4620 return atomic_read(&nohz.load_balancer);
4621}
4622
4623#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4624/**
4625 * lowest_flag_domain - Return lowest sched_domain containing flag.
4626 * @cpu: The cpu whose lowest level of sched domain is to
4627 * be returned.
4628 * @flag: The flag to check for the lowest sched_domain
4629 * for the given cpu.
4630 *
4631 * Returns the lowest sched_domain of a cpu which contains the given flag.
4632 */
4633static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4634{
4635 struct sched_domain *sd;
4636
4637 for_each_domain(cpu, sd)
4638 if (sd && (sd->flags & flag))
4639 break;
4640
4641 return sd;
4642}
4643
4644/**
4645 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4646 * @cpu: The cpu whose domains we're iterating over.
4647 * @sd: variable holding the value of the power_savings_sd
4648 * for cpu.
4649 * @flag: The flag to filter the sched_domains to be iterated.
4650 *
4651 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4652 * set, starting from the lowest sched_domain to the highest.
4653 */
4654#define for_each_flag_domain(cpu, sd, flag) \
4655 for (sd = lowest_flag_domain(cpu, flag); \
4656 (sd && (sd->flags & flag)); sd = sd->parent)
4657
4658/**
4659 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4660 * @ilb_group: group to be checked for semi-idleness
4661 *
4662 * Returns: 1 if the group is semi-idle. 0 otherwise.
4663 *
4664 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4665 * and atleast one non-idle CPU. This helper function checks if the given
4666 * sched_group is semi-idle or not.
4667 */
4668static inline int is_semi_idle_group(struct sched_group *ilb_group)
4669{
4670 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4671 sched_group_cpus(ilb_group));
4672
4673 /*
4674 * A sched_group is semi-idle when it has atleast one busy cpu
4675 * and atleast one idle cpu.
4676 */
4677 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4678 return 0;
4679
4680 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4681 return 0;
4682
4683 return 1;
4684}
4685/**
4686 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4687 * @cpu: The cpu which is nominating a new idle_load_balancer.
4688 *
4689 * Returns: Returns the id of the idle load balancer if it exists,
4690 * Else, returns >= nr_cpu_ids.
4691 *
4692 * This algorithm picks the idle load balancer such that it belongs to a
4693 * semi-idle powersavings sched_domain. The idea is to try and avoid
4694 * completely idle packages/cores just for the purpose of idle load balancing
4695 * when there are other idle cpu's which are better suited for that job.
4696 */
4697static int find_new_ilb(int cpu)
4698{
4699 struct sched_domain *sd;
4700 struct sched_group *ilb_group;
4701
4702 /*
4703 * Have idle load balancer selection from semi-idle packages only
4704 * when power-aware load balancing is enabled
4705 */
4706 if (!(sched_smt_power_savings || sched_mc_power_savings))
4707 goto out_done;
4708
4709 /*
4710 * Optimize for the case when we have no idle CPUs or only one
4711 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4712 */
4713 if (cpumask_weight(nohz.cpu_mask) < 2)
4714 goto out_done;
4715
4716 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4717 ilb_group = sd->groups;
4718
4719 do {
4720 if (is_semi_idle_group(ilb_group))
4721 return cpumask_first(nohz.ilb_grp_nohz_mask);
4722
4723 ilb_group = ilb_group->next;
4724
4725 } while (ilb_group != sd->groups);
4726 }
4727
4728out_done:
4729 return cpumask_first(nohz.cpu_mask);
4730}
4731#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4732static inline int find_new_ilb(int call_cpu)
4733{
4734 return cpumask_first(nohz.cpu_mask);
4735}
4736#endif
4737
4738/*
4739 * This routine will try to nominate the ilb (idle load balancing)
4740 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4741 * load balancing on behalf of all those cpus. If all the cpus in the system
4742 * go into this tickless mode, then there will be no ilb owner (as there is
4743 * no need for one) and all the cpus will sleep till the next wakeup event
4744 * arrives...
4745 *
4746 * For the ilb owner, tick is not stopped. And this tick will be used
4747 * for idle load balancing. ilb owner will still be part of
4748 * nohz.cpu_mask..
4749 *
4750 * While stopping the tick, this cpu will become the ilb owner if there
4751 * is no other owner. And will be the owner till that cpu becomes busy
4752 * or if all cpus in the system stop their ticks at which point
4753 * there is no need for ilb owner.
4754 *
4755 * When the ilb owner becomes busy, it nominates another owner, during the
4756 * next busy scheduler_tick()
4757 */
4758int select_nohz_load_balancer(int stop_tick)
4759{
4760 int cpu = smp_processor_id();
4761
4762 if (stop_tick) {
4763 cpu_rq(cpu)->in_nohz_recently = 1;
4764
4765 if (!cpu_active(cpu)) {
4766 if (atomic_read(&nohz.load_balancer) != cpu)
4767 return 0;
4768
4769 /*
4770 * If we are going offline and still the leader,
4771 * give up!
4772 */
4773 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4774 BUG();
4775
4776 return 0;
4777 }
4778
4779 cpumask_set_cpu(cpu, nohz.cpu_mask);
4780
4781 /* time for ilb owner also to sleep */
4782 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4783 if (atomic_read(&nohz.load_balancer) == cpu)
4784 atomic_set(&nohz.load_balancer, -1);
4785 return 0;
4786 }
4787
4788 if (atomic_read(&nohz.load_balancer) == -1) {
4789 /* make me the ilb owner */
4790 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4791 return 1;
4792 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4793 int new_ilb;
4794
4795 if (!(sched_smt_power_savings ||
4796 sched_mc_power_savings))
4797 return 1;
4798 /*
4799 * Check to see if there is a more power-efficient
4800 * ilb.
4801 */
4802 new_ilb = find_new_ilb(cpu);
4803 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4804 atomic_set(&nohz.load_balancer, -1);
4805 resched_cpu(new_ilb);
4806 return 0;
4807 }
4808 return 1;
4809 }
4810 } else {
4811 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4812 return 0;
4813
4814 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4815
4816 if (atomic_read(&nohz.load_balancer) == cpu)
4817 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4818 BUG();
4819 }
4820 return 0;
4821}
4822#endif
4823
4824static DEFINE_SPINLOCK(balancing);
4825
4826/*
4827 * It checks each scheduling domain to see if it is due to be balanced,
4828 * and initiates a balancing operation if so.
4829 *
4830 * Balancing parameters are set up in arch_init_sched_domains.
4831 */
4832static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4833{
4834 int balance = 1;
4835 struct rq *rq = cpu_rq(cpu);
4836 unsigned long interval;
4837 struct sched_domain *sd;
4838 /* Earliest time when we have to do rebalance again */
4839 unsigned long next_balance = jiffies + 60*HZ;
4840 int update_next_balance = 0;
4841 int need_serialize;
4842
4843 for_each_domain(cpu, sd) {
4844 if (!(sd->flags & SD_LOAD_BALANCE))
4845 continue;
4846
4847 interval = sd->balance_interval;
4848 if (idle != CPU_IDLE)
4849 interval *= sd->busy_factor;
4850
4851 /* scale ms to jiffies */
4852 interval = msecs_to_jiffies(interval);
4853 if (unlikely(!interval))
4854 interval = 1;
4855 if (interval > HZ*NR_CPUS/10)
4856 interval = HZ*NR_CPUS/10;
4857
4858 need_serialize = sd->flags & SD_SERIALIZE;
4859
4860 if (need_serialize) {
4861 if (!spin_trylock(&balancing))
4862 goto out;
4863 }
4864
4865 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4866 if (load_balance(cpu, rq, sd, idle, &balance)) {
4867 /*
4868 * We've pulled tasks over so either we're no
4869 * longer idle, or one of our SMT siblings is
4870 * not idle.
4871 */
4872 idle = CPU_NOT_IDLE;
4873 }
4874 sd->last_balance = jiffies;
4875 }
4876 if (need_serialize)
4877 spin_unlock(&balancing);
4878out:
4879 if (time_after(next_balance, sd->last_balance + interval)) {
4880 next_balance = sd->last_balance + interval;
4881 update_next_balance = 1;
4882 }
4883
4884 /*
4885 * Stop the load balance at this level. There is another
4886 * CPU in our sched group which is doing load balancing more
4887 * actively.
4888 */
4889 if (!balance)
4890 break;
4891 }
4892
4893 /*
4894 * next_balance will be updated only when there is a need.
4895 * When the cpu is attached to null domain for ex, it will not be
4896 * updated.
4897 */
4898 if (likely(update_next_balance))
4899 rq->next_balance = next_balance;
4900}
4901
4902/*
4903 * run_rebalance_domains is triggered when needed from the scheduler tick.
4904 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4905 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4906 */
4907static void run_rebalance_domains(struct softirq_action *h)
4908{
4909 int this_cpu = smp_processor_id();
4910 struct rq *this_rq = cpu_rq(this_cpu);
4911 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4912 CPU_IDLE : CPU_NOT_IDLE;
4913
4914 rebalance_domains(this_cpu, idle);
4915
4916#ifdef CONFIG_NO_HZ
4917 /*
4918 * If this cpu is the owner for idle load balancing, then do the
4919 * balancing on behalf of the other idle cpus whose ticks are
4920 * stopped.
4921 */
4922 if (this_rq->idle_at_tick &&
4923 atomic_read(&nohz.load_balancer) == this_cpu) {
4924 struct rq *rq;
4925 int balance_cpu;
4926
4927 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4928 if (balance_cpu == this_cpu)
4929 continue;
4930
4931 /*
4932 * If this cpu gets work to do, stop the load balancing
4933 * work being done for other cpus. Next load
4934 * balancing owner will pick it up.
4935 */
4936 if (need_resched())
4937 break;
4938
4939 rebalance_domains(balance_cpu, CPU_IDLE);
4940
4941 rq = cpu_rq(balance_cpu);
4942 if (time_after(this_rq->next_balance, rq->next_balance))
4943 this_rq->next_balance = rq->next_balance;
4944 }
4945 }
4946#endif
4947}
4948
4949static inline int on_null_domain(int cpu)
4950{
4951 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
4952}
4953
4954/*
4955 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4956 *
4957 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4958 * idle load balancing owner or decide to stop the periodic load balancing,
4959 * if the whole system is idle.
4960 */
4961static inline void trigger_load_balance(struct rq *rq, int cpu)
4962{
4963#ifdef CONFIG_NO_HZ
4964 /*
4965 * If we were in the nohz mode recently and busy at the current
4966 * scheduler tick, then check if we need to nominate new idle
4967 * load balancer.
4968 */
4969 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4970 rq->in_nohz_recently = 0;
4971
4972 if (atomic_read(&nohz.load_balancer) == cpu) {
4973 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4974 atomic_set(&nohz.load_balancer, -1);
4975 }
4976
4977 if (atomic_read(&nohz.load_balancer) == -1) {
4978 int ilb = find_new_ilb(cpu);
4979
4980 if (ilb < nr_cpu_ids)
4981 resched_cpu(ilb);
4982 }
4983 }
4984
4985 /*
4986 * If this cpu is idle and doing idle load balancing for all the
4987 * cpus with ticks stopped, is it time for that to stop?
4988 */
4989 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4990 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4991 resched_cpu(cpu);
4992 return;
4993 }
4994
4995 /*
4996 * If this cpu is idle and the idle load balancing is done by
4997 * someone else, then no need raise the SCHED_SOFTIRQ
4998 */
4999 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
5000 cpumask_test_cpu(cpu, nohz.cpu_mask))
5001 return;
5002#endif
5003 /* Don't need to rebalance while attached to NULL domain */
5004 if (time_after_eq(jiffies, rq->next_balance) &&
5005 likely(!on_null_domain(cpu)))
5006 raise_softirq(SCHED_SOFTIRQ);
5007}
5008
5009#else /* CONFIG_SMP */
5010
5011/*
5012 * on UP we do not need to balance between CPUs:
5013 */
5014static inline void idle_balance(int cpu, struct rq *rq)
5015{
5016}
5017
5018#endif 3164#endif
5019 3165
5020DEFINE_PER_CPU(struct kernel_stat, kstat); 3166DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -6114,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6114 unsigned long flags; 4260 unsigned long flags;
6115 int oldprio, on_rq, running; 4261 int oldprio, on_rq, running;
6116 struct rq *rq; 4262 struct rq *rq;
6117 const struct sched_class *prev_class = p->sched_class; 4263 const struct sched_class *prev_class;
6118 4264
6119 BUG_ON(prio < 0 || prio > MAX_PRIO); 4265 BUG_ON(prio < 0 || prio > MAX_PRIO);
6120 4266
@@ -6122,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6122 update_rq_clock(rq); 4268 update_rq_clock(rq);
6123 4269
6124 oldprio = p->prio; 4270 oldprio = p->prio;
4271 prev_class = p->sched_class;
6125 on_rq = p->se.on_rq; 4272 on_rq = p->se.on_rq;
6126 running = task_current(rq, p); 4273 running = task_current(rq, p);
6127 if (on_rq) 4274 if (on_rq)
@@ -6139,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6139 if (running) 4286 if (running)
6140 p->sched_class->set_curr_task(rq); 4287 p->sched_class->set_curr_task(rq);
6141 if (on_rq) { 4288 if (on_rq) {
6142 enqueue_task(rq, p, 0); 4289 enqueue_task(rq, p, 0, oldprio < prio);
6143 4290
6144 check_class_changed(rq, p, prev_class, oldprio, running); 4291 check_class_changed(rq, p, prev_class, oldprio, running);
6145 } 4292 }
@@ -6183,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice)
6183 delta = p->prio - old_prio; 4330 delta = p->prio - old_prio;
6184 4331
6185 if (on_rq) { 4332 if (on_rq) {
6186 enqueue_task(rq, p, 0); 4333 enqueue_task(rq, p, 0, false);
6187 /* 4334 /*
6188 * If the task increased its priority or is running and 4335 * If the task increased its priority or is running and
6189 * lowered its priority, then reschedule its CPU: 4336 * lowered its priority, then reschedule its CPU:
@@ -6341,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6341{ 4488{
6342 int retval, oldprio, oldpolicy = -1, on_rq, running; 4489 int retval, oldprio, oldpolicy = -1, on_rq, running;
6343 unsigned long flags; 4490 unsigned long flags;
6344 const struct sched_class *prev_class = p->sched_class; 4491 const struct sched_class *prev_class;
6345 struct rq *rq; 4492 struct rq *rq;
6346 int reset_on_fork; 4493 int reset_on_fork;
6347 4494
@@ -6455,6 +4602,7 @@ recheck:
6455 p->sched_reset_on_fork = reset_on_fork; 4602 p->sched_reset_on_fork = reset_on_fork;
6456 4603
6457 oldprio = p->prio; 4604 oldprio = p->prio;
4605 prev_class = p->sched_class;
6458 __setscheduler(rq, p, policy, param->sched_priority); 4606 __setscheduler(rq, p, policy, param->sched_priority);
6459 4607
6460 if (running) 4608 if (running)
@@ -9493,7 +7641,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9493 tg->rt_rq[cpu] = rt_rq; 7641 tg->rt_rq[cpu] = rt_rq;
9494 init_rt_rq(rt_rq, rq); 7642 init_rt_rq(rt_rq, rq);
9495 rt_rq->tg = tg; 7643 rt_rq->tg = tg;
9496 rt_rq->rt_se = rt_se;
9497 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7644 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9498 if (add) 7645 if (add)
9499 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7646 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9524,9 +7671,6 @@ void __init sched_init(void)
9524#ifdef CONFIG_RT_GROUP_SCHED 7671#ifdef CONFIG_RT_GROUP_SCHED
9525 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7672 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9526#endif 7673#endif
9527#ifdef CONFIG_USER_SCHED
9528 alloc_size *= 2;
9529#endif
9530#ifdef CONFIG_CPUMASK_OFFSTACK 7674#ifdef CONFIG_CPUMASK_OFFSTACK
9531 alloc_size += num_possible_cpus() * cpumask_size(); 7675 alloc_size += num_possible_cpus() * cpumask_size();
9532#endif 7676#endif
@@ -9540,13 +7684,6 @@ void __init sched_init(void)
9540 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7684 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9541 ptr += nr_cpu_ids * sizeof(void **); 7685 ptr += nr_cpu_ids * sizeof(void **);
9542 7686
9543#ifdef CONFIG_USER_SCHED
9544 root_task_group.se = (struct sched_entity **)ptr;
9545 ptr += nr_cpu_ids * sizeof(void **);
9546
9547 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9548 ptr += nr_cpu_ids * sizeof(void **);
9549#endif /* CONFIG_USER_SCHED */
9550#endif /* CONFIG_FAIR_GROUP_SCHED */ 7687#endif /* CONFIG_FAIR_GROUP_SCHED */
9551#ifdef CONFIG_RT_GROUP_SCHED 7688#ifdef CONFIG_RT_GROUP_SCHED
9552 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7689 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9555,13 +7692,6 @@ void __init sched_init(void)
9555 init_task_group.rt_rq = (struct rt_rq **)ptr; 7692 init_task_group.rt_rq = (struct rt_rq **)ptr;
9556 ptr += nr_cpu_ids * sizeof(void **); 7693 ptr += nr_cpu_ids * sizeof(void **);
9557 7694
9558#ifdef CONFIG_USER_SCHED
9559 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9560 ptr += nr_cpu_ids * sizeof(void **);
9561
9562 root_task_group.rt_rq = (struct rt_rq **)ptr;
9563 ptr += nr_cpu_ids * sizeof(void **);
9564#endif /* CONFIG_USER_SCHED */
9565#endif /* CONFIG_RT_GROUP_SCHED */ 7695#endif /* CONFIG_RT_GROUP_SCHED */
9566#ifdef CONFIG_CPUMASK_OFFSTACK 7696#ifdef CONFIG_CPUMASK_OFFSTACK
9567 for_each_possible_cpu(i) { 7697 for_each_possible_cpu(i) {
@@ -9581,22 +7711,13 @@ void __init sched_init(void)
9581#ifdef CONFIG_RT_GROUP_SCHED 7711#ifdef CONFIG_RT_GROUP_SCHED
9582 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7712 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9583 global_rt_period(), global_rt_runtime()); 7713 global_rt_period(), global_rt_runtime());
9584#ifdef CONFIG_USER_SCHED
9585 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9586 global_rt_period(), RUNTIME_INF);
9587#endif /* CONFIG_USER_SCHED */
9588#endif /* CONFIG_RT_GROUP_SCHED */ 7714#endif /* CONFIG_RT_GROUP_SCHED */
9589 7715
9590#ifdef CONFIG_GROUP_SCHED 7716#ifdef CONFIG_CGROUP_SCHED
9591 list_add(&init_task_group.list, &task_groups); 7717 list_add(&init_task_group.list, &task_groups);
9592 INIT_LIST_HEAD(&init_task_group.children); 7718 INIT_LIST_HEAD(&init_task_group.children);
9593 7719
9594#ifdef CONFIG_USER_SCHED 7720#endif /* CONFIG_CGROUP_SCHED */
9595 INIT_LIST_HEAD(&root_task_group.children);
9596 init_task_group.parent = &root_task_group;
9597 list_add(&init_task_group.siblings, &root_task_group.children);
9598#endif /* CONFIG_USER_SCHED */
9599#endif /* CONFIG_GROUP_SCHED */
9600 7721
9601#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7722#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9602 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7723 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9636,25 +7757,6 @@ void __init sched_init(void)
9636 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7757 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9637 */ 7758 */
9638 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7759 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9639#elif defined CONFIG_USER_SCHED
9640 root_task_group.shares = NICE_0_LOAD;
9641 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9642 /*
9643 * In case of task-groups formed thr' the user id of tasks,
9644 * init_task_group represents tasks belonging to root user.
9645 * Hence it forms a sibling of all subsequent groups formed.
9646 * In this case, init_task_group gets only a fraction of overall
9647 * system cpu resource, based on the weight assigned to root
9648 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9649 * by letting tasks of init_task_group sit in a separate cfs_rq
9650 * (init_tg_cfs_rq) and having one entity represent this group of
9651 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9652 */
9653 init_tg_cfs_entry(&init_task_group,
9654 &per_cpu(init_tg_cfs_rq, i),
9655 &per_cpu(init_sched_entity, i), i, 1,
9656 root_task_group.se[i]);
9657
9658#endif 7760#endif
9659#endif /* CONFIG_FAIR_GROUP_SCHED */ 7761#endif /* CONFIG_FAIR_GROUP_SCHED */
9660 7762
@@ -9663,12 +7765,6 @@ void __init sched_init(void)
9663 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7765 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9664#ifdef CONFIG_CGROUP_SCHED 7766#ifdef CONFIG_CGROUP_SCHED
9665 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7767 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9666#elif defined CONFIG_USER_SCHED
9667 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9668 init_tg_rt_entry(&init_task_group,
9669 &per_cpu(init_rt_rq_var, i),
9670 &per_cpu(init_sched_rt_entity, i), i, 1,
9671 root_task_group.rt_se[i]);
9672#endif 7768#endif
9673#endif 7769#endif
9674 7770
@@ -9753,7 +7849,7 @@ static inline int preempt_count_equals(int preempt_offset)
9753 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7849 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9754} 7850}
9755 7851
9756void __might_sleep(char *file, int line, int preempt_offset) 7852void __might_sleep(const char *file, int line, int preempt_offset)
9757{ 7853{
9758#ifdef in_atomic 7854#ifdef in_atomic
9759 static unsigned long prev_jiffy; /* ratelimiting */ 7855 static unsigned long prev_jiffy; /* ratelimiting */
@@ -10064,7 +8160,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10064} 8160}
10065#endif /* CONFIG_RT_GROUP_SCHED */ 8161#endif /* CONFIG_RT_GROUP_SCHED */
10066 8162
10067#ifdef CONFIG_GROUP_SCHED 8163#ifdef CONFIG_CGROUP_SCHED
10068static void free_sched_group(struct task_group *tg) 8164static void free_sched_group(struct task_group *tg)
10069{ 8165{
10070 free_fair_sched_group(tg); 8166 free_fair_sched_group(tg);
@@ -10169,11 +8265,11 @@ void sched_move_task(struct task_struct *tsk)
10169 if (unlikely(running)) 8265 if (unlikely(running))
10170 tsk->sched_class->set_curr_task(rq); 8266 tsk->sched_class->set_curr_task(rq);
10171 if (on_rq) 8267 if (on_rq)
10172 enqueue_task(rq, tsk, 0); 8268 enqueue_task(rq, tsk, 0, false);
10173 8269
10174 task_rq_unlock(rq, &flags); 8270 task_rq_unlock(rq, &flags);
10175} 8271}
10176#endif /* CONFIG_GROUP_SCHED */ 8272#endif /* CONFIG_CGROUP_SCHED */
10177 8273
10178#ifdef CONFIG_FAIR_GROUP_SCHED 8274#ifdef CONFIG_FAIR_GROUP_SCHED
10179static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8275static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10315,13 +8411,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10315 runtime = d->rt_runtime; 8411 runtime = d->rt_runtime;
10316 } 8412 }
10317 8413
10318#ifdef CONFIG_USER_SCHED
10319 if (tg == &root_task_group) {
10320 period = global_rt_period();
10321 runtime = global_rt_runtime();
10322 }
10323#endif
10324
10325 /* 8414 /*
10326 * Cannot have more runtime than the period. 8415 * Cannot have more runtime than the period.
10327 */ 8416 */
@@ -10941,12 +9030,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10941} 9030}
10942 9031
10943/* 9032/*
9033 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9034 * in cputime_t units. As a result, cpuacct_update_stats calls
9035 * percpu_counter_add with values large enough to always overflow the
9036 * per cpu batch limit causing bad SMP scalability.
9037 *
9038 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9039 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9040 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9041 */
9042#ifdef CONFIG_SMP
9043#define CPUACCT_BATCH \
9044 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9045#else
9046#define CPUACCT_BATCH 0
9047#endif
9048
9049/*
10944 * Charge the system/user time to the task's accounting group. 9050 * Charge the system/user time to the task's accounting group.
10945 */ 9051 */
10946static void cpuacct_update_stats(struct task_struct *tsk, 9052static void cpuacct_update_stats(struct task_struct *tsk,
10947 enum cpuacct_stat_index idx, cputime_t val) 9053 enum cpuacct_stat_index idx, cputime_t val)
10948{ 9054{
10949 struct cpuacct *ca; 9055 struct cpuacct *ca;
9056 int batch = CPUACCT_BATCH;
10950 9057
10951 if (unlikely(!cpuacct_subsys.active)) 9058 if (unlikely(!cpuacct_subsys.active))
10952 return; 9059 return;
@@ -10955,7 +9062,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10955 ca = task_ca(tsk); 9062 ca = task_ca(tsk);
10956 9063
10957 do { 9064 do {
10958 percpu_counter_add(&ca->cpustat[idx], val); 9065 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10959 ca = ca->parent; 9066 ca = ca->parent;
10960 } while (ca); 9067 } while (ca);
10961 rcu_read_unlock(); 9068 rcu_read_unlock();
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..eeb3506c4834 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,7 @@ static int convert_prio(int prio)
47} 47}
48 48
49#define for_each_cpupri_active(array, idx) \ 49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ 50 for_each_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53 51
54/** 52/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system 53 * cpupri_find - find the best (lowest-pri) CPU in the system
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8fe7ee81c552..3e1fd96c6cf9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq)
1053 * increased. Here we update the fair scheduling stats and 1053 * increased. Here we update the fair scheduling stats and
1054 * then put the task into the rbtree: 1054 * then put the task into the rbtree:
1055 */ 1055 */
1056static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 1056static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1057{ 1058{
1058 struct cfs_rq *cfs_rq; 1059 struct cfs_rq *cfs_rq;
1059 struct sched_entity *se = &p->se; 1060 struct sched_entity *se = &p->se;
@@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1815 */ 1816 */
1816 1817
1817/* 1818/*
1818 * Load-balancing iterator. Note: while the runqueue stays locked 1819 * pull_task - move a task from a remote runqueue to the local runqueue.
1819 * during the whole iteration, the current task might be 1820 * Both runqueues must be locked.
1820 * dequeued so the iterator has to be dequeue-safe. Here we
1821 * achieve that by always pre-iterating before returning
1822 * the current task:
1823 */ 1821 */
1824static struct task_struct * 1822static void pull_task(struct rq *src_rq, struct task_struct *p,
1825__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) 1823 struct rq *this_rq, int this_cpu)
1826{ 1824{
1827 struct task_struct *p = NULL; 1825 deactivate_task(src_rq, p, 0);
1828 struct sched_entity *se; 1826 set_task_cpu(p, this_cpu);
1827 activate_task(this_rq, p, 0);
1828 check_preempt_curr(this_rq, p, 0);
1829}
1829 1830
1830 if (next == &cfs_rq->tasks) 1831/*
1831 return NULL; 1832 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1833 */
1834static
1835int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1836 struct sched_domain *sd, enum cpu_idle_type idle,
1837 int *all_pinned)
1838{
1839 int tsk_cache_hot = 0;
1840 /*
1841 * We do not migrate tasks that are:
1842 * 1) running (obviously), or
1843 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1844 * 3) are cache-hot on their current CPU.
1845 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine);
1848 return 0;
1849 }
1850 *all_pinned = 0;
1832 1851
1833 se = list_entry(next, struct sched_entity, group_node); 1852 if (task_running(rq, p)) {
1834 p = task_of(se); 1853 schedstat_inc(p, se.nr_failed_migrations_running);
1835 cfs_rq->balance_iterator = next->next; 1854 return 0;
1855 }
1836 1856
1837 return p; 1857 /*
1838} 1858 * Aggressive migration if:
1859 * 1) task is cache cold, or
1860 * 2) too many balance attempts have failed.
1861 */
1839 1862
1840static struct task_struct *load_balance_start_fair(void *arg) 1863 tsk_cache_hot = task_hot(p, rq->clock, sd);
1841{ 1864 if (!tsk_cache_hot ||
1842 struct cfs_rq *cfs_rq = arg; 1865 sd->nr_balance_failed > sd->cache_nice_tries) {
1866#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations);
1870 }
1871#endif
1872 return 1;
1873 }
1843 1874
1844 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); 1875 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot);
1877 return 0;
1878 }
1879 return 1;
1845} 1880}
1846 1881
1847static struct task_struct *load_balance_next_fair(void *arg) 1882/*
1883 * move_one_task tries to move exactly one task from busiest to this_rq, as
1884 * part of active balancing operations within "domain".
1885 * Returns 1 if successful and 0 otherwise.
1886 *
1887 * Called with both runqueues locked.
1888 */
1889static int
1890move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1891 struct sched_domain *sd, enum cpu_idle_type idle)
1848{ 1892{
1849 struct cfs_rq *cfs_rq = arg; 1893 struct task_struct *p, *n;
1894 struct cfs_rq *cfs_rq;
1895 int pinned = 0;
1896
1897 for_each_leaf_cfs_rq(busiest, cfs_rq) {
1898 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
1899
1900 if (!can_migrate_task(p, busiest, this_cpu,
1901 sd, idle, &pinned))
1902 continue;
1850 1903
1851 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1904 pull_task(busiest, p, this_rq, this_cpu);
1905 /*
1906 * Right now, this is only the second place pull_task()
1907 * is called, so we can safely collect pull_task()
1908 * stats here rather than inside pull_task().
1909 */
1910 schedstat_inc(sd, lb_gained[idle]);
1911 return 1;
1912 }
1913 }
1914
1915 return 0;
1852} 1916}
1853 1917
1854static unsigned long 1918static unsigned long
1855__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1919balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1856 unsigned long max_load_move, struct sched_domain *sd, 1920 unsigned long max_load_move, struct sched_domain *sd,
1857 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, 1921 enum cpu_idle_type idle, int *all_pinned,
1858 struct cfs_rq *cfs_rq) 1922 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1859{ 1923{
1860 struct rq_iterator cfs_rq_iterator; 1924 int loops = 0, pulled = 0, pinned = 0;
1925 long rem_load_move = max_load_move;
1926 struct task_struct *p, *n;
1861 1927
1862 cfs_rq_iterator.start = load_balance_start_fair; 1928 if (max_load_move == 0)
1863 cfs_rq_iterator.next = load_balance_next_fair; 1929 goto out;
1864 cfs_rq_iterator.arg = cfs_rq;
1865 1930
1866 return balance_tasks(this_rq, this_cpu, busiest, 1931 pinned = 1;
1867 max_load_move, sd, idle, all_pinned, 1932
1868 this_best_prio, &cfs_rq_iterator); 1933 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1934 if (loops++ > sysctl_sched_nr_migrate)
1935 break;
1936
1937 if ((p->se.load.weight >> 1) > rem_load_move ||
1938 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
1939 continue;
1940
1941 pull_task(busiest, p, this_rq, this_cpu);
1942 pulled++;
1943 rem_load_move -= p->se.load.weight;
1944
1945#ifdef CONFIG_PREEMPT
1946 /*
1947 * NEWIDLE balancing is a source of latency, so preemptible
1948 * kernels will stop after the first task is pulled to minimize
1949 * the critical section.
1950 */
1951 if (idle == CPU_NEWLY_IDLE)
1952 break;
1953#endif
1954
1955 /*
1956 * We only want to steal up to the prescribed amount of
1957 * weighted load.
1958 */
1959 if (rem_load_move <= 0)
1960 break;
1961
1962 if (p->prio < *this_best_prio)
1963 *this_best_prio = p->prio;
1964 }
1965out:
1966 /*
1967 * Right now, this is one of only two places pull_task() is called,
1968 * so we can safely collect pull_task() stats here rather than
1969 * inside pull_task().
1970 */
1971 schedstat_add(sd, lb_gained[idle], pulled);
1972
1973 if (all_pinned)
1974 *all_pinned = pinned;
1975
1976 return max_load_move - rem_load_move;
1869} 1977}
1870 1978
1871#ifdef CONFIG_FAIR_GROUP_SCHED 1979#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1897 rem_load = (u64)rem_load_move * busiest_weight; 2005 rem_load = (u64)rem_load_move * busiest_weight;
1898 rem_load = div_u64(rem_load, busiest_h_load + 1); 2006 rem_load = div_u64(rem_load, busiest_h_load + 1);
1899 2007
1900 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 2008 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1901 rem_load, sd, idle, all_pinned, this_best_prio, 2009 rem_load, sd, idle, all_pinned, this_best_prio,
1902 tg->cfs_rq[busiest_cpu]); 2010 busiest_cfs_rq);
1903 2011
1904 if (!moved_load) 2012 if (!moved_load)
1905 continue; 2013 continue;
@@ -1922,35 +2030,1509 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1922 struct sched_domain *sd, enum cpu_idle_type idle, 2030 struct sched_domain *sd, enum cpu_idle_type idle,
1923 int *all_pinned, int *this_best_prio) 2031 int *all_pinned, int *this_best_prio)
1924{ 2032{
1925 return __load_balance_fair(this_rq, this_cpu, busiest, 2033 return balance_tasks(this_rq, this_cpu, busiest,
1926 max_load_move, sd, idle, all_pinned, 2034 max_load_move, sd, idle, all_pinned,
1927 this_best_prio, &busiest->cfs); 2035 this_best_prio, &busiest->cfs);
1928} 2036}
1929#endif 2037#endif
1930 2038
1931static int 2039/*
1932move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2040 * move_tasks tries to move up to max_load_move weighted load from busiest to
1933 struct sched_domain *sd, enum cpu_idle_type idle) 2041 * this_rq, as part of a balancing operation within domain "sd".
2042 * Returns 1 if successful and 0 otherwise.
2043 *
2044 * Called with both runqueues locked.
2045 */
2046static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2047 unsigned long max_load_move,
2048 struct sched_domain *sd, enum cpu_idle_type idle,
2049 int *all_pinned)
1934{ 2050{
1935 struct cfs_rq *busy_cfs_rq; 2051 unsigned long total_load_moved = 0, load_moved;
1936 struct rq_iterator cfs_rq_iterator; 2052 int this_best_prio = this_rq->curr->prio;
1937 2053
1938 cfs_rq_iterator.start = load_balance_start_fair; 2054 do {
1939 cfs_rq_iterator.next = load_balance_next_fair; 2055 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2056 max_load_move - total_load_moved,
2057 sd, idle, all_pinned, &this_best_prio);
1940 2058
1941 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 2059 total_load_moved += load_moved;
2060
2061#ifdef CONFIG_PREEMPT
1942 /* 2062 /*
1943 * pass busy_cfs_rq argument into 2063 * NEWIDLE balancing is a source of latency, so preemptible
1944 * load_balance_[start|next]_fair iterators 2064 * kernels will stop after the first task is pulled to minimize
2065 * the critical section.
1945 */ 2066 */
1946 cfs_rq_iterator.arg = busy_cfs_rq; 2067 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
1947 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 2068 break;
1948 &cfs_rq_iterator)) 2069
1949 return 1; 2070 if (raw_spin_is_contended(&this_rq->lock) ||
2071 raw_spin_is_contended(&busiest->lock))
2072 break;
2073#endif
2074 } while (load_moved && max_load_move > total_load_moved);
2075
2076 return total_load_moved > 0;
2077}
2078
2079/********** Helpers for find_busiest_group ************************/
2080/*
2081 * sd_lb_stats - Structure to store the statistics of a sched_domain
2082 * during load balancing.
2083 */
2084struct sd_lb_stats {
2085 struct sched_group *busiest; /* Busiest group in this sd */
2086 struct sched_group *this; /* Local group in this sd */
2087 unsigned long total_load; /* Total load of all groups in sd */
2088 unsigned long total_pwr; /* Total power of all groups in sd */
2089 unsigned long avg_load; /* Average load across all groups in sd */
2090
2091 /** Statistics of this group */
2092 unsigned long this_load;
2093 unsigned long this_load_per_task;
2094 unsigned long this_nr_running;
2095
2096 /* Statistics of the busiest group */
2097 unsigned long max_load;
2098 unsigned long busiest_load_per_task;
2099 unsigned long busiest_nr_running;
2100 unsigned long busiest_group_capacity;
2101
2102 int group_imb; /* Is there imbalance in this sd */
2103#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2104 int power_savings_balance; /* Is powersave balance needed for this sd */
2105 struct sched_group *group_min; /* Least loaded group in sd */
2106 struct sched_group *group_leader; /* Group which relieves group_min */
2107 unsigned long min_load_per_task; /* load_per_task in group_min */
2108 unsigned long leader_nr_running; /* Nr running of group_leader */
2109 unsigned long min_nr_running; /* Nr running of group_min */
2110#endif
2111};
2112
2113/*
2114 * sg_lb_stats - stats of a sched_group required for load_balancing
2115 */
2116struct sg_lb_stats {
2117 unsigned long avg_load; /*Avg load across the CPUs of the group */
2118 unsigned long group_load; /* Total load over the CPUs of the group */
2119 unsigned long sum_nr_running; /* Nr tasks running in the group */
2120 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2121 unsigned long group_capacity;
2122 int group_imb; /* Is there an imbalance in the group ? */
2123};
2124
2125/**
2126 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2127 * @group: The group whose first cpu is to be returned.
2128 */
2129static inline unsigned int group_first_cpu(struct sched_group *group)
2130{
2131 return cpumask_first(sched_group_cpus(group));
2132}
2133
2134/**
2135 * get_sd_load_idx - Obtain the load index for a given sched domain.
2136 * @sd: The sched_domain whose load_idx is to be obtained.
2137 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
2138 */
2139static inline int get_sd_load_idx(struct sched_domain *sd,
2140 enum cpu_idle_type idle)
2141{
2142 int load_idx;
2143
2144 switch (idle) {
2145 case CPU_NOT_IDLE:
2146 load_idx = sd->busy_idx;
2147 break;
2148
2149 case CPU_NEWLY_IDLE:
2150 load_idx = sd->newidle_idx;
2151 break;
2152 default:
2153 load_idx = sd->idle_idx;
2154 break;
1950 } 2155 }
1951 2156
2157 return load_idx;
2158}
2159
2160
2161#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2162/**
2163 * init_sd_power_savings_stats - Initialize power savings statistics for
2164 * the given sched_domain, during load balancing.
2165 *
2166 * @sd: Sched domain whose power-savings statistics are to be initialized.
2167 * @sds: Variable containing the statistics for sd.
2168 * @idle: Idle status of the CPU at which we're performing load-balancing.
2169 */
2170static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2171 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2172{
2173 /*
2174 * Busy processors will not participate in power savings
2175 * balance.
2176 */
2177 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2178 sds->power_savings_balance = 0;
2179 else {
2180 sds->power_savings_balance = 1;
2181 sds->min_nr_running = ULONG_MAX;
2182 sds->leader_nr_running = 0;
2183 }
2184}
2185
2186/**
2187 * update_sd_power_savings_stats - Update the power saving stats for a
2188 * sched_domain while performing load balancing.
2189 *
2190 * @group: sched_group belonging to the sched_domain under consideration.
2191 * @sds: Variable containing the statistics of the sched_domain
2192 * @local_group: Does group contain the CPU for which we're performing
2193 * load balancing ?
2194 * @sgs: Variable containing the statistics of the group.
2195 */
2196static inline void update_sd_power_savings_stats(struct sched_group *group,
2197 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2198{
2199
2200 if (!sds->power_savings_balance)
2201 return;
2202
2203 /*
2204 * If the local group is idle or completely loaded
2205 * no need to do power savings balance at this domain
2206 */
2207 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2208 !sds->this_nr_running))
2209 sds->power_savings_balance = 0;
2210
2211 /*
2212 * If a group is already running at full capacity or idle,
2213 * don't include that group in power savings calculations
2214 */
2215 if (!sds->power_savings_balance ||
2216 sgs->sum_nr_running >= sgs->group_capacity ||
2217 !sgs->sum_nr_running)
2218 return;
2219
2220 /*
2221 * Calculate the group which has the least non-idle load.
2222 * This is the group from where we need to pick up the load
2223 * for saving power
2224 */
2225 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2226 (sgs->sum_nr_running == sds->min_nr_running &&
2227 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2228 sds->group_min = group;
2229 sds->min_nr_running = sgs->sum_nr_running;
2230 sds->min_load_per_task = sgs->sum_weighted_load /
2231 sgs->sum_nr_running;
2232 }
2233
2234 /*
2235 * Calculate the group which is almost near its
2236 * capacity but still has some space to pick up some load
2237 * from other group and save more power
2238 */
2239 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2240 return;
2241
2242 if (sgs->sum_nr_running > sds->leader_nr_running ||
2243 (sgs->sum_nr_running == sds->leader_nr_running &&
2244 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2245 sds->group_leader = group;
2246 sds->leader_nr_running = sgs->sum_nr_running;
2247 }
2248}
2249
2250/**
2251 * check_power_save_busiest_group - see if there is potential for some power-savings balance
2252 * @sds: Variable containing the statistics of the sched_domain
2253 * under consideration.
2254 * @this_cpu: Cpu at which we're currently performing load-balancing.
2255 * @imbalance: Variable to store the imbalance.
2256 *
2257 * Description:
2258 * Check if we have potential to perform some power-savings balance.
2259 * If yes, set the busiest group to be the least loaded group in the
2260 * sched_domain, so that it's CPUs can be put to idle.
2261 *
2262 * Returns 1 if there is potential to perform power-savings balance.
2263 * Else returns 0.
2264 */
2265static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2266 int this_cpu, unsigned long *imbalance)
2267{
2268 if (!sds->power_savings_balance)
2269 return 0;
2270
2271 if (sds->this != sds->group_leader ||
2272 sds->group_leader == sds->group_min)
2273 return 0;
2274
2275 *imbalance = sds->min_load_per_task;
2276 sds->busiest = sds->group_min;
2277
2278 return 1;
2279
2280}
2281#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2282static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2283 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2284{
2285 return;
2286}
2287
2288static inline void update_sd_power_savings_stats(struct sched_group *group,
2289 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2290{
2291 return;
2292}
2293
2294static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2295 int this_cpu, unsigned long *imbalance)
2296{
1952 return 0; 2297 return 0;
1953} 2298}
2299#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2300
2301
2302unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2303{
2304 return SCHED_LOAD_SCALE;
2305}
2306
2307unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2308{
2309 return default_scale_freq_power(sd, cpu);
2310}
2311
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2315 unsigned long smt_gain = sd->smt_gain;
2316
2317 smt_gain /= weight;
2318
2319 return smt_gain;
2320}
2321
2322unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2323{
2324 return default_scale_smt_power(sd, cpu);
2325}
2326
2327unsigned long scale_rt_power(int cpu)
2328{
2329 struct rq *rq = cpu_rq(cpu);
2330 u64 total, available;
2331
2332 sched_avg_update(rq);
2333
2334 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2335 available = total - rq->rt_avg;
2336
2337 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2338 total = SCHED_LOAD_SCALE;
2339
2340 total >>= SCHED_LOAD_SHIFT;
2341
2342 return div_u64(available, total);
2343}
2344
2345static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2348 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups;
2350
2351 if (sched_feat(ARCH_POWER))
2352 power *= arch_scale_freq_power(sd, cpu);
2353 else
2354 power *= default_scale_freq_power(sd, cpu);
2355
2356 power >>= SCHED_LOAD_SHIFT;
2357
2358 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2359 if (sched_feat(ARCH_POWER))
2360 power *= arch_scale_smt_power(sd, cpu);
2361 else
2362 power *= default_scale_smt_power(sd, cpu);
2363
2364 power >>= SCHED_LOAD_SHIFT;
2365 }
2366
2367 power *= scale_rt_power(cpu);
2368 power >>= SCHED_LOAD_SHIFT;
2369
2370 if (!power)
2371 power = 1;
2372
2373 sdg->cpu_power = power;
2374}
2375
2376static void update_group_power(struct sched_domain *sd, int cpu)
2377{
2378 struct sched_domain *child = sd->child;
2379 struct sched_group *group, *sdg = sd->groups;
2380 unsigned long power;
2381
2382 if (!child) {
2383 update_cpu_power(sd, cpu);
2384 return;
2385 }
2386
2387 power = 0;
2388
2389 group = child->groups;
2390 do {
2391 power += group->cpu_power;
2392 group = group->next;
2393 } while (group != child->groups);
2394
2395 sdg->cpu_power = power;
2396}
2397
2398/**
2399 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2400 * @sd: The sched_domain whose statistics are to be updated.
2401 * @group: sched_group whose statistics are to be updated.
2402 * @this_cpu: Cpu for which load balance is currently performed.
2403 * @idle: Idle status of this_cpu
2404 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2405 * @sd_idle: Idle status of the sched_domain containing group.
2406 * @local_group: Does group contain this_cpu.
2407 * @cpus: Set of cpus considered for load balancing.
2408 * @balance: Should we balance.
2409 * @sgs: variable to hold the statistics for this group.
2410 */
2411static inline void update_sg_lb_stats(struct sched_domain *sd,
2412 struct sched_group *group, int this_cpu,
2413 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2414 int local_group, const struct cpumask *cpus,
2415 int *balance, struct sg_lb_stats *sgs)
2416{
2417 unsigned long load, max_cpu_load, min_cpu_load;
2418 int i;
2419 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2420 unsigned long avg_load_per_task = 0;
2421
2422 if (local_group)
2423 balance_cpu = group_first_cpu(group);
2424
2425 /* Tally up the load of all CPUs in the group */
2426 max_cpu_load = 0;
2427 min_cpu_load = ~0UL;
2428
2429 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2430 struct rq *rq = cpu_rq(i);
2431
2432 if (*sd_idle && rq->nr_running)
2433 *sd_idle = 0;
2434
2435 /* Bias balancing toward cpus of our domain */
2436 if (local_group) {
2437 if (idle_cpu(i) && !first_idle_cpu) {
2438 first_idle_cpu = 1;
2439 balance_cpu = i;
2440 }
2441
2442 load = target_load(i, load_idx);
2443 } else {
2444 load = source_load(i, load_idx);
2445 if (load > max_cpu_load)
2446 max_cpu_load = load;
2447 if (min_cpu_load > load)
2448 min_cpu_load = load;
2449 }
2450
2451 sgs->group_load += load;
2452 sgs->sum_nr_running += rq->nr_running;
2453 sgs->sum_weighted_load += weighted_cpuload(i);
2454
2455 }
2456
2457 /*
2458 * First idle cpu or the first cpu(busiest) in this sched group
2459 * is eligible for doing load balancing at this and above
2460 * domains. In the newly idle case, we will allow all the cpu's
2461 * to do the newly idle load balance.
2462 */
2463 if (idle != CPU_NEWLY_IDLE && local_group &&
2464 balance_cpu != this_cpu) {
2465 *balance = 0;
2466 return;
2467 }
2468
2469 update_group_power(sd, this_cpu);
2470
2471 /* Adjust by relative CPU power of the group */
2472 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2473
2474 /*
2475 * Consider the group unbalanced when the imbalance is larger
2476 * than the average weight of two tasks.
2477 *
2478 * APZ: with cgroup the avg task weight can vary wildly and
2479 * might not be a suitable number - should we keep a
2480 * normalized nr_running number somewhere that negates
2481 * the hierarchy?
2482 */
2483 if (sgs->sum_nr_running)
2484 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2485
2486 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
2487 sgs->group_imb = 1;
2488
2489 sgs->group_capacity =
2490 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2491}
2492
2493/**
2494 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2495 * @sd: sched_domain whose statistics are to be updated.
2496 * @this_cpu: Cpu for which load balance is currently performed.
2497 * @idle: Idle status of this_cpu
2498 * @sd_idle: Idle status of the sched_domain containing group.
2499 * @cpus: Set of cpus considered for load balancing.
2500 * @balance: Should we balance.
2501 * @sds: variable to hold the statistics for this sched_domain.
2502 */
2503static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2504 enum cpu_idle_type idle, int *sd_idle,
2505 const struct cpumask *cpus, int *balance,
2506 struct sd_lb_stats *sds)
2507{
2508 struct sched_domain *child = sd->child;
2509 struct sched_group *group = sd->groups;
2510 struct sg_lb_stats sgs;
2511 int load_idx, prefer_sibling = 0;
2512
2513 if (child && child->flags & SD_PREFER_SIBLING)
2514 prefer_sibling = 1;
2515
2516 init_sd_power_savings_stats(sd, sds, idle);
2517 load_idx = get_sd_load_idx(sd, idle);
2518
2519 do {
2520 int local_group;
2521
2522 local_group = cpumask_test_cpu(this_cpu,
2523 sched_group_cpus(group));
2524 memset(&sgs, 0, sizeof(sgs));
2525 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
2526 local_group, cpus, balance, &sgs);
2527
2528 if (local_group && !(*balance))
2529 return;
2530
2531 sds->total_load += sgs.group_load;
2532 sds->total_pwr += group->cpu_power;
2533
2534 /*
2535 * In case the child domain prefers tasks go to siblings
2536 * first, lower the group capacity to one so that we'll try
2537 * and move all the excess tasks away.
2538 */
2539 if (prefer_sibling)
2540 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2541
2542 if (local_group) {
2543 sds->this_load = sgs.avg_load;
2544 sds->this = group;
2545 sds->this_nr_running = sgs.sum_nr_running;
2546 sds->this_load_per_task = sgs.sum_weighted_load;
2547 } else if (sgs.avg_load > sds->max_load &&
2548 (sgs.sum_nr_running > sgs.group_capacity ||
2549 sgs.group_imb)) {
2550 sds->max_load = sgs.avg_load;
2551 sds->busiest = group;
2552 sds->busiest_nr_running = sgs.sum_nr_running;
2553 sds->busiest_group_capacity = sgs.group_capacity;
2554 sds->busiest_load_per_task = sgs.sum_weighted_load;
2555 sds->group_imb = sgs.group_imb;
2556 }
2557
2558 update_sd_power_savings_stats(group, sds, local_group, &sgs);
2559 group = group->next;
2560 } while (group != sd->groups);
2561}
2562
2563/**
2564 * fix_small_imbalance - Calculate the minor imbalance that exists
2565 * amongst the groups of a sched_domain, during
2566 * load balancing.
2567 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
2568 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2569 * @imbalance: Variable to store the imbalance.
2570 */
2571static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2572 int this_cpu, unsigned long *imbalance)
2573{
2574 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2575 unsigned int imbn = 2;
2576 unsigned long scaled_busy_load_per_task;
2577
2578 if (sds->this_nr_running) {
2579 sds->this_load_per_task /= sds->this_nr_running;
2580 if (sds->busiest_load_per_task >
2581 sds->this_load_per_task)
2582 imbn = 1;
2583 } else
2584 sds->this_load_per_task =
2585 cpu_avg_load_per_task(this_cpu);
2586
2587 scaled_busy_load_per_task = sds->busiest_load_per_task
2588 * SCHED_LOAD_SCALE;
2589 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2590
2591 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2592 (scaled_busy_load_per_task * imbn)) {
2593 *imbalance = sds->busiest_load_per_task;
2594 return;
2595 }
2596
2597 /*
2598 * OK, we don't have enough imbalance to justify moving tasks,
2599 * however we may be able to increase total CPU power used by
2600 * moving them.
2601 */
2602
2603 pwr_now += sds->busiest->cpu_power *
2604 min(sds->busiest_load_per_task, sds->max_load);
2605 pwr_now += sds->this->cpu_power *
2606 min(sds->this_load_per_task, sds->this_load);
2607 pwr_now /= SCHED_LOAD_SCALE;
2608
2609 /* Amount of load we'd subtract */
2610 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2611 sds->busiest->cpu_power;
2612 if (sds->max_load > tmp)
2613 pwr_move += sds->busiest->cpu_power *
2614 min(sds->busiest_load_per_task, sds->max_load - tmp);
2615
2616 /* Amount of load we'd add */
2617 if (sds->max_load * sds->busiest->cpu_power <
2618 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2619 tmp = (sds->max_load * sds->busiest->cpu_power) /
2620 sds->this->cpu_power;
2621 else
2622 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2623 sds->this->cpu_power;
2624 pwr_move += sds->this->cpu_power *
2625 min(sds->this_load_per_task, sds->this_load + tmp);
2626 pwr_move /= SCHED_LOAD_SCALE;
2627
2628 /* Move if we gain throughput */
2629 if (pwr_move > pwr_now)
2630 *imbalance = sds->busiest_load_per_task;
2631}
2632
2633/**
2634 * calculate_imbalance - Calculate the amount of imbalance present within the
2635 * groups of a given sched_domain during load balance.
2636 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
2637 * @this_cpu: Cpu for which currently load balance is being performed.
2638 * @imbalance: The variable to store the imbalance.
2639 */
2640static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2641 unsigned long *imbalance)
2642{
2643 unsigned long max_pull, load_above_capacity = ~0UL;
2644
2645 sds->busiest_load_per_task /= sds->busiest_nr_running;
2646 if (sds->group_imb) {
2647 sds->busiest_load_per_task =
2648 min(sds->busiest_load_per_task, sds->avg_load);
2649 }
2650
2651 /*
2652 * In the presence of smp nice balancing, certain scenarios can have
2653 * max load less than avg load(as we skip the groups at or below
2654 * its cpu_power, while calculating max_load..)
2655 */
2656 if (sds->max_load < sds->avg_load) {
2657 *imbalance = 0;
2658 return fix_small_imbalance(sds, this_cpu, imbalance);
2659 }
2660
2661 if (!sds->group_imb) {
2662 /*
2663 * Don't want to pull so many tasks that a group would go idle.
2664 */
2665 load_above_capacity = (sds->busiest_nr_running -
2666 sds->busiest_group_capacity);
2667
2668 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
2669
2670 load_above_capacity /= sds->busiest->cpu_power;
2671 }
2672
2673 /*
2674 * We're trying to get all the cpus to the average_load, so we don't
2675 * want to push ourselves above the average load, nor do we wish to
2676 * reduce the max loaded cpu below the average load. At the same time,
2677 * we also don't want to reduce the group load below the group capacity
2678 * (so that we can implement power-savings policies etc). Thus we look
2679 * for the minimum possible imbalance.
2680 * Be careful of negative numbers as they'll appear as very large values
2681 * with unsigned longs.
2682 */
2683 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
2684
2685 /* How much load to actually move to equalise the imbalance */
2686 *imbalance = min(max_pull * sds->busiest->cpu_power,
2687 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
2688 / SCHED_LOAD_SCALE;
2689
2690 /*
2691 * if *imbalance is less than the average load per runnable task
2692 * there is no gaurantee that any tasks will be moved so we'll have
2693 * a think about bumping its value to force at least one task to be
2694 * moved
2695 */
2696 if (*imbalance < sds->busiest_load_per_task)
2697 return fix_small_imbalance(sds, this_cpu, imbalance);
2698
2699}
2700/******* find_busiest_group() helpers end here *********************/
2701
2702/**
2703 * find_busiest_group - Returns the busiest group within the sched_domain
2704 * if there is an imbalance. If there isn't an imbalance, and
2705 * the user has opted for power-savings, it returns a group whose
2706 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
2707 * such a group exists.
2708 *
2709 * Also calculates the amount of weighted load which should be moved
2710 * to restore balance.
2711 *
2712 * @sd: The sched_domain whose busiest group is to be returned.
2713 * @this_cpu: The cpu for which load balancing is currently being performed.
2714 * @imbalance: Variable which stores amount of weighted load which should
2715 * be moved to restore balance/put a group to idle.
2716 * @idle: The idle status of this_cpu.
2717 * @sd_idle: The idleness of sd
2718 * @cpus: The set of CPUs under consideration for load-balancing.
2719 * @balance: Pointer to a variable indicating if this_cpu
2720 * is the appropriate cpu to perform load balancing at this_level.
2721 *
2722 * Returns: - the busiest group if imbalance exists.
2723 * - If no imbalance and user has opted for power-savings balance,
2724 * return the least loaded group whose CPUs can be
2725 * put to idle by rebalancing its tasks onto our group.
2726 */
2727static struct sched_group *
2728find_busiest_group(struct sched_domain *sd, int this_cpu,
2729 unsigned long *imbalance, enum cpu_idle_type idle,
2730 int *sd_idle, const struct cpumask *cpus, int *balance)
2731{
2732 struct sd_lb_stats sds;
2733
2734 memset(&sds, 0, sizeof(sds));
2735
2736 /*
2737 * Compute the various statistics relavent for load balancing at
2738 * this level.
2739 */
2740 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
2741 balance, &sds);
2742
2743 /* Cases where imbalance does not exist from POV of this_cpu */
2744 /* 1) this_cpu is not the appropriate cpu to perform load balancing
2745 * at this level.
2746 * 2) There is no busy sibling group to pull from.
2747 * 3) This group is the busiest group.
2748 * 4) This group is more busy than the avg busieness at this
2749 * sched_domain.
2750 * 5) The imbalance is within the specified limit.
2751 */
2752 if (!(*balance))
2753 goto ret;
2754
2755 if (!sds.busiest || sds.busiest_nr_running == 0)
2756 goto out_balanced;
2757
2758 if (sds.this_load >= sds.max_load)
2759 goto out_balanced;
2760
2761 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
2762
2763 if (sds.this_load >= sds.avg_load)
2764 goto out_balanced;
2765
2766 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2767 goto out_balanced;
2768
2769 /* Looks like there is an imbalance. Compute it */
2770 calculate_imbalance(&sds, this_cpu, imbalance);
2771 return sds.busiest;
2772
2773out_balanced:
2774 /*
2775 * There is no obvious imbalance. But check if we can do some balancing
2776 * to save power.
2777 */
2778 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
2779 return sds.busiest;
2780ret:
2781 *imbalance = 0;
2782 return NULL;
2783}
2784
2785/*
2786 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2787 */
2788static struct rq *
2789find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2790 unsigned long imbalance, const struct cpumask *cpus)
2791{
2792 struct rq *busiest = NULL, *rq;
2793 unsigned long max_load = 0;
2794 int i;
2795
2796 for_each_cpu(i, sched_group_cpus(group)) {
2797 unsigned long power = power_of(i);
2798 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2799 unsigned long wl;
2800
2801 if (!cpumask_test_cpu(i, cpus))
2802 continue;
2803
2804 rq = cpu_rq(i);
2805 wl = weighted_cpuload(i);
2806
2807 /*
2808 * When comparing with imbalance, use weighted_cpuload()
2809 * which is not scaled with the cpu power.
2810 */
2811 if (capacity && rq->nr_running == 1 && wl > imbalance)
2812 continue;
2813
2814 /*
2815 * For the load comparisons with the other cpu's, consider
2816 * the weighted_cpuload() scaled with the cpu power, so that
2817 * the load can be moved away from the cpu that is potentially
2818 * running at a lower capacity.
2819 */
2820 wl = (wl * SCHED_LOAD_SCALE) / power;
2821
2822 if (wl > max_load) {
2823 max_load = wl;
2824 busiest = rq;
2825 }
2826 }
2827
2828 return busiest;
2829}
2830
2831/*
2832 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2833 * so long as it is large enough.
2834 */
2835#define MAX_PINNED_INTERVAL 512
2836
2837/* Working cpumask for load_balance and load_balance_newidle. */
2838static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2839
2840static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2841{
2842 if (idle == CPU_NEWLY_IDLE) {
2843 /*
2844 * The only task running in a non-idle cpu can be moved to this
2845 * cpu in an attempt to completely freeup the other CPU
2846 * package.
2847 *
2848 * The package power saving logic comes from
2849 * find_busiest_group(). If there are no imbalance, then
2850 * f_b_g() will return NULL. However when sched_mc={1,2} then
2851 * f_b_g() will select a group from which a running task may be
2852 * pulled to this cpu in order to make the other package idle.
2853 * If there is no opportunity to make a package idle and if
2854 * there are no imbalance, then f_b_g() will return NULL and no
2855 * action will be taken in load_balance_newidle().
2856 *
2857 * Under normal task pull operation due to imbalance, there
2858 * will be more than one task in the source run queue and
2859 * move_tasks() will succeed. ld_moved will be true and this
2860 * active balance code will not be triggered.
2861 */
2862 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2863 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2864 return 0;
2865
2866 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2867 return 0;
2868 }
2869
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871}
2872
2873/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance.
2876 */
2877static int load_balance(int this_cpu, struct rq *this_rq,
2878 struct sched_domain *sd, enum cpu_idle_type idle,
2879 int *balance)
2880{
2881 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2882 struct sched_group *group;
2883 unsigned long imbalance;
2884 struct rq *busiest;
2885 unsigned long flags;
2886 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
2887
2888 cpumask_copy(cpus, cpu_active_mask);
2889
2890 /*
2891 * When power savings policy is enabled for the parent domain, idle
2892 * sibling can pick up load irrespective of busy siblings. In this case,
2893 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2894 * portraying it as CPU_NOT_IDLE.
2895 */
2896 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2897 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2898 sd_idle = 1;
2899
2900 schedstat_inc(sd, lb_count[idle]);
2901
2902redo:
2903 update_shares(sd);
2904 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2905 cpus, balance);
2906
2907 if (*balance == 0)
2908 goto out_balanced;
2909
2910 if (!group) {
2911 schedstat_inc(sd, lb_nobusyg[idle]);
2912 goto out_balanced;
2913 }
2914
2915 busiest = find_busiest_queue(group, idle, imbalance, cpus);
2916 if (!busiest) {
2917 schedstat_inc(sd, lb_nobusyq[idle]);
2918 goto out_balanced;
2919 }
2920
2921 BUG_ON(busiest == this_rq);
2922
2923 schedstat_add(sd, lb_imbalance[idle], imbalance);
2924
2925 ld_moved = 0;
2926 if (busiest->nr_running > 1) {
2927 /*
2928 * Attempt to move tasks. If find_busiest_group has found
2929 * an imbalance but busiest->nr_running <= 1, the group is
2930 * still unbalanced. ld_moved simply stays zero, so it is
2931 * correctly treated as an imbalance.
2932 */
2933 local_irq_save(flags);
2934 double_rq_lock(this_rq, busiest);
2935 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2936 imbalance, sd, idle, &all_pinned);
2937 double_rq_unlock(this_rq, busiest);
2938 local_irq_restore(flags);
2939
2940 /*
2941 * some other cpu did the load balance for us.
2942 */
2943 if (ld_moved && this_cpu != smp_processor_id())
2944 resched_cpu(this_cpu);
2945
2946 /* All tasks on this runqueue were pinned by CPU affinity */
2947 if (unlikely(all_pinned)) {
2948 cpumask_clear_cpu(cpu_of(busiest), cpus);
2949 if (!cpumask_empty(cpus))
2950 goto redo;
2951 goto out_balanced;
2952 }
2953 }
2954
2955 if (!ld_moved) {
2956 schedstat_inc(sd, lb_failed[idle]);
2957 sd->nr_balance_failed++;
2958
2959 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags);
2961
2962 /* don't kick the migration_thread, if the curr
2963 * task on busiest cpu can't be moved to this_cpu
2964 */
2965 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) {
2967 raw_spin_unlock_irqrestore(&busiest->lock,
2968 flags);
2969 all_pinned = 1;
2970 goto out_one_pinned;
2971 }
2972
2973 if (!busiest->active_balance) {
2974 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu;
2976 active_balance = 1;
2977 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2979 if (active_balance)
2980 wake_up_process(busiest->migration_thread);
2981
2982 /*
2983 * We've kicked active balancing, reset the failure
2984 * counter.
2985 */
2986 sd->nr_balance_failed = sd->cache_nice_tries+1;
2987 }
2988 } else
2989 sd->nr_balance_failed = 0;
2990
2991 if (likely(!active_balance)) {
2992 /* We were unbalanced, so reset the balancing interval */
2993 sd->balance_interval = sd->min_interval;
2994 } else {
2995 /*
2996 * If we've begun active balancing, start to back off. This
2997 * case may not be covered by the all_pinned logic if there
2998 * is only 1 task on the busy runqueue (because we don't call
2999 * move_tasks).
3000 */
3001 if (sd->balance_interval < sd->max_interval)
3002 sd->balance_interval *= 2;
3003 }
3004
3005 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3006 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3007 ld_moved = -1;
3008
3009 goto out;
3010
3011out_balanced:
3012 schedstat_inc(sd, lb_balanced[idle]);
3013
3014 sd->nr_balance_failed = 0;
3015
3016out_one_pinned:
3017 /* tune up the balancing interval */
3018 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3019 (sd->balance_interval < sd->max_interval))
3020 sd->balance_interval *= 2;
3021
3022 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3023 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3024 ld_moved = -1;
3025 else
3026 ld_moved = 0;
3027out:
3028 if (ld_moved)
3029 update_shares(sd);
3030 return ld_moved;
3031}
3032
3033/*
3034 * idle_balance is called by schedule() if this_cpu is about to become
3035 * idle. Attempts to pull tasks from other CPUs.
3036 */
3037static void idle_balance(int this_cpu, struct rq *this_rq)
3038{
3039 struct sched_domain *sd;
3040 int pulled_task = 0;
3041 unsigned long next_balance = jiffies + HZ;
3042
3043 this_rq->idle_stamp = this_rq->clock;
3044
3045 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3046 return;
3047
3048 /*
3049 * Drop the rq->lock, but keep IRQ/preempt disabled.
3050 */
3051 raw_spin_unlock(&this_rq->lock);
3052
3053 for_each_domain(this_cpu, sd) {
3054 unsigned long interval;
3055 int balance = 1;
3056
3057 if (!(sd->flags & SD_LOAD_BALANCE))
3058 continue;
3059
3060 if (sd->flags & SD_BALANCE_NEWIDLE) {
3061 /* If we've pulled tasks over stop searching: */
3062 pulled_task = load_balance(this_cpu, this_rq,
3063 sd, CPU_NEWLY_IDLE, &balance);
3064 }
3065
3066 interval = msecs_to_jiffies(sd->balance_interval);
3067 if (time_after(next_balance, sd->last_balance + interval))
3068 next_balance = sd->last_balance + interval;
3069 if (pulled_task) {
3070 this_rq->idle_stamp = 0;
3071 break;
3072 }
3073 }
3074
3075 raw_spin_lock(&this_rq->lock);
3076
3077 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3078 /*
3079 * We are going idle. next_balance may be set based on
3080 * a busy processor. So reset next_balance.
3081 */
3082 this_rq->next_balance = next_balance;
3083 }
3084}
3085
3086/*
3087 * active_load_balance is run by migration threads. It pushes running tasks
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
3089 * running on each physical CPU where possible, and avoids physical /
3090 * logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3095{
3096 int target_cpu = busiest_rq->push_cpu;
3097 struct sched_domain *sd;
3098 struct rq *target_rq;
3099
3100 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1)
3102 return;
3103
3104 target_rq = cpu_rq(target_cpu);
3105
3106 /*
3107 * This condition is "impossible", if it occurs
3108 * we need to fix it. Originally reported by
3109 * Bjorn Helgaas on a 128-cpu setup.
3110 */
3111 BUG_ON(busiest_rq == target_rq);
3112
3113 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117
3118 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) {
3120 if ((sd->flags & SD_LOAD_BALANCE) &&
3121 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3122 break;
3123 }
3124
3125 if (likely(sd)) {
3126 schedstat_inc(sd, alb_count);
3127
3128 if (move_one_task(target_rq, target_cpu, busiest_rq,
3129 sd, CPU_IDLE))
3130 schedstat_inc(sd, alb_pushed);
3131 else
3132 schedstat_inc(sd, alb_failed);
3133 }
3134 double_unlock_balance(busiest_rq, target_rq);
3135}
3136
3137#ifdef CONFIG_NO_HZ
3138static struct {
3139 atomic_t load_balancer;
3140 cpumask_var_t cpu_mask;
3141 cpumask_var_t ilb_grp_nohz_mask;
3142} nohz ____cacheline_aligned = {
3143 .load_balancer = ATOMIC_INIT(-1),
3144};
3145
3146int get_nohz_load_balancer(void)
3147{
3148 return atomic_read(&nohz.load_balancer);
3149}
3150
3151#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3152/**
3153 * lowest_flag_domain - Return lowest sched_domain containing flag.
3154 * @cpu: The cpu whose lowest level of sched domain is to
3155 * be returned.
3156 * @flag: The flag to check for the lowest sched_domain
3157 * for the given cpu.
3158 *
3159 * Returns the lowest sched_domain of a cpu which contains the given flag.
3160 */
3161static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3162{
3163 struct sched_domain *sd;
3164
3165 for_each_domain(cpu, sd)
3166 if (sd && (sd->flags & flag))
3167 break;
3168
3169 return sd;
3170}
3171
3172/**
3173 * for_each_flag_domain - Iterates over sched_domains containing the flag.
3174 * @cpu: The cpu whose domains we're iterating over.
3175 * @sd: variable holding the value of the power_savings_sd
3176 * for cpu.
3177 * @flag: The flag to filter the sched_domains to be iterated.
3178 *
3179 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
3180 * set, starting from the lowest sched_domain to the highest.
3181 */
3182#define for_each_flag_domain(cpu, sd, flag) \
3183 for (sd = lowest_flag_domain(cpu, flag); \
3184 (sd && (sd->flags & flag)); sd = sd->parent)
3185
3186/**
3187 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3188 * @ilb_group: group to be checked for semi-idleness
3189 *
3190 * Returns: 1 if the group is semi-idle. 0 otherwise.
3191 *
3192 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3193 * and atleast one non-idle CPU. This helper function checks if the given
3194 * sched_group is semi-idle or not.
3195 */
3196static inline int is_semi_idle_group(struct sched_group *ilb_group)
3197{
3198 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
3199 sched_group_cpus(ilb_group));
3200
3201 /*
3202 * A sched_group is semi-idle when it has atleast one busy cpu
3203 * and atleast one idle cpu.
3204 */
3205 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
3206 return 0;
3207
3208 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
3209 return 0;
3210
3211 return 1;
3212}
3213/**
3214 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3215 * @cpu: The cpu which is nominating a new idle_load_balancer.
3216 *
3217 * Returns: Returns the id of the idle load balancer if it exists,
3218 * Else, returns >= nr_cpu_ids.
3219 *
3220 * This algorithm picks the idle load balancer such that it belongs to a
3221 * semi-idle powersavings sched_domain. The idea is to try and avoid
3222 * completely idle packages/cores just for the purpose of idle load balancing
3223 * when there are other idle cpu's which are better suited for that job.
3224 */
3225static int find_new_ilb(int cpu)
3226{
3227 struct sched_domain *sd;
3228 struct sched_group *ilb_group;
3229
3230 /*
3231 * Have idle load balancer selection from semi-idle packages only
3232 * when power-aware load balancing is enabled
3233 */
3234 if (!(sched_smt_power_savings || sched_mc_power_savings))
3235 goto out_done;
3236
3237 /*
3238 * Optimize for the case when we have no idle CPUs or only one
3239 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3240 */
3241 if (cpumask_weight(nohz.cpu_mask) < 2)
3242 goto out_done;
3243
3244 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3245 ilb_group = sd->groups;
3246
3247 do {
3248 if (is_semi_idle_group(ilb_group))
3249 return cpumask_first(nohz.ilb_grp_nohz_mask);
3250
3251 ilb_group = ilb_group->next;
3252
3253 } while (ilb_group != sd->groups);
3254 }
3255
3256out_done:
3257 return cpumask_first(nohz.cpu_mask);
3258}
3259#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3260static inline int find_new_ilb(int call_cpu)
3261{
3262 return cpumask_first(nohz.cpu_mask);
3263}
3264#endif
3265
3266/*
3267 * This routine will try to nominate the ilb (idle load balancing)
3268 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3269 * load balancing on behalf of all those cpus. If all the cpus in the system
3270 * go into this tickless mode, then there will be no ilb owner (as there is
3271 * no need for one) and all the cpus will sleep till the next wakeup event
3272 * arrives...
3273 *
3274 * For the ilb owner, tick is not stopped. And this tick will be used
3275 * for idle load balancing. ilb owner will still be part of
3276 * nohz.cpu_mask..
3277 *
3278 * While stopping the tick, this cpu will become the ilb owner if there
3279 * is no other owner. And will be the owner till that cpu becomes busy
3280 * or if all cpus in the system stop their ticks at which point
3281 * there is no need for ilb owner.
3282 *
3283 * When the ilb owner becomes busy, it nominates another owner, during the
3284 * next busy scheduler_tick()
3285 */
3286int select_nohz_load_balancer(int stop_tick)
3287{
3288 int cpu = smp_processor_id();
3289
3290 if (stop_tick) {
3291 cpu_rq(cpu)->in_nohz_recently = 1;
3292
3293 if (!cpu_active(cpu)) {
3294 if (atomic_read(&nohz.load_balancer) != cpu)
3295 return 0;
3296
3297 /*
3298 * If we are going offline and still the leader,
3299 * give up!
3300 */
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3302 BUG();
3303
3304 return 0;
3305 }
3306
3307 cpumask_set_cpu(cpu, nohz.cpu_mask);
3308
3309 /* time for ilb owner also to sleep */
3310 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
3311 if (atomic_read(&nohz.load_balancer) == cpu)
3312 atomic_set(&nohz.load_balancer, -1);
3313 return 0;
3314 }
3315
3316 if (atomic_read(&nohz.load_balancer) == -1) {
3317 /* make me the ilb owner */
3318 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3319 return 1;
3320 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3321 int new_ilb;
3322
3323 if (!(sched_smt_power_savings ||
3324 sched_mc_power_savings))
3325 return 1;
3326 /*
3327 * Check to see if there is a more power-efficient
3328 * ilb.
3329 */
3330 new_ilb = find_new_ilb(cpu);
3331 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3332 atomic_set(&nohz.load_balancer, -1);
3333 resched_cpu(new_ilb);
3334 return 0;
3335 }
3336 return 1;
3337 }
3338 } else {
3339 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3340 return 0;
3341
3342 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3343
3344 if (atomic_read(&nohz.load_balancer) == cpu)
3345 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3346 BUG();
3347 }
3348 return 0;
3349}
3350#endif
3351
3352static DEFINE_SPINLOCK(balancing);
3353
3354/*
3355 * It checks each scheduling domain to see if it is due to be balanced,
3356 * and initiates a balancing operation if so.
3357 *
3358 * Balancing parameters are set up in arch_init_sched_domains.
3359 */
3360static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3361{
3362 int balance = 1;
3363 struct rq *rq = cpu_rq(cpu);
3364 unsigned long interval;
3365 struct sched_domain *sd;
3366 /* Earliest time when we have to do rebalance again */
3367 unsigned long next_balance = jiffies + 60*HZ;
3368 int update_next_balance = 0;
3369 int need_serialize;
3370
3371 for_each_domain(cpu, sd) {
3372 if (!(sd->flags & SD_LOAD_BALANCE))
3373 continue;
3374
3375 interval = sd->balance_interval;
3376 if (idle != CPU_IDLE)
3377 interval *= sd->busy_factor;
3378
3379 /* scale ms to jiffies */
3380 interval = msecs_to_jiffies(interval);
3381 if (unlikely(!interval))
3382 interval = 1;
3383 if (interval > HZ*NR_CPUS/10)
3384 interval = HZ*NR_CPUS/10;
3385
3386 need_serialize = sd->flags & SD_SERIALIZE;
3387
3388 if (need_serialize) {
3389 if (!spin_trylock(&balancing))
3390 goto out;
3391 }
3392
3393 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3394 if (load_balance(cpu, rq, sd, idle, &balance)) {
3395 /*
3396 * We've pulled tasks over so either we're no
3397 * longer idle, or one of our SMT siblings is
3398 * not idle.
3399 */
3400 idle = CPU_NOT_IDLE;
3401 }
3402 sd->last_balance = jiffies;
3403 }
3404 if (need_serialize)
3405 spin_unlock(&balancing);
3406out:
3407 if (time_after(next_balance, sd->last_balance + interval)) {
3408 next_balance = sd->last_balance + interval;
3409 update_next_balance = 1;
3410 }
3411
3412 /*
3413 * Stop the load balance at this level. There is another
3414 * CPU in our sched group which is doing load balancing more
3415 * actively.
3416 */
3417 if (!balance)
3418 break;
3419 }
3420
3421 /*
3422 * next_balance will be updated only when there is a need.
3423 * When the cpu is attached to null domain for ex, it will not be
3424 * updated.
3425 */
3426 if (likely(update_next_balance))
3427 rq->next_balance = next_balance;
3428}
3429
3430/*
3431 * run_rebalance_domains is triggered when needed from the scheduler tick.
3432 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3433 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3434 */
3435static void run_rebalance_domains(struct softirq_action *h)
3436{
3437 int this_cpu = smp_processor_id();
3438 struct rq *this_rq = cpu_rq(this_cpu);
3439 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3440 CPU_IDLE : CPU_NOT_IDLE;
3441
3442 rebalance_domains(this_cpu, idle);
3443
3444#ifdef CONFIG_NO_HZ
3445 /*
3446 * If this cpu is the owner for idle load balancing, then do the
3447 * balancing on behalf of the other idle cpus whose ticks are
3448 * stopped.
3449 */
3450 if (this_rq->idle_at_tick &&
3451 atomic_read(&nohz.load_balancer) == this_cpu) {
3452 struct rq *rq;
3453 int balance_cpu;
3454
3455 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3456 if (balance_cpu == this_cpu)
3457 continue;
3458
3459 /*
3460 * If this cpu gets work to do, stop the load balancing
3461 * work being done for other cpus. Next load
3462 * balancing owner will pick it up.
3463 */
3464 if (need_resched())
3465 break;
3466
3467 rebalance_domains(balance_cpu, CPU_IDLE);
3468
3469 rq = cpu_rq(balance_cpu);
3470 if (time_after(this_rq->next_balance, rq->next_balance))
3471 this_rq->next_balance = rq->next_balance;
3472 }
3473 }
3474#endif
3475}
3476
3477static inline int on_null_domain(int cpu)
3478{
3479 return !rcu_dereference(cpu_rq(cpu)->sd);
3480}
3481
3482/*
3483 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3484 *
3485 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3486 * idle load balancing owner or decide to stop the periodic load balancing,
3487 * if the whole system is idle.
3488 */
3489static inline void trigger_load_balance(struct rq *rq, int cpu)
3490{
3491#ifdef CONFIG_NO_HZ
3492 /*
3493 * If we were in the nohz mode recently and busy at the current
3494 * scheduler tick, then check if we need to nominate new idle
3495 * load balancer.
3496 */
3497 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3498 rq->in_nohz_recently = 0;
3499
3500 if (atomic_read(&nohz.load_balancer) == cpu) {
3501 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3502 atomic_set(&nohz.load_balancer, -1);
3503 }
3504
3505 if (atomic_read(&nohz.load_balancer) == -1) {
3506 int ilb = find_new_ilb(cpu);
3507
3508 if (ilb < nr_cpu_ids)
3509 resched_cpu(ilb);
3510 }
3511 }
3512
3513 /*
3514 * If this cpu is idle and doing idle load balancing for all the
3515 * cpus with ticks stopped, is it time for that to stop?
3516 */
3517 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3518 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3519 resched_cpu(cpu);
3520 return;
3521 }
3522
3523 /*
3524 * If this cpu is idle and the idle load balancing is done by
3525 * someone else, then no need raise the SCHED_SOFTIRQ
3526 */
3527 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3528 cpumask_test_cpu(cpu, nohz.cpu_mask))
3529 return;
3530#endif
3531 /* Don't need to rebalance while attached to NULL domain */
3532 if (time_after_eq(jiffies, rq->next_balance) &&
3533 likely(!on_null_domain(cpu)))
3534 raise_softirq(SCHED_SOFTIRQ);
3535}
1954 3536
1955static void rq_online_fair(struct rq *rq) 3537static void rq_online_fair(struct rq *rq)
1956{ 3538{
@@ -1962,6 +3544,15 @@ static void rq_offline_fair(struct rq *rq)
1962 update_sysctl(); 3544 update_sysctl();
1963} 3545}
1964 3546
3547#else /* CONFIG_SMP */
3548
3549/*
3550 * on UP we do not need to balance between CPUs:
3551 */
3552static inline void idle_balance(int cpu, struct rq *rq)
3553{
3554}
3555
1965#endif /* CONFIG_SMP */ 3556#endif /* CONFIG_SMP */
1966 3557
1967/* 3558/*
@@ -2076,7 +3667,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
2076} 3667}
2077#endif 3668#endif
2078 3669
2079unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 3670static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2080{ 3671{
2081 struct sched_entity *se = &task->se; 3672 struct sched_entity *se = &task->se;
2082 unsigned int rr_interval = 0; 3673 unsigned int rr_interval = 0;
@@ -2108,8 +3699,6 @@ static const struct sched_class fair_sched_class = {
2108#ifdef CONFIG_SMP 3699#ifdef CONFIG_SMP
2109 .select_task_rq = select_task_rq_fair, 3700 .select_task_rq = select_task_rq_fair,
2110 3701
2111 .load_balance = load_balance_fair,
2112 .move_one_task = move_one_task_fair,
2113 .rq_online = rq_online_fair, 3702 .rq_online = rq_online_fair,
2114 .rq_offline = rq_offline_fair, 3703 .rq_offline = rq_offline_fair,
2115 3704
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
44{ 44{
45} 45}
46 46
47#ifdef CONFIG_SMP
48static unsigned long
49load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
50 unsigned long max_load_move,
51 struct sched_domain *sd, enum cpu_idle_type idle,
52 int *all_pinned, int *this_best_prio)
53{
54 return 0;
55}
56
57static int
58move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
59 struct sched_domain *sd, enum cpu_idle_type idle)
60{
61 return 0;
62}
63#endif
64
65static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
66{ 48{
67} 49}
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
98} 80}
99 81
100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 83{
102 return 0; 84 return 0;
103} 85}
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
119 101
120#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
121 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
122
123 .load_balance = load_balance_idle,
124 .move_one_task = move_one_task_idle,
125#endif 104#endif
126 105
127 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216f..bf3e38fdbe6d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
194 return rt_se->my_q; 194 return rt_se->my_q;
195} 195}
196 196
197static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
198static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 199
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 int this_cpu = smp_processor_id();
202 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
203 struct sched_rt_entity *rt_se = rt_rq->rt_se; 204 struct sched_rt_entity *rt_se;
205
206 rt_se = rt_rq->tg->rt_se[this_cpu];
204 207
205 if (rt_rq->rt_nr_running) { 208 if (rt_rq->rt_nr_running) {
206 if (rt_se && !on_rt_rq(rt_se)) 209 if (rt_se && !on_rt_rq(rt_se))
207 enqueue_rt_entity(rt_se); 210 enqueue_rt_entity(rt_se, false);
208 if (rt_rq->highest_prio.curr < curr->prio) 211 if (rt_rq->highest_prio.curr < curr->prio)
209 resched_task(curr); 212 resched_task(curr);
210 } 213 }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212 215
213static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
214{ 217{
215 struct sched_rt_entity *rt_se = rt_rq->rt_se; 218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se;
220
221 rt_se = rt_rq->tg->rt_se[this_cpu];
216 222
217 if (rt_se && on_rt_rq(rt_se)) 223 if (rt_se && on_rt_rq(rt_se))
218 dequeue_rt_entity(rt_se); 224 dequeue_rt_entity(rt_se);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
803 dec_rt_group(rt_se, rt_rq); 809 dec_rt_group(rt_se, rt_rq);
804} 810}
805 811
806static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 812static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
807{ 813{
808 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 814 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
809 struct rt_prio_array *array = &rt_rq->active; 815 struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
819 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
820 return; 826 return;
821 827
822 list_add_tail(&rt_se->run_list, queue); 828 if (head)
829 list_add(&rt_se->run_list, queue);
830 else
831 list_add_tail(&rt_se->run_list, queue);
823 __set_bit(rt_se_prio(rt_se), array->bitmap); 832 __set_bit(rt_se_prio(rt_se), array->bitmap);
824 833
825 inc_rt_tasks(rt_se, rt_rq); 834 inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
856 } 865 }
857} 866}
858 867
859static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 868static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
860{ 869{
861 dequeue_rt_stack(rt_se); 870 dequeue_rt_stack(rt_se);
862 for_each_sched_rt_entity(rt_se) 871 for_each_sched_rt_entity(rt_se)
863 __enqueue_rt_entity(rt_se); 872 __enqueue_rt_entity(rt_se, head);
864} 873}
865 874
866static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 875static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
871 struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 struct rt_rq *rt_rq = group_rt_rq(rt_se);
872 881
873 if (rt_rq && rt_rq->rt_nr_running) 882 if (rt_rq && rt_rq->rt_nr_running)
874 __enqueue_rt_entity(rt_se); 883 __enqueue_rt_entity(rt_se, false);
875 } 884 }
876} 885}
877 886
878/* 887/*
879 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
880 */ 889 */
881static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
882{ 892{
883 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
884 894
885 if (wakeup) 895 if (wakeup)
886 rt_se->timeout = 0; 896 rt_se->timeout = 0;
887 897
888 enqueue_rt_entity(rt_se); 898 enqueue_rt_entity(rt_se, head);
889 899
890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
891 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
@@ -1481,24 +1491,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1481 push_rt_tasks(rq); 1491 push_rt_tasks(rq);
1482} 1492}
1483 1493
1484static unsigned long
1485load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1486 unsigned long max_load_move,
1487 struct sched_domain *sd, enum cpu_idle_type idle,
1488 int *all_pinned, int *this_best_prio)
1489{
1490 /* don't touch RT tasks */
1491 return 0;
1492}
1493
1494static int
1495move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1496 struct sched_domain *sd, enum cpu_idle_type idle)
1497{
1498 /* don't touch RT tasks */
1499 return 0;
1500}
1501
1502static void set_cpus_allowed_rt(struct task_struct *p, 1494static void set_cpus_allowed_rt(struct task_struct *p,
1503 const struct cpumask *new_mask) 1495 const struct cpumask *new_mask)
1504{ 1496{
@@ -1721,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1713 dequeue_pushable_task(rq, p);
1722} 1714}
1723 1715
1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 1716static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1717{
1726 /* 1718 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1719 * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1738,6 @@ static const struct sched_class rt_sched_class = {
1746#ifdef CONFIG_SMP 1738#ifdef CONFIG_SMP
1747 .select_task_rq = select_task_rq_rt, 1739 .select_task_rq = select_task_rq_rt,
1748 1740
1749 .load_balance = load_balance_rt,
1750 .move_one_task = move_one_task_rt,
1751 .set_cpus_allowed = set_cpus_allowed_rt, 1741 .set_cpus_allowed = set_cpus_allowed_rt,
1752 .rq_online = rq_online_rt, 1742 .rq_online = rq_online_rt,
1753 .rq_offline = rq_offline_rt, 1743 .rq_offline = rq_offline_rt,
diff --git a/kernel/sys.c b/kernel/sys.c
index 18bde979f346..877fe4f8e05e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -571,11 +571,6 @@ static int set_user(struct cred *new)
571 if (!new_user) 571 if (!new_user)
572 return -EAGAIN; 572 return -EAGAIN;
573 573
574 if (!task_can_switch_user(new_user, current)) {
575 free_uid(new_user);
576 return -EINVAL;
577 }
578
579 if (atomic_read(&new_user->processes) >= 574 if (atomic_read(&new_user->processes) >=
580 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 575 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
581 new_user != INIT_USER) { 576 new_user != INIT_USER) {
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
56 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
57 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns, 58 .user_ns = &init_user_ns,
59#ifdef CONFIG_USER_SCHED
60 .tg = &init_task_group,
61#endif
62}; 59};
63 60
64/* 61/*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 72 put_user_ns(up->user_ns);
76} 73}
77 74
78#ifdef CONFIG_USER_SCHED
79
80static void sched_destroy_user(struct user_struct *up)
81{
82 sched_destroy_group(up->tg);
83}
84
85static int sched_create_user(struct user_struct *up)
86{
87 int rc = 0;
88
89 up->tg = sched_create_group(&root_task_group);
90 if (IS_ERR(up->tg))
91 rc = -ENOMEM;
92
93 set_tg_uid(up);
94
95 return rc;
96}
97
98#else /* CONFIG_USER_SCHED */
99
100static void sched_destroy_user(struct user_struct *up) { }
101static int sched_create_user(struct user_struct *up) { return 0; }
102
103#endif /* CONFIG_USER_SCHED */
104
105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
125static DEFINE_MUTEX(uids_mutex);
126
127static inline void uids_mutex_lock(void)
128{
129 mutex_lock(&uids_mutex);
130}
131
132static inline void uids_mutex_unlock(void)
133{
134 mutex_unlock(&uids_mutex);
135}
136
137/* uid directory attributes */
138#ifdef CONFIG_FAIR_GROUP_SCHED
139static ssize_t cpu_shares_show(struct kobject *kobj,
140 struct kobj_attribute *attr,
141 char *buf)
142{
143 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144
145 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
146}
147
148static ssize_t cpu_shares_store(struct kobject *kobj,
149 struct kobj_attribute *attr,
150 const char *buf, size_t size)
151{
152 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
153 unsigned long shares;
154 int rc;
155
156 sscanf(buf, "%lu", &shares);
157
158 rc = sched_group_set_shares(up->tg, shares);
159
160 return (rc ? rc : size);
161}
162
163static struct kobj_attribute cpu_share_attr =
164 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
165#endif
166
167#ifdef CONFIG_RT_GROUP_SCHED
168static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 char *buf)
171{
172 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
173
174 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
175}
176
177static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
178 struct kobj_attribute *attr,
179 const char *buf, size_t size)
180{
181 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
182 unsigned long rt_runtime;
183 int rc;
184
185 sscanf(buf, "%ld", &rt_runtime);
186
187 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
188
189 return (rc ? rc : size);
190}
191
192static struct kobj_attribute cpu_rt_runtime_attr =
193 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
194
195static ssize_t cpu_rt_period_show(struct kobject *kobj,
196 struct kobj_attribute *attr,
197 char *buf)
198{
199 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
200
201 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
202}
203
204static ssize_t cpu_rt_period_store(struct kobject *kobj,
205 struct kobj_attribute *attr,
206 const char *buf, size_t size)
207{
208 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
209 unsigned long rt_period;
210 int rc;
211
212 sscanf(buf, "%lu", &rt_period);
213
214 rc = sched_group_set_rt_period(up->tg, rt_period);
215
216 return (rc ? rc : size);
217}
218
219static struct kobj_attribute cpu_rt_period_attr =
220 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
221#endif
222
223/* default attributes per uid directory */
224static struct attribute *uids_attributes[] = {
225#ifdef CONFIG_FAIR_GROUP_SCHED
226 &cpu_share_attr.attr,
227#endif
228#ifdef CONFIG_RT_GROUP_SCHED
229 &cpu_rt_runtime_attr.attr,
230 &cpu_rt_period_attr.attr,
231#endif
232 NULL
233};
234
235/* the lifetime of user_struct is not managed by the core (now) */
236static void uids_release(struct kobject *kobj)
237{
238 return;
239}
240
241static struct kobj_type uids_ktype = {
242 .sysfs_ops = &kobj_sysfs_ops,
243 .default_attrs = uids_attributes,
244 .release = uids_release,
245};
246
247/*
248 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
249 * We do not create this file for users in a user namespace (until
250 * sysfs tagging is implemented).
251 *
252 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
253 */
254static int uids_user_create(struct user_struct *up)
255{
256 struct kobject *kobj = &up->kobj;
257 int error;
258
259 memset(kobj, 0, sizeof(struct kobject));
260 if (up->user_ns != &init_user_ns)
261 return 0;
262 kobj->kset = uids_kset;
263 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
264 if (error) {
265 kobject_put(kobj);
266 goto done;
267 }
268
269 kobject_uevent(kobj, KOBJ_ADD);
270done:
271 return error;
272}
273
274/* create these entries in sysfs:
275 * "/sys/kernel/uids" directory
276 * "/sys/kernel/uids/0" directory (for root user)
277 * "/sys/kernel/uids/0/cpu_share" file (for root user)
278 */
279int __init uids_sysfs_init(void)
280{
281 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
282 if (!uids_kset)
283 return -ENOMEM;
284
285 return uids_user_create(&root_user);
286}
287
288/* delayed work function to remove sysfs directory for a user and free up
289 * corresponding structures.
290 */
291static void cleanup_user_struct(struct work_struct *w)
292{
293 struct user_struct *up = container_of(w, struct user_struct, work.work);
294 unsigned long flags;
295 int remove_user = 0;
296
297 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
298 * atomic.
299 */
300 uids_mutex_lock();
301
302 spin_lock_irqsave(&uidhash_lock, flags);
303 if (atomic_read(&up->__count) == 0) {
304 uid_hash_remove(up);
305 remove_user = 1;
306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
308
309 if (!remove_user)
310 goto done;
311
312 if (up->user_ns == &init_user_ns) {
313 kobject_uevent(&up->kobj, KOBJ_REMOVE);
314 kobject_del(&up->kobj);
315 kobject_put(&up->kobj);
316 }
317
318 sched_destroy_user(up);
319 key_put(up->uid_keyring);
320 key_put(up->session_keyring);
321 kmem_cache_free(uid_cachep, up);
322
323done:
324 uids_mutex_unlock();
325}
326
327/* IRQs are disabled and uidhash_lock is held upon function entry.
328 * IRQ state (as stored in flags) is restored and uidhash_lock released
329 * upon function exit.
330 */
331static void free_user(struct user_struct *up, unsigned long flags)
332{
333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336}
337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 75static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{ 76{
342 struct user_struct *user; 77 struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
352 return NULL; 87 return NULL;
353} 88}
354 89
355int uids_sysfs_init(void) { return 0; }
356static inline int uids_user_create(struct user_struct *up) { return 0; }
357static inline void uids_mutex_lock(void) { }
358static inline void uids_mutex_unlock(void) { }
359
360/* IRQs are disabled and uidhash_lock is held upon function entry. 90/* IRQs are disabled and uidhash_lock is held upon function entry.
361 * IRQ state (as stored in flags) is restored and uidhash_lock released 91 * IRQ state (as stored in flags) is restored and uidhash_lock released
362 * upon function exit. 92 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
365{ 95{
366 uid_hash_remove(up); 96 uid_hash_remove(up);
367 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
368 sched_destroy_user(up);
369 key_put(up->uid_keyring); 98 key_put(up->uid_keyring);
370 key_put(up->session_keyring); 99 key_put(up->session_keyring);
371 kmem_cache_free(uid_cachep, up); 100 kmem_cache_free(uid_cachep, up);
372} 101}
373 102
374#endif
375
376#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
377/*
378 * We need to check if a setuid can take place. This function should be called
379 * before successfully completing the setuid.
380 */
381int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
382{
383
384 return sched_rt_can_attach(up->tg, tsk);
385
386}
387#else
388int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
389{
390 return 1;
391}
392#endif
393
394/* 103/*
395 * Locate the user_struct for the passed UID. If found, take a ref on it. The 104 * Locate the user_struct for the passed UID. If found, take a ref on it. The
396 * caller must undo that ref with free_uid(). 105 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
431 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() 140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
432 * atomic. 141 * atomic.
433 */ 142 */
434 uids_mutex_lock();
435
436 spin_lock_irq(&uidhash_lock); 143 spin_lock_irq(&uidhash_lock);
437 up = uid_hash_find(uid, hashent); 144 up = uid_hash_find(uid, hashent);
438 spin_unlock_irq(&uidhash_lock); 145 spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
445 new->uid = uid; 152 new->uid = uid;
446 atomic_set(&new->__count, 1); 153 atomic_set(&new->__count, 1);
447 154
448 if (sched_create_user(new) < 0)
449 goto out_free_user;
450
451 new->user_ns = get_user_ns(ns); 155 new->user_ns = get_user_ns(ns);
452 156
453 if (uids_user_create(new))
454 goto out_destoy_sched;
455
456 /* 157 /*
457 * Before adding this, check whether we raced 158 * Before adding this, check whether we raced
458 * on adding the same user already.. 159 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
475 spin_unlock_irq(&uidhash_lock); 176 spin_unlock_irq(&uidhash_lock);
476 } 177 }
477 178
478 uids_mutex_unlock();
479
480 return up; 179 return up;
481 180
482out_destoy_sched:
483 sched_destroy_user(new);
484 put_user_ns(new->user_ns); 181 put_user_ns(new->user_ns);
485out_free_user:
486 kmem_cache_free(uid_cachep, new); 182 kmem_cache_free(uid_cachep, new);
487out_unlock: 183out_unlock:
488 uids_mutex_unlock();
489 return NULL; 184 return NULL;
490} 185}
491 186