aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/feature-removal-schedule.txt15
-rw-r--r--include/linux/kernel.h5
-rw-r--r--include/linux/percpu_counter.h9
-rw-r--r--include/linux/sched.h25
-rw-r--r--init/Kconfig81
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/sched.c2119
-rw-r--r--kernel/sched_cpupri.c4
-rw-r--r--kernel/sched_fair.c1689
-rw-r--r--kernel/sched_idletask.c23
-rw-r--r--kernel/sched_rt.c54
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/user.c305
14 files changed, 1813 insertions, 2531 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 0a46833c1b76..dbc12067872d 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,21 +6,6 @@ be removed from this file.
6 6
7--------------------------- 7---------------------------
8 8
9What: USER_SCHED
10When: 2.6.34
11
12Why: USER_SCHED was implemented as a proof of concept for group scheduling.
13 The effect of USER_SCHED can already be achieved from userspace with
14 the help of libcgroup. The removal of USER_SCHED will also simplify
15 the scheduler code with the removal of one major ifdef. There are also
16 issues USER_SCHED has with USER_NS. A decision was taken not to fix
17 those and instead remove USER_SCHED. Also new group scheduling
18 features will not be implemented for USER_SCHED.
19
20Who: Dhaval Giani <dhaval@linux.vnet.ibm.com>
21
22---------------------------
23
24What: PRISM54 9What: PRISM54
25When: 2.6.34 10When: 2.6.34
26 11
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 328bca609b9b..1221d2331a6d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -124,7 +124,7 @@ extern int _cond_resched(void);
124#endif 124#endif
125 125
126#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 126#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
127 void __might_sleep(char *file, int line, int preempt_offset); 127 void __might_sleep(const char *file, int line, int preempt_offset);
128/** 128/**
129 * might_sleep - annotation for functions that can sleep 129 * might_sleep - annotation for functions that can sleep
130 * 130 *
@@ -138,7 +138,8 @@ extern int _cond_resched(void);
138# define might_sleep() \ 138# define might_sleep() \
139 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) 139 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
140#else 140#else
141 static inline void __might_sleep(char *file, int line, int preempt_offset) { } 141 static inline void __might_sleep(const char *file, int line,
142 int preempt_offset) { }
142# define might_sleep() do { might_resched(); } while (0) 143# define might_sleep() do { might_resched(); } while (0)
143#endif 144#endif
144 145
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index a7684a513994..794662b2be5d 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -98,9 +98,6 @@ static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
98 fbc->count = amount; 98 fbc->count = amount;
99} 99}
100 100
101#define __percpu_counter_add(fbc, amount, batch) \
102 percpu_counter_add(fbc, amount)
103
104static inline void 101static inline void
105percpu_counter_add(struct percpu_counter *fbc, s64 amount) 102percpu_counter_add(struct percpu_counter *fbc, s64 amount)
106{ 103{
@@ -109,6 +106,12 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
109 preempt_enable(); 106 preempt_enable();
110} 107}
111 108
109static inline void
110__percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
111{
112 percpu_counter_add(fbc, amount);
113}
114
112static inline s64 percpu_counter_read(struct percpu_counter *fbc) 115static inline s64 percpu_counter_read(struct percpu_counter *fbc)
113{ 116{
114 return fbc->count; 117 return fbc->count;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1f5fa53b46b1..0eef87b58ea5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -740,14 +740,6 @@ struct user_struct {
740 uid_t uid; 740 uid_t uid;
741 struct user_namespace *user_ns; 741 struct user_namespace *user_ns;
742 742
743#ifdef CONFIG_USER_SCHED
744 struct task_group *tg;
745#ifdef CONFIG_SYSFS
746 struct kobject kobj;
747 struct delayed_work work;
748#endif
749#endif
750
751#ifdef CONFIG_PERF_EVENTS 743#ifdef CONFIG_PERF_EVENTS
752 atomic_long_t locked_vm; 744 atomic_long_t locked_vm;
753#endif 745#endif
@@ -1087,7 +1079,8 @@ struct sched_domain;
1087struct sched_class { 1079struct sched_class {
1088 const struct sched_class *next; 1080 const struct sched_class *next;
1089 1081
1090 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); 1082 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup,
1083 bool head);
1091 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); 1084 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
1092 void (*yield_task) (struct rq *rq); 1085 void (*yield_task) (struct rq *rq);
1093 1086
@@ -1099,14 +1092,6 @@ struct sched_class {
1099#ifdef CONFIG_SMP 1092#ifdef CONFIG_SMP
1100 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 1093 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
1101 1094
1102 unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
1103 struct rq *busiest, unsigned long max_load_move,
1104 struct sched_domain *sd, enum cpu_idle_type idle,
1105 int *all_pinned, int *this_best_prio);
1106
1107 int (*move_one_task) (struct rq *this_rq, int this_cpu,
1108 struct rq *busiest, struct sched_domain *sd,
1109 enum cpu_idle_type idle);
1110 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1095 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1111 void (*post_schedule) (struct rq *this_rq); 1096 void (*post_schedule) (struct rq *this_rq);
1112 void (*task_waking) (struct rq *this_rq, struct task_struct *task); 1097 void (*task_waking) (struct rq *this_rq, struct task_struct *task);
@@ -2520,13 +2505,9 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
2520 2505
2521extern void normalize_rt_tasks(void); 2506extern void normalize_rt_tasks(void);
2522 2507
2523#ifdef CONFIG_GROUP_SCHED 2508#ifdef CONFIG_CGROUP_SCHED
2524 2509
2525extern struct task_group init_task_group; 2510extern struct task_group init_task_group;
2526#ifdef CONFIG_USER_SCHED
2527extern struct task_group root_task_group;
2528extern void set_tg_uid(struct user_struct *user);
2529#endif
2530 2511
2531extern struct task_group *sched_create_group(struct task_group *parent); 2512extern struct task_group *sched_create_group(struct task_group *parent);
2532extern void sched_destroy_group(struct task_group *tg); 2513extern void sched_destroy_group(struct task_group *tg);
diff --git a/init/Kconfig b/init/Kconfig
index d95ca7cd5d45..ed9c19e02f93 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -445,57 +445,6 @@ config LOG_BUF_SHIFT
445config HAVE_UNSTABLE_SCHED_CLOCK 445config HAVE_UNSTABLE_SCHED_CLOCK
446 bool 446 bool
447 447
448config GROUP_SCHED
449 bool "Group CPU scheduler"
450 depends on EXPERIMENTAL
451 default n
452 help
453 This feature lets CPU scheduler recognize task groups and control CPU
454 bandwidth allocation to such task groups.
455 In order to create a group from arbitrary set of processes, use
456 CONFIG_CGROUPS. (See Control Group support.)
457
458config FAIR_GROUP_SCHED
459 bool "Group scheduling for SCHED_OTHER"
460 depends on GROUP_SCHED
461 default GROUP_SCHED
462
463config RT_GROUP_SCHED
464 bool "Group scheduling for SCHED_RR/FIFO"
465 depends on EXPERIMENTAL
466 depends on GROUP_SCHED
467 default n
468 help
469 This feature lets you explicitly allocate real CPU bandwidth
470 to users or control groups (depending on the "Basis for grouping tasks"
471 setting below. If enabled, it will also make it impossible to
472 schedule realtime tasks for non-root users until you allocate
473 realtime bandwidth for them.
474 See Documentation/scheduler/sched-rt-group.txt for more information.
475
476choice
477 depends on GROUP_SCHED
478 prompt "Basis for grouping tasks"
479 default USER_SCHED
480
481config USER_SCHED
482 bool "user id"
483 help
484 This option will choose userid as the basis for grouping
485 tasks, thus providing equal CPU bandwidth to each user.
486
487config CGROUP_SCHED
488 bool "Control groups"
489 depends on CGROUPS
490 help
491 This option allows you to create arbitrary task groups
492 using the "cgroup" pseudo filesystem and control
493 the cpu bandwidth allocated to each such task group.
494 Refer to Documentation/cgroups/cgroups.txt for more
495 information on "cgroup" pseudo filesystem.
496
497endchoice
498
499menuconfig CGROUPS 448menuconfig CGROUPS
500 boolean "Control Group support" 449 boolean "Control Group support"
501 help 450 help
@@ -616,6 +565,36 @@ config CGROUP_MEM_RES_CTLR_SWAP
616 Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page 565 Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
617 size is 4096bytes, 512k per 1Gbytes of swap. 566 size is 4096bytes, 512k per 1Gbytes of swap.
618 567
568menuconfig CGROUP_SCHED
569 bool "Group CPU scheduler"
570 depends on EXPERIMENTAL && CGROUPS
571 default n
572 help
573 This feature lets CPU scheduler recognize task groups and control CPU
574 bandwidth allocation to such task groups. It uses cgroups to group
575 tasks.
576
577if CGROUP_SCHED
578config FAIR_GROUP_SCHED
579 bool "Group scheduling for SCHED_OTHER"
580 depends on CGROUP_SCHED
581 default CGROUP_SCHED
582
583config RT_GROUP_SCHED
584 bool "Group scheduling for SCHED_RR/FIFO"
585 depends on EXPERIMENTAL
586 depends on CGROUP_SCHED
587 default n
588 help
589 This feature lets you explicitly allocate real CPU bandwidth
590 to users or control groups (depending on the "Basis for grouping tasks"
591 setting below. If enabled, it will also make it impossible to
592 schedule realtime tasks for non-root users until you allocate
593 realtime bandwidth for them.
594 See Documentation/scheduler/sched-rt-group.txt for more information.
595
596endif #CGROUP_SCHED
597
619endif # CGROUPS 598endif # CGROUPS
620 599
621config MM_OWNER 600config MM_OWNER
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..6b1ccc3f0205 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void)
197 goto group_exit; 197 goto group_exit;
198 } 198 }
199 199
200 /* create the /sys/kernel/uids/ directory */
201 error = uids_sysfs_init();
202 if (error)
203 goto notes_exit;
204
205 return 0; 200 return 0;
206 201
207notes_exit:
208 if (notes_size > 0)
209 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
210group_exit: 202group_exit:
211 sysfs_remove_group(kernel_kobj, &kernel_attr_group); 203 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
212kset_exit: 204kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e0..82ed0ea15194 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
101 * 101 *
102 * Description: This helper function creates and names a kernel 102 * Description: This helper function creates and names a kernel
103 * thread. The thread will be stopped: use wake_up_process() to start 103 * thread. The thread will be stopped: use wake_up_process() to start
104 * it. See also kthread_run(), kthread_create_on_cpu(). 104 * it. See also kthread_run().
105 * 105 *
106 * When woken, the thread will run @threadfn() with @data as its 106 * When woken, the thread will run @threadfn() with @data as its
107 * argument. @threadfn() can either call do_exit() directly if it is a 107 * argument. @threadfn() can either call do_exit() directly if it is a
diff --git a/kernel/sched.c b/kernel/sched.c
index 404e2017c0cf..af5fa239804d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -1409,32 +1366,6 @@ static const u32 prio_to_wmult[40] = {
1409 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1366 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1410}; 1367};
1411 1368
1412static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1413
1414/*
1415 * runqueue iterator, to support SMP load-balancing between different
1416 * scheduling classes, without having to expose their internal data
1417 * structures to the load-balancing proper:
1418 */
1419struct rq_iterator {
1420 void *arg;
1421 struct task_struct *(*start)(void *);
1422 struct task_struct *(*next)(void *);
1423};
1424
1425#ifdef CONFIG_SMP
1426static unsigned long
1427balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1428 unsigned long max_load_move, struct sched_domain *sd,
1429 enum cpu_idle_type idle, int *all_pinned,
1430 int *this_best_prio, struct rq_iterator *iterator);
1431
1432static int
1433iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1434 struct sched_domain *sd, enum cpu_idle_type idle,
1435 struct rq_iterator *iterator);
1436#endif
1437
1438/* Time spent by the tasks of the cpu accounting group executing in ... */ 1369/* Time spent by the tasks of the cpu accounting group executing in ... */
1439enum cpuacct_stat_index { 1370enum cpuacct_stat_index {
1440 CPUACCT_STAT_USER, /* ... user mode */ 1371 CPUACCT_STAT_USER, /* ... user mode */
@@ -1720,16 +1651,6 @@ static void update_shares(struct sched_domain *sd)
1720 } 1651 }
1721} 1652}
1722 1653
1723static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1724{
1725 if (root_task_group_empty())
1726 return;
1727
1728 raw_spin_unlock(&rq->lock);
1729 update_shares(sd);
1730 raw_spin_lock(&rq->lock);
1731}
1732
1733static void update_h_load(long cpu) 1654static void update_h_load(long cpu)
1734{ 1655{
1735 if (root_task_group_empty()) 1656 if (root_task_group_empty())
@@ -1744,10 +1665,6 @@ static inline void update_shares(struct sched_domain *sd)
1744{ 1665{
1745} 1666}
1746 1667
1747static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1748{
1749}
1750
1751#endif 1668#endif
1752 1669
1753#ifdef CONFIG_PREEMPT 1670#ifdef CONFIG_PREEMPT
@@ -1824,6 +1741,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1824 raw_spin_unlock(&busiest->lock); 1741 raw_spin_unlock(&busiest->lock);
1825 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1742 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1826} 1743}
1744
1745/*
1746 * double_rq_lock - safely lock two runqueues
1747 *
1748 * Note this does not disable interrupts like task_rq_lock,
1749 * you need to do so manually before calling.
1750 */
1751static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1752 __acquires(rq1->lock)
1753 __acquires(rq2->lock)
1754{
1755 BUG_ON(!irqs_disabled());
1756 if (rq1 == rq2) {
1757 raw_spin_lock(&rq1->lock);
1758 __acquire(rq2->lock); /* Fake it out ;) */
1759 } else {
1760 if (rq1 < rq2) {
1761 raw_spin_lock(&rq1->lock);
1762 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1763 } else {
1764 raw_spin_lock(&rq2->lock);
1765 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1766 }
1767 }
1768 update_rq_clock(rq1);
1769 update_rq_clock(rq2);
1770}
1771
1772/*
1773 * double_rq_unlock - safely unlock two runqueues
1774 *
1775 * Note this does not restore interrupts like task_rq_unlock,
1776 * you need to do so manually after calling.
1777 */
1778static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1779 __releases(rq1->lock)
1780 __releases(rq2->lock)
1781{
1782 raw_spin_unlock(&rq1->lock);
1783 if (rq1 != rq2)
1784 raw_spin_unlock(&rq2->lock);
1785 else
1786 __release(rq2->lock);
1787}
1788
1827#endif 1789#endif
1828 1790
1829#ifdef CONFIG_FAIR_GROUP_SCHED 1791#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1853,18 +1815,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1853#endif 1815#endif
1854} 1816}
1855 1817
1856#include "sched_stats.h" 1818static const struct sched_class rt_sched_class;
1857#include "sched_idletask.c"
1858#include "sched_fair.c"
1859#include "sched_rt.c"
1860#ifdef CONFIG_SCHED_DEBUG
1861# include "sched_debug.c"
1862#endif
1863 1819
1864#define sched_class_highest (&rt_sched_class) 1820#define sched_class_highest (&rt_sched_class)
1865#define for_each_class(class) \ 1821#define for_each_class(class) \
1866 for (class = sched_class_highest; class; class = class->next) 1822 for (class = sched_class_highest; class; class = class->next)
1867 1823
1824#include "sched_stats.h"
1825
1868static void inc_nr_running(struct rq *rq) 1826static void inc_nr_running(struct rq *rq)
1869{ 1827{
1870 rq->nr_running++; 1828 rq->nr_running++;
@@ -1902,13 +1860,14 @@ static void update_avg(u64 *avg, u64 sample)
1902 *avg += diff >> 3; 1860 *avg += diff >> 3;
1903} 1861}
1904 1862
1905static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1863static void
1864enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1906{ 1865{
1907 if (wakeup) 1866 if (wakeup)
1908 p->se.start_runtime = p->se.sum_exec_runtime; 1867 p->se.start_runtime = p->se.sum_exec_runtime;
1909 1868
1910 sched_info_queued(p); 1869 sched_info_queued(p);
1911 p->sched_class->enqueue_task(rq, p, wakeup); 1870 p->sched_class->enqueue_task(rq, p, wakeup, head);
1912 p->se.on_rq = 1; 1871 p->se.on_rq = 1;
1913} 1872}
1914 1873
@@ -1931,6 +1890,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1931} 1890}
1932 1891
1933/* 1892/*
1893 * activate_task - move a task to the runqueue.
1894 */
1895static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1896{
1897 if (task_contributes_to_load(p))
1898 rq->nr_uninterruptible--;
1899
1900 enqueue_task(rq, p, wakeup, false);
1901 inc_nr_running(rq);
1902}
1903
1904/*
1905 * deactivate_task - remove a task from the runqueue.
1906 */
1907static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1908{
1909 if (task_contributes_to_load(p))
1910 rq->nr_uninterruptible++;
1911
1912 dequeue_task(rq, p, sleep);
1913 dec_nr_running(rq);
1914}
1915
1916#include "sched_idletask.c"
1917#include "sched_fair.c"
1918#include "sched_rt.c"
1919#ifdef CONFIG_SCHED_DEBUG
1920# include "sched_debug.c"
1921#endif
1922
1923/*
1934 * __normal_prio - return the priority that is based on the static prio 1924 * __normal_prio - return the priority that is based on the static prio
1935 */ 1925 */
1936static inline int __normal_prio(struct task_struct *p) 1926static inline int __normal_prio(struct task_struct *p)
@@ -1976,30 +1966,6 @@ static int effective_prio(struct task_struct *p)
1976 return p->prio; 1966 return p->prio;
1977} 1967}
1978 1968
1979/*
1980 * activate_task - move a task to the runqueue.
1981 */
1982static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1983{
1984 if (task_contributes_to_load(p))
1985 rq->nr_uninterruptible--;
1986
1987 enqueue_task(rq, p, wakeup);
1988 inc_nr_running(rq);
1989}
1990
1991/*
1992 * deactivate_task - remove a task from the runqueue.
1993 */
1994static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1995{
1996 if (task_contributes_to_load(p))
1997 rq->nr_uninterruptible++;
1998
1999 dequeue_task(rq, p, sleep);
2000 dec_nr_running(rq);
2001}
2002
2003/** 1969/**
2004 * task_curr - is this task currently executing on a CPU? 1970 * task_curr - is this task currently executing on a CPU?
2005 * @p: the task in question. 1971 * @p: the task in question.
@@ -3137,50 +3103,6 @@ static void update_cpu_load(struct rq *this_rq)
3137#ifdef CONFIG_SMP 3103#ifdef CONFIG_SMP
3138 3104
3139/* 3105/*
3140 * double_rq_lock - safely lock two runqueues
3141 *
3142 * Note this does not disable interrupts like task_rq_lock,
3143 * you need to do so manually before calling.
3144 */
3145static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3146 __acquires(rq1->lock)
3147 __acquires(rq2->lock)
3148{
3149 BUG_ON(!irqs_disabled());
3150 if (rq1 == rq2) {
3151 raw_spin_lock(&rq1->lock);
3152 __acquire(rq2->lock); /* Fake it out ;) */
3153 } else {
3154 if (rq1 < rq2) {
3155 raw_spin_lock(&rq1->lock);
3156 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3157 } else {
3158 raw_spin_lock(&rq2->lock);
3159 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3160 }
3161 }
3162 update_rq_clock(rq1);
3163 update_rq_clock(rq2);
3164}
3165
3166/*
3167 * double_rq_unlock - safely unlock two runqueues
3168 *
3169 * Note this does not restore interrupts like task_rq_unlock,
3170 * you need to do so manually after calling.
3171 */
3172static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3173 __releases(rq1->lock)
3174 __releases(rq2->lock)
3175{
3176 raw_spin_unlock(&rq1->lock);
3177 if (rq1 != rq2)
3178 raw_spin_unlock(&rq2->lock);
3179 else
3180 __release(rq2->lock);
3181}
3182
3183/*
3184 * sched_exec - execve() is a valuable balancing opportunity, because at 3106 * sched_exec - execve() is a valuable balancing opportunity, because at
3185 * this point the task has the smallest effective memory and cache footprint. 3107 * this point the task has the smallest effective memory and cache footprint.
3186 */ 3108 */
@@ -3228,1782 +3150,6 @@ again:
3228 task_rq_unlock(rq, &flags); 3150 task_rq_unlock(rq, &flags);
3229} 3151}
3230 3152
3231/*
3232 * pull_task - move a task from a remote runqueue to the local runqueue.
3233 * Both runqueues must be locked.
3234 */
3235static void pull_task(struct rq *src_rq, struct task_struct *p,
3236 struct rq *this_rq, int this_cpu)
3237{
3238 deactivate_task(src_rq, p, 0);
3239 set_task_cpu(p, this_cpu);
3240 activate_task(this_rq, p, 0);
3241 check_preempt_curr(this_rq, p, 0);
3242}
3243
3244/*
3245 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3246 */
3247static
3248int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3249 struct sched_domain *sd, enum cpu_idle_type idle,
3250 int *all_pinned)
3251{
3252 int tsk_cache_hot = 0;
3253 /*
3254 * We do not migrate tasks that are:
3255 * 1) running (obviously), or
3256 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3257 * 3) are cache-hot on their current CPU.
3258 */
3259 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3260 schedstat_inc(p, se.nr_failed_migrations_affine);
3261 return 0;
3262 }
3263 *all_pinned = 0;
3264
3265 if (task_running(rq, p)) {
3266 schedstat_inc(p, se.nr_failed_migrations_running);
3267 return 0;
3268 }
3269
3270 /*
3271 * Aggressive migration if:
3272 * 1) task is cache cold, or
3273 * 2) too many balance attempts have failed.
3274 */
3275
3276 tsk_cache_hot = task_hot(p, rq->clock, sd);
3277 if (!tsk_cache_hot ||
3278 sd->nr_balance_failed > sd->cache_nice_tries) {
3279#ifdef CONFIG_SCHEDSTATS
3280 if (tsk_cache_hot) {
3281 schedstat_inc(sd, lb_hot_gained[idle]);
3282 schedstat_inc(p, se.nr_forced_migrations);
3283 }
3284#endif
3285 return 1;
3286 }
3287
3288 if (tsk_cache_hot) {
3289 schedstat_inc(p, se.nr_failed_migrations_hot);
3290 return 0;
3291 }
3292 return 1;
3293}
3294
3295static unsigned long
3296balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3297 unsigned long max_load_move, struct sched_domain *sd,
3298 enum cpu_idle_type idle, int *all_pinned,
3299 int *this_best_prio, struct rq_iterator *iterator)
3300{
3301 int loops = 0, pulled = 0, pinned = 0;
3302 struct task_struct *p;
3303 long rem_load_move = max_load_move;
3304
3305 if (max_load_move == 0)
3306 goto out;
3307
3308 pinned = 1;
3309
3310 /*
3311 * Start the load-balancing iterator:
3312 */
3313 p = iterator->start(iterator->arg);
3314next:
3315 if (!p || loops++ > sysctl_sched_nr_migrate)
3316 goto out;
3317
3318 if ((p->se.load.weight >> 1) > rem_load_move ||
3319 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3320 p = iterator->next(iterator->arg);
3321 goto next;
3322 }
3323
3324 pull_task(busiest, p, this_rq, this_cpu);
3325 pulled++;
3326 rem_load_move -= p->se.load.weight;
3327
3328#ifdef CONFIG_PREEMPT
3329 /*
3330 * NEWIDLE balancing is a source of latency, so preemptible kernels
3331 * will stop after the first task is pulled to minimize the critical
3332 * section.
3333 */
3334 if (idle == CPU_NEWLY_IDLE)
3335 goto out;
3336#endif
3337
3338 /*
3339 * We only want to steal up to the prescribed amount of weighted load.
3340 */
3341 if (rem_load_move > 0) {
3342 if (p->prio < *this_best_prio)
3343 *this_best_prio = p->prio;
3344 p = iterator->next(iterator->arg);
3345 goto next;
3346 }
3347out:
3348 /*
3349 * Right now, this is one of only two places pull_task() is called,
3350 * so we can safely collect pull_task() stats here rather than
3351 * inside pull_task().
3352 */
3353 schedstat_add(sd, lb_gained[idle], pulled);
3354
3355 if (all_pinned)
3356 *all_pinned = pinned;
3357
3358 return max_load_move - rem_load_move;
3359}
3360
3361/*
3362 * move_tasks tries to move up to max_load_move weighted load from busiest to
3363 * this_rq, as part of a balancing operation within domain "sd".
3364 * Returns 1 if successful and 0 otherwise.
3365 *
3366 * Called with both runqueues locked.
3367 */
3368static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3369 unsigned long max_load_move,
3370 struct sched_domain *sd, enum cpu_idle_type idle,
3371 int *all_pinned)
3372{
3373 const struct sched_class *class = sched_class_highest;
3374 unsigned long total_load_moved = 0;
3375 int this_best_prio = this_rq->curr->prio;
3376
3377 do {
3378 total_load_moved +=
3379 class->load_balance(this_rq, this_cpu, busiest,
3380 max_load_move - total_load_moved,
3381 sd, idle, all_pinned, &this_best_prio);
3382 class = class->next;
3383
3384#ifdef CONFIG_PREEMPT
3385 /*
3386 * NEWIDLE balancing is a source of latency, so preemptible
3387 * kernels will stop after the first task is pulled to minimize
3388 * the critical section.
3389 */
3390 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3391 break;
3392#endif
3393 } while (class && max_load_move > total_load_moved);
3394
3395 return total_load_moved > 0;
3396}
3397
3398static int
3399iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3400 struct sched_domain *sd, enum cpu_idle_type idle,
3401 struct rq_iterator *iterator)
3402{
3403 struct task_struct *p = iterator->start(iterator->arg);
3404 int pinned = 0;
3405
3406 while (p) {
3407 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3408 pull_task(busiest, p, this_rq, this_cpu);
3409 /*
3410 * Right now, this is only the second place pull_task()
3411 * is called, so we can safely collect pull_task()
3412 * stats here rather than inside pull_task().
3413 */
3414 schedstat_inc(sd, lb_gained[idle]);
3415
3416 return 1;
3417 }
3418 p = iterator->next(iterator->arg);
3419 }
3420
3421 return 0;
3422}
3423
3424/*
3425 * move_one_task tries to move exactly one task from busiest to this_rq, as
3426 * part of active balancing operations within "domain".
3427 * Returns 1 if successful and 0 otherwise.
3428 *
3429 * Called with both runqueues locked.
3430 */
3431static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3432 struct sched_domain *sd, enum cpu_idle_type idle)
3433{
3434 const struct sched_class *class;
3435
3436 for_each_class(class) {
3437 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3438 return 1;
3439 }
3440
3441 return 0;
3442}
3443/********** Helpers for find_busiest_group ************************/
3444/*
3445 * sd_lb_stats - Structure to store the statistics of a sched_domain
3446 * during load balancing.
3447 */
3448struct sd_lb_stats {
3449 struct sched_group *busiest; /* Busiest group in this sd */
3450 struct sched_group *this; /* Local group in this sd */
3451 unsigned long total_load; /* Total load of all groups in sd */
3452 unsigned long total_pwr; /* Total power of all groups in sd */
3453 unsigned long avg_load; /* Average load across all groups in sd */
3454
3455 /** Statistics of this group */
3456 unsigned long this_load;
3457 unsigned long this_load_per_task;
3458 unsigned long this_nr_running;
3459
3460 /* Statistics of the busiest group */
3461 unsigned long max_load;
3462 unsigned long busiest_load_per_task;
3463 unsigned long busiest_nr_running;
3464
3465 int group_imb; /* Is there imbalance in this sd */
3466#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3467 int power_savings_balance; /* Is powersave balance needed for this sd */
3468 struct sched_group *group_min; /* Least loaded group in sd */
3469 struct sched_group *group_leader; /* Group which relieves group_min */
3470 unsigned long min_load_per_task; /* load_per_task in group_min */
3471 unsigned long leader_nr_running; /* Nr running of group_leader */
3472 unsigned long min_nr_running; /* Nr running of group_min */
3473#endif
3474};
3475
3476/*
3477 * sg_lb_stats - stats of a sched_group required for load_balancing
3478 */
3479struct sg_lb_stats {
3480 unsigned long avg_load; /*Avg load across the CPUs of the group */
3481 unsigned long group_load; /* Total load over the CPUs of the group */
3482 unsigned long sum_nr_running; /* Nr tasks running in the group */
3483 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3484 unsigned long group_capacity;
3485 int group_imb; /* Is there an imbalance in the group ? */
3486};
3487
3488/**
3489 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3490 * @group: The group whose first cpu is to be returned.
3491 */
3492static inline unsigned int group_first_cpu(struct sched_group *group)
3493{
3494 return cpumask_first(sched_group_cpus(group));
3495}
3496
3497/**
3498 * get_sd_load_idx - Obtain the load index for a given sched domain.
3499 * @sd: The sched_domain whose load_idx is to be obtained.
3500 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3501 */
3502static inline int get_sd_load_idx(struct sched_domain *sd,
3503 enum cpu_idle_type idle)
3504{
3505 int load_idx;
3506
3507 switch (idle) {
3508 case CPU_NOT_IDLE:
3509 load_idx = sd->busy_idx;
3510 break;
3511
3512 case CPU_NEWLY_IDLE:
3513 load_idx = sd->newidle_idx;
3514 break;
3515 default:
3516 load_idx = sd->idle_idx;
3517 break;
3518 }
3519
3520 return load_idx;
3521}
3522
3523
3524#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3525/**
3526 * init_sd_power_savings_stats - Initialize power savings statistics for
3527 * the given sched_domain, during load balancing.
3528 *
3529 * @sd: Sched domain whose power-savings statistics are to be initialized.
3530 * @sds: Variable containing the statistics for sd.
3531 * @idle: Idle status of the CPU at which we're performing load-balancing.
3532 */
3533static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3534 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3535{
3536 /*
3537 * Busy processors will not participate in power savings
3538 * balance.
3539 */
3540 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3541 sds->power_savings_balance = 0;
3542 else {
3543 sds->power_savings_balance = 1;
3544 sds->min_nr_running = ULONG_MAX;
3545 sds->leader_nr_running = 0;
3546 }
3547}
3548
3549/**
3550 * update_sd_power_savings_stats - Update the power saving stats for a
3551 * sched_domain while performing load balancing.
3552 *
3553 * @group: sched_group belonging to the sched_domain under consideration.
3554 * @sds: Variable containing the statistics of the sched_domain
3555 * @local_group: Does group contain the CPU for which we're performing
3556 * load balancing ?
3557 * @sgs: Variable containing the statistics of the group.
3558 */
3559static inline void update_sd_power_savings_stats(struct sched_group *group,
3560 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3561{
3562
3563 if (!sds->power_savings_balance)
3564 return;
3565
3566 /*
3567 * If the local group is idle or completely loaded
3568 * no need to do power savings balance at this domain
3569 */
3570 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3571 !sds->this_nr_running))
3572 sds->power_savings_balance = 0;
3573
3574 /*
3575 * If a group is already running at full capacity or idle,
3576 * don't include that group in power savings calculations
3577 */
3578 if (!sds->power_savings_balance ||
3579 sgs->sum_nr_running >= sgs->group_capacity ||
3580 !sgs->sum_nr_running)
3581 return;
3582
3583 /*
3584 * Calculate the group which has the least non-idle load.
3585 * This is the group from where we need to pick up the load
3586 * for saving power
3587 */
3588 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3589 (sgs->sum_nr_running == sds->min_nr_running &&
3590 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3591 sds->group_min = group;
3592 sds->min_nr_running = sgs->sum_nr_running;
3593 sds->min_load_per_task = sgs->sum_weighted_load /
3594 sgs->sum_nr_running;
3595 }
3596
3597 /*
3598 * Calculate the group which is almost near its
3599 * capacity but still has some space to pick up some load
3600 * from other group and save more power
3601 */
3602 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3603 return;
3604
3605 if (sgs->sum_nr_running > sds->leader_nr_running ||
3606 (sgs->sum_nr_running == sds->leader_nr_running &&
3607 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3608 sds->group_leader = group;
3609 sds->leader_nr_running = sgs->sum_nr_running;
3610 }
3611}
3612
3613/**
3614 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3615 * @sds: Variable containing the statistics of the sched_domain
3616 * under consideration.
3617 * @this_cpu: Cpu at which we're currently performing load-balancing.
3618 * @imbalance: Variable to store the imbalance.
3619 *
3620 * Description:
3621 * Check if we have potential to perform some power-savings balance.
3622 * If yes, set the busiest group to be the least loaded group in the
3623 * sched_domain, so that it's CPUs can be put to idle.
3624 *
3625 * Returns 1 if there is potential to perform power-savings balance.
3626 * Else returns 0.
3627 */
3628static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3629 int this_cpu, unsigned long *imbalance)
3630{
3631 if (!sds->power_savings_balance)
3632 return 0;
3633
3634 if (sds->this != sds->group_leader ||
3635 sds->group_leader == sds->group_min)
3636 return 0;
3637
3638 *imbalance = sds->min_load_per_task;
3639 sds->busiest = sds->group_min;
3640
3641 return 1;
3642
3643}
3644#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3645static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3646 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3647{
3648 return;
3649}
3650
3651static inline void update_sd_power_savings_stats(struct sched_group *group,
3652 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3653{
3654 return;
3655}
3656
3657static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3658 int this_cpu, unsigned long *imbalance)
3659{
3660 return 0;
3661}
3662#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3663
3664
3665unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3666{
3667 return SCHED_LOAD_SCALE;
3668}
3669
3670unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3671{
3672 return default_scale_freq_power(sd, cpu);
3673}
3674
3675unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3676{
3677 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3678 unsigned long smt_gain = sd->smt_gain;
3679
3680 smt_gain /= weight;
3681
3682 return smt_gain;
3683}
3684
3685unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3686{
3687 return default_scale_smt_power(sd, cpu);
3688}
3689
3690unsigned long scale_rt_power(int cpu)
3691{
3692 struct rq *rq = cpu_rq(cpu);
3693 u64 total, available;
3694
3695 sched_avg_update(rq);
3696
3697 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3698 available = total - rq->rt_avg;
3699
3700 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3701 total = SCHED_LOAD_SCALE;
3702
3703 total >>= SCHED_LOAD_SHIFT;
3704
3705 return div_u64(available, total);
3706}
3707
3708static void update_cpu_power(struct sched_domain *sd, int cpu)
3709{
3710 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3711 unsigned long power = SCHED_LOAD_SCALE;
3712 struct sched_group *sdg = sd->groups;
3713
3714 if (sched_feat(ARCH_POWER))
3715 power *= arch_scale_freq_power(sd, cpu);
3716 else
3717 power *= default_scale_freq_power(sd, cpu);
3718
3719 power >>= SCHED_LOAD_SHIFT;
3720
3721 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3722 if (sched_feat(ARCH_POWER))
3723 power *= arch_scale_smt_power(sd, cpu);
3724 else
3725 power *= default_scale_smt_power(sd, cpu);
3726
3727 power >>= SCHED_LOAD_SHIFT;
3728 }
3729
3730 power *= scale_rt_power(cpu);
3731 power >>= SCHED_LOAD_SHIFT;
3732
3733 if (!power)
3734 power = 1;
3735
3736 sdg->cpu_power = power;
3737}
3738
3739static void update_group_power(struct sched_domain *sd, int cpu)
3740{
3741 struct sched_domain *child = sd->child;
3742 struct sched_group *group, *sdg = sd->groups;
3743 unsigned long power;
3744
3745 if (!child) {
3746 update_cpu_power(sd, cpu);
3747 return;
3748 }
3749
3750 power = 0;
3751
3752 group = child->groups;
3753 do {
3754 power += group->cpu_power;
3755 group = group->next;
3756 } while (group != child->groups);
3757
3758 sdg->cpu_power = power;
3759}
3760
3761/**
3762 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3763 * @sd: The sched_domain whose statistics are to be updated.
3764 * @group: sched_group whose statistics are to be updated.
3765 * @this_cpu: Cpu for which load balance is currently performed.
3766 * @idle: Idle status of this_cpu
3767 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3768 * @sd_idle: Idle status of the sched_domain containing group.
3769 * @local_group: Does group contain this_cpu.
3770 * @cpus: Set of cpus considered for load balancing.
3771 * @balance: Should we balance.
3772 * @sgs: variable to hold the statistics for this group.
3773 */
3774static inline void update_sg_lb_stats(struct sched_domain *sd,
3775 struct sched_group *group, int this_cpu,
3776 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3777 int local_group, const struct cpumask *cpus,
3778 int *balance, struct sg_lb_stats *sgs)
3779{
3780 unsigned long load, max_cpu_load, min_cpu_load;
3781 int i;
3782 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3783 unsigned long sum_avg_load_per_task;
3784 unsigned long avg_load_per_task;
3785
3786 if (local_group) {
3787 balance_cpu = group_first_cpu(group);
3788 if (balance_cpu == this_cpu)
3789 update_group_power(sd, this_cpu);
3790 }
3791
3792 /* Tally up the load of all CPUs in the group */
3793 sum_avg_load_per_task = avg_load_per_task = 0;
3794 max_cpu_load = 0;
3795 min_cpu_load = ~0UL;
3796
3797 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3798 struct rq *rq = cpu_rq(i);
3799
3800 if (*sd_idle && rq->nr_running)
3801 *sd_idle = 0;
3802
3803 /* Bias balancing toward cpus of our domain */
3804 if (local_group) {
3805 if (idle_cpu(i) && !first_idle_cpu) {
3806 first_idle_cpu = 1;
3807 balance_cpu = i;
3808 }
3809
3810 load = target_load(i, load_idx);
3811 } else {
3812 load = source_load(i, load_idx);
3813 if (load > max_cpu_load)
3814 max_cpu_load = load;
3815 if (min_cpu_load > load)
3816 min_cpu_load = load;
3817 }
3818
3819 sgs->group_load += load;
3820 sgs->sum_nr_running += rq->nr_running;
3821 sgs->sum_weighted_load += weighted_cpuload(i);
3822
3823 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3824 }
3825
3826 /*
3827 * First idle cpu or the first cpu(busiest) in this sched group
3828 * is eligible for doing load balancing at this and above
3829 * domains. In the newly idle case, we will allow all the cpu's
3830 * to do the newly idle load balance.
3831 */
3832 if (idle != CPU_NEWLY_IDLE && local_group &&
3833 balance_cpu != this_cpu && balance) {
3834 *balance = 0;
3835 return;
3836 }
3837
3838 /* Adjust by relative CPU power of the group */
3839 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3840
3841
3842 /*
3843 * Consider the group unbalanced when the imbalance is larger
3844 * than the average weight of two tasks.
3845 *
3846 * APZ: with cgroup the avg task weight can vary wildly and
3847 * might not be a suitable number - should we keep a
3848 * normalized nr_running number somewhere that negates
3849 * the hierarchy?
3850 */
3851 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3852 group->cpu_power;
3853
3854 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3855 sgs->group_imb = 1;
3856
3857 sgs->group_capacity =
3858 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3859}
3860
3861/**
3862 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3863 * @sd: sched_domain whose statistics are to be updated.
3864 * @this_cpu: Cpu for which load balance is currently performed.
3865 * @idle: Idle status of this_cpu
3866 * @sd_idle: Idle status of the sched_domain containing group.
3867 * @cpus: Set of cpus considered for load balancing.
3868 * @balance: Should we balance.
3869 * @sds: variable to hold the statistics for this sched_domain.
3870 */
3871static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3872 enum cpu_idle_type idle, int *sd_idle,
3873 const struct cpumask *cpus, int *balance,
3874 struct sd_lb_stats *sds)
3875{
3876 struct sched_domain *child = sd->child;
3877 struct sched_group *group = sd->groups;
3878 struct sg_lb_stats sgs;
3879 int load_idx, prefer_sibling = 0;
3880
3881 if (child && child->flags & SD_PREFER_SIBLING)
3882 prefer_sibling = 1;
3883
3884 init_sd_power_savings_stats(sd, sds, idle);
3885 load_idx = get_sd_load_idx(sd, idle);
3886
3887 do {
3888 int local_group;
3889
3890 local_group = cpumask_test_cpu(this_cpu,
3891 sched_group_cpus(group));
3892 memset(&sgs, 0, sizeof(sgs));
3893 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3894 local_group, cpus, balance, &sgs);
3895
3896 if (local_group && balance && !(*balance))
3897 return;
3898
3899 sds->total_load += sgs.group_load;
3900 sds->total_pwr += group->cpu_power;
3901
3902 /*
3903 * In case the child domain prefers tasks go to siblings
3904 * first, lower the group capacity to one so that we'll try
3905 * and move all the excess tasks away.
3906 */
3907 if (prefer_sibling)
3908 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3909
3910 if (local_group) {
3911 sds->this_load = sgs.avg_load;
3912 sds->this = group;
3913 sds->this_nr_running = sgs.sum_nr_running;
3914 sds->this_load_per_task = sgs.sum_weighted_load;
3915 } else if (sgs.avg_load > sds->max_load &&
3916 (sgs.sum_nr_running > sgs.group_capacity ||
3917 sgs.group_imb)) {
3918 sds->max_load = sgs.avg_load;
3919 sds->busiest = group;
3920 sds->busiest_nr_running = sgs.sum_nr_running;
3921 sds->busiest_load_per_task = sgs.sum_weighted_load;
3922 sds->group_imb = sgs.group_imb;
3923 }
3924
3925 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3926 group = group->next;
3927 } while (group != sd->groups);
3928}
3929
3930/**
3931 * fix_small_imbalance - Calculate the minor imbalance that exists
3932 * amongst the groups of a sched_domain, during
3933 * load balancing.
3934 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3935 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3936 * @imbalance: Variable to store the imbalance.
3937 */
3938static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3939 int this_cpu, unsigned long *imbalance)
3940{
3941 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3942 unsigned int imbn = 2;
3943
3944 if (sds->this_nr_running) {
3945 sds->this_load_per_task /= sds->this_nr_running;
3946 if (sds->busiest_load_per_task >
3947 sds->this_load_per_task)
3948 imbn = 1;
3949 } else
3950 sds->this_load_per_task =
3951 cpu_avg_load_per_task(this_cpu);
3952
3953 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3954 sds->busiest_load_per_task * imbn) {
3955 *imbalance = sds->busiest_load_per_task;
3956 return;
3957 }
3958
3959 /*
3960 * OK, we don't have enough imbalance to justify moving tasks,
3961 * however we may be able to increase total CPU power used by
3962 * moving them.
3963 */
3964
3965 pwr_now += sds->busiest->cpu_power *
3966 min(sds->busiest_load_per_task, sds->max_load);
3967 pwr_now += sds->this->cpu_power *
3968 min(sds->this_load_per_task, sds->this_load);
3969 pwr_now /= SCHED_LOAD_SCALE;
3970
3971 /* Amount of load we'd subtract */
3972 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3973 sds->busiest->cpu_power;
3974 if (sds->max_load > tmp)
3975 pwr_move += sds->busiest->cpu_power *
3976 min(sds->busiest_load_per_task, sds->max_load - tmp);
3977
3978 /* Amount of load we'd add */
3979 if (sds->max_load * sds->busiest->cpu_power <
3980 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3981 tmp = (sds->max_load * sds->busiest->cpu_power) /
3982 sds->this->cpu_power;
3983 else
3984 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3985 sds->this->cpu_power;
3986 pwr_move += sds->this->cpu_power *
3987 min(sds->this_load_per_task, sds->this_load + tmp);
3988 pwr_move /= SCHED_LOAD_SCALE;
3989
3990 /* Move if we gain throughput */
3991 if (pwr_move > pwr_now)
3992 *imbalance = sds->busiest_load_per_task;
3993}
3994
3995/**
3996 * calculate_imbalance - Calculate the amount of imbalance present within the
3997 * groups of a given sched_domain during load balance.
3998 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3999 * @this_cpu: Cpu for which currently load balance is being performed.
4000 * @imbalance: The variable to store the imbalance.
4001 */
4002static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4003 unsigned long *imbalance)
4004{
4005 unsigned long max_pull;
4006 /*
4007 * In the presence of smp nice balancing, certain scenarios can have
4008 * max load less than avg load(as we skip the groups at or below
4009 * its cpu_power, while calculating max_load..)
4010 */
4011 if (sds->max_load < sds->avg_load) {
4012 *imbalance = 0;
4013 return fix_small_imbalance(sds, this_cpu, imbalance);
4014 }
4015
4016 /* Don't want to pull so many tasks that a group would go idle */
4017 max_pull = min(sds->max_load - sds->avg_load,
4018 sds->max_load - sds->busiest_load_per_task);
4019
4020 /* How much load to actually move to equalise the imbalance */
4021 *imbalance = min(max_pull * sds->busiest->cpu_power,
4022 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
4023 / SCHED_LOAD_SCALE;
4024
4025 /*
4026 * if *imbalance is less than the average load per runnable task
4027 * there is no gaurantee that any tasks will be moved so we'll have
4028 * a think about bumping its value to force at least one task to be
4029 * moved
4030 */
4031 if (*imbalance < sds->busiest_load_per_task)
4032 return fix_small_imbalance(sds, this_cpu, imbalance);
4033
4034}
4035/******* find_busiest_group() helpers end here *********************/
4036
4037/**
4038 * find_busiest_group - Returns the busiest group within the sched_domain
4039 * if there is an imbalance. If there isn't an imbalance, and
4040 * the user has opted for power-savings, it returns a group whose
4041 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4042 * such a group exists.
4043 *
4044 * Also calculates the amount of weighted load which should be moved
4045 * to restore balance.
4046 *
4047 * @sd: The sched_domain whose busiest group is to be returned.
4048 * @this_cpu: The cpu for which load balancing is currently being performed.
4049 * @imbalance: Variable which stores amount of weighted load which should
4050 * be moved to restore balance/put a group to idle.
4051 * @idle: The idle status of this_cpu.
4052 * @sd_idle: The idleness of sd
4053 * @cpus: The set of CPUs under consideration for load-balancing.
4054 * @balance: Pointer to a variable indicating if this_cpu
4055 * is the appropriate cpu to perform load balancing at this_level.
4056 *
4057 * Returns: - the busiest group if imbalance exists.
4058 * - If no imbalance and user has opted for power-savings balance,
4059 * return the least loaded group whose CPUs can be
4060 * put to idle by rebalancing its tasks onto our group.
4061 */
4062static struct sched_group *
4063find_busiest_group(struct sched_domain *sd, int this_cpu,
4064 unsigned long *imbalance, enum cpu_idle_type idle,
4065 int *sd_idle, const struct cpumask *cpus, int *balance)
4066{
4067 struct sd_lb_stats sds;
4068
4069 memset(&sds, 0, sizeof(sds));
4070
4071 /*
4072 * Compute the various statistics relavent for load balancing at
4073 * this level.
4074 */
4075 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4076 balance, &sds);
4077
4078 /* Cases where imbalance does not exist from POV of this_cpu */
4079 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4080 * at this level.
4081 * 2) There is no busy sibling group to pull from.
4082 * 3) This group is the busiest group.
4083 * 4) This group is more busy than the avg busieness at this
4084 * sched_domain.
4085 * 5) The imbalance is within the specified limit.
4086 * 6) Any rebalance would lead to ping-pong
4087 */
4088 if (balance && !(*balance))
4089 goto ret;
4090
4091 if (!sds.busiest || sds.busiest_nr_running == 0)
4092 goto out_balanced;
4093
4094 if (sds.this_load >= sds.max_load)
4095 goto out_balanced;
4096
4097 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4098
4099 if (sds.this_load >= sds.avg_load)
4100 goto out_balanced;
4101
4102 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4103 goto out_balanced;
4104
4105 sds.busiest_load_per_task /= sds.busiest_nr_running;
4106 if (sds.group_imb)
4107 sds.busiest_load_per_task =
4108 min(sds.busiest_load_per_task, sds.avg_load);
4109
4110 /*
4111 * We're trying to get all the cpus to the average_load, so we don't
4112 * want to push ourselves above the average load, nor do we wish to
4113 * reduce the max loaded cpu below the average load, as either of these
4114 * actions would just result in more rebalancing later, and ping-pong
4115 * tasks around. Thus we look for the minimum possible imbalance.
4116 * Negative imbalances (*we* are more loaded than anyone else) will
4117 * be counted as no imbalance for these purposes -- we can't fix that
4118 * by pulling tasks to us. Be careful of negative numbers as they'll
4119 * appear as very large values with unsigned longs.
4120 */
4121 if (sds.max_load <= sds.busiest_load_per_task)
4122 goto out_balanced;
4123
4124 /* Looks like there is an imbalance. Compute it */
4125 calculate_imbalance(&sds, this_cpu, imbalance);
4126 return sds.busiest;
4127
4128out_balanced:
4129 /*
4130 * There is no obvious imbalance. But check if we can do some balancing
4131 * to save power.
4132 */
4133 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4134 return sds.busiest;
4135ret:
4136 *imbalance = 0;
4137 return NULL;
4138}
4139
4140/*
4141 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4142 */
4143static struct rq *
4144find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4145 unsigned long imbalance, const struct cpumask *cpus)
4146{
4147 struct rq *busiest = NULL, *rq;
4148 unsigned long max_load = 0;
4149 int i;
4150
4151 for_each_cpu(i, sched_group_cpus(group)) {
4152 unsigned long power = power_of(i);
4153 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4154 unsigned long wl;
4155
4156 if (!cpumask_test_cpu(i, cpus))
4157 continue;
4158
4159 rq = cpu_rq(i);
4160 wl = weighted_cpuload(i);
4161
4162 /*
4163 * When comparing with imbalance, use weighted_cpuload()
4164 * which is not scaled with the cpu power.
4165 */
4166 if (capacity && rq->nr_running == 1 && wl > imbalance)
4167 continue;
4168
4169 /*
4170 * For the load comparisons with the other cpu's, consider
4171 * the weighted_cpuload() scaled with the cpu power, so that
4172 * the load can be moved away from the cpu that is potentially
4173 * running at a lower capacity.
4174 */
4175 wl = (wl * SCHED_LOAD_SCALE) / power;
4176
4177 if (wl > max_load) {
4178 max_load = wl;
4179 busiest = rq;
4180 }
4181 }
4182
4183 return busiest;
4184}
4185
4186/*
4187 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4188 * so long as it is large enough.
4189 */
4190#define MAX_PINNED_INTERVAL 512
4191
4192/* Working cpumask for load_balance and load_balance_newidle. */
4193static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4194
4195/*
4196 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4197 * tasks if there is an imbalance.
4198 */
4199static int load_balance(int this_cpu, struct rq *this_rq,
4200 struct sched_domain *sd, enum cpu_idle_type idle,
4201 int *balance)
4202{
4203 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4204 struct sched_group *group;
4205 unsigned long imbalance;
4206 struct rq *busiest;
4207 unsigned long flags;
4208 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4209
4210 cpumask_copy(cpus, cpu_active_mask);
4211
4212 /*
4213 * When power savings policy is enabled for the parent domain, idle
4214 * sibling can pick up load irrespective of busy siblings. In this case,
4215 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4216 * portraying it as CPU_NOT_IDLE.
4217 */
4218 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4219 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4220 sd_idle = 1;
4221
4222 schedstat_inc(sd, lb_count[idle]);
4223
4224redo:
4225 update_shares(sd);
4226 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4227 cpus, balance);
4228
4229 if (*balance == 0)
4230 goto out_balanced;
4231
4232 if (!group) {
4233 schedstat_inc(sd, lb_nobusyg[idle]);
4234 goto out_balanced;
4235 }
4236
4237 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4238 if (!busiest) {
4239 schedstat_inc(sd, lb_nobusyq[idle]);
4240 goto out_balanced;
4241 }
4242
4243 BUG_ON(busiest == this_rq);
4244
4245 schedstat_add(sd, lb_imbalance[idle], imbalance);
4246
4247 ld_moved = 0;
4248 if (busiest->nr_running > 1) {
4249 /*
4250 * Attempt to move tasks. If find_busiest_group has found
4251 * an imbalance but busiest->nr_running <= 1, the group is
4252 * still unbalanced. ld_moved simply stays zero, so it is
4253 * correctly treated as an imbalance.
4254 */
4255 local_irq_save(flags);
4256 double_rq_lock(this_rq, busiest);
4257 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4258 imbalance, sd, idle, &all_pinned);
4259 double_rq_unlock(this_rq, busiest);
4260 local_irq_restore(flags);
4261
4262 /*
4263 * some other cpu did the load balance for us.
4264 */
4265 if (ld_moved && this_cpu != smp_processor_id())
4266 resched_cpu(this_cpu);
4267
4268 /* All tasks on this runqueue were pinned by CPU affinity */
4269 if (unlikely(all_pinned)) {
4270 cpumask_clear_cpu(cpu_of(busiest), cpus);
4271 if (!cpumask_empty(cpus))
4272 goto redo;
4273 goto out_balanced;
4274 }
4275 }
4276
4277 if (!ld_moved) {
4278 schedstat_inc(sd, lb_failed[idle]);
4279 sd->nr_balance_failed++;
4280
4281 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4282
4283 raw_spin_lock_irqsave(&busiest->lock, flags);
4284
4285 /* don't kick the migration_thread, if the curr
4286 * task on busiest cpu can't be moved to this_cpu
4287 */
4288 if (!cpumask_test_cpu(this_cpu,
4289 &busiest->curr->cpus_allowed)) {
4290 raw_spin_unlock_irqrestore(&busiest->lock,
4291 flags);
4292 all_pinned = 1;
4293 goto out_one_pinned;
4294 }
4295
4296 if (!busiest->active_balance) {
4297 busiest->active_balance = 1;
4298 busiest->push_cpu = this_cpu;
4299 active_balance = 1;
4300 }
4301 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4302 if (active_balance)
4303 wake_up_process(busiest->migration_thread);
4304
4305 /*
4306 * We've kicked active balancing, reset the failure
4307 * counter.
4308 */
4309 sd->nr_balance_failed = sd->cache_nice_tries+1;
4310 }
4311 } else
4312 sd->nr_balance_failed = 0;
4313
4314 if (likely(!active_balance)) {
4315 /* We were unbalanced, so reset the balancing interval */
4316 sd->balance_interval = sd->min_interval;
4317 } else {
4318 /*
4319 * If we've begun active balancing, start to back off. This
4320 * case may not be covered by the all_pinned logic if there
4321 * is only 1 task on the busy runqueue (because we don't call
4322 * move_tasks).
4323 */
4324 if (sd->balance_interval < sd->max_interval)
4325 sd->balance_interval *= 2;
4326 }
4327
4328 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4329 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4330 ld_moved = -1;
4331
4332 goto out;
4333
4334out_balanced:
4335 schedstat_inc(sd, lb_balanced[idle]);
4336
4337 sd->nr_balance_failed = 0;
4338
4339out_one_pinned:
4340 /* tune up the balancing interval */
4341 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4342 (sd->balance_interval < sd->max_interval))
4343 sd->balance_interval *= 2;
4344
4345 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4346 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4347 ld_moved = -1;
4348 else
4349 ld_moved = 0;
4350out:
4351 if (ld_moved)
4352 update_shares(sd);
4353 return ld_moved;
4354}
4355
4356/*
4357 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4358 * tasks if there is an imbalance.
4359 *
4360 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4361 * this_rq is locked.
4362 */
4363static int
4364load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4365{
4366 struct sched_group *group;
4367 struct rq *busiest = NULL;
4368 unsigned long imbalance;
4369 int ld_moved = 0;
4370 int sd_idle = 0;
4371 int all_pinned = 0;
4372 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4373
4374 cpumask_copy(cpus, cpu_active_mask);
4375
4376 /*
4377 * When power savings policy is enabled for the parent domain, idle
4378 * sibling can pick up load irrespective of busy siblings. In this case,
4379 * let the state of idle sibling percolate up as IDLE, instead of
4380 * portraying it as CPU_NOT_IDLE.
4381 */
4382 if (sd->flags & SD_SHARE_CPUPOWER &&
4383 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4384 sd_idle = 1;
4385
4386 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4387redo:
4388 update_shares_locked(this_rq, sd);
4389 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4390 &sd_idle, cpus, NULL);
4391 if (!group) {
4392 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4393 goto out_balanced;
4394 }
4395
4396 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4397 if (!busiest) {
4398 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4399 goto out_balanced;
4400 }
4401
4402 BUG_ON(busiest == this_rq);
4403
4404 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4405
4406 ld_moved = 0;
4407 if (busiest->nr_running > 1) {
4408 /* Attempt to move tasks */
4409 double_lock_balance(this_rq, busiest);
4410 /* this_rq->clock is already updated */
4411 update_rq_clock(busiest);
4412 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4413 imbalance, sd, CPU_NEWLY_IDLE,
4414 &all_pinned);
4415 double_unlock_balance(this_rq, busiest);
4416
4417 if (unlikely(all_pinned)) {
4418 cpumask_clear_cpu(cpu_of(busiest), cpus);
4419 if (!cpumask_empty(cpus))
4420 goto redo;
4421 }
4422 }
4423
4424 if (!ld_moved) {
4425 int active_balance = 0;
4426
4427 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4428 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4429 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4430 return -1;
4431
4432 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4433 return -1;
4434
4435 if (sd->nr_balance_failed++ < 2)
4436 return -1;
4437
4438 /*
4439 * The only task running in a non-idle cpu can be moved to this
4440 * cpu in an attempt to completely freeup the other CPU
4441 * package. The same method used to move task in load_balance()
4442 * have been extended for load_balance_newidle() to speedup
4443 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4444 *
4445 * The package power saving logic comes from
4446 * find_busiest_group(). If there are no imbalance, then
4447 * f_b_g() will return NULL. However when sched_mc={1,2} then
4448 * f_b_g() will select a group from which a running task may be
4449 * pulled to this cpu in order to make the other package idle.
4450 * If there is no opportunity to make a package idle and if
4451 * there are no imbalance, then f_b_g() will return NULL and no
4452 * action will be taken in load_balance_newidle().
4453 *
4454 * Under normal task pull operation due to imbalance, there
4455 * will be more than one task in the source run queue and
4456 * move_tasks() will succeed. ld_moved will be true and this
4457 * active balance code will not be triggered.
4458 */
4459
4460 /* Lock busiest in correct order while this_rq is held */
4461 double_lock_balance(this_rq, busiest);
4462
4463 /*
4464 * don't kick the migration_thread, if the curr
4465 * task on busiest cpu can't be moved to this_cpu
4466 */
4467 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4468 double_unlock_balance(this_rq, busiest);
4469 all_pinned = 1;
4470 return ld_moved;
4471 }
4472
4473 if (!busiest->active_balance) {
4474 busiest->active_balance = 1;
4475 busiest->push_cpu = this_cpu;
4476 active_balance = 1;
4477 }
4478
4479 double_unlock_balance(this_rq, busiest);
4480 /*
4481 * Should not call ttwu while holding a rq->lock
4482 */
4483 raw_spin_unlock(&this_rq->lock);
4484 if (active_balance)
4485 wake_up_process(busiest->migration_thread);
4486 raw_spin_lock(&this_rq->lock);
4487
4488 } else
4489 sd->nr_balance_failed = 0;
4490
4491 update_shares_locked(this_rq, sd);
4492 return ld_moved;
4493
4494out_balanced:
4495 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4496 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4497 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4498 return -1;
4499 sd->nr_balance_failed = 0;
4500
4501 return 0;
4502}
4503
4504/*
4505 * idle_balance is called by schedule() if this_cpu is about to become
4506 * idle. Attempts to pull tasks from other CPUs.
4507 */
4508static void idle_balance(int this_cpu, struct rq *this_rq)
4509{
4510 struct sched_domain *sd;
4511 int pulled_task = 0;
4512 unsigned long next_balance = jiffies + HZ;
4513
4514 this_rq->idle_stamp = this_rq->clock;
4515
4516 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4517 return;
4518
4519 for_each_domain(this_cpu, sd) {
4520 unsigned long interval;
4521
4522 if (!(sd->flags & SD_LOAD_BALANCE))
4523 continue;
4524
4525 if (sd->flags & SD_BALANCE_NEWIDLE)
4526 /* If we've pulled tasks over stop searching: */
4527 pulled_task = load_balance_newidle(this_cpu, this_rq,
4528 sd);
4529
4530 interval = msecs_to_jiffies(sd->balance_interval);
4531 if (time_after(next_balance, sd->last_balance + interval))
4532 next_balance = sd->last_balance + interval;
4533 if (pulled_task) {
4534 this_rq->idle_stamp = 0;
4535 break;
4536 }
4537 }
4538 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4539 /*
4540 * We are going idle. next_balance may be set based on
4541 * a busy processor. So reset next_balance.
4542 */
4543 this_rq->next_balance = next_balance;
4544 }
4545}
4546
4547/*
4548 * active_load_balance is run by migration threads. It pushes running tasks
4549 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4550 * running on each physical CPU where possible, and avoids physical /
4551 * logical imbalances.
4552 *
4553 * Called with busiest_rq locked.
4554 */
4555static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4556{
4557 int target_cpu = busiest_rq->push_cpu;
4558 struct sched_domain *sd;
4559 struct rq *target_rq;
4560
4561 /* Is there any task to move? */
4562 if (busiest_rq->nr_running <= 1)
4563 return;
4564
4565 target_rq = cpu_rq(target_cpu);
4566
4567 /*
4568 * This condition is "impossible", if it occurs
4569 * we need to fix it. Originally reported by
4570 * Bjorn Helgaas on a 128-cpu setup.
4571 */
4572 BUG_ON(busiest_rq == target_rq);
4573
4574 /* move a task from busiest_rq to target_rq */
4575 double_lock_balance(busiest_rq, target_rq);
4576 update_rq_clock(busiest_rq);
4577 update_rq_clock(target_rq);
4578
4579 /* Search for an sd spanning us and the target CPU. */
4580 for_each_domain(target_cpu, sd) {
4581 if ((sd->flags & SD_LOAD_BALANCE) &&
4582 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4583 break;
4584 }
4585
4586 if (likely(sd)) {
4587 schedstat_inc(sd, alb_count);
4588
4589 if (move_one_task(target_rq, target_cpu, busiest_rq,
4590 sd, CPU_IDLE))
4591 schedstat_inc(sd, alb_pushed);
4592 else
4593 schedstat_inc(sd, alb_failed);
4594 }
4595 double_unlock_balance(busiest_rq, target_rq);
4596}
4597
4598#ifdef CONFIG_NO_HZ
4599static struct {
4600 atomic_t load_balancer;
4601 cpumask_var_t cpu_mask;
4602 cpumask_var_t ilb_grp_nohz_mask;
4603} nohz ____cacheline_aligned = {
4604 .load_balancer = ATOMIC_INIT(-1),
4605};
4606
4607int get_nohz_load_balancer(void)
4608{
4609 return atomic_read(&nohz.load_balancer);
4610}
4611
4612#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4613/**
4614 * lowest_flag_domain - Return lowest sched_domain containing flag.
4615 * @cpu: The cpu whose lowest level of sched domain is to
4616 * be returned.
4617 * @flag: The flag to check for the lowest sched_domain
4618 * for the given cpu.
4619 *
4620 * Returns the lowest sched_domain of a cpu which contains the given flag.
4621 */
4622static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4623{
4624 struct sched_domain *sd;
4625
4626 for_each_domain(cpu, sd)
4627 if (sd && (sd->flags & flag))
4628 break;
4629
4630 return sd;
4631}
4632
4633/**
4634 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4635 * @cpu: The cpu whose domains we're iterating over.
4636 * @sd: variable holding the value of the power_savings_sd
4637 * for cpu.
4638 * @flag: The flag to filter the sched_domains to be iterated.
4639 *
4640 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4641 * set, starting from the lowest sched_domain to the highest.
4642 */
4643#define for_each_flag_domain(cpu, sd, flag) \
4644 for (sd = lowest_flag_domain(cpu, flag); \
4645 (sd && (sd->flags & flag)); sd = sd->parent)
4646
4647/**
4648 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4649 * @ilb_group: group to be checked for semi-idleness
4650 *
4651 * Returns: 1 if the group is semi-idle. 0 otherwise.
4652 *
4653 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4654 * and atleast one non-idle CPU. This helper function checks if the given
4655 * sched_group is semi-idle or not.
4656 */
4657static inline int is_semi_idle_group(struct sched_group *ilb_group)
4658{
4659 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4660 sched_group_cpus(ilb_group));
4661
4662 /*
4663 * A sched_group is semi-idle when it has atleast one busy cpu
4664 * and atleast one idle cpu.
4665 */
4666 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4667 return 0;
4668
4669 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4670 return 0;
4671
4672 return 1;
4673}
4674/**
4675 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4676 * @cpu: The cpu which is nominating a new idle_load_balancer.
4677 *
4678 * Returns: Returns the id of the idle load balancer if it exists,
4679 * Else, returns >= nr_cpu_ids.
4680 *
4681 * This algorithm picks the idle load balancer such that it belongs to a
4682 * semi-idle powersavings sched_domain. The idea is to try and avoid
4683 * completely idle packages/cores just for the purpose of idle load balancing
4684 * when there are other idle cpu's which are better suited for that job.
4685 */
4686static int find_new_ilb(int cpu)
4687{
4688 struct sched_domain *sd;
4689 struct sched_group *ilb_group;
4690
4691 /*
4692 * Have idle load balancer selection from semi-idle packages only
4693 * when power-aware load balancing is enabled
4694 */
4695 if (!(sched_smt_power_savings || sched_mc_power_savings))
4696 goto out_done;
4697
4698 /*
4699 * Optimize for the case when we have no idle CPUs or only one
4700 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4701 */
4702 if (cpumask_weight(nohz.cpu_mask) < 2)
4703 goto out_done;
4704
4705 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4706 ilb_group = sd->groups;
4707
4708 do {
4709 if (is_semi_idle_group(ilb_group))
4710 return cpumask_first(nohz.ilb_grp_nohz_mask);
4711
4712 ilb_group = ilb_group->next;
4713
4714 } while (ilb_group != sd->groups);
4715 }
4716
4717out_done:
4718 return cpumask_first(nohz.cpu_mask);
4719}
4720#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4721static inline int find_new_ilb(int call_cpu)
4722{
4723 return cpumask_first(nohz.cpu_mask);
4724}
4725#endif
4726
4727/*
4728 * This routine will try to nominate the ilb (idle load balancing)
4729 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4730 * load balancing on behalf of all those cpus. If all the cpus in the system
4731 * go into this tickless mode, then there will be no ilb owner (as there is
4732 * no need for one) and all the cpus will sleep till the next wakeup event
4733 * arrives...
4734 *
4735 * For the ilb owner, tick is not stopped. And this tick will be used
4736 * for idle load balancing. ilb owner will still be part of
4737 * nohz.cpu_mask..
4738 *
4739 * While stopping the tick, this cpu will become the ilb owner if there
4740 * is no other owner. And will be the owner till that cpu becomes busy
4741 * or if all cpus in the system stop their ticks at which point
4742 * there is no need for ilb owner.
4743 *
4744 * When the ilb owner becomes busy, it nominates another owner, during the
4745 * next busy scheduler_tick()
4746 */
4747int select_nohz_load_balancer(int stop_tick)
4748{
4749 int cpu = smp_processor_id();
4750
4751 if (stop_tick) {
4752 cpu_rq(cpu)->in_nohz_recently = 1;
4753
4754 if (!cpu_active(cpu)) {
4755 if (atomic_read(&nohz.load_balancer) != cpu)
4756 return 0;
4757
4758 /*
4759 * If we are going offline and still the leader,
4760 * give up!
4761 */
4762 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4763 BUG();
4764
4765 return 0;
4766 }
4767
4768 cpumask_set_cpu(cpu, nohz.cpu_mask);
4769
4770 /* time for ilb owner also to sleep */
4771 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4772 if (atomic_read(&nohz.load_balancer) == cpu)
4773 atomic_set(&nohz.load_balancer, -1);
4774 return 0;
4775 }
4776
4777 if (atomic_read(&nohz.load_balancer) == -1) {
4778 /* make me the ilb owner */
4779 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4780 return 1;
4781 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4782 int new_ilb;
4783
4784 if (!(sched_smt_power_savings ||
4785 sched_mc_power_savings))
4786 return 1;
4787 /*
4788 * Check to see if there is a more power-efficient
4789 * ilb.
4790 */
4791 new_ilb = find_new_ilb(cpu);
4792 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4793 atomic_set(&nohz.load_balancer, -1);
4794 resched_cpu(new_ilb);
4795 return 0;
4796 }
4797 return 1;
4798 }
4799 } else {
4800 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4801 return 0;
4802
4803 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4804
4805 if (atomic_read(&nohz.load_balancer) == cpu)
4806 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4807 BUG();
4808 }
4809 return 0;
4810}
4811#endif
4812
4813static DEFINE_SPINLOCK(balancing);
4814
4815/*
4816 * It checks each scheduling domain to see if it is due to be balanced,
4817 * and initiates a balancing operation if so.
4818 *
4819 * Balancing parameters are set up in arch_init_sched_domains.
4820 */
4821static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4822{
4823 int balance = 1;
4824 struct rq *rq = cpu_rq(cpu);
4825 unsigned long interval;
4826 struct sched_domain *sd;
4827 /* Earliest time when we have to do rebalance again */
4828 unsigned long next_balance = jiffies + 60*HZ;
4829 int update_next_balance = 0;
4830 int need_serialize;
4831
4832 for_each_domain(cpu, sd) {
4833 if (!(sd->flags & SD_LOAD_BALANCE))
4834 continue;
4835
4836 interval = sd->balance_interval;
4837 if (idle != CPU_IDLE)
4838 interval *= sd->busy_factor;
4839
4840 /* scale ms to jiffies */
4841 interval = msecs_to_jiffies(interval);
4842 if (unlikely(!interval))
4843 interval = 1;
4844 if (interval > HZ*NR_CPUS/10)
4845 interval = HZ*NR_CPUS/10;
4846
4847 need_serialize = sd->flags & SD_SERIALIZE;
4848
4849 if (need_serialize) {
4850 if (!spin_trylock(&balancing))
4851 goto out;
4852 }
4853
4854 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4855 if (load_balance(cpu, rq, sd, idle, &balance)) {
4856 /*
4857 * We've pulled tasks over so either we're no
4858 * longer idle, or one of our SMT siblings is
4859 * not idle.
4860 */
4861 idle = CPU_NOT_IDLE;
4862 }
4863 sd->last_balance = jiffies;
4864 }
4865 if (need_serialize)
4866 spin_unlock(&balancing);
4867out:
4868 if (time_after(next_balance, sd->last_balance + interval)) {
4869 next_balance = sd->last_balance + interval;
4870 update_next_balance = 1;
4871 }
4872
4873 /*
4874 * Stop the load balance at this level. There is another
4875 * CPU in our sched group which is doing load balancing more
4876 * actively.
4877 */
4878 if (!balance)
4879 break;
4880 }
4881
4882 /*
4883 * next_balance will be updated only when there is a need.
4884 * When the cpu is attached to null domain for ex, it will not be
4885 * updated.
4886 */
4887 if (likely(update_next_balance))
4888 rq->next_balance = next_balance;
4889}
4890
4891/*
4892 * run_rebalance_domains is triggered when needed from the scheduler tick.
4893 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4894 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4895 */
4896static void run_rebalance_domains(struct softirq_action *h)
4897{
4898 int this_cpu = smp_processor_id();
4899 struct rq *this_rq = cpu_rq(this_cpu);
4900 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4901 CPU_IDLE : CPU_NOT_IDLE;
4902
4903 rebalance_domains(this_cpu, idle);
4904
4905#ifdef CONFIG_NO_HZ
4906 /*
4907 * If this cpu is the owner for idle load balancing, then do the
4908 * balancing on behalf of the other idle cpus whose ticks are
4909 * stopped.
4910 */
4911 if (this_rq->idle_at_tick &&
4912 atomic_read(&nohz.load_balancer) == this_cpu) {
4913 struct rq *rq;
4914 int balance_cpu;
4915
4916 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4917 if (balance_cpu == this_cpu)
4918 continue;
4919
4920 /*
4921 * If this cpu gets work to do, stop the load balancing
4922 * work being done for other cpus. Next load
4923 * balancing owner will pick it up.
4924 */
4925 if (need_resched())
4926 break;
4927
4928 rebalance_domains(balance_cpu, CPU_IDLE);
4929
4930 rq = cpu_rq(balance_cpu);
4931 if (time_after(this_rq->next_balance, rq->next_balance))
4932 this_rq->next_balance = rq->next_balance;
4933 }
4934 }
4935#endif
4936}
4937
4938static inline int on_null_domain(int cpu)
4939{
4940 return !rcu_dereference(cpu_rq(cpu)->sd);
4941}
4942
4943/*
4944 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4945 *
4946 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4947 * idle load balancing owner or decide to stop the periodic load balancing,
4948 * if the whole system is idle.
4949 */
4950static inline void trigger_load_balance(struct rq *rq, int cpu)
4951{
4952#ifdef CONFIG_NO_HZ
4953 /*
4954 * If we were in the nohz mode recently and busy at the current
4955 * scheduler tick, then check if we need to nominate new idle
4956 * load balancer.
4957 */
4958 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4959 rq->in_nohz_recently = 0;
4960
4961 if (atomic_read(&nohz.load_balancer) == cpu) {
4962 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4963 atomic_set(&nohz.load_balancer, -1);
4964 }
4965
4966 if (atomic_read(&nohz.load_balancer) == -1) {
4967 int ilb = find_new_ilb(cpu);
4968
4969 if (ilb < nr_cpu_ids)
4970 resched_cpu(ilb);
4971 }
4972 }
4973
4974 /*
4975 * If this cpu is idle and doing idle load balancing for all the
4976 * cpus with ticks stopped, is it time for that to stop?
4977 */
4978 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4979 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4980 resched_cpu(cpu);
4981 return;
4982 }
4983
4984 /*
4985 * If this cpu is idle and the idle load balancing is done by
4986 * someone else, then no need raise the SCHED_SOFTIRQ
4987 */
4988 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4989 cpumask_test_cpu(cpu, nohz.cpu_mask))
4990 return;
4991#endif
4992 /* Don't need to rebalance while attached to NULL domain */
4993 if (time_after_eq(jiffies, rq->next_balance) &&
4994 likely(!on_null_domain(cpu)))
4995 raise_softirq(SCHED_SOFTIRQ);
4996}
4997
4998#else /* CONFIG_SMP */
4999
5000/*
5001 * on UP we do not need to balance between CPUs:
5002 */
5003static inline void idle_balance(int cpu, struct rq *rq)
5004{
5005}
5006
5007#endif 3153#endif
5008 3154
5009DEFINE_PER_CPU(struct kernel_stat, kstat); 3155DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -6128,7 +4274,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6128 if (running) 4274 if (running)
6129 p->sched_class->set_curr_task(rq); 4275 p->sched_class->set_curr_task(rq);
6130 if (on_rq) { 4276 if (on_rq) {
6131 enqueue_task(rq, p, 0); 4277 enqueue_task(rq, p, 0, oldprio < prio);
6132 4278
6133 check_class_changed(rq, p, prev_class, oldprio, running); 4279 check_class_changed(rq, p, prev_class, oldprio, running);
6134 } 4280 }
@@ -6172,7 +4318,7 @@ void set_user_nice(struct task_struct *p, long nice)
6172 delta = p->prio - old_prio; 4318 delta = p->prio - old_prio;
6173 4319
6174 if (on_rq) { 4320 if (on_rq) {
6175 enqueue_task(rq, p, 0); 4321 enqueue_task(rq, p, 0, false);
6176 /* 4322 /*
6177 * If the task increased its priority or is running and 4323 * If the task increased its priority or is running and
6178 * lowered its priority, then reschedule its CPU: 4324 * lowered its priority, then reschedule its CPU:
@@ -9482,7 +7628,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9482 tg->rt_rq[cpu] = rt_rq; 7628 tg->rt_rq[cpu] = rt_rq;
9483 init_rt_rq(rt_rq, rq); 7629 init_rt_rq(rt_rq, rq);
9484 rt_rq->tg = tg; 7630 rt_rq->tg = tg;
9485 rt_rq->rt_se = rt_se;
9486 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7631 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9487 if (add) 7632 if (add)
9488 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7633 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9513,9 +7658,6 @@ void __init sched_init(void)
9513#ifdef CONFIG_RT_GROUP_SCHED 7658#ifdef CONFIG_RT_GROUP_SCHED
9514 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7659 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9515#endif 7660#endif
9516#ifdef CONFIG_USER_SCHED
9517 alloc_size *= 2;
9518#endif
9519#ifdef CONFIG_CPUMASK_OFFSTACK 7661#ifdef CONFIG_CPUMASK_OFFSTACK
9520 alloc_size += num_possible_cpus() * cpumask_size(); 7662 alloc_size += num_possible_cpus() * cpumask_size();
9521#endif 7663#endif
@@ -9529,13 +7671,6 @@ void __init sched_init(void)
9529 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7671 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9530 ptr += nr_cpu_ids * sizeof(void **); 7672 ptr += nr_cpu_ids * sizeof(void **);
9531 7673
9532#ifdef CONFIG_USER_SCHED
9533 root_task_group.se = (struct sched_entity **)ptr;
9534 ptr += nr_cpu_ids * sizeof(void **);
9535
9536 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9537 ptr += nr_cpu_ids * sizeof(void **);
9538#endif /* CONFIG_USER_SCHED */
9539#endif /* CONFIG_FAIR_GROUP_SCHED */ 7674#endif /* CONFIG_FAIR_GROUP_SCHED */
9540#ifdef CONFIG_RT_GROUP_SCHED 7675#ifdef CONFIG_RT_GROUP_SCHED
9541 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7676 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9544,13 +7679,6 @@ void __init sched_init(void)
9544 init_task_group.rt_rq = (struct rt_rq **)ptr; 7679 init_task_group.rt_rq = (struct rt_rq **)ptr;
9545 ptr += nr_cpu_ids * sizeof(void **); 7680 ptr += nr_cpu_ids * sizeof(void **);
9546 7681
9547#ifdef CONFIG_USER_SCHED
9548 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9549 ptr += nr_cpu_ids * sizeof(void **);
9550
9551 root_task_group.rt_rq = (struct rt_rq **)ptr;
9552 ptr += nr_cpu_ids * sizeof(void **);
9553#endif /* CONFIG_USER_SCHED */
9554#endif /* CONFIG_RT_GROUP_SCHED */ 7682#endif /* CONFIG_RT_GROUP_SCHED */
9555#ifdef CONFIG_CPUMASK_OFFSTACK 7683#ifdef CONFIG_CPUMASK_OFFSTACK
9556 for_each_possible_cpu(i) { 7684 for_each_possible_cpu(i) {
@@ -9570,22 +7698,13 @@ void __init sched_init(void)
9570#ifdef CONFIG_RT_GROUP_SCHED 7698#ifdef CONFIG_RT_GROUP_SCHED
9571 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7699 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9572 global_rt_period(), global_rt_runtime()); 7700 global_rt_period(), global_rt_runtime());
9573#ifdef CONFIG_USER_SCHED
9574 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9575 global_rt_period(), RUNTIME_INF);
9576#endif /* CONFIG_USER_SCHED */
9577#endif /* CONFIG_RT_GROUP_SCHED */ 7701#endif /* CONFIG_RT_GROUP_SCHED */
9578 7702
9579#ifdef CONFIG_GROUP_SCHED 7703#ifdef CONFIG_CGROUP_SCHED
9580 list_add(&init_task_group.list, &task_groups); 7704 list_add(&init_task_group.list, &task_groups);
9581 INIT_LIST_HEAD(&init_task_group.children); 7705 INIT_LIST_HEAD(&init_task_group.children);
9582 7706
9583#ifdef CONFIG_USER_SCHED 7707#endif /* CONFIG_CGROUP_SCHED */
9584 INIT_LIST_HEAD(&root_task_group.children);
9585 init_task_group.parent = &root_task_group;
9586 list_add(&init_task_group.siblings, &root_task_group.children);
9587#endif /* CONFIG_USER_SCHED */
9588#endif /* CONFIG_GROUP_SCHED */
9589 7708
9590#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7709#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9591 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7710 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9625,25 +7744,6 @@ void __init sched_init(void)
9625 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7744 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9626 */ 7745 */
9627 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7746 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9628#elif defined CONFIG_USER_SCHED
9629 root_task_group.shares = NICE_0_LOAD;
9630 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9631 /*
9632 * In case of task-groups formed thr' the user id of tasks,
9633 * init_task_group represents tasks belonging to root user.
9634 * Hence it forms a sibling of all subsequent groups formed.
9635 * In this case, init_task_group gets only a fraction of overall
9636 * system cpu resource, based on the weight assigned to root
9637 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9638 * by letting tasks of init_task_group sit in a separate cfs_rq
9639 * (init_tg_cfs_rq) and having one entity represent this group of
9640 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9641 */
9642 init_tg_cfs_entry(&init_task_group,
9643 &per_cpu(init_tg_cfs_rq, i),
9644 &per_cpu(init_sched_entity, i), i, 1,
9645 root_task_group.se[i]);
9646
9647#endif 7747#endif
9648#endif /* CONFIG_FAIR_GROUP_SCHED */ 7748#endif /* CONFIG_FAIR_GROUP_SCHED */
9649 7749
@@ -9652,12 +7752,6 @@ void __init sched_init(void)
9652 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7752 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9653#ifdef CONFIG_CGROUP_SCHED 7753#ifdef CONFIG_CGROUP_SCHED
9654 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7754 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9655#elif defined CONFIG_USER_SCHED
9656 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9657 init_tg_rt_entry(&init_task_group,
9658 &per_cpu(init_rt_rq_var, i),
9659 &per_cpu(init_sched_rt_entity, i), i, 1,
9660 root_task_group.rt_se[i]);
9661#endif 7755#endif
9662#endif 7756#endif
9663 7757
@@ -9742,7 +7836,7 @@ static inline int preempt_count_equals(int preempt_offset)
9742 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7836 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9743} 7837}
9744 7838
9745void __might_sleep(char *file, int line, int preempt_offset) 7839void __might_sleep(const char *file, int line, int preempt_offset)
9746{ 7840{
9747#ifdef in_atomic 7841#ifdef in_atomic
9748 static unsigned long prev_jiffy; /* ratelimiting */ 7842 static unsigned long prev_jiffy; /* ratelimiting */
@@ -10053,7 +8147,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10053} 8147}
10054#endif /* CONFIG_RT_GROUP_SCHED */ 8148#endif /* CONFIG_RT_GROUP_SCHED */
10055 8149
10056#ifdef CONFIG_GROUP_SCHED 8150#ifdef CONFIG_CGROUP_SCHED
10057static void free_sched_group(struct task_group *tg) 8151static void free_sched_group(struct task_group *tg)
10058{ 8152{
10059 free_fair_sched_group(tg); 8153 free_fair_sched_group(tg);
@@ -10158,11 +8252,11 @@ void sched_move_task(struct task_struct *tsk)
10158 if (unlikely(running)) 8252 if (unlikely(running))
10159 tsk->sched_class->set_curr_task(rq); 8253 tsk->sched_class->set_curr_task(rq);
10160 if (on_rq) 8254 if (on_rq)
10161 enqueue_task(rq, tsk, 0); 8255 enqueue_task(rq, tsk, 0, false);
10162 8256
10163 task_rq_unlock(rq, &flags); 8257 task_rq_unlock(rq, &flags);
10164} 8258}
10165#endif /* CONFIG_GROUP_SCHED */ 8259#endif /* CONFIG_CGROUP_SCHED */
10166 8260
10167#ifdef CONFIG_FAIR_GROUP_SCHED 8261#ifdef CONFIG_FAIR_GROUP_SCHED
10168static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8262static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10304,13 +8398,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10304 runtime = d->rt_runtime; 8398 runtime = d->rt_runtime;
10305 } 8399 }
10306 8400
10307#ifdef CONFIG_USER_SCHED
10308 if (tg == &root_task_group) {
10309 period = global_rt_period();
10310 runtime = global_rt_runtime();
10311 }
10312#endif
10313
10314 /* 8401 /*
10315 * Cannot have more runtime than the period. 8402 * Cannot have more runtime than the period.
10316 */ 8403 */
@@ -10930,12 +9017,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10930} 9017}
10931 9018
10932/* 9019/*
9020 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9021 * in cputime_t units. As a result, cpuacct_update_stats calls
9022 * percpu_counter_add with values large enough to always overflow the
9023 * per cpu batch limit causing bad SMP scalability.
9024 *
9025 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9026 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9027 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9028 */
9029#ifdef CONFIG_SMP
9030#define CPUACCT_BATCH \
9031 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9032#else
9033#define CPUACCT_BATCH 0
9034#endif
9035
9036/*
10933 * Charge the system/user time to the task's accounting group. 9037 * Charge the system/user time to the task's accounting group.
10934 */ 9038 */
10935static void cpuacct_update_stats(struct task_struct *tsk, 9039static void cpuacct_update_stats(struct task_struct *tsk,
10936 enum cpuacct_stat_index idx, cputime_t val) 9040 enum cpuacct_stat_index idx, cputime_t val)
10937{ 9041{
10938 struct cpuacct *ca; 9042 struct cpuacct *ca;
9043 int batch = CPUACCT_BATCH;
10939 9044
10940 if (unlikely(!cpuacct_subsys.active)) 9045 if (unlikely(!cpuacct_subsys.active))
10941 return; 9046 return;
@@ -10944,7 +9049,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10944 ca = task_ca(tsk); 9049 ca = task_ca(tsk);
10945 9050
10946 do { 9051 do {
10947 percpu_counter_add(&ca->cpustat[idx], val); 9052 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10948 ca = ca->parent; 9053 ca = ca->parent;
10949 } while (ca); 9054 } while (ca);
10950 rcu_read_unlock(); 9055 rcu_read_unlock();
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..eeb3506c4834 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,7 @@ static int convert_prio(int prio)
47} 47}
48 48
49#define for_each_cpupri_active(array, idx) \ 49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ 50 for_each_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53 51
54/** 52/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system 53 * cpupri_find - find the best (lowest-pri) CPU in the system
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8fe7ee81c552..ff7692ccda89 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq)
1053 * increased. Here we update the fair scheduling stats and 1053 * increased. Here we update the fair scheduling stats and
1054 * then put the task into the rbtree: 1054 * then put the task into the rbtree:
1055 */ 1055 */
1056static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 1056static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1057{ 1058{
1058 struct cfs_rq *cfs_rq; 1059 struct cfs_rq *cfs_rq;
1059 struct sched_entity *se = &p->se; 1060 struct sched_entity *se = &p->se;
@@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1815 */ 1816 */
1816 1817
1817/* 1818/*
1818 * Load-balancing iterator. Note: while the runqueue stays locked 1819 * pull_task - move a task from a remote runqueue to the local runqueue.
1819 * during the whole iteration, the current task might be 1820 * Both runqueues must be locked.
1820 * dequeued so the iterator has to be dequeue-safe. Here we
1821 * achieve that by always pre-iterating before returning
1822 * the current task:
1823 */ 1821 */
1824static struct task_struct * 1822static void pull_task(struct rq *src_rq, struct task_struct *p,
1825__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) 1823 struct rq *this_rq, int this_cpu)
1826{ 1824{
1827 struct task_struct *p = NULL; 1825 deactivate_task(src_rq, p, 0);
1828 struct sched_entity *se; 1826 set_task_cpu(p, this_cpu);
1827 activate_task(this_rq, p, 0);
1828 check_preempt_curr(this_rq, p, 0);
1829}
1829 1830
1830 if (next == &cfs_rq->tasks) 1831/*
1831 return NULL; 1832 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1833 */
1834static
1835int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1836 struct sched_domain *sd, enum cpu_idle_type idle,
1837 int *all_pinned)
1838{
1839 int tsk_cache_hot = 0;
1840 /*
1841 * We do not migrate tasks that are:
1842 * 1) running (obviously), or
1843 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1844 * 3) are cache-hot on their current CPU.
1845 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine);
1848 return 0;
1849 }
1850 *all_pinned = 0;
1832 1851
1833 se = list_entry(next, struct sched_entity, group_node); 1852 if (task_running(rq, p)) {
1834 p = task_of(se); 1853 schedstat_inc(p, se.nr_failed_migrations_running);
1835 cfs_rq->balance_iterator = next->next; 1854 return 0;
1855 }
1836 1856
1837 return p; 1857 /*
1838} 1858 * Aggressive migration if:
1859 * 1) task is cache cold, or
1860 * 2) too many balance attempts have failed.
1861 */
1839 1862
1840static struct task_struct *load_balance_start_fair(void *arg) 1863 tsk_cache_hot = task_hot(p, rq->clock, sd);
1841{ 1864 if (!tsk_cache_hot ||
1842 struct cfs_rq *cfs_rq = arg; 1865 sd->nr_balance_failed > sd->cache_nice_tries) {
1866#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations);
1870 }
1871#endif
1872 return 1;
1873 }
1843 1874
1844 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); 1875 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot);
1877 return 0;
1878 }
1879 return 1;
1845} 1880}
1846 1881
1847static struct task_struct *load_balance_next_fair(void *arg) 1882/*
1883 * move_one_task tries to move exactly one task from busiest to this_rq, as
1884 * part of active balancing operations within "domain".
1885 * Returns 1 if successful and 0 otherwise.
1886 *
1887 * Called with both runqueues locked.
1888 */
1889static int
1890move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1891 struct sched_domain *sd, enum cpu_idle_type idle)
1848{ 1892{
1849 struct cfs_rq *cfs_rq = arg; 1893 struct task_struct *p, *n;
1894 struct cfs_rq *cfs_rq;
1895 int pinned = 0;
1896
1897 for_each_leaf_cfs_rq(busiest, cfs_rq) {
1898 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
1899
1900 if (!can_migrate_task(p, busiest, this_cpu,
1901 sd, idle, &pinned))
1902 continue;
1850 1903
1851 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1904 pull_task(busiest, p, this_rq, this_cpu);
1905 /*
1906 * Right now, this is only the second place pull_task()
1907 * is called, so we can safely collect pull_task()
1908 * stats here rather than inside pull_task().
1909 */
1910 schedstat_inc(sd, lb_gained[idle]);
1911 return 1;
1912 }
1913 }
1914
1915 return 0;
1852} 1916}
1853 1917
1854static unsigned long 1918static unsigned long
1855__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1919balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1856 unsigned long max_load_move, struct sched_domain *sd, 1920 unsigned long max_load_move, struct sched_domain *sd,
1857 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, 1921 enum cpu_idle_type idle, int *all_pinned,
1858 struct cfs_rq *cfs_rq) 1922 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1859{ 1923{
1860 struct rq_iterator cfs_rq_iterator; 1924 int loops = 0, pulled = 0, pinned = 0;
1925 long rem_load_move = max_load_move;
1926 struct task_struct *p, *n;
1861 1927
1862 cfs_rq_iterator.start = load_balance_start_fair; 1928 if (max_load_move == 0)
1863 cfs_rq_iterator.next = load_balance_next_fair; 1929 goto out;
1864 cfs_rq_iterator.arg = cfs_rq;
1865 1930
1866 return balance_tasks(this_rq, this_cpu, busiest, 1931 pinned = 1;
1867 max_load_move, sd, idle, all_pinned, 1932
1868 this_best_prio, &cfs_rq_iterator); 1933 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1934 if (loops++ > sysctl_sched_nr_migrate)
1935 break;
1936
1937 if ((p->se.load.weight >> 1) > rem_load_move ||
1938 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
1939 continue;
1940
1941 pull_task(busiest, p, this_rq, this_cpu);
1942 pulled++;
1943 rem_load_move -= p->se.load.weight;
1944
1945#ifdef CONFIG_PREEMPT
1946 /*
1947 * NEWIDLE balancing is a source of latency, so preemptible
1948 * kernels will stop after the first task is pulled to minimize
1949 * the critical section.
1950 */
1951 if (idle == CPU_NEWLY_IDLE)
1952 break;
1953#endif
1954
1955 /*
1956 * We only want to steal up to the prescribed amount of
1957 * weighted load.
1958 */
1959 if (rem_load_move <= 0)
1960 break;
1961
1962 if (p->prio < *this_best_prio)
1963 *this_best_prio = p->prio;
1964 }
1965out:
1966 /*
1967 * Right now, this is one of only two places pull_task() is called,
1968 * so we can safely collect pull_task() stats here rather than
1969 * inside pull_task().
1970 */
1971 schedstat_add(sd, lb_gained[idle], pulled);
1972
1973 if (all_pinned)
1974 *all_pinned = pinned;
1975
1976 return max_load_move - rem_load_move;
1869} 1977}
1870 1978
1871#ifdef CONFIG_FAIR_GROUP_SCHED 1979#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1897 rem_load = (u64)rem_load_move * busiest_weight; 2005 rem_load = (u64)rem_load_move * busiest_weight;
1898 rem_load = div_u64(rem_load, busiest_h_load + 1); 2006 rem_load = div_u64(rem_load, busiest_h_load + 1);
1899 2007
1900 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 2008 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1901 rem_load, sd, idle, all_pinned, this_best_prio, 2009 rem_load, sd, idle, all_pinned, this_best_prio,
1902 tg->cfs_rq[busiest_cpu]); 2010 busiest_cfs_rq);
1903 2011
1904 if (!moved_load) 2012 if (!moved_load)
1905 continue; 2013 continue;
@@ -1922,35 +2030,1499 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1922 struct sched_domain *sd, enum cpu_idle_type idle, 2030 struct sched_domain *sd, enum cpu_idle_type idle,
1923 int *all_pinned, int *this_best_prio) 2031 int *all_pinned, int *this_best_prio)
1924{ 2032{
1925 return __load_balance_fair(this_rq, this_cpu, busiest, 2033 return balance_tasks(this_rq, this_cpu, busiest,
1926 max_load_move, sd, idle, all_pinned, 2034 max_load_move, sd, idle, all_pinned,
1927 this_best_prio, &busiest->cfs); 2035 this_best_prio, &busiest->cfs);
1928} 2036}
1929#endif 2037#endif
1930 2038
1931static int 2039/*
1932move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2040 * move_tasks tries to move up to max_load_move weighted load from busiest to
1933 struct sched_domain *sd, enum cpu_idle_type idle) 2041 * this_rq, as part of a balancing operation within domain "sd".
2042 * Returns 1 if successful and 0 otherwise.
2043 *
2044 * Called with both runqueues locked.
2045 */
2046static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2047 unsigned long max_load_move,
2048 struct sched_domain *sd, enum cpu_idle_type idle,
2049 int *all_pinned)
2050{
2051 unsigned long total_load_moved = 0, load_moved;
2052 int this_best_prio = this_rq->curr->prio;
2053
2054 do {
2055 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2056 max_load_move - total_load_moved,
2057 sd, idle, all_pinned, &this_best_prio);
2058
2059 total_load_moved += load_moved;
2060
2061#ifdef CONFIG_PREEMPT
2062 /*
2063 * NEWIDLE balancing is a source of latency, so preemptible
2064 * kernels will stop after the first task is pulled to minimize
2065 * the critical section.
2066 */
2067 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
2068 break;
2069
2070 if (raw_spin_is_contended(&this_rq->lock) ||
2071 raw_spin_is_contended(&busiest->lock))
2072 break;
2073#endif
2074 } while (load_moved && max_load_move > total_load_moved);
2075
2076 return total_load_moved > 0;
2077}
2078
2079/********** Helpers for find_busiest_group ************************/
2080/*
2081 * sd_lb_stats - Structure to store the statistics of a sched_domain
2082 * during load balancing.
2083 */
2084struct sd_lb_stats {
2085 struct sched_group *busiest; /* Busiest group in this sd */
2086 struct sched_group *this; /* Local group in this sd */
2087 unsigned long total_load; /* Total load of all groups in sd */
2088 unsigned long total_pwr; /* Total power of all groups in sd */
2089 unsigned long avg_load; /* Average load across all groups in sd */
2090
2091 /** Statistics of this group */
2092 unsigned long this_load;
2093 unsigned long this_load_per_task;
2094 unsigned long this_nr_running;
2095
2096 /* Statistics of the busiest group */
2097 unsigned long max_load;
2098 unsigned long busiest_load_per_task;
2099 unsigned long busiest_nr_running;
2100
2101 int group_imb; /* Is there imbalance in this sd */
2102#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2103 int power_savings_balance; /* Is powersave balance needed for this sd */
2104 struct sched_group *group_min; /* Least loaded group in sd */
2105 struct sched_group *group_leader; /* Group which relieves group_min */
2106 unsigned long min_load_per_task; /* load_per_task in group_min */
2107 unsigned long leader_nr_running; /* Nr running of group_leader */
2108 unsigned long min_nr_running; /* Nr running of group_min */
2109#endif
2110};
2111
2112/*
2113 * sg_lb_stats - stats of a sched_group required for load_balancing
2114 */
2115struct sg_lb_stats {
2116 unsigned long avg_load; /*Avg load across the CPUs of the group */
2117 unsigned long group_load; /* Total load over the CPUs of the group */
2118 unsigned long sum_nr_running; /* Nr tasks running in the group */
2119 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2120 unsigned long group_capacity;
2121 int group_imb; /* Is there an imbalance in the group ? */
2122};
2123
2124/**
2125 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2126 * @group: The group whose first cpu is to be returned.
2127 */
2128static inline unsigned int group_first_cpu(struct sched_group *group)
2129{
2130 return cpumask_first(sched_group_cpus(group));
2131}
2132
2133/**
2134 * get_sd_load_idx - Obtain the load index for a given sched domain.
2135 * @sd: The sched_domain whose load_idx is to be obtained.
2136 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
2137 */
2138static inline int get_sd_load_idx(struct sched_domain *sd,
2139 enum cpu_idle_type idle)
2140{
2141 int load_idx;
2142
2143 switch (idle) {
2144 case CPU_NOT_IDLE:
2145 load_idx = sd->busy_idx;
2146 break;
2147
2148 case CPU_NEWLY_IDLE:
2149 load_idx = sd->newidle_idx;
2150 break;
2151 default:
2152 load_idx = sd->idle_idx;
2153 break;
2154 }
2155
2156 return load_idx;
2157}
2158
2159
2160#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2161/**
2162 * init_sd_power_savings_stats - Initialize power savings statistics for
2163 * the given sched_domain, during load balancing.
2164 *
2165 * @sd: Sched domain whose power-savings statistics are to be initialized.
2166 * @sds: Variable containing the statistics for sd.
2167 * @idle: Idle status of the CPU at which we're performing load-balancing.
2168 */
2169static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2170 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2171{
2172 /*
2173 * Busy processors will not participate in power savings
2174 * balance.
2175 */
2176 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2177 sds->power_savings_balance = 0;
2178 else {
2179 sds->power_savings_balance = 1;
2180 sds->min_nr_running = ULONG_MAX;
2181 sds->leader_nr_running = 0;
2182 }
2183}
2184
2185/**
2186 * update_sd_power_savings_stats - Update the power saving stats for a
2187 * sched_domain while performing load balancing.
2188 *
2189 * @group: sched_group belonging to the sched_domain under consideration.
2190 * @sds: Variable containing the statistics of the sched_domain
2191 * @local_group: Does group contain the CPU for which we're performing
2192 * load balancing ?
2193 * @sgs: Variable containing the statistics of the group.
2194 */
2195static inline void update_sd_power_savings_stats(struct sched_group *group,
2196 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2197{
2198
2199 if (!sds->power_savings_balance)
2200 return;
2201
2202 /*
2203 * If the local group is idle or completely loaded
2204 * no need to do power savings balance at this domain
2205 */
2206 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2207 !sds->this_nr_running))
2208 sds->power_savings_balance = 0;
2209
2210 /*
2211 * If a group is already running at full capacity or idle,
2212 * don't include that group in power savings calculations
2213 */
2214 if (!sds->power_savings_balance ||
2215 sgs->sum_nr_running >= sgs->group_capacity ||
2216 !sgs->sum_nr_running)
2217 return;
2218
2219 /*
2220 * Calculate the group which has the least non-idle load.
2221 * This is the group from where we need to pick up the load
2222 * for saving power
2223 */
2224 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2225 (sgs->sum_nr_running == sds->min_nr_running &&
2226 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2227 sds->group_min = group;
2228 sds->min_nr_running = sgs->sum_nr_running;
2229 sds->min_load_per_task = sgs->sum_weighted_load /
2230 sgs->sum_nr_running;
2231 }
2232
2233 /*
2234 * Calculate the group which is almost near its
2235 * capacity but still has some space to pick up some load
2236 * from other group and save more power
2237 */
2238 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2239 return;
2240
2241 if (sgs->sum_nr_running > sds->leader_nr_running ||
2242 (sgs->sum_nr_running == sds->leader_nr_running &&
2243 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2244 sds->group_leader = group;
2245 sds->leader_nr_running = sgs->sum_nr_running;
2246 }
2247}
2248
2249/**
2250 * check_power_save_busiest_group - see if there is potential for some power-savings balance
2251 * @sds: Variable containing the statistics of the sched_domain
2252 * under consideration.
2253 * @this_cpu: Cpu at which we're currently performing load-balancing.
2254 * @imbalance: Variable to store the imbalance.
2255 *
2256 * Description:
2257 * Check if we have potential to perform some power-savings balance.
2258 * If yes, set the busiest group to be the least loaded group in the
2259 * sched_domain, so that it's CPUs can be put to idle.
2260 *
2261 * Returns 1 if there is potential to perform power-savings balance.
2262 * Else returns 0.
2263 */
2264static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2265 int this_cpu, unsigned long *imbalance)
2266{
2267 if (!sds->power_savings_balance)
2268 return 0;
2269
2270 if (sds->this != sds->group_leader ||
2271 sds->group_leader == sds->group_min)
2272 return 0;
2273
2274 *imbalance = sds->min_load_per_task;
2275 sds->busiest = sds->group_min;
2276
2277 return 1;
2278
2279}
2280#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2281static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2282 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2283{
2284 return;
2285}
2286
2287static inline void update_sd_power_savings_stats(struct sched_group *group,
2288 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2289{
2290 return;
2291}
2292
2293static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2294 int this_cpu, unsigned long *imbalance)
2295{
2296 return 0;
2297}
2298#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2299
2300
2301unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2302{
2303 return SCHED_LOAD_SCALE;
2304}
2305
2306unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2307{
2308 return default_scale_freq_power(sd, cpu);
2309}
2310
2311unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2312{
2313 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2314 unsigned long smt_gain = sd->smt_gain;
2315
2316 smt_gain /= weight;
2317
2318 return smt_gain;
2319}
2320
2321unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2322{
2323 return default_scale_smt_power(sd, cpu);
2324}
2325
2326unsigned long scale_rt_power(int cpu)
2327{
2328 struct rq *rq = cpu_rq(cpu);
2329 u64 total, available;
2330
2331 sched_avg_update(rq);
2332
2333 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2334 available = total - rq->rt_avg;
2335
2336 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2337 total = SCHED_LOAD_SCALE;
2338
2339 total >>= SCHED_LOAD_SHIFT;
2340
2341 return div_u64(available, total);
2342}
2343
2344static void update_cpu_power(struct sched_domain *sd, int cpu)
2345{
2346 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2347 unsigned long power = SCHED_LOAD_SCALE;
2348 struct sched_group *sdg = sd->groups;
2349
2350 if (sched_feat(ARCH_POWER))
2351 power *= arch_scale_freq_power(sd, cpu);
2352 else
2353 power *= default_scale_freq_power(sd, cpu);
2354
2355 power >>= SCHED_LOAD_SHIFT;
2356
2357 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2358 if (sched_feat(ARCH_POWER))
2359 power *= arch_scale_smt_power(sd, cpu);
2360 else
2361 power *= default_scale_smt_power(sd, cpu);
2362
2363 power >>= SCHED_LOAD_SHIFT;
2364 }
2365
2366 power *= scale_rt_power(cpu);
2367 power >>= SCHED_LOAD_SHIFT;
2368
2369 if (!power)
2370 power = 1;
2371
2372 sdg->cpu_power = power;
2373}
2374
2375static void update_group_power(struct sched_domain *sd, int cpu)
2376{
2377 struct sched_domain *child = sd->child;
2378 struct sched_group *group, *sdg = sd->groups;
2379 unsigned long power;
2380
2381 if (!child) {
2382 update_cpu_power(sd, cpu);
2383 return;
2384 }
2385
2386 power = 0;
2387
2388 group = child->groups;
2389 do {
2390 power += group->cpu_power;
2391 group = group->next;
2392 } while (group != child->groups);
2393
2394 sdg->cpu_power = power;
2395}
2396
2397/**
2398 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2399 * @sd: The sched_domain whose statistics are to be updated.
2400 * @group: sched_group whose statistics are to be updated.
2401 * @this_cpu: Cpu for which load balance is currently performed.
2402 * @idle: Idle status of this_cpu
2403 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2404 * @sd_idle: Idle status of the sched_domain containing group.
2405 * @local_group: Does group contain this_cpu.
2406 * @cpus: Set of cpus considered for load balancing.
2407 * @balance: Should we balance.
2408 * @sgs: variable to hold the statistics for this group.
2409 */
2410static inline void update_sg_lb_stats(struct sched_domain *sd,
2411 struct sched_group *group, int this_cpu,
2412 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2413 int local_group, const struct cpumask *cpus,
2414 int *balance, struct sg_lb_stats *sgs)
2415{
2416 unsigned long load, max_cpu_load, min_cpu_load;
2417 int i;
2418 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2419 unsigned long sum_avg_load_per_task;
2420 unsigned long avg_load_per_task;
2421
2422 if (local_group)
2423 balance_cpu = group_first_cpu(group);
2424
2425 /* Tally up the load of all CPUs in the group */
2426 sum_avg_load_per_task = avg_load_per_task = 0;
2427 max_cpu_load = 0;
2428 min_cpu_load = ~0UL;
2429
2430 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2431 struct rq *rq = cpu_rq(i);
2432
2433 if (*sd_idle && rq->nr_running)
2434 *sd_idle = 0;
2435
2436 /* Bias balancing toward cpus of our domain */
2437 if (local_group) {
2438 if (idle_cpu(i) && !first_idle_cpu) {
2439 first_idle_cpu = 1;
2440 balance_cpu = i;
2441 }
2442
2443 load = target_load(i, load_idx);
2444 } else {
2445 load = source_load(i, load_idx);
2446 if (load > max_cpu_load)
2447 max_cpu_load = load;
2448 if (min_cpu_load > load)
2449 min_cpu_load = load;
2450 }
2451
2452 sgs->group_load += load;
2453 sgs->sum_nr_running += rq->nr_running;
2454 sgs->sum_weighted_load += weighted_cpuload(i);
2455
2456 sum_avg_load_per_task += cpu_avg_load_per_task(i);
2457 }
2458
2459 /*
2460 * First idle cpu or the first cpu(busiest) in this sched group
2461 * is eligible for doing load balancing at this and above
2462 * domains. In the newly idle case, we will allow all the cpu's
2463 * to do the newly idle load balance.
2464 */
2465 if (idle != CPU_NEWLY_IDLE && local_group &&
2466 balance_cpu != this_cpu) {
2467 *balance = 0;
2468 return;
2469 }
2470
2471 update_group_power(sd, this_cpu);
2472
2473 /* Adjust by relative CPU power of the group */
2474 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2475
2476
2477 /*
2478 * Consider the group unbalanced when the imbalance is larger
2479 * than the average weight of two tasks.
2480 *
2481 * APZ: with cgroup the avg task weight can vary wildly and
2482 * might not be a suitable number - should we keep a
2483 * normalized nr_running number somewhere that negates
2484 * the hierarchy?
2485 */
2486 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
2487 group->cpu_power;
2488
2489 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
2490 sgs->group_imb = 1;
2491
2492 sgs->group_capacity =
2493 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2494}
2495
2496/**
2497 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2498 * @sd: sched_domain whose statistics are to be updated.
2499 * @this_cpu: Cpu for which load balance is currently performed.
2500 * @idle: Idle status of this_cpu
2501 * @sd_idle: Idle status of the sched_domain containing group.
2502 * @cpus: Set of cpus considered for load balancing.
2503 * @balance: Should we balance.
2504 * @sds: variable to hold the statistics for this sched_domain.
2505 */
2506static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2507 enum cpu_idle_type idle, int *sd_idle,
2508 const struct cpumask *cpus, int *balance,
2509 struct sd_lb_stats *sds)
2510{
2511 struct sched_domain *child = sd->child;
2512 struct sched_group *group = sd->groups;
2513 struct sg_lb_stats sgs;
2514 int load_idx, prefer_sibling = 0;
2515
2516 if (child && child->flags & SD_PREFER_SIBLING)
2517 prefer_sibling = 1;
2518
2519 init_sd_power_savings_stats(sd, sds, idle);
2520 load_idx = get_sd_load_idx(sd, idle);
2521
2522 do {
2523 int local_group;
2524
2525 local_group = cpumask_test_cpu(this_cpu,
2526 sched_group_cpus(group));
2527 memset(&sgs, 0, sizeof(sgs));
2528 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
2529 local_group, cpus, balance, &sgs);
2530
2531 if (local_group && !(*balance))
2532 return;
2533
2534 sds->total_load += sgs.group_load;
2535 sds->total_pwr += group->cpu_power;
2536
2537 /*
2538 * In case the child domain prefers tasks go to siblings
2539 * first, lower the group capacity to one so that we'll try
2540 * and move all the excess tasks away.
2541 */
2542 if (prefer_sibling)
2543 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2544
2545 if (local_group) {
2546 sds->this_load = sgs.avg_load;
2547 sds->this = group;
2548 sds->this_nr_running = sgs.sum_nr_running;
2549 sds->this_load_per_task = sgs.sum_weighted_load;
2550 } else if (sgs.avg_load > sds->max_load &&
2551 (sgs.sum_nr_running > sgs.group_capacity ||
2552 sgs.group_imb)) {
2553 sds->max_load = sgs.avg_load;
2554 sds->busiest = group;
2555 sds->busiest_nr_running = sgs.sum_nr_running;
2556 sds->busiest_load_per_task = sgs.sum_weighted_load;
2557 sds->group_imb = sgs.group_imb;
2558 }
2559
2560 update_sd_power_savings_stats(group, sds, local_group, &sgs);
2561 group = group->next;
2562 } while (group != sd->groups);
2563}
2564
2565/**
2566 * fix_small_imbalance - Calculate the minor imbalance that exists
2567 * amongst the groups of a sched_domain, during
2568 * load balancing.
2569 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
2570 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2571 * @imbalance: Variable to store the imbalance.
2572 */
2573static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2574 int this_cpu, unsigned long *imbalance)
2575{
2576 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2577 unsigned int imbn = 2;
2578
2579 if (sds->this_nr_running) {
2580 sds->this_load_per_task /= sds->this_nr_running;
2581 if (sds->busiest_load_per_task >
2582 sds->this_load_per_task)
2583 imbn = 1;
2584 } else
2585 sds->this_load_per_task =
2586 cpu_avg_load_per_task(this_cpu);
2587
2588 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
2589 sds->busiest_load_per_task * imbn) {
2590 *imbalance = sds->busiest_load_per_task;
2591 return;
2592 }
2593
2594 /*
2595 * OK, we don't have enough imbalance to justify moving tasks,
2596 * however we may be able to increase total CPU power used by
2597 * moving them.
2598 */
2599
2600 pwr_now += sds->busiest->cpu_power *
2601 min(sds->busiest_load_per_task, sds->max_load);
2602 pwr_now += sds->this->cpu_power *
2603 min(sds->this_load_per_task, sds->this_load);
2604 pwr_now /= SCHED_LOAD_SCALE;
2605
2606 /* Amount of load we'd subtract */
2607 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2608 sds->busiest->cpu_power;
2609 if (sds->max_load > tmp)
2610 pwr_move += sds->busiest->cpu_power *
2611 min(sds->busiest_load_per_task, sds->max_load - tmp);
2612
2613 /* Amount of load we'd add */
2614 if (sds->max_load * sds->busiest->cpu_power <
2615 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2616 tmp = (sds->max_load * sds->busiest->cpu_power) /
2617 sds->this->cpu_power;
2618 else
2619 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2620 sds->this->cpu_power;
2621 pwr_move += sds->this->cpu_power *
2622 min(sds->this_load_per_task, sds->this_load + tmp);
2623 pwr_move /= SCHED_LOAD_SCALE;
2624
2625 /* Move if we gain throughput */
2626 if (pwr_move > pwr_now)
2627 *imbalance = sds->busiest_load_per_task;
2628}
2629
2630/**
2631 * calculate_imbalance - Calculate the amount of imbalance present within the
2632 * groups of a given sched_domain during load balance.
2633 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
2634 * @this_cpu: Cpu for which currently load balance is being performed.
2635 * @imbalance: The variable to store the imbalance.
2636 */
2637static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2638 unsigned long *imbalance)
2639{
2640 unsigned long max_pull;
2641 /*
2642 * In the presence of smp nice balancing, certain scenarios can have
2643 * max load less than avg load(as we skip the groups at or below
2644 * its cpu_power, while calculating max_load..)
2645 */
2646 if (sds->max_load < sds->avg_load) {
2647 *imbalance = 0;
2648 return fix_small_imbalance(sds, this_cpu, imbalance);
2649 }
2650
2651 /* Don't want to pull so many tasks that a group would go idle */
2652 max_pull = min(sds->max_load - sds->avg_load,
2653 sds->max_load - sds->busiest_load_per_task);
2654
2655 /* How much load to actually move to equalise the imbalance */
2656 *imbalance = min(max_pull * sds->busiest->cpu_power,
2657 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
2658 / SCHED_LOAD_SCALE;
2659
2660 /*
2661 * if *imbalance is less than the average load per runnable task
2662 * there is no gaurantee that any tasks will be moved so we'll have
2663 * a think about bumping its value to force at least one task to be
2664 * moved
2665 */
2666 if (*imbalance < sds->busiest_load_per_task)
2667 return fix_small_imbalance(sds, this_cpu, imbalance);
2668
2669}
2670/******* find_busiest_group() helpers end here *********************/
2671
2672/**
2673 * find_busiest_group - Returns the busiest group within the sched_domain
2674 * if there is an imbalance. If there isn't an imbalance, and
2675 * the user has opted for power-savings, it returns a group whose
2676 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
2677 * such a group exists.
2678 *
2679 * Also calculates the amount of weighted load which should be moved
2680 * to restore balance.
2681 *
2682 * @sd: The sched_domain whose busiest group is to be returned.
2683 * @this_cpu: The cpu for which load balancing is currently being performed.
2684 * @imbalance: Variable which stores amount of weighted load which should
2685 * be moved to restore balance/put a group to idle.
2686 * @idle: The idle status of this_cpu.
2687 * @sd_idle: The idleness of sd
2688 * @cpus: The set of CPUs under consideration for load-balancing.
2689 * @balance: Pointer to a variable indicating if this_cpu
2690 * is the appropriate cpu to perform load balancing at this_level.
2691 *
2692 * Returns: - the busiest group if imbalance exists.
2693 * - If no imbalance and user has opted for power-savings balance,
2694 * return the least loaded group whose CPUs can be
2695 * put to idle by rebalancing its tasks onto our group.
2696 */
2697static struct sched_group *
2698find_busiest_group(struct sched_domain *sd, int this_cpu,
2699 unsigned long *imbalance, enum cpu_idle_type idle,
2700 int *sd_idle, const struct cpumask *cpus, int *balance)
2701{
2702 struct sd_lb_stats sds;
2703
2704 memset(&sds, 0, sizeof(sds));
2705
2706 /*
2707 * Compute the various statistics relavent for load balancing at
2708 * this level.
2709 */
2710 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
2711 balance, &sds);
2712
2713 /* Cases where imbalance does not exist from POV of this_cpu */
2714 /* 1) this_cpu is not the appropriate cpu to perform load balancing
2715 * at this level.
2716 * 2) There is no busy sibling group to pull from.
2717 * 3) This group is the busiest group.
2718 * 4) This group is more busy than the avg busieness at this
2719 * sched_domain.
2720 * 5) The imbalance is within the specified limit.
2721 * 6) Any rebalance would lead to ping-pong
2722 */
2723 if (!(*balance))
2724 goto ret;
2725
2726 if (!sds.busiest || sds.busiest_nr_running == 0)
2727 goto out_balanced;
2728
2729 if (sds.this_load >= sds.max_load)
2730 goto out_balanced;
2731
2732 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
2733
2734 if (sds.this_load >= sds.avg_load)
2735 goto out_balanced;
2736
2737 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2738 goto out_balanced;
2739
2740 sds.busiest_load_per_task /= sds.busiest_nr_running;
2741 if (sds.group_imb)
2742 sds.busiest_load_per_task =
2743 min(sds.busiest_load_per_task, sds.avg_load);
2744
2745 /*
2746 * We're trying to get all the cpus to the average_load, so we don't
2747 * want to push ourselves above the average load, nor do we wish to
2748 * reduce the max loaded cpu below the average load, as either of these
2749 * actions would just result in more rebalancing later, and ping-pong
2750 * tasks around. Thus we look for the minimum possible imbalance.
2751 * Negative imbalances (*we* are more loaded than anyone else) will
2752 * be counted as no imbalance for these purposes -- we can't fix that
2753 * by pulling tasks to us. Be careful of negative numbers as they'll
2754 * appear as very large values with unsigned longs.
2755 */
2756 if (sds.max_load <= sds.busiest_load_per_task)
2757 goto out_balanced;
2758
2759 /* Looks like there is an imbalance. Compute it */
2760 calculate_imbalance(&sds, this_cpu, imbalance);
2761 return sds.busiest;
2762
2763out_balanced:
2764 /*
2765 * There is no obvious imbalance. But check if we can do some balancing
2766 * to save power.
2767 */
2768 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
2769 return sds.busiest;
2770ret:
2771 *imbalance = 0;
2772 return NULL;
2773}
2774
2775/*
2776 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2777 */
2778static struct rq *
2779find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2780 unsigned long imbalance, const struct cpumask *cpus)
1934{ 2781{
1935 struct cfs_rq *busy_cfs_rq; 2782 struct rq *busiest = NULL, *rq;
1936 struct rq_iterator cfs_rq_iterator; 2783 unsigned long max_load = 0;
2784 int i;
2785
2786 for_each_cpu(i, sched_group_cpus(group)) {
2787 unsigned long power = power_of(i);
2788 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2789 unsigned long wl;
2790
2791 if (!cpumask_test_cpu(i, cpus))
2792 continue;
1937 2793
1938 cfs_rq_iterator.start = load_balance_start_fair; 2794 rq = cpu_rq(i);
1939 cfs_rq_iterator.next = load_balance_next_fair; 2795 wl = weighted_cpuload(i);
1940 2796
1941 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1942 /* 2797 /*
1943 * pass busy_cfs_rq argument into 2798 * When comparing with imbalance, use weighted_cpuload()
1944 * load_balance_[start|next]_fair iterators 2799 * which is not scaled with the cpu power.
1945 */ 2800 */
1946 cfs_rq_iterator.arg = busy_cfs_rq; 2801 if (capacity && rq->nr_running == 1 && wl > imbalance)
1947 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 2802 continue;
1948 &cfs_rq_iterator)) 2803
1949 return 1; 2804 /*
2805 * For the load comparisons with the other cpu's, consider
2806 * the weighted_cpuload() scaled with the cpu power, so that
2807 * the load can be moved away from the cpu that is potentially
2808 * running at a lower capacity.
2809 */
2810 wl = (wl * SCHED_LOAD_SCALE) / power;
2811
2812 if (wl > max_load) {
2813 max_load = wl;
2814 busiest = rq;
2815 }
1950 } 2816 }
1951 2817
2818 return busiest;
2819}
2820
2821/*
2822 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2823 * so long as it is large enough.
2824 */
2825#define MAX_PINNED_INTERVAL 512
2826
2827/* Working cpumask for load_balance and load_balance_newidle. */
2828static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2829
2830static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2831{
2832 if (idle == CPU_NEWLY_IDLE) {
2833 /*
2834 * The only task running in a non-idle cpu can be moved to this
2835 * cpu in an attempt to completely freeup the other CPU
2836 * package.
2837 *
2838 * The package power saving logic comes from
2839 * find_busiest_group(). If there are no imbalance, then
2840 * f_b_g() will return NULL. However when sched_mc={1,2} then
2841 * f_b_g() will select a group from which a running task may be
2842 * pulled to this cpu in order to make the other package idle.
2843 * If there is no opportunity to make a package idle and if
2844 * there are no imbalance, then f_b_g() will return NULL and no
2845 * action will be taken in load_balance_newidle().
2846 *
2847 * Under normal task pull operation due to imbalance, there
2848 * will be more than one task in the source run queue and
2849 * move_tasks() will succeed. ld_moved will be true and this
2850 * active balance code will not be triggered.
2851 */
2852 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2853 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2854 return 0;
2855
2856 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2857 return 0;
2858 }
2859
2860 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2861}
2862
2863/*
2864 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2865 * tasks if there is an imbalance.
2866 */
2867static int load_balance(int this_cpu, struct rq *this_rq,
2868 struct sched_domain *sd, enum cpu_idle_type idle,
2869 int *balance)
2870{
2871 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2872 struct sched_group *group;
2873 unsigned long imbalance;
2874 struct rq *busiest;
2875 unsigned long flags;
2876 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
2877
2878 cpumask_copy(cpus, cpu_active_mask);
2879
2880 /*
2881 * When power savings policy is enabled for the parent domain, idle
2882 * sibling can pick up load irrespective of busy siblings. In this case,
2883 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2884 * portraying it as CPU_NOT_IDLE.
2885 */
2886 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2887 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2888 sd_idle = 1;
2889
2890 schedstat_inc(sd, lb_count[idle]);
2891
2892redo:
2893 update_shares(sd);
2894 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2895 cpus, balance);
2896
2897 if (*balance == 0)
2898 goto out_balanced;
2899
2900 if (!group) {
2901 schedstat_inc(sd, lb_nobusyg[idle]);
2902 goto out_balanced;
2903 }
2904
2905 busiest = find_busiest_queue(group, idle, imbalance, cpus);
2906 if (!busiest) {
2907 schedstat_inc(sd, lb_nobusyq[idle]);
2908 goto out_balanced;
2909 }
2910
2911 BUG_ON(busiest == this_rq);
2912
2913 schedstat_add(sd, lb_imbalance[idle], imbalance);
2914
2915 ld_moved = 0;
2916 if (busiest->nr_running > 1) {
2917 /*
2918 * Attempt to move tasks. If find_busiest_group has found
2919 * an imbalance but busiest->nr_running <= 1, the group is
2920 * still unbalanced. ld_moved simply stays zero, so it is
2921 * correctly treated as an imbalance.
2922 */
2923 local_irq_save(flags);
2924 double_rq_lock(this_rq, busiest);
2925 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2926 imbalance, sd, idle, &all_pinned);
2927 double_rq_unlock(this_rq, busiest);
2928 local_irq_restore(flags);
2929
2930 /*
2931 * some other cpu did the load balance for us.
2932 */
2933 if (ld_moved && this_cpu != smp_processor_id())
2934 resched_cpu(this_cpu);
2935
2936 /* All tasks on this runqueue were pinned by CPU affinity */
2937 if (unlikely(all_pinned)) {
2938 cpumask_clear_cpu(cpu_of(busiest), cpus);
2939 if (!cpumask_empty(cpus))
2940 goto redo;
2941 goto out_balanced;
2942 }
2943 }
2944
2945 if (!ld_moved) {
2946 schedstat_inc(sd, lb_failed[idle]);
2947 sd->nr_balance_failed++;
2948
2949 if (need_active_balance(sd, sd_idle, idle)) {
2950 raw_spin_lock_irqsave(&busiest->lock, flags);
2951
2952 /* don't kick the migration_thread, if the curr
2953 * task on busiest cpu can't be moved to this_cpu
2954 */
2955 if (!cpumask_test_cpu(this_cpu,
2956 &busiest->curr->cpus_allowed)) {
2957 raw_spin_unlock_irqrestore(&busiest->lock,
2958 flags);
2959 all_pinned = 1;
2960 goto out_one_pinned;
2961 }
2962
2963 if (!busiest->active_balance) {
2964 busiest->active_balance = 1;
2965 busiest->push_cpu = this_cpu;
2966 active_balance = 1;
2967 }
2968 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2969 if (active_balance)
2970 wake_up_process(busiest->migration_thread);
2971
2972 /*
2973 * We've kicked active balancing, reset the failure
2974 * counter.
2975 */
2976 sd->nr_balance_failed = sd->cache_nice_tries+1;
2977 }
2978 } else
2979 sd->nr_balance_failed = 0;
2980
2981 if (likely(!active_balance)) {
2982 /* We were unbalanced, so reset the balancing interval */
2983 sd->balance_interval = sd->min_interval;
2984 } else {
2985 /*
2986 * If we've begun active balancing, start to back off. This
2987 * case may not be covered by the all_pinned logic if there
2988 * is only 1 task on the busy runqueue (because we don't call
2989 * move_tasks).
2990 */
2991 if (sd->balance_interval < sd->max_interval)
2992 sd->balance_interval *= 2;
2993 }
2994
2995 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2996 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2997 ld_moved = -1;
2998
2999 goto out;
3000
3001out_balanced:
3002 schedstat_inc(sd, lb_balanced[idle]);
3003
3004 sd->nr_balance_failed = 0;
3005
3006out_one_pinned:
3007 /* tune up the balancing interval */
3008 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3009 (sd->balance_interval < sd->max_interval))
3010 sd->balance_interval *= 2;
3011
3012 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3013 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3014 ld_moved = -1;
3015 else
3016 ld_moved = 0;
3017out:
3018 if (ld_moved)
3019 update_shares(sd);
3020 return ld_moved;
3021}
3022
3023/*
3024 * idle_balance is called by schedule() if this_cpu is about to become
3025 * idle. Attempts to pull tasks from other CPUs.
3026 */
3027static void idle_balance(int this_cpu, struct rq *this_rq)
3028{
3029 struct sched_domain *sd;
3030 int pulled_task = 0;
3031 unsigned long next_balance = jiffies + HZ;
3032
3033 this_rq->idle_stamp = this_rq->clock;
3034
3035 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3036 return;
3037
3038 /*
3039 * Drop the rq->lock, but keep IRQ/preempt disabled.
3040 */
3041 raw_spin_unlock(&this_rq->lock);
3042
3043 for_each_domain(this_cpu, sd) {
3044 unsigned long interval;
3045 int balance = 1;
3046
3047 if (!(sd->flags & SD_LOAD_BALANCE))
3048 continue;
3049
3050 if (sd->flags & SD_BALANCE_NEWIDLE) {
3051 /* If we've pulled tasks over stop searching: */
3052 pulled_task = load_balance(this_cpu, this_rq,
3053 sd, CPU_NEWLY_IDLE, &balance);
3054 }
3055
3056 interval = msecs_to_jiffies(sd->balance_interval);
3057 if (time_after(next_balance, sd->last_balance + interval))
3058 next_balance = sd->last_balance + interval;
3059 if (pulled_task) {
3060 this_rq->idle_stamp = 0;
3061 break;
3062 }
3063 }
3064
3065 raw_spin_lock(&this_rq->lock);
3066
3067 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3068 /*
3069 * We are going idle. next_balance may be set based on
3070 * a busy processor. So reset next_balance.
3071 */
3072 this_rq->next_balance = next_balance;
3073 }
3074}
3075
3076/*
3077 * active_load_balance is run by migration threads. It pushes running tasks
3078 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
3079 * running on each physical CPU where possible, and avoids physical /
3080 * logical imbalances.
3081 *
3082 * Called with busiest_rq locked.
3083 */
3084static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3085{
3086 int target_cpu = busiest_rq->push_cpu;
3087 struct sched_domain *sd;
3088 struct rq *target_rq;
3089
3090 /* Is there any task to move? */
3091 if (busiest_rq->nr_running <= 1)
3092 return;
3093
3094 target_rq = cpu_rq(target_cpu);
3095
3096 /*
3097 * This condition is "impossible", if it occurs
3098 * we need to fix it. Originally reported by
3099 * Bjorn Helgaas on a 128-cpu setup.
3100 */
3101 BUG_ON(busiest_rq == target_rq);
3102
3103 /* move a task from busiest_rq to target_rq */
3104 double_lock_balance(busiest_rq, target_rq);
3105 update_rq_clock(busiest_rq);
3106 update_rq_clock(target_rq);
3107
3108 /* Search for an sd spanning us and the target CPU. */
3109 for_each_domain(target_cpu, sd) {
3110 if ((sd->flags & SD_LOAD_BALANCE) &&
3111 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3112 break;
3113 }
3114
3115 if (likely(sd)) {
3116 schedstat_inc(sd, alb_count);
3117
3118 if (move_one_task(target_rq, target_cpu, busiest_rq,
3119 sd, CPU_IDLE))
3120 schedstat_inc(sd, alb_pushed);
3121 else
3122 schedstat_inc(sd, alb_failed);
3123 }
3124 double_unlock_balance(busiest_rq, target_rq);
3125}
3126
3127#ifdef CONFIG_NO_HZ
3128static struct {
3129 atomic_t load_balancer;
3130 cpumask_var_t cpu_mask;
3131 cpumask_var_t ilb_grp_nohz_mask;
3132} nohz ____cacheline_aligned = {
3133 .load_balancer = ATOMIC_INIT(-1),
3134};
3135
3136int get_nohz_load_balancer(void)
3137{
3138 return atomic_read(&nohz.load_balancer);
3139}
3140
3141#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3142/**
3143 * lowest_flag_domain - Return lowest sched_domain containing flag.
3144 * @cpu: The cpu whose lowest level of sched domain is to
3145 * be returned.
3146 * @flag: The flag to check for the lowest sched_domain
3147 * for the given cpu.
3148 *
3149 * Returns the lowest sched_domain of a cpu which contains the given flag.
3150 */
3151static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3152{
3153 struct sched_domain *sd;
3154
3155 for_each_domain(cpu, sd)
3156 if (sd && (sd->flags & flag))
3157 break;
3158
3159 return sd;
3160}
3161
3162/**
3163 * for_each_flag_domain - Iterates over sched_domains containing the flag.
3164 * @cpu: The cpu whose domains we're iterating over.
3165 * @sd: variable holding the value of the power_savings_sd
3166 * for cpu.
3167 * @flag: The flag to filter the sched_domains to be iterated.
3168 *
3169 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
3170 * set, starting from the lowest sched_domain to the highest.
3171 */
3172#define for_each_flag_domain(cpu, sd, flag) \
3173 for (sd = lowest_flag_domain(cpu, flag); \
3174 (sd && (sd->flags & flag)); sd = sd->parent)
3175
3176/**
3177 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3178 * @ilb_group: group to be checked for semi-idleness
3179 *
3180 * Returns: 1 if the group is semi-idle. 0 otherwise.
3181 *
3182 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3183 * and atleast one non-idle CPU. This helper function checks if the given
3184 * sched_group is semi-idle or not.
3185 */
3186static inline int is_semi_idle_group(struct sched_group *ilb_group)
3187{
3188 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
3189 sched_group_cpus(ilb_group));
3190
3191 /*
3192 * A sched_group is semi-idle when it has atleast one busy cpu
3193 * and atleast one idle cpu.
3194 */
3195 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
3196 return 0;
3197
3198 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
3199 return 0;
3200
3201 return 1;
3202}
3203/**
3204 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3205 * @cpu: The cpu which is nominating a new idle_load_balancer.
3206 *
3207 * Returns: Returns the id of the idle load balancer if it exists,
3208 * Else, returns >= nr_cpu_ids.
3209 *
3210 * This algorithm picks the idle load balancer such that it belongs to a
3211 * semi-idle powersavings sched_domain. The idea is to try and avoid
3212 * completely idle packages/cores just for the purpose of idle load balancing
3213 * when there are other idle cpu's which are better suited for that job.
3214 */
3215static int find_new_ilb(int cpu)
3216{
3217 struct sched_domain *sd;
3218 struct sched_group *ilb_group;
3219
3220 /*
3221 * Have idle load balancer selection from semi-idle packages only
3222 * when power-aware load balancing is enabled
3223 */
3224 if (!(sched_smt_power_savings || sched_mc_power_savings))
3225 goto out_done;
3226
3227 /*
3228 * Optimize for the case when we have no idle CPUs or only one
3229 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3230 */
3231 if (cpumask_weight(nohz.cpu_mask) < 2)
3232 goto out_done;
3233
3234 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3235 ilb_group = sd->groups;
3236
3237 do {
3238 if (is_semi_idle_group(ilb_group))
3239 return cpumask_first(nohz.ilb_grp_nohz_mask);
3240
3241 ilb_group = ilb_group->next;
3242
3243 } while (ilb_group != sd->groups);
3244 }
3245
3246out_done:
3247 return cpumask_first(nohz.cpu_mask);
3248}
3249#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3250static inline int find_new_ilb(int call_cpu)
3251{
3252 return cpumask_first(nohz.cpu_mask);
3253}
3254#endif
3255
3256/*
3257 * This routine will try to nominate the ilb (idle load balancing)
3258 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3259 * load balancing on behalf of all those cpus. If all the cpus in the system
3260 * go into this tickless mode, then there will be no ilb owner (as there is
3261 * no need for one) and all the cpus will sleep till the next wakeup event
3262 * arrives...
3263 *
3264 * For the ilb owner, tick is not stopped. And this tick will be used
3265 * for idle load balancing. ilb owner will still be part of
3266 * nohz.cpu_mask..
3267 *
3268 * While stopping the tick, this cpu will become the ilb owner if there
3269 * is no other owner. And will be the owner till that cpu becomes busy
3270 * or if all cpus in the system stop their ticks at which point
3271 * there is no need for ilb owner.
3272 *
3273 * When the ilb owner becomes busy, it nominates another owner, during the
3274 * next busy scheduler_tick()
3275 */
3276int select_nohz_load_balancer(int stop_tick)
3277{
3278 int cpu = smp_processor_id();
3279
3280 if (stop_tick) {
3281 cpu_rq(cpu)->in_nohz_recently = 1;
3282
3283 if (!cpu_active(cpu)) {
3284 if (atomic_read(&nohz.load_balancer) != cpu)
3285 return 0;
3286
3287 /*
3288 * If we are going offline and still the leader,
3289 * give up!
3290 */
3291 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3292 BUG();
3293
3294 return 0;
3295 }
3296
3297 cpumask_set_cpu(cpu, nohz.cpu_mask);
3298
3299 /* time for ilb owner also to sleep */
3300 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
3301 if (atomic_read(&nohz.load_balancer) == cpu)
3302 atomic_set(&nohz.load_balancer, -1);
3303 return 0;
3304 }
3305
3306 if (atomic_read(&nohz.load_balancer) == -1) {
3307 /* make me the ilb owner */
3308 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3309 return 1;
3310 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3311 int new_ilb;
3312
3313 if (!(sched_smt_power_savings ||
3314 sched_mc_power_savings))
3315 return 1;
3316 /*
3317 * Check to see if there is a more power-efficient
3318 * ilb.
3319 */
3320 new_ilb = find_new_ilb(cpu);
3321 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3322 atomic_set(&nohz.load_balancer, -1);
3323 resched_cpu(new_ilb);
3324 return 0;
3325 }
3326 return 1;
3327 }
3328 } else {
3329 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3330 return 0;
3331
3332 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3333
3334 if (atomic_read(&nohz.load_balancer) == cpu)
3335 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3336 BUG();
3337 }
1952 return 0; 3338 return 0;
1953} 3339}
3340#endif
3341
3342static DEFINE_SPINLOCK(balancing);
3343
3344/*
3345 * It checks each scheduling domain to see if it is due to be balanced,
3346 * and initiates a balancing operation if so.
3347 *
3348 * Balancing parameters are set up in arch_init_sched_domains.
3349 */
3350static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3351{
3352 int balance = 1;
3353 struct rq *rq = cpu_rq(cpu);
3354 unsigned long interval;
3355 struct sched_domain *sd;
3356 /* Earliest time when we have to do rebalance again */
3357 unsigned long next_balance = jiffies + 60*HZ;
3358 int update_next_balance = 0;
3359 int need_serialize;
3360
3361 for_each_domain(cpu, sd) {
3362 if (!(sd->flags & SD_LOAD_BALANCE))
3363 continue;
3364
3365 interval = sd->balance_interval;
3366 if (idle != CPU_IDLE)
3367 interval *= sd->busy_factor;
3368
3369 /* scale ms to jiffies */
3370 interval = msecs_to_jiffies(interval);
3371 if (unlikely(!interval))
3372 interval = 1;
3373 if (interval > HZ*NR_CPUS/10)
3374 interval = HZ*NR_CPUS/10;
3375
3376 need_serialize = sd->flags & SD_SERIALIZE;
3377
3378 if (need_serialize) {
3379 if (!spin_trylock(&balancing))
3380 goto out;
3381 }
3382
3383 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3384 if (load_balance(cpu, rq, sd, idle, &balance)) {
3385 /*
3386 * We've pulled tasks over so either we're no
3387 * longer idle, or one of our SMT siblings is
3388 * not idle.
3389 */
3390 idle = CPU_NOT_IDLE;
3391 }
3392 sd->last_balance = jiffies;
3393 }
3394 if (need_serialize)
3395 spin_unlock(&balancing);
3396out:
3397 if (time_after(next_balance, sd->last_balance + interval)) {
3398 next_balance = sd->last_balance + interval;
3399 update_next_balance = 1;
3400 }
3401
3402 /*
3403 * Stop the load balance at this level. There is another
3404 * CPU in our sched group which is doing load balancing more
3405 * actively.
3406 */
3407 if (!balance)
3408 break;
3409 }
3410
3411 /*
3412 * next_balance will be updated only when there is a need.
3413 * When the cpu is attached to null domain for ex, it will not be
3414 * updated.
3415 */
3416 if (likely(update_next_balance))
3417 rq->next_balance = next_balance;
3418}
3419
3420/*
3421 * run_rebalance_domains is triggered when needed from the scheduler tick.
3422 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3423 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3424 */
3425static void run_rebalance_domains(struct softirq_action *h)
3426{
3427 int this_cpu = smp_processor_id();
3428 struct rq *this_rq = cpu_rq(this_cpu);
3429 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3430 CPU_IDLE : CPU_NOT_IDLE;
3431
3432 rebalance_domains(this_cpu, idle);
3433
3434#ifdef CONFIG_NO_HZ
3435 /*
3436 * If this cpu is the owner for idle load balancing, then do the
3437 * balancing on behalf of the other idle cpus whose ticks are
3438 * stopped.
3439 */
3440 if (this_rq->idle_at_tick &&
3441 atomic_read(&nohz.load_balancer) == this_cpu) {
3442 struct rq *rq;
3443 int balance_cpu;
3444
3445 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3446 if (balance_cpu == this_cpu)
3447 continue;
3448
3449 /*
3450 * If this cpu gets work to do, stop the load balancing
3451 * work being done for other cpus. Next load
3452 * balancing owner will pick it up.
3453 */
3454 if (need_resched())
3455 break;
3456
3457 rebalance_domains(balance_cpu, CPU_IDLE);
3458
3459 rq = cpu_rq(balance_cpu);
3460 if (time_after(this_rq->next_balance, rq->next_balance))
3461 this_rq->next_balance = rq->next_balance;
3462 }
3463 }
3464#endif
3465}
3466
3467static inline int on_null_domain(int cpu)
3468{
3469 return !rcu_dereference(cpu_rq(cpu)->sd);
3470}
3471
3472/*
3473 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3474 *
3475 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3476 * idle load balancing owner or decide to stop the periodic load balancing,
3477 * if the whole system is idle.
3478 */
3479static inline void trigger_load_balance(struct rq *rq, int cpu)
3480{
3481#ifdef CONFIG_NO_HZ
3482 /*
3483 * If we were in the nohz mode recently and busy at the current
3484 * scheduler tick, then check if we need to nominate new idle
3485 * load balancer.
3486 */
3487 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3488 rq->in_nohz_recently = 0;
3489
3490 if (atomic_read(&nohz.load_balancer) == cpu) {
3491 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3492 atomic_set(&nohz.load_balancer, -1);
3493 }
3494
3495 if (atomic_read(&nohz.load_balancer) == -1) {
3496 int ilb = find_new_ilb(cpu);
3497
3498 if (ilb < nr_cpu_ids)
3499 resched_cpu(ilb);
3500 }
3501 }
3502
3503 /*
3504 * If this cpu is idle and doing idle load balancing for all the
3505 * cpus with ticks stopped, is it time for that to stop?
3506 */
3507 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3508 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3509 resched_cpu(cpu);
3510 return;
3511 }
3512
3513 /*
3514 * If this cpu is idle and the idle load balancing is done by
3515 * someone else, then no need raise the SCHED_SOFTIRQ
3516 */
3517 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3518 cpumask_test_cpu(cpu, nohz.cpu_mask))
3519 return;
3520#endif
3521 /* Don't need to rebalance while attached to NULL domain */
3522 if (time_after_eq(jiffies, rq->next_balance) &&
3523 likely(!on_null_domain(cpu)))
3524 raise_softirq(SCHED_SOFTIRQ);
3525}
1954 3526
1955static void rq_online_fair(struct rq *rq) 3527static void rq_online_fair(struct rq *rq)
1956{ 3528{
@@ -1962,6 +3534,15 @@ static void rq_offline_fair(struct rq *rq)
1962 update_sysctl(); 3534 update_sysctl();
1963} 3535}
1964 3536
3537#else /* CONFIG_SMP */
3538
3539/*
3540 * on UP we do not need to balance between CPUs:
3541 */
3542static inline void idle_balance(int cpu, struct rq *rq)
3543{
3544}
3545
1965#endif /* CONFIG_SMP */ 3546#endif /* CONFIG_SMP */
1966 3547
1967/* 3548/*
@@ -2076,7 +3657,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
2076} 3657}
2077#endif 3658#endif
2078 3659
2079unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 3660static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2080{ 3661{
2081 struct sched_entity *se = &task->se; 3662 struct sched_entity *se = &task->se;
2082 unsigned int rr_interval = 0; 3663 unsigned int rr_interval = 0;
@@ -2108,8 +3689,6 @@ static const struct sched_class fair_sched_class = {
2108#ifdef CONFIG_SMP 3689#ifdef CONFIG_SMP
2109 .select_task_rq = select_task_rq_fair, 3690 .select_task_rq = select_task_rq_fair,
2110 3691
2111 .load_balance = load_balance_fair,
2112 .move_one_task = move_one_task_fair,
2113 .rq_online = rq_online_fair, 3692 .rq_online = rq_online_fair,
2114 .rq_offline = rq_offline_fair, 3693 .rq_offline = rq_offline_fair,
2115 3694
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
44{ 44{
45} 45}
46 46
47#ifdef CONFIG_SMP
48static unsigned long
49load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
50 unsigned long max_load_move,
51 struct sched_domain *sd, enum cpu_idle_type idle,
52 int *all_pinned, int *this_best_prio)
53{
54 return 0;
55}
56
57static int
58move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
59 struct sched_domain *sd, enum cpu_idle_type idle)
60{
61 return 0;
62}
63#endif
64
65static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
66{ 48{
67} 49}
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
98} 80}
99 81
100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 83{
102 return 0; 84 return 0;
103} 85}
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
119 101
120#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
121 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
122
123 .load_balance = load_balance_idle,
124 .move_one_task = move_one_task_idle,
125#endif 104#endif
126 105
127 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216f..bf3e38fdbe6d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
194 return rt_se->my_q; 194 return rt_se->my_q;
195} 195}
196 196
197static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
198static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 199
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 int this_cpu = smp_processor_id();
202 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
203 struct sched_rt_entity *rt_se = rt_rq->rt_se; 204 struct sched_rt_entity *rt_se;
205
206 rt_se = rt_rq->tg->rt_se[this_cpu];
204 207
205 if (rt_rq->rt_nr_running) { 208 if (rt_rq->rt_nr_running) {
206 if (rt_se && !on_rt_rq(rt_se)) 209 if (rt_se && !on_rt_rq(rt_se))
207 enqueue_rt_entity(rt_se); 210 enqueue_rt_entity(rt_se, false);
208 if (rt_rq->highest_prio.curr < curr->prio) 211 if (rt_rq->highest_prio.curr < curr->prio)
209 resched_task(curr); 212 resched_task(curr);
210 } 213 }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212 215
213static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
214{ 217{
215 struct sched_rt_entity *rt_se = rt_rq->rt_se; 218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se;
220
221 rt_se = rt_rq->tg->rt_se[this_cpu];
216 222
217 if (rt_se && on_rt_rq(rt_se)) 223 if (rt_se && on_rt_rq(rt_se))
218 dequeue_rt_entity(rt_se); 224 dequeue_rt_entity(rt_se);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
803 dec_rt_group(rt_se, rt_rq); 809 dec_rt_group(rt_se, rt_rq);
804} 810}
805 811
806static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 812static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
807{ 813{
808 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 814 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
809 struct rt_prio_array *array = &rt_rq->active; 815 struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
819 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
820 return; 826 return;
821 827
822 list_add_tail(&rt_se->run_list, queue); 828 if (head)
829 list_add(&rt_se->run_list, queue);
830 else
831 list_add_tail(&rt_se->run_list, queue);
823 __set_bit(rt_se_prio(rt_se), array->bitmap); 832 __set_bit(rt_se_prio(rt_se), array->bitmap);
824 833
825 inc_rt_tasks(rt_se, rt_rq); 834 inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
856 } 865 }
857} 866}
858 867
859static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 868static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
860{ 869{
861 dequeue_rt_stack(rt_se); 870 dequeue_rt_stack(rt_se);
862 for_each_sched_rt_entity(rt_se) 871 for_each_sched_rt_entity(rt_se)
863 __enqueue_rt_entity(rt_se); 872 __enqueue_rt_entity(rt_se, head);
864} 873}
865 874
866static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 875static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
871 struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 struct rt_rq *rt_rq = group_rt_rq(rt_se);
872 881
873 if (rt_rq && rt_rq->rt_nr_running) 882 if (rt_rq && rt_rq->rt_nr_running)
874 __enqueue_rt_entity(rt_se); 883 __enqueue_rt_entity(rt_se, false);
875 } 884 }
876} 885}
877 886
878/* 887/*
879 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
880 */ 889 */
881static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
882{ 892{
883 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
884 894
885 if (wakeup) 895 if (wakeup)
886 rt_se->timeout = 0; 896 rt_se->timeout = 0;
887 897
888 enqueue_rt_entity(rt_se); 898 enqueue_rt_entity(rt_se, head);
889 899
890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
891 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
@@ -1481,24 +1491,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1481 push_rt_tasks(rq); 1491 push_rt_tasks(rq);
1482} 1492}
1483 1493
1484static unsigned long
1485load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1486 unsigned long max_load_move,
1487 struct sched_domain *sd, enum cpu_idle_type idle,
1488 int *all_pinned, int *this_best_prio)
1489{
1490 /* don't touch RT tasks */
1491 return 0;
1492}
1493
1494static int
1495move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1496 struct sched_domain *sd, enum cpu_idle_type idle)
1497{
1498 /* don't touch RT tasks */
1499 return 0;
1500}
1501
1502static void set_cpus_allowed_rt(struct task_struct *p, 1494static void set_cpus_allowed_rt(struct task_struct *p,
1503 const struct cpumask *new_mask) 1495 const struct cpumask *new_mask)
1504{ 1496{
@@ -1721,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1713 dequeue_pushable_task(rq, p);
1722} 1714}
1723 1715
1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 1716static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1717{
1726 /* 1718 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1719 * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1738,6 @@ static const struct sched_class rt_sched_class = {
1746#ifdef CONFIG_SMP 1738#ifdef CONFIG_SMP
1747 .select_task_rq = select_task_rq_rt, 1739 .select_task_rq = select_task_rq_rt,
1748 1740
1749 .load_balance = load_balance_rt,
1750 .move_one_task = move_one_task_rt,
1751 .set_cpus_allowed = set_cpus_allowed_rt, 1741 .set_cpus_allowed = set_cpus_allowed_rt,
1752 .rq_online = rq_online_rt, 1742 .rq_online = rq_online_rt,
1753 .rq_offline = rq_offline_rt, 1743 .rq_offline = rq_offline_rt,
diff --git a/kernel/sys.c b/kernel/sys.c
index 26a6b73a6b85..f75bf0936f47 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -569,11 +569,6 @@ static int set_user(struct cred *new)
569 if (!new_user) 569 if (!new_user)
570 return -EAGAIN; 570 return -EAGAIN;
571 571
572 if (!task_can_switch_user(new_user, current)) {
573 free_uid(new_user);
574 return -EINVAL;
575 }
576
577 if (atomic_read(&new_user->processes) >= 572 if (atomic_read(&new_user->processes) >=
578 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 573 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
579 new_user != INIT_USER) { 574 new_user != INIT_USER) {
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
56 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
57 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns, 58 .user_ns = &init_user_ns,
59#ifdef CONFIG_USER_SCHED
60 .tg = &init_task_group,
61#endif
62}; 59};
63 60
64/* 61/*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 72 put_user_ns(up->user_ns);
76} 73}
77 74
78#ifdef CONFIG_USER_SCHED
79
80static void sched_destroy_user(struct user_struct *up)
81{
82 sched_destroy_group(up->tg);
83}
84
85static int sched_create_user(struct user_struct *up)
86{
87 int rc = 0;
88
89 up->tg = sched_create_group(&root_task_group);
90 if (IS_ERR(up->tg))
91 rc = -ENOMEM;
92
93 set_tg_uid(up);
94
95 return rc;
96}
97
98#else /* CONFIG_USER_SCHED */
99
100static void sched_destroy_user(struct user_struct *up) { }
101static int sched_create_user(struct user_struct *up) { return 0; }
102
103#endif /* CONFIG_USER_SCHED */
104
105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
125static DEFINE_MUTEX(uids_mutex);
126
127static inline void uids_mutex_lock(void)
128{
129 mutex_lock(&uids_mutex);
130}
131
132static inline void uids_mutex_unlock(void)
133{
134 mutex_unlock(&uids_mutex);
135}
136
137/* uid directory attributes */
138#ifdef CONFIG_FAIR_GROUP_SCHED
139static ssize_t cpu_shares_show(struct kobject *kobj,
140 struct kobj_attribute *attr,
141 char *buf)
142{
143 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144
145 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
146}
147
148static ssize_t cpu_shares_store(struct kobject *kobj,
149 struct kobj_attribute *attr,
150 const char *buf, size_t size)
151{
152 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
153 unsigned long shares;
154 int rc;
155
156 sscanf(buf, "%lu", &shares);
157
158 rc = sched_group_set_shares(up->tg, shares);
159
160 return (rc ? rc : size);
161}
162
163static struct kobj_attribute cpu_share_attr =
164 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
165#endif
166
167#ifdef CONFIG_RT_GROUP_SCHED
168static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 char *buf)
171{
172 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
173
174 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
175}
176
177static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
178 struct kobj_attribute *attr,
179 const char *buf, size_t size)
180{
181 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
182 unsigned long rt_runtime;
183 int rc;
184
185 sscanf(buf, "%ld", &rt_runtime);
186
187 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
188
189 return (rc ? rc : size);
190}
191
192static struct kobj_attribute cpu_rt_runtime_attr =
193 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
194
195static ssize_t cpu_rt_period_show(struct kobject *kobj,
196 struct kobj_attribute *attr,
197 char *buf)
198{
199 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
200
201 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
202}
203
204static ssize_t cpu_rt_period_store(struct kobject *kobj,
205 struct kobj_attribute *attr,
206 const char *buf, size_t size)
207{
208 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
209 unsigned long rt_period;
210 int rc;
211
212 sscanf(buf, "%lu", &rt_period);
213
214 rc = sched_group_set_rt_period(up->tg, rt_period);
215
216 return (rc ? rc : size);
217}
218
219static struct kobj_attribute cpu_rt_period_attr =
220 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
221#endif
222
223/* default attributes per uid directory */
224static struct attribute *uids_attributes[] = {
225#ifdef CONFIG_FAIR_GROUP_SCHED
226 &cpu_share_attr.attr,
227#endif
228#ifdef CONFIG_RT_GROUP_SCHED
229 &cpu_rt_runtime_attr.attr,
230 &cpu_rt_period_attr.attr,
231#endif
232 NULL
233};
234
235/* the lifetime of user_struct is not managed by the core (now) */
236static void uids_release(struct kobject *kobj)
237{
238 return;
239}
240
241static struct kobj_type uids_ktype = {
242 .sysfs_ops = &kobj_sysfs_ops,
243 .default_attrs = uids_attributes,
244 .release = uids_release,
245};
246
247/*
248 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
249 * We do not create this file for users in a user namespace (until
250 * sysfs tagging is implemented).
251 *
252 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
253 */
254static int uids_user_create(struct user_struct *up)
255{
256 struct kobject *kobj = &up->kobj;
257 int error;
258
259 memset(kobj, 0, sizeof(struct kobject));
260 if (up->user_ns != &init_user_ns)
261 return 0;
262 kobj->kset = uids_kset;
263 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
264 if (error) {
265 kobject_put(kobj);
266 goto done;
267 }
268
269 kobject_uevent(kobj, KOBJ_ADD);
270done:
271 return error;
272}
273
274/* create these entries in sysfs:
275 * "/sys/kernel/uids" directory
276 * "/sys/kernel/uids/0" directory (for root user)
277 * "/sys/kernel/uids/0/cpu_share" file (for root user)
278 */
279int __init uids_sysfs_init(void)
280{
281 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
282 if (!uids_kset)
283 return -ENOMEM;
284
285 return uids_user_create(&root_user);
286}
287
288/* delayed work function to remove sysfs directory for a user and free up
289 * corresponding structures.
290 */
291static void cleanup_user_struct(struct work_struct *w)
292{
293 struct user_struct *up = container_of(w, struct user_struct, work.work);
294 unsigned long flags;
295 int remove_user = 0;
296
297 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
298 * atomic.
299 */
300 uids_mutex_lock();
301
302 spin_lock_irqsave(&uidhash_lock, flags);
303 if (atomic_read(&up->__count) == 0) {
304 uid_hash_remove(up);
305 remove_user = 1;
306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
308
309 if (!remove_user)
310 goto done;
311
312 if (up->user_ns == &init_user_ns) {
313 kobject_uevent(&up->kobj, KOBJ_REMOVE);
314 kobject_del(&up->kobj);
315 kobject_put(&up->kobj);
316 }
317
318 sched_destroy_user(up);
319 key_put(up->uid_keyring);
320 key_put(up->session_keyring);
321 kmem_cache_free(uid_cachep, up);
322
323done:
324 uids_mutex_unlock();
325}
326
327/* IRQs are disabled and uidhash_lock is held upon function entry.
328 * IRQ state (as stored in flags) is restored and uidhash_lock released
329 * upon function exit.
330 */
331static void free_user(struct user_struct *up, unsigned long flags)
332{
333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336}
337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 75static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{ 76{
342 struct user_struct *user; 77 struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
352 return NULL; 87 return NULL;
353} 88}
354 89
355int uids_sysfs_init(void) { return 0; }
356static inline int uids_user_create(struct user_struct *up) { return 0; }
357static inline void uids_mutex_lock(void) { }
358static inline void uids_mutex_unlock(void) { }
359
360/* IRQs are disabled and uidhash_lock is held upon function entry. 90/* IRQs are disabled and uidhash_lock is held upon function entry.
361 * IRQ state (as stored in flags) is restored and uidhash_lock released 91 * IRQ state (as stored in flags) is restored and uidhash_lock released
362 * upon function exit. 92 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
365{ 95{
366 uid_hash_remove(up); 96 uid_hash_remove(up);
367 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
368 sched_destroy_user(up);
369 key_put(up->uid_keyring); 98 key_put(up->uid_keyring);
370 key_put(up->session_keyring); 99 key_put(up->session_keyring);
371 kmem_cache_free(uid_cachep, up); 100 kmem_cache_free(uid_cachep, up);
372} 101}
373 102
374#endif
375
376#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
377/*
378 * We need to check if a setuid can take place. This function should be called
379 * before successfully completing the setuid.
380 */
381int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
382{
383
384 return sched_rt_can_attach(up->tg, tsk);
385
386}
387#else
388int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
389{
390 return 1;
391}
392#endif
393
394/* 103/*
395 * Locate the user_struct for the passed UID. If found, take a ref on it. The 104 * Locate the user_struct for the passed UID. If found, take a ref on it. The
396 * caller must undo that ref with free_uid(). 105 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
431 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() 140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
432 * atomic. 141 * atomic.
433 */ 142 */
434 uids_mutex_lock();
435
436 spin_lock_irq(&uidhash_lock); 143 spin_lock_irq(&uidhash_lock);
437 up = uid_hash_find(uid, hashent); 144 up = uid_hash_find(uid, hashent);
438 spin_unlock_irq(&uidhash_lock); 145 spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
445 new->uid = uid; 152 new->uid = uid;
446 atomic_set(&new->__count, 1); 153 atomic_set(&new->__count, 1);
447 154
448 if (sched_create_user(new) < 0)
449 goto out_free_user;
450
451 new->user_ns = get_user_ns(ns); 155 new->user_ns = get_user_ns(ns);
452 156
453 if (uids_user_create(new))
454 goto out_destoy_sched;
455
456 /* 157 /*
457 * Before adding this, check whether we raced 158 * Before adding this, check whether we raced
458 * on adding the same user already.. 159 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
475 spin_unlock_irq(&uidhash_lock); 176 spin_unlock_irq(&uidhash_lock);
476 } 177 }
477 178
478 uids_mutex_unlock();
479
480 return up; 179 return up;
481 180
482out_destoy_sched:
483 sched_destroy_user(new);
484 put_user_ns(new->user_ns); 181 put_user_ns(new->user_ns);
485out_free_user:
486 kmem_cache_free(uid_cachep, new); 182 kmem_cache_free(uid_cachep, new);
487out_unlock: 183out_unlock:
488 uids_mutex_unlock();
489 return NULL; 184 return NULL;
490} 185}
491 186