aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/feature-removal-schedule.txt15
-rw-r--r--include/linux/kernel.h5
-rw-r--r--include/linux/sched.h25
-rw-r--r--init/Kconfig81
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/sched.c2088
-rw-r--r--kernel/sched_cpupri.c4
-rw-r--r--kernel/sched_fair.c1678
-rw-r--r--kernel/sched_idletask.c23
-rw-r--r--kernel/sched_rt.c54
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/user.c305
12 files changed, 1776 insertions, 2515 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 0a46833c1b76..dbc12067872d 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,21 +6,6 @@ be removed from this file.
6 6
7--------------------------- 7---------------------------
8 8
9What: USER_SCHED
10When: 2.6.34
11
12Why: USER_SCHED was implemented as a proof of concept for group scheduling.
13 The effect of USER_SCHED can already be achieved from userspace with
14 the help of libcgroup. The removal of USER_SCHED will also simplify
15 the scheduler code with the removal of one major ifdef. There are also
16 issues USER_SCHED has with USER_NS. A decision was taken not to fix
17 those and instead remove USER_SCHED. Also new group scheduling
18 features will not be implemented for USER_SCHED.
19
20Who: Dhaval Giani <dhaval@linux.vnet.ibm.com>
21
22---------------------------
23
24What: PRISM54 9What: PRISM54
25When: 2.6.34 10When: 2.6.34
26 11
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 328bca609b9b..1221d2331a6d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -124,7 +124,7 @@ extern int _cond_resched(void);
124#endif 124#endif
125 125
126#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 126#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
127 void __might_sleep(char *file, int line, int preempt_offset); 127 void __might_sleep(const char *file, int line, int preempt_offset);
128/** 128/**
129 * might_sleep - annotation for functions that can sleep 129 * might_sleep - annotation for functions that can sleep
130 * 130 *
@@ -138,7 +138,8 @@ extern int _cond_resched(void);
138# define might_sleep() \ 138# define might_sleep() \
139 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) 139 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
140#else 140#else
141 static inline void __might_sleep(char *file, int line, int preempt_offset) { } 141 static inline void __might_sleep(const char *file, int line,
142 int preempt_offset) { }
142# define might_sleep() do { might_resched(); } while (0) 143# define might_sleep() do { might_resched(); } while (0)
143#endif 144#endif
144 145
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78efe7c485ac..b1b8d84c5805 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -740,14 +740,6 @@ struct user_struct {
740 uid_t uid; 740 uid_t uid;
741 struct user_namespace *user_ns; 741 struct user_namespace *user_ns;
742 742
743#ifdef CONFIG_USER_SCHED
744 struct task_group *tg;
745#ifdef CONFIG_SYSFS
746 struct kobject kobj;
747 struct delayed_work work;
748#endif
749#endif
750
751#ifdef CONFIG_PERF_EVENTS 743#ifdef CONFIG_PERF_EVENTS
752 atomic_long_t locked_vm; 744 atomic_long_t locked_vm;
753#endif 745#endif
@@ -1084,7 +1076,8 @@ struct sched_domain;
1084struct sched_class { 1076struct sched_class {
1085 const struct sched_class *next; 1077 const struct sched_class *next;
1086 1078
1087 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); 1079 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup,
1080 bool head);
1088 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); 1081 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
1089 void (*yield_task) (struct rq *rq); 1082 void (*yield_task) (struct rq *rq);
1090 1083
@@ -1096,14 +1089,6 @@ struct sched_class {
1096#ifdef CONFIG_SMP 1089#ifdef CONFIG_SMP
1097 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 1090 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
1098 1091
1099 unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
1100 struct rq *busiest, unsigned long max_load_move,
1101 struct sched_domain *sd, enum cpu_idle_type idle,
1102 int *all_pinned, int *this_best_prio);
1103
1104 int (*move_one_task) (struct rq *this_rq, int this_cpu,
1105 struct rq *busiest, struct sched_domain *sd,
1106 enum cpu_idle_type idle);
1107 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1092 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1108 void (*post_schedule) (struct rq *this_rq); 1093 void (*post_schedule) (struct rq *this_rq);
1109 void (*task_waking) (struct rq *this_rq, struct task_struct *task); 1094 void (*task_waking) (struct rq *this_rq, struct task_struct *task);
@@ -2517,13 +2502,9 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
2517 2502
2518extern void normalize_rt_tasks(void); 2503extern void normalize_rt_tasks(void);
2519 2504
2520#ifdef CONFIG_GROUP_SCHED 2505#ifdef CONFIG_CGROUP_SCHED
2521 2506
2522extern struct task_group init_task_group; 2507extern struct task_group init_task_group;
2523#ifdef CONFIG_USER_SCHED
2524extern struct task_group root_task_group;
2525extern void set_tg_uid(struct user_struct *user);
2526#endif
2527 2508
2528extern struct task_group *sched_create_group(struct task_group *parent); 2509extern struct task_group *sched_create_group(struct task_group *parent);
2529extern void sched_destroy_group(struct task_group *tg); 2510extern void sched_destroy_group(struct task_group *tg);
diff --git a/init/Kconfig b/init/Kconfig
index d95ca7cd5d45..ed9c19e02f93 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -445,57 +445,6 @@ config LOG_BUF_SHIFT
445config HAVE_UNSTABLE_SCHED_CLOCK 445config HAVE_UNSTABLE_SCHED_CLOCK
446 bool 446 bool
447 447
448config GROUP_SCHED
449 bool "Group CPU scheduler"
450 depends on EXPERIMENTAL
451 default n
452 help
453 This feature lets CPU scheduler recognize task groups and control CPU
454 bandwidth allocation to such task groups.
455 In order to create a group from arbitrary set of processes, use
456 CONFIG_CGROUPS. (See Control Group support.)
457
458config FAIR_GROUP_SCHED
459 bool "Group scheduling for SCHED_OTHER"
460 depends on GROUP_SCHED
461 default GROUP_SCHED
462
463config RT_GROUP_SCHED
464 bool "Group scheduling for SCHED_RR/FIFO"
465 depends on EXPERIMENTAL
466 depends on GROUP_SCHED
467 default n
468 help
469 This feature lets you explicitly allocate real CPU bandwidth
470 to users or control groups (depending on the "Basis for grouping tasks"
471 setting below. If enabled, it will also make it impossible to
472 schedule realtime tasks for non-root users until you allocate
473 realtime bandwidth for them.
474 See Documentation/scheduler/sched-rt-group.txt for more information.
475
476choice
477 depends on GROUP_SCHED
478 prompt "Basis for grouping tasks"
479 default USER_SCHED
480
481config USER_SCHED
482 bool "user id"
483 help
484 This option will choose userid as the basis for grouping
485 tasks, thus providing equal CPU bandwidth to each user.
486
487config CGROUP_SCHED
488 bool "Control groups"
489 depends on CGROUPS
490 help
491 This option allows you to create arbitrary task groups
492 using the "cgroup" pseudo filesystem and control
493 the cpu bandwidth allocated to each such task group.
494 Refer to Documentation/cgroups/cgroups.txt for more
495 information on "cgroup" pseudo filesystem.
496
497endchoice
498
499menuconfig CGROUPS 448menuconfig CGROUPS
500 boolean "Control Group support" 449 boolean "Control Group support"
501 help 450 help
@@ -616,6 +565,36 @@ config CGROUP_MEM_RES_CTLR_SWAP
616 Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page 565 Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
617 size is 4096bytes, 512k per 1Gbytes of swap. 566 size is 4096bytes, 512k per 1Gbytes of swap.
618 567
568menuconfig CGROUP_SCHED
569 bool "Group CPU scheduler"
570 depends on EXPERIMENTAL && CGROUPS
571 default n
572 help
573 This feature lets CPU scheduler recognize task groups and control CPU
574 bandwidth allocation to such task groups. It uses cgroups to group
575 tasks.
576
577if CGROUP_SCHED
578config FAIR_GROUP_SCHED
579 bool "Group scheduling for SCHED_OTHER"
580 depends on CGROUP_SCHED
581 default CGROUP_SCHED
582
583config RT_GROUP_SCHED
584 bool "Group scheduling for SCHED_RR/FIFO"
585 depends on EXPERIMENTAL
586 depends on CGROUP_SCHED
587 default n
588 help
589 This feature lets you explicitly allocate real CPU bandwidth
590 to users or control groups (depending on the "Basis for grouping tasks"
591 setting below. If enabled, it will also make it impossible to
592 schedule realtime tasks for non-root users until you allocate
593 realtime bandwidth for them.
594 See Documentation/scheduler/sched-rt-group.txt for more information.
595
596endif #CGROUP_SCHED
597
619endif # CGROUPS 598endif # CGROUPS
620 599
621config MM_OWNER 600config MM_OWNER
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..6b1ccc3f0205 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void)
197 goto group_exit; 197 goto group_exit;
198 } 198 }
199 199
200 /* create the /sys/kernel/uids/ directory */
201 error = uids_sysfs_init();
202 if (error)
203 goto notes_exit;
204
205 return 0; 200 return 0;
206 201
207notes_exit:
208 if (notes_size > 0)
209 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
210group_exit: 202group_exit:
211 sysfs_remove_group(kernel_kobj, &kernel_attr_group); 203 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
212kset_exit: 204kset_exit:
diff --git a/kernel/sched.c b/kernel/sched.c
index e3199df426e3..f96be9370b75 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -1390,32 +1347,6 @@ static const u32 prio_to_wmult[40] = {
1390 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1347 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1391}; 1348};
1392 1349
1393static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1394
1395/*
1396 * runqueue iterator, to support SMP load-balancing between different
1397 * scheduling classes, without having to expose their internal data
1398 * structures to the load-balancing proper:
1399 */
1400struct rq_iterator {
1401 void *arg;
1402 struct task_struct *(*start)(void *);
1403 struct task_struct *(*next)(void *);
1404};
1405
1406#ifdef CONFIG_SMP
1407static unsigned long
1408balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1409 unsigned long max_load_move, struct sched_domain *sd,
1410 enum cpu_idle_type idle, int *all_pinned,
1411 int *this_best_prio, struct rq_iterator *iterator);
1412
1413static int
1414iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1415 struct sched_domain *sd, enum cpu_idle_type idle,
1416 struct rq_iterator *iterator);
1417#endif
1418
1419/* Time spent by the tasks of the cpu accounting group executing in ... */ 1350/* Time spent by the tasks of the cpu accounting group executing in ... */
1420enum cpuacct_stat_index { 1351enum cpuacct_stat_index {
1421 CPUACCT_STAT_USER, /* ... user mode */ 1352 CPUACCT_STAT_USER, /* ... user mode */
@@ -1701,16 +1632,6 @@ static void update_shares(struct sched_domain *sd)
1701 } 1632 }
1702} 1633}
1703 1634
1704static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1705{
1706 if (root_task_group_empty())
1707 return;
1708
1709 raw_spin_unlock(&rq->lock);
1710 update_shares(sd);
1711 raw_spin_lock(&rq->lock);
1712}
1713
1714static void update_h_load(long cpu) 1635static void update_h_load(long cpu)
1715{ 1636{
1716 if (root_task_group_empty()) 1637 if (root_task_group_empty())
@@ -1725,10 +1646,6 @@ static inline void update_shares(struct sched_domain *sd)
1725{ 1646{
1726} 1647}
1727 1648
1728static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1729{
1730}
1731
1732#endif 1649#endif
1733 1650
1734#ifdef CONFIG_PREEMPT 1651#ifdef CONFIG_PREEMPT
@@ -1805,6 +1722,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 raw_spin_unlock(&busiest->lock); 1722 raw_spin_unlock(&busiest->lock);
1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1723 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1807} 1724}
1725
1726/*
1727 * double_rq_lock - safely lock two runqueues
1728 *
1729 * Note this does not disable interrupts like task_rq_lock,
1730 * you need to do so manually before calling.
1731 */
1732static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1733 __acquires(rq1->lock)
1734 __acquires(rq2->lock)
1735{
1736 BUG_ON(!irqs_disabled());
1737 if (rq1 == rq2) {
1738 raw_spin_lock(&rq1->lock);
1739 __acquire(rq2->lock); /* Fake it out ;) */
1740 } else {
1741 if (rq1 < rq2) {
1742 raw_spin_lock(&rq1->lock);
1743 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1744 } else {
1745 raw_spin_lock(&rq2->lock);
1746 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1747 }
1748 }
1749 update_rq_clock(rq1);
1750 update_rq_clock(rq2);
1751}
1752
1753/*
1754 * double_rq_unlock - safely unlock two runqueues
1755 *
1756 * Note this does not restore interrupts like task_rq_unlock,
1757 * you need to do so manually after calling.
1758 */
1759static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1760 __releases(rq1->lock)
1761 __releases(rq2->lock)
1762{
1763 raw_spin_unlock(&rq1->lock);
1764 if (rq1 != rq2)
1765 raw_spin_unlock(&rq2->lock);
1766 else
1767 __release(rq2->lock);
1768}
1769
1808#endif 1770#endif
1809 1771
1810#ifdef CONFIG_FAIR_GROUP_SCHED 1772#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1834,18 +1796,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1834#endif 1796#endif
1835} 1797}
1836 1798
1837#include "sched_stats.h" 1799static const struct sched_class rt_sched_class;
1838#include "sched_idletask.c"
1839#include "sched_fair.c"
1840#include "sched_rt.c"
1841#ifdef CONFIG_SCHED_DEBUG
1842# include "sched_debug.c"
1843#endif
1844 1800
1845#define sched_class_highest (&rt_sched_class) 1801#define sched_class_highest (&rt_sched_class)
1846#define for_each_class(class) \ 1802#define for_each_class(class) \
1847 for (class = sched_class_highest; class; class = class->next) 1803 for (class = sched_class_highest; class; class = class->next)
1848 1804
1805#include "sched_stats.h"
1806
1849static void inc_nr_running(struct rq *rq) 1807static void inc_nr_running(struct rq *rq)
1850{ 1808{
1851 rq->nr_running++; 1809 rq->nr_running++;
@@ -1883,13 +1841,14 @@ static void update_avg(u64 *avg, u64 sample)
1883 *avg += diff >> 3; 1841 *avg += diff >> 3;
1884} 1842}
1885 1843
1886static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1844static void
1845enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1887{ 1846{
1888 if (wakeup) 1847 if (wakeup)
1889 p->se.start_runtime = p->se.sum_exec_runtime; 1848 p->se.start_runtime = p->se.sum_exec_runtime;
1890 1849
1891 sched_info_queued(p); 1850 sched_info_queued(p);
1892 p->sched_class->enqueue_task(rq, p, wakeup); 1851 p->sched_class->enqueue_task(rq, p, wakeup, head);
1893 p->se.on_rq = 1; 1852 p->se.on_rq = 1;
1894} 1853}
1895 1854
@@ -1912,6 +1871,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1912} 1871}
1913 1872
1914/* 1873/*
1874 * activate_task - move a task to the runqueue.
1875 */
1876static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1877{
1878 if (task_contributes_to_load(p))
1879 rq->nr_uninterruptible--;
1880
1881 enqueue_task(rq, p, wakeup, false);
1882 inc_nr_running(rq);
1883}
1884
1885/*
1886 * deactivate_task - remove a task from the runqueue.
1887 */
1888static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1889{
1890 if (task_contributes_to_load(p))
1891 rq->nr_uninterruptible++;
1892
1893 dequeue_task(rq, p, sleep);
1894 dec_nr_running(rq);
1895}
1896
1897#include "sched_idletask.c"
1898#include "sched_fair.c"
1899#include "sched_rt.c"
1900#ifdef CONFIG_SCHED_DEBUG
1901# include "sched_debug.c"
1902#endif
1903
1904/*
1915 * __normal_prio - return the priority that is based on the static prio 1905 * __normal_prio - return the priority that is based on the static prio
1916 */ 1906 */
1917static inline int __normal_prio(struct task_struct *p) 1907static inline int __normal_prio(struct task_struct *p)
@@ -1957,30 +1947,6 @@ static int effective_prio(struct task_struct *p)
1957 return p->prio; 1947 return p->prio;
1958} 1948}
1959 1949
1960/*
1961 * activate_task - move a task to the runqueue.
1962 */
1963static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1964{
1965 if (task_contributes_to_load(p))
1966 rq->nr_uninterruptible--;
1967
1968 enqueue_task(rq, p, wakeup);
1969 inc_nr_running(rq);
1970}
1971
1972/*
1973 * deactivate_task - remove a task from the runqueue.
1974 */
1975static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1976{
1977 if (task_contributes_to_load(p))
1978 rq->nr_uninterruptible++;
1979
1980 dequeue_task(rq, p, sleep);
1981 dec_nr_running(rq);
1982}
1983
1984/** 1950/**
1985 * task_curr - is this task currently executing on a CPU? 1951 * task_curr - is this task currently executing on a CPU?
1986 * @p: the task in question. 1952 * @p: the task in question.
@@ -3099,50 +3065,6 @@ static void update_cpu_load(struct rq *this_rq)
3099#ifdef CONFIG_SMP 3065#ifdef CONFIG_SMP
3100 3066
3101/* 3067/*
3102 * double_rq_lock - safely lock two runqueues
3103 *
3104 * Note this does not disable interrupts like task_rq_lock,
3105 * you need to do so manually before calling.
3106 */
3107static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3108 __acquires(rq1->lock)
3109 __acquires(rq2->lock)
3110{
3111 BUG_ON(!irqs_disabled());
3112 if (rq1 == rq2) {
3113 raw_spin_lock(&rq1->lock);
3114 __acquire(rq2->lock); /* Fake it out ;) */
3115 } else {
3116 if (rq1 < rq2) {
3117 raw_spin_lock(&rq1->lock);
3118 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3119 } else {
3120 raw_spin_lock(&rq2->lock);
3121 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3122 }
3123 }
3124 update_rq_clock(rq1);
3125 update_rq_clock(rq2);
3126}
3127
3128/*
3129 * double_rq_unlock - safely unlock two runqueues
3130 *
3131 * Note this does not restore interrupts like task_rq_unlock,
3132 * you need to do so manually after calling.
3133 */
3134static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3135 __releases(rq1->lock)
3136 __releases(rq2->lock)
3137{
3138 raw_spin_unlock(&rq1->lock);
3139 if (rq1 != rq2)
3140 raw_spin_unlock(&rq2->lock);
3141 else
3142 __release(rq2->lock);
3143}
3144
3145/*
3146 * sched_exec - execve() is a valuable balancing opportunity, because at 3068 * sched_exec - execve() is a valuable balancing opportunity, because at
3147 * this point the task has the smallest effective memory and cache footprint. 3069 * this point the task has the smallest effective memory and cache footprint.
3148 */ 3070 */
@@ -3190,1771 +3112,6 @@ again:
3190 task_rq_unlock(rq, &flags); 3112 task_rq_unlock(rq, &flags);
3191} 3113}
3192 3114
3193/*
3194 * pull_task - move a task from a remote runqueue to the local runqueue.
3195 * Both runqueues must be locked.
3196 */
3197static void pull_task(struct rq *src_rq, struct task_struct *p,
3198 struct rq *this_rq, int this_cpu)
3199{
3200 deactivate_task(src_rq, p, 0);
3201 set_task_cpu(p, this_cpu);
3202 activate_task(this_rq, p, 0);
3203 check_preempt_curr(this_rq, p, 0);
3204}
3205
3206/*
3207 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3208 */
3209static
3210int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3211 struct sched_domain *sd, enum cpu_idle_type idle,
3212 int *all_pinned)
3213{
3214 int tsk_cache_hot = 0;
3215 /*
3216 * We do not migrate tasks that are:
3217 * 1) running (obviously), or
3218 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3219 * 3) are cache-hot on their current CPU.
3220 */
3221 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3222 schedstat_inc(p, se.nr_failed_migrations_affine);
3223 return 0;
3224 }
3225 *all_pinned = 0;
3226
3227 if (task_running(rq, p)) {
3228 schedstat_inc(p, se.nr_failed_migrations_running);
3229 return 0;
3230 }
3231
3232 /*
3233 * Aggressive migration if:
3234 * 1) task is cache cold, or
3235 * 2) too many balance attempts have failed.
3236 */
3237
3238 tsk_cache_hot = task_hot(p, rq->clock, sd);
3239 if (!tsk_cache_hot ||
3240 sd->nr_balance_failed > sd->cache_nice_tries) {
3241#ifdef CONFIG_SCHEDSTATS
3242 if (tsk_cache_hot) {
3243 schedstat_inc(sd, lb_hot_gained[idle]);
3244 schedstat_inc(p, se.nr_forced_migrations);
3245 }
3246#endif
3247 return 1;
3248 }
3249
3250 if (tsk_cache_hot) {
3251 schedstat_inc(p, se.nr_failed_migrations_hot);
3252 return 0;
3253 }
3254 return 1;
3255}
3256
3257static unsigned long
3258balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3259 unsigned long max_load_move, struct sched_domain *sd,
3260 enum cpu_idle_type idle, int *all_pinned,
3261 int *this_best_prio, struct rq_iterator *iterator)
3262{
3263 int loops = 0, pulled = 0, pinned = 0;
3264 struct task_struct *p;
3265 long rem_load_move = max_load_move;
3266
3267 if (max_load_move == 0)
3268 goto out;
3269
3270 pinned = 1;
3271
3272 /*
3273 * Start the load-balancing iterator:
3274 */
3275 p = iterator->start(iterator->arg);
3276next:
3277 if (!p || loops++ > sysctl_sched_nr_migrate)
3278 goto out;
3279
3280 if ((p->se.load.weight >> 1) > rem_load_move ||
3281 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3282 p = iterator->next(iterator->arg);
3283 goto next;
3284 }
3285
3286 pull_task(busiest, p, this_rq, this_cpu);
3287 pulled++;
3288 rem_load_move -= p->se.load.weight;
3289
3290#ifdef CONFIG_PREEMPT
3291 /*
3292 * NEWIDLE balancing is a source of latency, so preemptible kernels
3293 * will stop after the first task is pulled to minimize the critical
3294 * section.
3295 */
3296 if (idle == CPU_NEWLY_IDLE)
3297 goto out;
3298#endif
3299
3300 /*
3301 * We only want to steal up to the prescribed amount of weighted load.
3302 */
3303 if (rem_load_move > 0) {
3304 if (p->prio < *this_best_prio)
3305 *this_best_prio = p->prio;
3306 p = iterator->next(iterator->arg);
3307 goto next;
3308 }
3309out:
3310 /*
3311 * Right now, this is one of only two places pull_task() is called,
3312 * so we can safely collect pull_task() stats here rather than
3313 * inside pull_task().
3314 */
3315 schedstat_add(sd, lb_gained[idle], pulled);
3316
3317 if (all_pinned)
3318 *all_pinned = pinned;
3319
3320 return max_load_move - rem_load_move;
3321}
3322
3323/*
3324 * move_tasks tries to move up to max_load_move weighted load from busiest to
3325 * this_rq, as part of a balancing operation within domain "sd".
3326 * Returns 1 if successful and 0 otherwise.
3327 *
3328 * Called with both runqueues locked.
3329 */
3330static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3331 unsigned long max_load_move,
3332 struct sched_domain *sd, enum cpu_idle_type idle,
3333 int *all_pinned)
3334{
3335 const struct sched_class *class = sched_class_highest;
3336 unsigned long total_load_moved = 0;
3337 int this_best_prio = this_rq->curr->prio;
3338
3339 do {
3340 total_load_moved +=
3341 class->load_balance(this_rq, this_cpu, busiest,
3342 max_load_move - total_load_moved,
3343 sd, idle, all_pinned, &this_best_prio);
3344 class = class->next;
3345
3346#ifdef CONFIG_PREEMPT
3347 /*
3348 * NEWIDLE balancing is a source of latency, so preemptible
3349 * kernels will stop after the first task is pulled to minimize
3350 * the critical section.
3351 */
3352 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3353 break;
3354#endif
3355 } while (class && max_load_move > total_load_moved);
3356
3357 return total_load_moved > 0;
3358}
3359
3360static int
3361iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3362 struct sched_domain *sd, enum cpu_idle_type idle,
3363 struct rq_iterator *iterator)
3364{
3365 struct task_struct *p = iterator->start(iterator->arg);
3366 int pinned = 0;
3367
3368 while (p) {
3369 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3370 pull_task(busiest, p, this_rq, this_cpu);
3371 /*
3372 * Right now, this is only the second place pull_task()
3373 * is called, so we can safely collect pull_task()
3374 * stats here rather than inside pull_task().
3375 */
3376 schedstat_inc(sd, lb_gained[idle]);
3377
3378 return 1;
3379 }
3380 p = iterator->next(iterator->arg);
3381 }
3382
3383 return 0;
3384}
3385
3386/*
3387 * move_one_task tries to move exactly one task from busiest to this_rq, as
3388 * part of active balancing operations within "domain".
3389 * Returns 1 if successful and 0 otherwise.
3390 *
3391 * Called with both runqueues locked.
3392 */
3393static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3394 struct sched_domain *sd, enum cpu_idle_type idle)
3395{
3396 const struct sched_class *class;
3397
3398 for_each_class(class) {
3399 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3400 return 1;
3401 }
3402
3403 return 0;
3404}
3405/********** Helpers for find_busiest_group ************************/
3406/*
3407 * sd_lb_stats - Structure to store the statistics of a sched_domain
3408 * during load balancing.
3409 */
3410struct sd_lb_stats {
3411 struct sched_group *busiest; /* Busiest group in this sd */
3412 struct sched_group *this; /* Local group in this sd */
3413 unsigned long total_load; /* Total load of all groups in sd */
3414 unsigned long total_pwr; /* Total power of all groups in sd */
3415 unsigned long avg_load; /* Average load across all groups in sd */
3416
3417 /** Statistics of this group */
3418 unsigned long this_load;
3419 unsigned long this_load_per_task;
3420 unsigned long this_nr_running;
3421
3422 /* Statistics of the busiest group */
3423 unsigned long max_load;
3424 unsigned long busiest_load_per_task;
3425 unsigned long busiest_nr_running;
3426
3427 int group_imb; /* Is there imbalance in this sd */
3428#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3429 int power_savings_balance; /* Is powersave balance needed for this sd */
3430 struct sched_group *group_min; /* Least loaded group in sd */
3431 struct sched_group *group_leader; /* Group which relieves group_min */
3432 unsigned long min_load_per_task; /* load_per_task in group_min */
3433 unsigned long leader_nr_running; /* Nr running of group_leader */
3434 unsigned long min_nr_running; /* Nr running of group_min */
3435#endif
3436};
3437
3438/*
3439 * sg_lb_stats - stats of a sched_group required for load_balancing
3440 */
3441struct sg_lb_stats {
3442 unsigned long avg_load; /*Avg load across the CPUs of the group */
3443 unsigned long group_load; /* Total load over the CPUs of the group */
3444 unsigned long sum_nr_running; /* Nr tasks running in the group */
3445 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3446 unsigned long group_capacity;
3447 int group_imb; /* Is there an imbalance in the group ? */
3448};
3449
3450/**
3451 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3452 * @group: The group whose first cpu is to be returned.
3453 */
3454static inline unsigned int group_first_cpu(struct sched_group *group)
3455{
3456 return cpumask_first(sched_group_cpus(group));
3457}
3458
3459/**
3460 * get_sd_load_idx - Obtain the load index for a given sched domain.
3461 * @sd: The sched_domain whose load_idx is to be obtained.
3462 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3463 */
3464static inline int get_sd_load_idx(struct sched_domain *sd,
3465 enum cpu_idle_type idle)
3466{
3467 int load_idx;
3468
3469 switch (idle) {
3470 case CPU_NOT_IDLE:
3471 load_idx = sd->busy_idx;
3472 break;
3473
3474 case CPU_NEWLY_IDLE:
3475 load_idx = sd->newidle_idx;
3476 break;
3477 default:
3478 load_idx = sd->idle_idx;
3479 break;
3480 }
3481
3482 return load_idx;
3483}
3484
3485
3486#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3487/**
3488 * init_sd_power_savings_stats - Initialize power savings statistics for
3489 * the given sched_domain, during load balancing.
3490 *
3491 * @sd: Sched domain whose power-savings statistics are to be initialized.
3492 * @sds: Variable containing the statistics for sd.
3493 * @idle: Idle status of the CPU at which we're performing load-balancing.
3494 */
3495static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3496 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3497{
3498 /*
3499 * Busy processors will not participate in power savings
3500 * balance.
3501 */
3502 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3503 sds->power_savings_balance = 0;
3504 else {
3505 sds->power_savings_balance = 1;
3506 sds->min_nr_running = ULONG_MAX;
3507 sds->leader_nr_running = 0;
3508 }
3509}
3510
3511/**
3512 * update_sd_power_savings_stats - Update the power saving stats for a
3513 * sched_domain while performing load balancing.
3514 *
3515 * @group: sched_group belonging to the sched_domain under consideration.
3516 * @sds: Variable containing the statistics of the sched_domain
3517 * @local_group: Does group contain the CPU for which we're performing
3518 * load balancing ?
3519 * @sgs: Variable containing the statistics of the group.
3520 */
3521static inline void update_sd_power_savings_stats(struct sched_group *group,
3522 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3523{
3524
3525 if (!sds->power_savings_balance)
3526 return;
3527
3528 /*
3529 * If the local group is idle or completely loaded
3530 * no need to do power savings balance at this domain
3531 */
3532 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3533 !sds->this_nr_running))
3534 sds->power_savings_balance = 0;
3535
3536 /*
3537 * If a group is already running at full capacity or idle,
3538 * don't include that group in power savings calculations
3539 */
3540 if (!sds->power_savings_balance ||
3541 sgs->sum_nr_running >= sgs->group_capacity ||
3542 !sgs->sum_nr_running)
3543 return;
3544
3545 /*
3546 * Calculate the group which has the least non-idle load.
3547 * This is the group from where we need to pick up the load
3548 * for saving power
3549 */
3550 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3551 (sgs->sum_nr_running == sds->min_nr_running &&
3552 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3553 sds->group_min = group;
3554 sds->min_nr_running = sgs->sum_nr_running;
3555 sds->min_load_per_task = sgs->sum_weighted_load /
3556 sgs->sum_nr_running;
3557 }
3558
3559 /*
3560 * Calculate the group which is almost near its
3561 * capacity but still has some space to pick up some load
3562 * from other group and save more power
3563 */
3564 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3565 return;
3566
3567 if (sgs->sum_nr_running > sds->leader_nr_running ||
3568 (sgs->sum_nr_running == sds->leader_nr_running &&
3569 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3570 sds->group_leader = group;
3571 sds->leader_nr_running = sgs->sum_nr_running;
3572 }
3573}
3574
3575/**
3576 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3577 * @sds: Variable containing the statistics of the sched_domain
3578 * under consideration.
3579 * @this_cpu: Cpu at which we're currently performing load-balancing.
3580 * @imbalance: Variable to store the imbalance.
3581 *
3582 * Description:
3583 * Check if we have potential to perform some power-savings balance.
3584 * If yes, set the busiest group to be the least loaded group in the
3585 * sched_domain, so that it's CPUs can be put to idle.
3586 *
3587 * Returns 1 if there is potential to perform power-savings balance.
3588 * Else returns 0.
3589 */
3590static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3591 int this_cpu, unsigned long *imbalance)
3592{
3593 if (!sds->power_savings_balance)
3594 return 0;
3595
3596 if (sds->this != sds->group_leader ||
3597 sds->group_leader == sds->group_min)
3598 return 0;
3599
3600 *imbalance = sds->min_load_per_task;
3601 sds->busiest = sds->group_min;
3602
3603 return 1;
3604
3605}
3606#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3607static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3608 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3609{
3610 return;
3611}
3612
3613static inline void update_sd_power_savings_stats(struct sched_group *group,
3614 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3615{
3616 return;
3617}
3618
3619static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3620 int this_cpu, unsigned long *imbalance)
3621{
3622 return 0;
3623}
3624#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3625
3626
3627unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3628{
3629 return SCHED_LOAD_SCALE;
3630}
3631
3632unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3633{
3634 return default_scale_freq_power(sd, cpu);
3635}
3636
3637unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3638{
3639 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3640 unsigned long smt_gain = sd->smt_gain;
3641
3642 smt_gain /= weight;
3643
3644 return smt_gain;
3645}
3646
3647unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3648{
3649 return default_scale_smt_power(sd, cpu);
3650}
3651
3652unsigned long scale_rt_power(int cpu)
3653{
3654 struct rq *rq = cpu_rq(cpu);
3655 u64 total, available;
3656
3657 sched_avg_update(rq);
3658
3659 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3660 available = total - rq->rt_avg;
3661
3662 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3663 total = SCHED_LOAD_SCALE;
3664
3665 total >>= SCHED_LOAD_SHIFT;
3666
3667 return div_u64(available, total);
3668}
3669
3670static void update_cpu_power(struct sched_domain *sd, int cpu)
3671{
3672 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3673 unsigned long power = SCHED_LOAD_SCALE;
3674 struct sched_group *sdg = sd->groups;
3675
3676 if (sched_feat(ARCH_POWER))
3677 power *= arch_scale_freq_power(sd, cpu);
3678 else
3679 power *= default_scale_freq_power(sd, cpu);
3680
3681 power >>= SCHED_LOAD_SHIFT;
3682
3683 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3684 if (sched_feat(ARCH_POWER))
3685 power *= arch_scale_smt_power(sd, cpu);
3686 else
3687 power *= default_scale_smt_power(sd, cpu);
3688
3689 power >>= SCHED_LOAD_SHIFT;
3690 }
3691
3692 power *= scale_rt_power(cpu);
3693 power >>= SCHED_LOAD_SHIFT;
3694
3695 if (!power)
3696 power = 1;
3697
3698 sdg->cpu_power = power;
3699}
3700
3701static void update_group_power(struct sched_domain *sd, int cpu)
3702{
3703 struct sched_domain *child = sd->child;
3704 struct sched_group *group, *sdg = sd->groups;
3705 unsigned long power;
3706
3707 if (!child) {
3708 update_cpu_power(sd, cpu);
3709 return;
3710 }
3711
3712 power = 0;
3713
3714 group = child->groups;
3715 do {
3716 power += group->cpu_power;
3717 group = group->next;
3718 } while (group != child->groups);
3719
3720 sdg->cpu_power = power;
3721}
3722
3723/**
3724 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3725 * @sd: The sched_domain whose statistics are to be updated.
3726 * @group: sched_group whose statistics are to be updated.
3727 * @this_cpu: Cpu for which load balance is currently performed.
3728 * @idle: Idle status of this_cpu
3729 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3730 * @sd_idle: Idle status of the sched_domain containing group.
3731 * @local_group: Does group contain this_cpu.
3732 * @cpus: Set of cpus considered for load balancing.
3733 * @balance: Should we balance.
3734 * @sgs: variable to hold the statistics for this group.
3735 */
3736static inline void update_sg_lb_stats(struct sched_domain *sd,
3737 struct sched_group *group, int this_cpu,
3738 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3739 int local_group, const struct cpumask *cpus,
3740 int *balance, struct sg_lb_stats *sgs)
3741{
3742 unsigned long load, max_cpu_load, min_cpu_load;
3743 int i;
3744 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3745 unsigned long sum_avg_load_per_task;
3746 unsigned long avg_load_per_task;
3747
3748 if (local_group) {
3749 balance_cpu = group_first_cpu(group);
3750 if (balance_cpu == this_cpu)
3751 update_group_power(sd, this_cpu);
3752 }
3753
3754 /* Tally up the load of all CPUs in the group */
3755 sum_avg_load_per_task = avg_load_per_task = 0;
3756 max_cpu_load = 0;
3757 min_cpu_load = ~0UL;
3758
3759 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3760 struct rq *rq = cpu_rq(i);
3761
3762 if (*sd_idle && rq->nr_running)
3763 *sd_idle = 0;
3764
3765 /* Bias balancing toward cpus of our domain */
3766 if (local_group) {
3767 if (idle_cpu(i) && !first_idle_cpu) {
3768 first_idle_cpu = 1;
3769 balance_cpu = i;
3770 }
3771
3772 load = target_load(i, load_idx);
3773 } else {
3774 load = source_load(i, load_idx);
3775 if (load > max_cpu_load)
3776 max_cpu_load = load;
3777 if (min_cpu_load > load)
3778 min_cpu_load = load;
3779 }
3780
3781 sgs->group_load += load;
3782 sgs->sum_nr_running += rq->nr_running;
3783 sgs->sum_weighted_load += weighted_cpuload(i);
3784
3785 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3786 }
3787
3788 /*
3789 * First idle cpu or the first cpu(busiest) in this sched group
3790 * is eligible for doing load balancing at this and above
3791 * domains. In the newly idle case, we will allow all the cpu's
3792 * to do the newly idle load balance.
3793 */
3794 if (idle != CPU_NEWLY_IDLE && local_group &&
3795 balance_cpu != this_cpu && balance) {
3796 *balance = 0;
3797 return;
3798 }
3799
3800 /* Adjust by relative CPU power of the group */
3801 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3802
3803
3804 /*
3805 * Consider the group unbalanced when the imbalance is larger
3806 * than the average weight of two tasks.
3807 *
3808 * APZ: with cgroup the avg task weight can vary wildly and
3809 * might not be a suitable number - should we keep a
3810 * normalized nr_running number somewhere that negates
3811 * the hierarchy?
3812 */
3813 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3814 group->cpu_power;
3815
3816 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3817 sgs->group_imb = 1;
3818
3819 sgs->group_capacity =
3820 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3821}
3822
3823/**
3824 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3825 * @sd: sched_domain whose statistics are to be updated.
3826 * @this_cpu: Cpu for which load balance is currently performed.
3827 * @idle: Idle status of this_cpu
3828 * @sd_idle: Idle status of the sched_domain containing group.
3829 * @cpus: Set of cpus considered for load balancing.
3830 * @balance: Should we balance.
3831 * @sds: variable to hold the statistics for this sched_domain.
3832 */
3833static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3834 enum cpu_idle_type idle, int *sd_idle,
3835 const struct cpumask *cpus, int *balance,
3836 struct sd_lb_stats *sds)
3837{
3838 struct sched_domain *child = sd->child;
3839 struct sched_group *group = sd->groups;
3840 struct sg_lb_stats sgs;
3841 int load_idx, prefer_sibling = 0;
3842
3843 if (child && child->flags & SD_PREFER_SIBLING)
3844 prefer_sibling = 1;
3845
3846 init_sd_power_savings_stats(sd, sds, idle);
3847 load_idx = get_sd_load_idx(sd, idle);
3848
3849 do {
3850 int local_group;
3851
3852 local_group = cpumask_test_cpu(this_cpu,
3853 sched_group_cpus(group));
3854 memset(&sgs, 0, sizeof(sgs));
3855 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3856 local_group, cpus, balance, &sgs);
3857
3858 if (local_group && balance && !(*balance))
3859 return;
3860
3861 sds->total_load += sgs.group_load;
3862 sds->total_pwr += group->cpu_power;
3863
3864 /*
3865 * In case the child domain prefers tasks go to siblings
3866 * first, lower the group capacity to one so that we'll try
3867 * and move all the excess tasks away.
3868 */
3869 if (prefer_sibling)
3870 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3871
3872 if (local_group) {
3873 sds->this_load = sgs.avg_load;
3874 sds->this = group;
3875 sds->this_nr_running = sgs.sum_nr_running;
3876 sds->this_load_per_task = sgs.sum_weighted_load;
3877 } else if (sgs.avg_load > sds->max_load &&
3878 (sgs.sum_nr_running > sgs.group_capacity ||
3879 sgs.group_imb)) {
3880 sds->max_load = sgs.avg_load;
3881 sds->busiest = group;
3882 sds->busiest_nr_running = sgs.sum_nr_running;
3883 sds->busiest_load_per_task = sgs.sum_weighted_load;
3884 sds->group_imb = sgs.group_imb;
3885 }
3886
3887 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3888 group = group->next;
3889 } while (group != sd->groups);
3890}
3891
3892/**
3893 * fix_small_imbalance - Calculate the minor imbalance that exists
3894 * amongst the groups of a sched_domain, during
3895 * load balancing.
3896 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3897 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3898 * @imbalance: Variable to store the imbalance.
3899 */
3900static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3901 int this_cpu, unsigned long *imbalance)
3902{
3903 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3904 unsigned int imbn = 2;
3905
3906 if (sds->this_nr_running) {
3907 sds->this_load_per_task /= sds->this_nr_running;
3908 if (sds->busiest_load_per_task >
3909 sds->this_load_per_task)
3910 imbn = 1;
3911 } else
3912 sds->this_load_per_task =
3913 cpu_avg_load_per_task(this_cpu);
3914
3915 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3916 sds->busiest_load_per_task * imbn) {
3917 *imbalance = sds->busiest_load_per_task;
3918 return;
3919 }
3920
3921 /*
3922 * OK, we don't have enough imbalance to justify moving tasks,
3923 * however we may be able to increase total CPU power used by
3924 * moving them.
3925 */
3926
3927 pwr_now += sds->busiest->cpu_power *
3928 min(sds->busiest_load_per_task, sds->max_load);
3929 pwr_now += sds->this->cpu_power *
3930 min(sds->this_load_per_task, sds->this_load);
3931 pwr_now /= SCHED_LOAD_SCALE;
3932
3933 /* Amount of load we'd subtract */
3934 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3935 sds->busiest->cpu_power;
3936 if (sds->max_load > tmp)
3937 pwr_move += sds->busiest->cpu_power *
3938 min(sds->busiest_load_per_task, sds->max_load - tmp);
3939
3940 /* Amount of load we'd add */
3941 if (sds->max_load * sds->busiest->cpu_power <
3942 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3943 tmp = (sds->max_load * sds->busiest->cpu_power) /
3944 sds->this->cpu_power;
3945 else
3946 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3947 sds->this->cpu_power;
3948 pwr_move += sds->this->cpu_power *
3949 min(sds->this_load_per_task, sds->this_load + tmp);
3950 pwr_move /= SCHED_LOAD_SCALE;
3951
3952 /* Move if we gain throughput */
3953 if (pwr_move > pwr_now)
3954 *imbalance = sds->busiest_load_per_task;
3955}
3956
3957/**
3958 * calculate_imbalance - Calculate the amount of imbalance present within the
3959 * groups of a given sched_domain during load balance.
3960 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3961 * @this_cpu: Cpu for which currently load balance is being performed.
3962 * @imbalance: The variable to store the imbalance.
3963 */
3964static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3965 unsigned long *imbalance)
3966{
3967 unsigned long max_pull;
3968 /*
3969 * In the presence of smp nice balancing, certain scenarios can have
3970 * max load less than avg load(as we skip the groups at or below
3971 * its cpu_power, while calculating max_load..)
3972 */
3973 if (sds->max_load < sds->avg_load) {
3974 *imbalance = 0;
3975 return fix_small_imbalance(sds, this_cpu, imbalance);
3976 }
3977
3978 /* Don't want to pull so many tasks that a group would go idle */
3979 max_pull = min(sds->max_load - sds->avg_load,
3980 sds->max_load - sds->busiest_load_per_task);
3981
3982 /* How much load to actually move to equalise the imbalance */
3983 *imbalance = min(max_pull * sds->busiest->cpu_power,
3984 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3985 / SCHED_LOAD_SCALE;
3986
3987 /*
3988 * if *imbalance is less than the average load per runnable task
3989 * there is no gaurantee that any tasks will be moved so we'll have
3990 * a think about bumping its value to force at least one task to be
3991 * moved
3992 */
3993 if (*imbalance < sds->busiest_load_per_task)
3994 return fix_small_imbalance(sds, this_cpu, imbalance);
3995
3996}
3997/******* find_busiest_group() helpers end here *********************/
3998
3999/**
4000 * find_busiest_group - Returns the busiest group within the sched_domain
4001 * if there is an imbalance. If there isn't an imbalance, and
4002 * the user has opted for power-savings, it returns a group whose
4003 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4004 * such a group exists.
4005 *
4006 * Also calculates the amount of weighted load which should be moved
4007 * to restore balance.
4008 *
4009 * @sd: The sched_domain whose busiest group is to be returned.
4010 * @this_cpu: The cpu for which load balancing is currently being performed.
4011 * @imbalance: Variable which stores amount of weighted load which should
4012 * be moved to restore balance/put a group to idle.
4013 * @idle: The idle status of this_cpu.
4014 * @sd_idle: The idleness of sd
4015 * @cpus: The set of CPUs under consideration for load-balancing.
4016 * @balance: Pointer to a variable indicating if this_cpu
4017 * is the appropriate cpu to perform load balancing at this_level.
4018 *
4019 * Returns: - the busiest group if imbalance exists.
4020 * - If no imbalance and user has opted for power-savings balance,
4021 * return the least loaded group whose CPUs can be
4022 * put to idle by rebalancing its tasks onto our group.
4023 */
4024static struct sched_group *
4025find_busiest_group(struct sched_domain *sd, int this_cpu,
4026 unsigned long *imbalance, enum cpu_idle_type idle,
4027 int *sd_idle, const struct cpumask *cpus, int *balance)
4028{
4029 struct sd_lb_stats sds;
4030
4031 memset(&sds, 0, sizeof(sds));
4032
4033 /*
4034 * Compute the various statistics relavent for load balancing at
4035 * this level.
4036 */
4037 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4038 balance, &sds);
4039
4040 /* Cases where imbalance does not exist from POV of this_cpu */
4041 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4042 * at this level.
4043 * 2) There is no busy sibling group to pull from.
4044 * 3) This group is the busiest group.
4045 * 4) This group is more busy than the avg busieness at this
4046 * sched_domain.
4047 * 5) The imbalance is within the specified limit.
4048 * 6) Any rebalance would lead to ping-pong
4049 */
4050 if (balance && !(*balance))
4051 goto ret;
4052
4053 if (!sds.busiest || sds.busiest_nr_running == 0)
4054 goto out_balanced;
4055
4056 if (sds.this_load >= sds.max_load)
4057 goto out_balanced;
4058
4059 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4060
4061 if (sds.this_load >= sds.avg_load)
4062 goto out_balanced;
4063
4064 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4065 goto out_balanced;
4066
4067 sds.busiest_load_per_task /= sds.busiest_nr_running;
4068 if (sds.group_imb)
4069 sds.busiest_load_per_task =
4070 min(sds.busiest_load_per_task, sds.avg_load);
4071
4072 /*
4073 * We're trying to get all the cpus to the average_load, so we don't
4074 * want to push ourselves above the average load, nor do we wish to
4075 * reduce the max loaded cpu below the average load, as either of these
4076 * actions would just result in more rebalancing later, and ping-pong
4077 * tasks around. Thus we look for the minimum possible imbalance.
4078 * Negative imbalances (*we* are more loaded than anyone else) will
4079 * be counted as no imbalance for these purposes -- we can't fix that
4080 * by pulling tasks to us. Be careful of negative numbers as they'll
4081 * appear as very large values with unsigned longs.
4082 */
4083 if (sds.max_load <= sds.busiest_load_per_task)
4084 goto out_balanced;
4085
4086 /* Looks like there is an imbalance. Compute it */
4087 calculate_imbalance(&sds, this_cpu, imbalance);
4088 return sds.busiest;
4089
4090out_balanced:
4091 /*
4092 * There is no obvious imbalance. But check if we can do some balancing
4093 * to save power.
4094 */
4095 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4096 return sds.busiest;
4097ret:
4098 *imbalance = 0;
4099 return NULL;
4100}
4101
4102/*
4103 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4104 */
4105static struct rq *
4106find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4107 unsigned long imbalance, const struct cpumask *cpus)
4108{
4109 struct rq *busiest = NULL, *rq;
4110 unsigned long max_load = 0;
4111 int i;
4112
4113 for_each_cpu(i, sched_group_cpus(group)) {
4114 unsigned long power = power_of(i);
4115 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4116 unsigned long wl;
4117
4118 if (!cpumask_test_cpu(i, cpus))
4119 continue;
4120
4121 rq = cpu_rq(i);
4122 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4123 wl /= power;
4124
4125 if (capacity && rq->nr_running == 1 && wl > imbalance)
4126 continue;
4127
4128 if (wl > max_load) {
4129 max_load = wl;
4130 busiest = rq;
4131 }
4132 }
4133
4134 return busiest;
4135}
4136
4137/*
4138 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4139 * so long as it is large enough.
4140 */
4141#define MAX_PINNED_INTERVAL 512
4142
4143/* Working cpumask for load_balance and load_balance_newidle. */
4144static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4145
4146/*
4147 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4148 * tasks if there is an imbalance.
4149 */
4150static int load_balance(int this_cpu, struct rq *this_rq,
4151 struct sched_domain *sd, enum cpu_idle_type idle,
4152 int *balance)
4153{
4154 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4155 struct sched_group *group;
4156 unsigned long imbalance;
4157 struct rq *busiest;
4158 unsigned long flags;
4159 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4160
4161 cpumask_copy(cpus, cpu_active_mask);
4162
4163 /*
4164 * When power savings policy is enabled for the parent domain, idle
4165 * sibling can pick up load irrespective of busy siblings. In this case,
4166 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4167 * portraying it as CPU_NOT_IDLE.
4168 */
4169 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4170 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4171 sd_idle = 1;
4172
4173 schedstat_inc(sd, lb_count[idle]);
4174
4175redo:
4176 update_shares(sd);
4177 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4178 cpus, balance);
4179
4180 if (*balance == 0)
4181 goto out_balanced;
4182
4183 if (!group) {
4184 schedstat_inc(sd, lb_nobusyg[idle]);
4185 goto out_balanced;
4186 }
4187
4188 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4189 if (!busiest) {
4190 schedstat_inc(sd, lb_nobusyq[idle]);
4191 goto out_balanced;
4192 }
4193
4194 BUG_ON(busiest == this_rq);
4195
4196 schedstat_add(sd, lb_imbalance[idle], imbalance);
4197
4198 ld_moved = 0;
4199 if (busiest->nr_running > 1) {
4200 /*
4201 * Attempt to move tasks. If find_busiest_group has found
4202 * an imbalance but busiest->nr_running <= 1, the group is
4203 * still unbalanced. ld_moved simply stays zero, so it is
4204 * correctly treated as an imbalance.
4205 */
4206 local_irq_save(flags);
4207 double_rq_lock(this_rq, busiest);
4208 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4209 imbalance, sd, idle, &all_pinned);
4210 double_rq_unlock(this_rq, busiest);
4211 local_irq_restore(flags);
4212
4213 /*
4214 * some other cpu did the load balance for us.
4215 */
4216 if (ld_moved && this_cpu != smp_processor_id())
4217 resched_cpu(this_cpu);
4218
4219 /* All tasks on this runqueue were pinned by CPU affinity */
4220 if (unlikely(all_pinned)) {
4221 cpumask_clear_cpu(cpu_of(busiest), cpus);
4222 if (!cpumask_empty(cpus))
4223 goto redo;
4224 goto out_balanced;
4225 }
4226 }
4227
4228 if (!ld_moved) {
4229 schedstat_inc(sd, lb_failed[idle]);
4230 sd->nr_balance_failed++;
4231
4232 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4233
4234 raw_spin_lock_irqsave(&busiest->lock, flags);
4235
4236 /* don't kick the migration_thread, if the curr
4237 * task on busiest cpu can't be moved to this_cpu
4238 */
4239 if (!cpumask_test_cpu(this_cpu,
4240 &busiest->curr->cpus_allowed)) {
4241 raw_spin_unlock_irqrestore(&busiest->lock,
4242 flags);
4243 all_pinned = 1;
4244 goto out_one_pinned;
4245 }
4246
4247 if (!busiest->active_balance) {
4248 busiest->active_balance = 1;
4249 busiest->push_cpu = this_cpu;
4250 active_balance = 1;
4251 }
4252 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4253 if (active_balance)
4254 wake_up_process(busiest->migration_thread);
4255
4256 /*
4257 * We've kicked active balancing, reset the failure
4258 * counter.
4259 */
4260 sd->nr_balance_failed = sd->cache_nice_tries+1;
4261 }
4262 } else
4263 sd->nr_balance_failed = 0;
4264
4265 if (likely(!active_balance)) {
4266 /* We were unbalanced, so reset the balancing interval */
4267 sd->balance_interval = sd->min_interval;
4268 } else {
4269 /*
4270 * If we've begun active balancing, start to back off. This
4271 * case may not be covered by the all_pinned logic if there
4272 * is only 1 task on the busy runqueue (because we don't call
4273 * move_tasks).
4274 */
4275 if (sd->balance_interval < sd->max_interval)
4276 sd->balance_interval *= 2;
4277 }
4278
4279 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4280 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4281 ld_moved = -1;
4282
4283 goto out;
4284
4285out_balanced:
4286 schedstat_inc(sd, lb_balanced[idle]);
4287
4288 sd->nr_balance_failed = 0;
4289
4290out_one_pinned:
4291 /* tune up the balancing interval */
4292 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4293 (sd->balance_interval < sd->max_interval))
4294 sd->balance_interval *= 2;
4295
4296 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4297 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4298 ld_moved = -1;
4299 else
4300 ld_moved = 0;
4301out:
4302 if (ld_moved)
4303 update_shares(sd);
4304 return ld_moved;
4305}
4306
4307/*
4308 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4309 * tasks if there is an imbalance.
4310 *
4311 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4312 * this_rq is locked.
4313 */
4314static int
4315load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4316{
4317 struct sched_group *group;
4318 struct rq *busiest = NULL;
4319 unsigned long imbalance;
4320 int ld_moved = 0;
4321 int sd_idle = 0;
4322 int all_pinned = 0;
4323 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4324
4325 cpumask_copy(cpus, cpu_active_mask);
4326
4327 /*
4328 * When power savings policy is enabled for the parent domain, idle
4329 * sibling can pick up load irrespective of busy siblings. In this case,
4330 * let the state of idle sibling percolate up as IDLE, instead of
4331 * portraying it as CPU_NOT_IDLE.
4332 */
4333 if (sd->flags & SD_SHARE_CPUPOWER &&
4334 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4335 sd_idle = 1;
4336
4337 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4338redo:
4339 update_shares_locked(this_rq, sd);
4340 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4341 &sd_idle, cpus, NULL);
4342 if (!group) {
4343 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4344 goto out_balanced;
4345 }
4346
4347 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4348 if (!busiest) {
4349 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4350 goto out_balanced;
4351 }
4352
4353 BUG_ON(busiest == this_rq);
4354
4355 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4356
4357 ld_moved = 0;
4358 if (busiest->nr_running > 1) {
4359 /* Attempt to move tasks */
4360 double_lock_balance(this_rq, busiest);
4361 /* this_rq->clock is already updated */
4362 update_rq_clock(busiest);
4363 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4364 imbalance, sd, CPU_NEWLY_IDLE,
4365 &all_pinned);
4366 double_unlock_balance(this_rq, busiest);
4367
4368 if (unlikely(all_pinned)) {
4369 cpumask_clear_cpu(cpu_of(busiest), cpus);
4370 if (!cpumask_empty(cpus))
4371 goto redo;
4372 }
4373 }
4374
4375 if (!ld_moved) {
4376 int active_balance = 0;
4377
4378 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4379 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4380 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4381 return -1;
4382
4383 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4384 return -1;
4385
4386 if (sd->nr_balance_failed++ < 2)
4387 return -1;
4388
4389 /*
4390 * The only task running in a non-idle cpu can be moved to this
4391 * cpu in an attempt to completely freeup the other CPU
4392 * package. The same method used to move task in load_balance()
4393 * have been extended for load_balance_newidle() to speedup
4394 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4395 *
4396 * The package power saving logic comes from
4397 * find_busiest_group(). If there are no imbalance, then
4398 * f_b_g() will return NULL. However when sched_mc={1,2} then
4399 * f_b_g() will select a group from which a running task may be
4400 * pulled to this cpu in order to make the other package idle.
4401 * If there is no opportunity to make a package idle and if
4402 * there are no imbalance, then f_b_g() will return NULL and no
4403 * action will be taken in load_balance_newidle().
4404 *
4405 * Under normal task pull operation due to imbalance, there
4406 * will be more than one task in the source run queue and
4407 * move_tasks() will succeed. ld_moved will be true and this
4408 * active balance code will not be triggered.
4409 */
4410
4411 /* Lock busiest in correct order while this_rq is held */
4412 double_lock_balance(this_rq, busiest);
4413
4414 /*
4415 * don't kick the migration_thread, if the curr
4416 * task on busiest cpu can't be moved to this_cpu
4417 */
4418 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4419 double_unlock_balance(this_rq, busiest);
4420 all_pinned = 1;
4421 return ld_moved;
4422 }
4423
4424 if (!busiest->active_balance) {
4425 busiest->active_balance = 1;
4426 busiest->push_cpu = this_cpu;
4427 active_balance = 1;
4428 }
4429
4430 double_unlock_balance(this_rq, busiest);
4431 /*
4432 * Should not call ttwu while holding a rq->lock
4433 */
4434 raw_spin_unlock(&this_rq->lock);
4435 if (active_balance)
4436 wake_up_process(busiest->migration_thread);
4437 raw_spin_lock(&this_rq->lock);
4438
4439 } else
4440 sd->nr_balance_failed = 0;
4441
4442 update_shares_locked(this_rq, sd);
4443 return ld_moved;
4444
4445out_balanced:
4446 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4447 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4448 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4449 return -1;
4450 sd->nr_balance_failed = 0;
4451
4452 return 0;
4453}
4454
4455/*
4456 * idle_balance is called by schedule() if this_cpu is about to become
4457 * idle. Attempts to pull tasks from other CPUs.
4458 */
4459static void idle_balance(int this_cpu, struct rq *this_rq)
4460{
4461 struct sched_domain *sd;
4462 int pulled_task = 0;
4463 unsigned long next_balance = jiffies + HZ;
4464
4465 this_rq->idle_stamp = this_rq->clock;
4466
4467 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4468 return;
4469
4470 for_each_domain(this_cpu, sd) {
4471 unsigned long interval;
4472
4473 if (!(sd->flags & SD_LOAD_BALANCE))
4474 continue;
4475
4476 if (sd->flags & SD_BALANCE_NEWIDLE)
4477 /* If we've pulled tasks over stop searching: */
4478 pulled_task = load_balance_newidle(this_cpu, this_rq,
4479 sd);
4480
4481 interval = msecs_to_jiffies(sd->balance_interval);
4482 if (time_after(next_balance, sd->last_balance + interval))
4483 next_balance = sd->last_balance + interval;
4484 if (pulled_task) {
4485 this_rq->idle_stamp = 0;
4486 break;
4487 }
4488 }
4489 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4490 /*
4491 * We are going idle. next_balance may be set based on
4492 * a busy processor. So reset next_balance.
4493 */
4494 this_rq->next_balance = next_balance;
4495 }
4496}
4497
4498/*
4499 * active_load_balance is run by migration threads. It pushes running tasks
4500 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4501 * running on each physical CPU where possible, and avoids physical /
4502 * logical imbalances.
4503 *
4504 * Called with busiest_rq locked.
4505 */
4506static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4507{
4508 int target_cpu = busiest_rq->push_cpu;
4509 struct sched_domain *sd;
4510 struct rq *target_rq;
4511
4512 /* Is there any task to move? */
4513 if (busiest_rq->nr_running <= 1)
4514 return;
4515
4516 target_rq = cpu_rq(target_cpu);
4517
4518 /*
4519 * This condition is "impossible", if it occurs
4520 * we need to fix it. Originally reported by
4521 * Bjorn Helgaas on a 128-cpu setup.
4522 */
4523 BUG_ON(busiest_rq == target_rq);
4524
4525 /* move a task from busiest_rq to target_rq */
4526 double_lock_balance(busiest_rq, target_rq);
4527 update_rq_clock(busiest_rq);
4528 update_rq_clock(target_rq);
4529
4530 /* Search for an sd spanning us and the target CPU. */
4531 for_each_domain(target_cpu, sd) {
4532 if ((sd->flags & SD_LOAD_BALANCE) &&
4533 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4534 break;
4535 }
4536
4537 if (likely(sd)) {
4538 schedstat_inc(sd, alb_count);
4539
4540 if (move_one_task(target_rq, target_cpu, busiest_rq,
4541 sd, CPU_IDLE))
4542 schedstat_inc(sd, alb_pushed);
4543 else
4544 schedstat_inc(sd, alb_failed);
4545 }
4546 double_unlock_balance(busiest_rq, target_rq);
4547}
4548
4549#ifdef CONFIG_NO_HZ
4550static struct {
4551 atomic_t load_balancer;
4552 cpumask_var_t cpu_mask;
4553 cpumask_var_t ilb_grp_nohz_mask;
4554} nohz ____cacheline_aligned = {
4555 .load_balancer = ATOMIC_INIT(-1),
4556};
4557
4558int get_nohz_load_balancer(void)
4559{
4560 return atomic_read(&nohz.load_balancer);
4561}
4562
4563#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4564/**
4565 * lowest_flag_domain - Return lowest sched_domain containing flag.
4566 * @cpu: The cpu whose lowest level of sched domain is to
4567 * be returned.
4568 * @flag: The flag to check for the lowest sched_domain
4569 * for the given cpu.
4570 *
4571 * Returns the lowest sched_domain of a cpu which contains the given flag.
4572 */
4573static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4574{
4575 struct sched_domain *sd;
4576
4577 for_each_domain(cpu, sd)
4578 if (sd && (sd->flags & flag))
4579 break;
4580
4581 return sd;
4582}
4583
4584/**
4585 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4586 * @cpu: The cpu whose domains we're iterating over.
4587 * @sd: variable holding the value of the power_savings_sd
4588 * for cpu.
4589 * @flag: The flag to filter the sched_domains to be iterated.
4590 *
4591 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4592 * set, starting from the lowest sched_domain to the highest.
4593 */
4594#define for_each_flag_domain(cpu, sd, flag) \
4595 for (sd = lowest_flag_domain(cpu, flag); \
4596 (sd && (sd->flags & flag)); sd = sd->parent)
4597
4598/**
4599 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4600 * @ilb_group: group to be checked for semi-idleness
4601 *
4602 * Returns: 1 if the group is semi-idle. 0 otherwise.
4603 *
4604 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4605 * and atleast one non-idle CPU. This helper function checks if the given
4606 * sched_group is semi-idle or not.
4607 */
4608static inline int is_semi_idle_group(struct sched_group *ilb_group)
4609{
4610 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4611 sched_group_cpus(ilb_group));
4612
4613 /*
4614 * A sched_group is semi-idle when it has atleast one busy cpu
4615 * and atleast one idle cpu.
4616 */
4617 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4618 return 0;
4619
4620 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4621 return 0;
4622
4623 return 1;
4624}
4625/**
4626 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4627 * @cpu: The cpu which is nominating a new idle_load_balancer.
4628 *
4629 * Returns: Returns the id of the idle load balancer if it exists,
4630 * Else, returns >= nr_cpu_ids.
4631 *
4632 * This algorithm picks the idle load balancer such that it belongs to a
4633 * semi-idle powersavings sched_domain. The idea is to try and avoid
4634 * completely idle packages/cores just for the purpose of idle load balancing
4635 * when there are other idle cpu's which are better suited for that job.
4636 */
4637static int find_new_ilb(int cpu)
4638{
4639 struct sched_domain *sd;
4640 struct sched_group *ilb_group;
4641
4642 /*
4643 * Have idle load balancer selection from semi-idle packages only
4644 * when power-aware load balancing is enabled
4645 */
4646 if (!(sched_smt_power_savings || sched_mc_power_savings))
4647 goto out_done;
4648
4649 /*
4650 * Optimize for the case when we have no idle CPUs or only one
4651 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4652 */
4653 if (cpumask_weight(nohz.cpu_mask) < 2)
4654 goto out_done;
4655
4656 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4657 ilb_group = sd->groups;
4658
4659 do {
4660 if (is_semi_idle_group(ilb_group))
4661 return cpumask_first(nohz.ilb_grp_nohz_mask);
4662
4663 ilb_group = ilb_group->next;
4664
4665 } while (ilb_group != sd->groups);
4666 }
4667
4668out_done:
4669 return cpumask_first(nohz.cpu_mask);
4670}
4671#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4672static inline int find_new_ilb(int call_cpu)
4673{
4674 return cpumask_first(nohz.cpu_mask);
4675}
4676#endif
4677
4678/*
4679 * This routine will try to nominate the ilb (idle load balancing)
4680 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4681 * load balancing on behalf of all those cpus. If all the cpus in the system
4682 * go into this tickless mode, then there will be no ilb owner (as there is
4683 * no need for one) and all the cpus will sleep till the next wakeup event
4684 * arrives...
4685 *
4686 * For the ilb owner, tick is not stopped. And this tick will be used
4687 * for idle load balancing. ilb owner will still be part of
4688 * nohz.cpu_mask..
4689 *
4690 * While stopping the tick, this cpu will become the ilb owner if there
4691 * is no other owner. And will be the owner till that cpu becomes busy
4692 * or if all cpus in the system stop their ticks at which point
4693 * there is no need for ilb owner.
4694 *
4695 * When the ilb owner becomes busy, it nominates another owner, during the
4696 * next busy scheduler_tick()
4697 */
4698int select_nohz_load_balancer(int stop_tick)
4699{
4700 int cpu = smp_processor_id();
4701
4702 if (stop_tick) {
4703 cpu_rq(cpu)->in_nohz_recently = 1;
4704
4705 if (!cpu_active(cpu)) {
4706 if (atomic_read(&nohz.load_balancer) != cpu)
4707 return 0;
4708
4709 /*
4710 * If we are going offline and still the leader,
4711 * give up!
4712 */
4713 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4714 BUG();
4715
4716 return 0;
4717 }
4718
4719 cpumask_set_cpu(cpu, nohz.cpu_mask);
4720
4721 /* time for ilb owner also to sleep */
4722 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4723 if (atomic_read(&nohz.load_balancer) == cpu)
4724 atomic_set(&nohz.load_balancer, -1);
4725 return 0;
4726 }
4727
4728 if (atomic_read(&nohz.load_balancer) == -1) {
4729 /* make me the ilb owner */
4730 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4731 return 1;
4732 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4733 int new_ilb;
4734
4735 if (!(sched_smt_power_savings ||
4736 sched_mc_power_savings))
4737 return 1;
4738 /*
4739 * Check to see if there is a more power-efficient
4740 * ilb.
4741 */
4742 new_ilb = find_new_ilb(cpu);
4743 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4744 atomic_set(&nohz.load_balancer, -1);
4745 resched_cpu(new_ilb);
4746 return 0;
4747 }
4748 return 1;
4749 }
4750 } else {
4751 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4752 return 0;
4753
4754 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4755
4756 if (atomic_read(&nohz.load_balancer) == cpu)
4757 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4758 BUG();
4759 }
4760 return 0;
4761}
4762#endif
4763
4764static DEFINE_SPINLOCK(balancing);
4765
4766/*
4767 * It checks each scheduling domain to see if it is due to be balanced,
4768 * and initiates a balancing operation if so.
4769 *
4770 * Balancing parameters are set up in arch_init_sched_domains.
4771 */
4772static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4773{
4774 int balance = 1;
4775 struct rq *rq = cpu_rq(cpu);
4776 unsigned long interval;
4777 struct sched_domain *sd;
4778 /* Earliest time when we have to do rebalance again */
4779 unsigned long next_balance = jiffies + 60*HZ;
4780 int update_next_balance = 0;
4781 int need_serialize;
4782
4783 for_each_domain(cpu, sd) {
4784 if (!(sd->flags & SD_LOAD_BALANCE))
4785 continue;
4786
4787 interval = sd->balance_interval;
4788 if (idle != CPU_IDLE)
4789 interval *= sd->busy_factor;
4790
4791 /* scale ms to jiffies */
4792 interval = msecs_to_jiffies(interval);
4793 if (unlikely(!interval))
4794 interval = 1;
4795 if (interval > HZ*NR_CPUS/10)
4796 interval = HZ*NR_CPUS/10;
4797
4798 need_serialize = sd->flags & SD_SERIALIZE;
4799
4800 if (need_serialize) {
4801 if (!spin_trylock(&balancing))
4802 goto out;
4803 }
4804
4805 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4806 if (load_balance(cpu, rq, sd, idle, &balance)) {
4807 /*
4808 * We've pulled tasks over so either we're no
4809 * longer idle, or one of our SMT siblings is
4810 * not idle.
4811 */
4812 idle = CPU_NOT_IDLE;
4813 }
4814 sd->last_balance = jiffies;
4815 }
4816 if (need_serialize)
4817 spin_unlock(&balancing);
4818out:
4819 if (time_after(next_balance, sd->last_balance + interval)) {
4820 next_balance = sd->last_balance + interval;
4821 update_next_balance = 1;
4822 }
4823
4824 /*
4825 * Stop the load balance at this level. There is another
4826 * CPU in our sched group which is doing load balancing more
4827 * actively.
4828 */
4829 if (!balance)
4830 break;
4831 }
4832
4833 /*
4834 * next_balance will be updated only when there is a need.
4835 * When the cpu is attached to null domain for ex, it will not be
4836 * updated.
4837 */
4838 if (likely(update_next_balance))
4839 rq->next_balance = next_balance;
4840}
4841
4842/*
4843 * run_rebalance_domains is triggered when needed from the scheduler tick.
4844 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4845 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4846 */
4847static void run_rebalance_domains(struct softirq_action *h)
4848{
4849 int this_cpu = smp_processor_id();
4850 struct rq *this_rq = cpu_rq(this_cpu);
4851 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4852 CPU_IDLE : CPU_NOT_IDLE;
4853
4854 rebalance_domains(this_cpu, idle);
4855
4856#ifdef CONFIG_NO_HZ
4857 /*
4858 * If this cpu is the owner for idle load balancing, then do the
4859 * balancing on behalf of the other idle cpus whose ticks are
4860 * stopped.
4861 */
4862 if (this_rq->idle_at_tick &&
4863 atomic_read(&nohz.load_balancer) == this_cpu) {
4864 struct rq *rq;
4865 int balance_cpu;
4866
4867 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4868 if (balance_cpu == this_cpu)
4869 continue;
4870
4871 /*
4872 * If this cpu gets work to do, stop the load balancing
4873 * work being done for other cpus. Next load
4874 * balancing owner will pick it up.
4875 */
4876 if (need_resched())
4877 break;
4878
4879 rebalance_domains(balance_cpu, CPU_IDLE);
4880
4881 rq = cpu_rq(balance_cpu);
4882 if (time_after(this_rq->next_balance, rq->next_balance))
4883 this_rq->next_balance = rq->next_balance;
4884 }
4885 }
4886#endif
4887}
4888
4889static inline int on_null_domain(int cpu)
4890{
4891 return !rcu_dereference(cpu_rq(cpu)->sd);
4892}
4893
4894/*
4895 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4896 *
4897 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4898 * idle load balancing owner or decide to stop the periodic load balancing,
4899 * if the whole system is idle.
4900 */
4901static inline void trigger_load_balance(struct rq *rq, int cpu)
4902{
4903#ifdef CONFIG_NO_HZ
4904 /*
4905 * If we were in the nohz mode recently and busy at the current
4906 * scheduler tick, then check if we need to nominate new idle
4907 * load balancer.
4908 */
4909 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4910 rq->in_nohz_recently = 0;
4911
4912 if (atomic_read(&nohz.load_balancer) == cpu) {
4913 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4914 atomic_set(&nohz.load_balancer, -1);
4915 }
4916
4917 if (atomic_read(&nohz.load_balancer) == -1) {
4918 int ilb = find_new_ilb(cpu);
4919
4920 if (ilb < nr_cpu_ids)
4921 resched_cpu(ilb);
4922 }
4923 }
4924
4925 /*
4926 * If this cpu is idle and doing idle load balancing for all the
4927 * cpus with ticks stopped, is it time for that to stop?
4928 */
4929 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4930 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4931 resched_cpu(cpu);
4932 return;
4933 }
4934
4935 /*
4936 * If this cpu is idle and the idle load balancing is done by
4937 * someone else, then no need raise the SCHED_SOFTIRQ
4938 */
4939 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4940 cpumask_test_cpu(cpu, nohz.cpu_mask))
4941 return;
4942#endif
4943 /* Don't need to rebalance while attached to NULL domain */
4944 if (time_after_eq(jiffies, rq->next_balance) &&
4945 likely(!on_null_domain(cpu)))
4946 raise_softirq(SCHED_SOFTIRQ);
4947}
4948
4949#else /* CONFIG_SMP */
4950
4951/*
4952 * on UP we do not need to balance between CPUs:
4953 */
4954static inline void idle_balance(int cpu, struct rq *rq)
4955{
4956}
4957
4958#endif 3115#endif
4959 3116
4960DEFINE_PER_CPU(struct kernel_stat, kstat); 3117DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -6079,7 +4236,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6079 if (running) 4236 if (running)
6080 p->sched_class->set_curr_task(rq); 4237 p->sched_class->set_curr_task(rq);
6081 if (on_rq) { 4238 if (on_rq) {
6082 enqueue_task(rq, p, 0); 4239 enqueue_task(rq, p, 0, oldprio < prio);
6083 4240
6084 check_class_changed(rq, p, prev_class, oldprio, running); 4241 check_class_changed(rq, p, prev_class, oldprio, running);
6085 } 4242 }
@@ -6123,7 +4280,7 @@ void set_user_nice(struct task_struct *p, long nice)
6123 delta = p->prio - old_prio; 4280 delta = p->prio - old_prio;
6124 4281
6125 if (on_rq) { 4282 if (on_rq) {
6126 enqueue_task(rq, p, 0); 4283 enqueue_task(rq, p, 0, false);
6127 /* 4284 /*
6128 * If the task increased its priority or is running and 4285 * If the task increased its priority or is running and
6129 * lowered its priority, then reschedule its CPU: 4286 * lowered its priority, then reschedule its CPU:
@@ -9452,7 +7609,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9452 tg->rt_rq[cpu] = rt_rq; 7609 tg->rt_rq[cpu] = rt_rq;
9453 init_rt_rq(rt_rq, rq); 7610 init_rt_rq(rt_rq, rq);
9454 rt_rq->tg = tg; 7611 rt_rq->tg = tg;
9455 rt_rq->rt_se = rt_se;
9456 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7612 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9457 if (add) 7613 if (add)
9458 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7614 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9483,9 +7639,6 @@ void __init sched_init(void)
9483#ifdef CONFIG_RT_GROUP_SCHED 7639#ifdef CONFIG_RT_GROUP_SCHED
9484 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7640 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9485#endif 7641#endif
9486#ifdef CONFIG_USER_SCHED
9487 alloc_size *= 2;
9488#endif
9489#ifdef CONFIG_CPUMASK_OFFSTACK 7642#ifdef CONFIG_CPUMASK_OFFSTACK
9490 alloc_size += num_possible_cpus() * cpumask_size(); 7643 alloc_size += num_possible_cpus() * cpumask_size();
9491#endif 7644#endif
@@ -9499,13 +7652,6 @@ void __init sched_init(void)
9499 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7652 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9500 ptr += nr_cpu_ids * sizeof(void **); 7653 ptr += nr_cpu_ids * sizeof(void **);
9501 7654
9502#ifdef CONFIG_USER_SCHED
9503 root_task_group.se = (struct sched_entity **)ptr;
9504 ptr += nr_cpu_ids * sizeof(void **);
9505
9506 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9507 ptr += nr_cpu_ids * sizeof(void **);
9508#endif /* CONFIG_USER_SCHED */
9509#endif /* CONFIG_FAIR_GROUP_SCHED */ 7655#endif /* CONFIG_FAIR_GROUP_SCHED */
9510#ifdef CONFIG_RT_GROUP_SCHED 7656#ifdef CONFIG_RT_GROUP_SCHED
9511 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7657 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9514,13 +7660,6 @@ void __init sched_init(void)
9514 init_task_group.rt_rq = (struct rt_rq **)ptr; 7660 init_task_group.rt_rq = (struct rt_rq **)ptr;
9515 ptr += nr_cpu_ids * sizeof(void **); 7661 ptr += nr_cpu_ids * sizeof(void **);
9516 7662
9517#ifdef CONFIG_USER_SCHED
9518 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9519 ptr += nr_cpu_ids * sizeof(void **);
9520
9521 root_task_group.rt_rq = (struct rt_rq **)ptr;
9522 ptr += nr_cpu_ids * sizeof(void **);
9523#endif /* CONFIG_USER_SCHED */
9524#endif /* CONFIG_RT_GROUP_SCHED */ 7663#endif /* CONFIG_RT_GROUP_SCHED */
9525#ifdef CONFIG_CPUMASK_OFFSTACK 7664#ifdef CONFIG_CPUMASK_OFFSTACK
9526 for_each_possible_cpu(i) { 7665 for_each_possible_cpu(i) {
@@ -9540,22 +7679,13 @@ void __init sched_init(void)
9540#ifdef CONFIG_RT_GROUP_SCHED 7679#ifdef CONFIG_RT_GROUP_SCHED
9541 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7680 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9542 global_rt_period(), global_rt_runtime()); 7681 global_rt_period(), global_rt_runtime());
9543#ifdef CONFIG_USER_SCHED
9544 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9545 global_rt_period(), RUNTIME_INF);
9546#endif /* CONFIG_USER_SCHED */
9547#endif /* CONFIG_RT_GROUP_SCHED */ 7682#endif /* CONFIG_RT_GROUP_SCHED */
9548 7683
9549#ifdef CONFIG_GROUP_SCHED 7684#ifdef CONFIG_CGROUP_SCHED
9550 list_add(&init_task_group.list, &task_groups); 7685 list_add(&init_task_group.list, &task_groups);
9551 INIT_LIST_HEAD(&init_task_group.children); 7686 INIT_LIST_HEAD(&init_task_group.children);
9552 7687
9553#ifdef CONFIG_USER_SCHED 7688#endif /* CONFIG_CGROUP_SCHED */
9554 INIT_LIST_HEAD(&root_task_group.children);
9555 init_task_group.parent = &root_task_group;
9556 list_add(&init_task_group.siblings, &root_task_group.children);
9557#endif /* CONFIG_USER_SCHED */
9558#endif /* CONFIG_GROUP_SCHED */
9559 7689
9560#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7690#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9561 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7691 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9595,25 +7725,6 @@ void __init sched_init(void)
9595 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7725 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9596 */ 7726 */
9597 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7727 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9598#elif defined CONFIG_USER_SCHED
9599 root_task_group.shares = NICE_0_LOAD;
9600 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9601 /*
9602 * In case of task-groups formed thr' the user id of tasks,
9603 * init_task_group represents tasks belonging to root user.
9604 * Hence it forms a sibling of all subsequent groups formed.
9605 * In this case, init_task_group gets only a fraction of overall
9606 * system cpu resource, based on the weight assigned to root
9607 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9608 * by letting tasks of init_task_group sit in a separate cfs_rq
9609 * (init_tg_cfs_rq) and having one entity represent this group of
9610 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9611 */
9612 init_tg_cfs_entry(&init_task_group,
9613 &per_cpu(init_tg_cfs_rq, i),
9614 &per_cpu(init_sched_entity, i), i, 1,
9615 root_task_group.se[i]);
9616
9617#endif 7728#endif
9618#endif /* CONFIG_FAIR_GROUP_SCHED */ 7729#endif /* CONFIG_FAIR_GROUP_SCHED */
9619 7730
@@ -9622,12 +7733,6 @@ void __init sched_init(void)
9622 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7733 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9623#ifdef CONFIG_CGROUP_SCHED 7734#ifdef CONFIG_CGROUP_SCHED
9624 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7735 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9625#elif defined CONFIG_USER_SCHED
9626 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9627 init_tg_rt_entry(&init_task_group,
9628 &per_cpu(init_rt_rq_var, i),
9629 &per_cpu(init_sched_rt_entity, i), i, 1,
9630 root_task_group.rt_se[i]);
9631#endif 7736#endif
9632#endif 7737#endif
9633 7738
@@ -9712,7 +7817,7 @@ static inline int preempt_count_equals(int preempt_offset)
9712 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7817 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9713} 7818}
9714 7819
9715void __might_sleep(char *file, int line, int preempt_offset) 7820void __might_sleep(const char *file, int line, int preempt_offset)
9716{ 7821{
9717#ifdef in_atomic 7822#ifdef in_atomic
9718 static unsigned long prev_jiffy; /* ratelimiting */ 7823 static unsigned long prev_jiffy; /* ratelimiting */
@@ -10023,7 +8128,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10023} 8128}
10024#endif /* CONFIG_RT_GROUP_SCHED */ 8129#endif /* CONFIG_RT_GROUP_SCHED */
10025 8130
10026#ifdef CONFIG_GROUP_SCHED 8131#ifdef CONFIG_CGROUP_SCHED
10027static void free_sched_group(struct task_group *tg) 8132static void free_sched_group(struct task_group *tg)
10028{ 8133{
10029 free_fair_sched_group(tg); 8134 free_fair_sched_group(tg);
@@ -10128,11 +8233,11 @@ void sched_move_task(struct task_struct *tsk)
10128 if (unlikely(running)) 8233 if (unlikely(running))
10129 tsk->sched_class->set_curr_task(rq); 8234 tsk->sched_class->set_curr_task(rq);
10130 if (on_rq) 8235 if (on_rq)
10131 enqueue_task(rq, tsk, 0); 8236 enqueue_task(rq, tsk, 0, false);
10132 8237
10133 task_rq_unlock(rq, &flags); 8238 task_rq_unlock(rq, &flags);
10134} 8239}
10135#endif /* CONFIG_GROUP_SCHED */ 8240#endif /* CONFIG_CGROUP_SCHED */
10136 8241
10137#ifdef CONFIG_FAIR_GROUP_SCHED 8242#ifdef CONFIG_FAIR_GROUP_SCHED
10138static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8243static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10274,13 +8379,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10274 runtime = d->rt_runtime; 8379 runtime = d->rt_runtime;
10275 } 8380 }
10276 8381
10277#ifdef CONFIG_USER_SCHED
10278 if (tg == &root_task_group) {
10279 period = global_rt_period();
10280 runtime = global_rt_runtime();
10281 }
10282#endif
10283
10284 /* 8382 /*
10285 * Cannot have more runtime than the period. 8383 * Cannot have more runtime than the period.
10286 */ 8384 */
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..eeb3506c4834 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,7 @@ static int convert_prio(int prio)
47} 47}
48 48
49#define for_each_cpupri_active(array, idx) \ 49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ 50 for_each_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53 51
54/** 52/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system 53 * cpupri_find - find the best (lowest-pri) CPU in the system
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8fe7ee81c552..b45abbe55067 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq)
1053 * increased. Here we update the fair scheduling stats and 1053 * increased. Here we update the fair scheduling stats and
1054 * then put the task into the rbtree: 1054 * then put the task into the rbtree:
1055 */ 1055 */
1056static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 1056static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1057{ 1058{
1058 struct cfs_rq *cfs_rq; 1059 struct cfs_rq *cfs_rq;
1059 struct sched_entity *se = &p->se; 1060 struct sched_entity *se = &p->se;
@@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1815 */ 1816 */
1816 1817
1817/* 1818/*
1818 * Load-balancing iterator. Note: while the runqueue stays locked 1819 * pull_task - move a task from a remote runqueue to the local runqueue.
1819 * during the whole iteration, the current task might be 1820 * Both runqueues must be locked.
1820 * dequeued so the iterator has to be dequeue-safe. Here we
1821 * achieve that by always pre-iterating before returning
1822 * the current task:
1823 */ 1821 */
1824static struct task_struct * 1822static void pull_task(struct rq *src_rq, struct task_struct *p,
1825__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) 1823 struct rq *this_rq, int this_cpu)
1826{ 1824{
1827 struct task_struct *p = NULL; 1825 deactivate_task(src_rq, p, 0);
1828 struct sched_entity *se; 1826 set_task_cpu(p, this_cpu);
1827 activate_task(this_rq, p, 0);
1828 check_preempt_curr(this_rq, p, 0);
1829}
1829 1830
1830 if (next == &cfs_rq->tasks) 1831/*
1831 return NULL; 1832 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1833 */
1834static
1835int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1836 struct sched_domain *sd, enum cpu_idle_type idle,
1837 int *all_pinned)
1838{
1839 int tsk_cache_hot = 0;
1840 /*
1841 * We do not migrate tasks that are:
1842 * 1) running (obviously), or
1843 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1844 * 3) are cache-hot on their current CPU.
1845 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine);
1848 return 0;
1849 }
1850 *all_pinned = 0;
1832 1851
1833 se = list_entry(next, struct sched_entity, group_node); 1852 if (task_running(rq, p)) {
1834 p = task_of(se); 1853 schedstat_inc(p, se.nr_failed_migrations_running);
1835 cfs_rq->balance_iterator = next->next; 1854 return 0;
1855 }
1836 1856
1837 return p; 1857 /*
1838} 1858 * Aggressive migration if:
1859 * 1) task is cache cold, or
1860 * 2) too many balance attempts have failed.
1861 */
1839 1862
1840static struct task_struct *load_balance_start_fair(void *arg) 1863 tsk_cache_hot = task_hot(p, rq->clock, sd);
1841{ 1864 if (!tsk_cache_hot ||
1842 struct cfs_rq *cfs_rq = arg; 1865 sd->nr_balance_failed > sd->cache_nice_tries) {
1866#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations);
1870 }
1871#endif
1872 return 1;
1873 }
1843 1874
1844 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); 1875 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot);
1877 return 0;
1878 }
1879 return 1;
1845} 1880}
1846 1881
1847static struct task_struct *load_balance_next_fair(void *arg) 1882/*
1883 * move_one_task tries to move exactly one task from busiest to this_rq, as
1884 * part of active balancing operations within "domain".
1885 * Returns 1 if successful and 0 otherwise.
1886 *
1887 * Called with both runqueues locked.
1888 */
1889static int
1890move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1891 struct sched_domain *sd, enum cpu_idle_type idle)
1848{ 1892{
1849 struct cfs_rq *cfs_rq = arg; 1893 struct task_struct *p, *n;
1894 struct cfs_rq *cfs_rq;
1895 int pinned = 0;
1896
1897 for_each_leaf_cfs_rq(busiest, cfs_rq) {
1898 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
1899
1900 if (!can_migrate_task(p, busiest, this_cpu,
1901 sd, idle, &pinned))
1902 continue;
1903
1904 pull_task(busiest, p, this_rq, this_cpu);
1905 /*
1906 * Right now, this is only the second place pull_task()
1907 * is called, so we can safely collect pull_task()
1908 * stats here rather than inside pull_task().
1909 */
1910 schedstat_inc(sd, lb_gained[idle]);
1911 return 1;
1912 }
1913 }
1850 1914
1851 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1915 return 0;
1852} 1916}
1853 1917
1854static unsigned long 1918static unsigned long
1855__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1919balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1856 unsigned long max_load_move, struct sched_domain *sd, 1920 unsigned long max_load_move, struct sched_domain *sd,
1857 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, 1921 enum cpu_idle_type idle, int *all_pinned,
1858 struct cfs_rq *cfs_rq) 1922 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1859{ 1923{
1860 struct rq_iterator cfs_rq_iterator; 1924 int loops = 0, pulled = 0, pinned = 0;
1925 long rem_load_move = max_load_move;
1926 struct task_struct *p, *n;
1861 1927
1862 cfs_rq_iterator.start = load_balance_start_fair; 1928 if (max_load_move == 0)
1863 cfs_rq_iterator.next = load_balance_next_fair; 1929 goto out;
1864 cfs_rq_iterator.arg = cfs_rq;
1865 1930
1866 return balance_tasks(this_rq, this_cpu, busiest, 1931 pinned = 1;
1867 max_load_move, sd, idle, all_pinned, 1932
1868 this_best_prio, &cfs_rq_iterator); 1933 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1934 if (loops++ > sysctl_sched_nr_migrate)
1935 break;
1936
1937 if ((p->se.load.weight >> 1) > rem_load_move ||
1938 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
1939 continue;
1940
1941 pull_task(busiest, p, this_rq, this_cpu);
1942 pulled++;
1943 rem_load_move -= p->se.load.weight;
1944
1945#ifdef CONFIG_PREEMPT
1946 /*
1947 * NEWIDLE balancing is a source of latency, so preemptible
1948 * kernels will stop after the first task is pulled to minimize
1949 * the critical section.
1950 */
1951 if (idle == CPU_NEWLY_IDLE)
1952 break;
1953#endif
1954
1955 /*
1956 * We only want to steal up to the prescribed amount of
1957 * weighted load.
1958 */
1959 if (rem_load_move <= 0)
1960 break;
1961
1962 if (p->prio < *this_best_prio)
1963 *this_best_prio = p->prio;
1964 }
1965out:
1966 /*
1967 * Right now, this is one of only two places pull_task() is called,
1968 * so we can safely collect pull_task() stats here rather than
1969 * inside pull_task().
1970 */
1971 schedstat_add(sd, lb_gained[idle], pulled);
1972
1973 if (all_pinned)
1974 *all_pinned = pinned;
1975
1976 return max_load_move - rem_load_move;
1869} 1977}
1870 1978
1871#ifdef CONFIG_FAIR_GROUP_SCHED 1979#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1897 rem_load = (u64)rem_load_move * busiest_weight; 2005 rem_load = (u64)rem_load_move * busiest_weight;
1898 rem_load = div_u64(rem_load, busiest_h_load + 1); 2006 rem_load = div_u64(rem_load, busiest_h_load + 1);
1899 2007
1900 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 2008 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1901 rem_load, sd, idle, all_pinned, this_best_prio, 2009 rem_load, sd, idle, all_pinned, this_best_prio,
1902 tg->cfs_rq[busiest_cpu]); 2010 busiest_cfs_rq);
1903 2011
1904 if (!moved_load) 2012 if (!moved_load)
1905 continue; 2013 continue;
@@ -1922,35 +2030,1488 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1922 struct sched_domain *sd, enum cpu_idle_type idle, 2030 struct sched_domain *sd, enum cpu_idle_type idle,
1923 int *all_pinned, int *this_best_prio) 2031 int *all_pinned, int *this_best_prio)
1924{ 2032{
1925 return __load_balance_fair(this_rq, this_cpu, busiest, 2033 return balance_tasks(this_rq, this_cpu, busiest,
1926 max_load_move, sd, idle, all_pinned, 2034 max_load_move, sd, idle, all_pinned,
1927 this_best_prio, &busiest->cfs); 2035 this_best_prio, &busiest->cfs);
1928} 2036}
1929#endif 2037#endif
1930 2038
1931static int 2039/*
1932move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2040 * move_tasks tries to move up to max_load_move weighted load from busiest to
1933 struct sched_domain *sd, enum cpu_idle_type idle) 2041 * this_rq, as part of a balancing operation within domain "sd".
2042 * Returns 1 if successful and 0 otherwise.
2043 *
2044 * Called with both runqueues locked.
2045 */
2046static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2047 unsigned long max_load_move,
2048 struct sched_domain *sd, enum cpu_idle_type idle,
2049 int *all_pinned)
1934{ 2050{
1935 struct cfs_rq *busy_cfs_rq; 2051 unsigned long total_load_moved = 0, load_moved;
1936 struct rq_iterator cfs_rq_iterator; 2052 int this_best_prio = this_rq->curr->prio;
2053
2054 do {
2055 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2056 max_load_move - total_load_moved,
2057 sd, idle, all_pinned, &this_best_prio);
1937 2058
1938 cfs_rq_iterator.start = load_balance_start_fair; 2059 total_load_moved += load_moved;
1939 cfs_rq_iterator.next = load_balance_next_fair;
1940 2060
1941 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 2061#ifdef CONFIG_PREEMPT
1942 /* 2062 /*
1943 * pass busy_cfs_rq argument into 2063 * NEWIDLE balancing is a source of latency, so preemptible
1944 * load_balance_[start|next]_fair iterators 2064 * kernels will stop after the first task is pulled to minimize
2065 * the critical section.
1945 */ 2066 */
1946 cfs_rq_iterator.arg = busy_cfs_rq; 2067 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
1947 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 2068 break;
1948 &cfs_rq_iterator)) 2069
1949 return 1; 2070 if (raw_spin_is_contended(&this_rq->lock) ||
2071 raw_spin_is_contended(&busiest->lock))
2072 break;
2073#endif
2074 } while (load_moved && max_load_move > total_load_moved);
2075
2076 return total_load_moved > 0;
2077}
2078
2079/********** Helpers for find_busiest_group ************************/
2080/*
2081 * sd_lb_stats - Structure to store the statistics of a sched_domain
2082 * during load balancing.
2083 */
2084struct sd_lb_stats {
2085 struct sched_group *busiest; /* Busiest group in this sd */
2086 struct sched_group *this; /* Local group in this sd */
2087 unsigned long total_load; /* Total load of all groups in sd */
2088 unsigned long total_pwr; /* Total power of all groups in sd */
2089 unsigned long avg_load; /* Average load across all groups in sd */
2090
2091 /** Statistics of this group */
2092 unsigned long this_load;
2093 unsigned long this_load_per_task;
2094 unsigned long this_nr_running;
2095
2096 /* Statistics of the busiest group */
2097 unsigned long max_load;
2098 unsigned long busiest_load_per_task;
2099 unsigned long busiest_nr_running;
2100
2101 int group_imb; /* Is there imbalance in this sd */
2102#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2103 int power_savings_balance; /* Is powersave balance needed for this sd */
2104 struct sched_group *group_min; /* Least loaded group in sd */
2105 struct sched_group *group_leader; /* Group which relieves group_min */
2106 unsigned long min_load_per_task; /* load_per_task in group_min */
2107 unsigned long leader_nr_running; /* Nr running of group_leader */
2108 unsigned long min_nr_running; /* Nr running of group_min */
2109#endif
2110};
2111
2112/*
2113 * sg_lb_stats - stats of a sched_group required for load_balancing
2114 */
2115struct sg_lb_stats {
2116 unsigned long avg_load; /*Avg load across the CPUs of the group */
2117 unsigned long group_load; /* Total load over the CPUs of the group */
2118 unsigned long sum_nr_running; /* Nr tasks running in the group */
2119 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2120 unsigned long group_capacity;
2121 int group_imb; /* Is there an imbalance in the group ? */
2122};
2123
2124/**
2125 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2126 * @group: The group whose first cpu is to be returned.
2127 */
2128static inline unsigned int group_first_cpu(struct sched_group *group)
2129{
2130 return cpumask_first(sched_group_cpus(group));
2131}
2132
2133/**
2134 * get_sd_load_idx - Obtain the load index for a given sched domain.
2135 * @sd: The sched_domain whose load_idx is to be obtained.
2136 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
2137 */
2138static inline int get_sd_load_idx(struct sched_domain *sd,
2139 enum cpu_idle_type idle)
2140{
2141 int load_idx;
2142
2143 switch (idle) {
2144 case CPU_NOT_IDLE:
2145 load_idx = sd->busy_idx;
2146 break;
2147
2148 case CPU_NEWLY_IDLE:
2149 load_idx = sd->newidle_idx;
2150 break;
2151 default:
2152 load_idx = sd->idle_idx;
2153 break;
2154 }
2155
2156 return load_idx;
2157}
2158
2159
2160#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2161/**
2162 * init_sd_power_savings_stats - Initialize power savings statistics for
2163 * the given sched_domain, during load balancing.
2164 *
2165 * @sd: Sched domain whose power-savings statistics are to be initialized.
2166 * @sds: Variable containing the statistics for sd.
2167 * @idle: Idle status of the CPU at which we're performing load-balancing.
2168 */
2169static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2170 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2171{
2172 /*
2173 * Busy processors will not participate in power savings
2174 * balance.
2175 */
2176 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2177 sds->power_savings_balance = 0;
2178 else {
2179 sds->power_savings_balance = 1;
2180 sds->min_nr_running = ULONG_MAX;
2181 sds->leader_nr_running = 0;
2182 }
2183}
2184
2185/**
2186 * update_sd_power_savings_stats - Update the power saving stats for a
2187 * sched_domain while performing load balancing.
2188 *
2189 * @group: sched_group belonging to the sched_domain under consideration.
2190 * @sds: Variable containing the statistics of the sched_domain
2191 * @local_group: Does group contain the CPU for which we're performing
2192 * load balancing ?
2193 * @sgs: Variable containing the statistics of the group.
2194 */
2195static inline void update_sd_power_savings_stats(struct sched_group *group,
2196 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2197{
2198
2199 if (!sds->power_savings_balance)
2200 return;
2201
2202 /*
2203 * If the local group is idle or completely loaded
2204 * no need to do power savings balance at this domain
2205 */
2206 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2207 !sds->this_nr_running))
2208 sds->power_savings_balance = 0;
2209
2210 /*
2211 * If a group is already running at full capacity or idle,
2212 * don't include that group in power savings calculations
2213 */
2214 if (!sds->power_savings_balance ||
2215 sgs->sum_nr_running >= sgs->group_capacity ||
2216 !sgs->sum_nr_running)
2217 return;
2218
2219 /*
2220 * Calculate the group which has the least non-idle load.
2221 * This is the group from where we need to pick up the load
2222 * for saving power
2223 */
2224 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2225 (sgs->sum_nr_running == sds->min_nr_running &&
2226 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2227 sds->group_min = group;
2228 sds->min_nr_running = sgs->sum_nr_running;
2229 sds->min_load_per_task = sgs->sum_weighted_load /
2230 sgs->sum_nr_running;
1950 } 2231 }
1951 2232
2233 /*
2234 * Calculate the group which is almost near its
2235 * capacity but still has some space to pick up some load
2236 * from other group and save more power
2237 */
2238 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2239 return;
2240
2241 if (sgs->sum_nr_running > sds->leader_nr_running ||
2242 (sgs->sum_nr_running == sds->leader_nr_running &&
2243 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2244 sds->group_leader = group;
2245 sds->leader_nr_running = sgs->sum_nr_running;
2246 }
2247}
2248
2249/**
2250 * check_power_save_busiest_group - see if there is potential for some power-savings balance
2251 * @sds: Variable containing the statistics of the sched_domain
2252 * under consideration.
2253 * @this_cpu: Cpu at which we're currently performing load-balancing.
2254 * @imbalance: Variable to store the imbalance.
2255 *
2256 * Description:
2257 * Check if we have potential to perform some power-savings balance.
2258 * If yes, set the busiest group to be the least loaded group in the
2259 * sched_domain, so that it's CPUs can be put to idle.
2260 *
2261 * Returns 1 if there is potential to perform power-savings balance.
2262 * Else returns 0.
2263 */
2264static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2265 int this_cpu, unsigned long *imbalance)
2266{
2267 if (!sds->power_savings_balance)
2268 return 0;
2269
2270 if (sds->this != sds->group_leader ||
2271 sds->group_leader == sds->group_min)
2272 return 0;
2273
2274 *imbalance = sds->min_load_per_task;
2275 sds->busiest = sds->group_min;
2276
2277 return 1;
2278
2279}
2280#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2281static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2282 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2283{
2284 return;
2285}
2286
2287static inline void update_sd_power_savings_stats(struct sched_group *group,
2288 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2289{
2290 return;
2291}
2292
2293static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2294 int this_cpu, unsigned long *imbalance)
2295{
1952 return 0; 2296 return 0;
1953} 2297}
2298#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2299
2300
2301unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2302{
2303 return SCHED_LOAD_SCALE;
2304}
2305
2306unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2307{
2308 return default_scale_freq_power(sd, cpu);
2309}
2310
2311unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2312{
2313 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2314 unsigned long smt_gain = sd->smt_gain;
2315
2316 smt_gain /= weight;
2317
2318 return smt_gain;
2319}
2320
2321unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2322{
2323 return default_scale_smt_power(sd, cpu);
2324}
2325
2326unsigned long scale_rt_power(int cpu)
2327{
2328 struct rq *rq = cpu_rq(cpu);
2329 u64 total, available;
2330
2331 sched_avg_update(rq);
2332
2333 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2334 available = total - rq->rt_avg;
2335
2336 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2337 total = SCHED_LOAD_SCALE;
2338
2339 total >>= SCHED_LOAD_SHIFT;
2340
2341 return div_u64(available, total);
2342}
2343
2344static void update_cpu_power(struct sched_domain *sd, int cpu)
2345{
2346 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2347 unsigned long power = SCHED_LOAD_SCALE;
2348 struct sched_group *sdg = sd->groups;
2349
2350 if (sched_feat(ARCH_POWER))
2351 power *= arch_scale_freq_power(sd, cpu);
2352 else
2353 power *= default_scale_freq_power(sd, cpu);
2354
2355 power >>= SCHED_LOAD_SHIFT;
2356
2357 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2358 if (sched_feat(ARCH_POWER))
2359 power *= arch_scale_smt_power(sd, cpu);
2360 else
2361 power *= default_scale_smt_power(sd, cpu);
2362
2363 power >>= SCHED_LOAD_SHIFT;
2364 }
2365
2366 power *= scale_rt_power(cpu);
2367 power >>= SCHED_LOAD_SHIFT;
2368
2369 if (!power)
2370 power = 1;
2371
2372 sdg->cpu_power = power;
2373}
2374
2375static void update_group_power(struct sched_domain *sd, int cpu)
2376{
2377 struct sched_domain *child = sd->child;
2378 struct sched_group *group, *sdg = sd->groups;
2379 unsigned long power;
2380
2381 if (!child) {
2382 update_cpu_power(sd, cpu);
2383 return;
2384 }
2385
2386 power = 0;
2387
2388 group = child->groups;
2389 do {
2390 power += group->cpu_power;
2391 group = group->next;
2392 } while (group != child->groups);
2393
2394 sdg->cpu_power = power;
2395}
2396
2397/**
2398 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2399 * @sd: The sched_domain whose statistics are to be updated.
2400 * @group: sched_group whose statistics are to be updated.
2401 * @this_cpu: Cpu for which load balance is currently performed.
2402 * @idle: Idle status of this_cpu
2403 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2404 * @sd_idle: Idle status of the sched_domain containing group.
2405 * @local_group: Does group contain this_cpu.
2406 * @cpus: Set of cpus considered for load balancing.
2407 * @balance: Should we balance.
2408 * @sgs: variable to hold the statistics for this group.
2409 */
2410static inline void update_sg_lb_stats(struct sched_domain *sd,
2411 struct sched_group *group, int this_cpu,
2412 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2413 int local_group, const struct cpumask *cpus,
2414 int *balance, struct sg_lb_stats *sgs)
2415{
2416 unsigned long load, max_cpu_load, min_cpu_load;
2417 int i;
2418 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2419 unsigned long sum_avg_load_per_task;
2420 unsigned long avg_load_per_task;
2421
2422 if (local_group)
2423 balance_cpu = group_first_cpu(group);
2424
2425 /* Tally up the load of all CPUs in the group */
2426 sum_avg_load_per_task = avg_load_per_task = 0;
2427 max_cpu_load = 0;
2428 min_cpu_load = ~0UL;
2429
2430 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2431 struct rq *rq = cpu_rq(i);
2432
2433 if (*sd_idle && rq->nr_running)
2434 *sd_idle = 0;
2435
2436 /* Bias balancing toward cpus of our domain */
2437 if (local_group) {
2438 if (idle_cpu(i) && !first_idle_cpu) {
2439 first_idle_cpu = 1;
2440 balance_cpu = i;
2441 }
2442
2443 load = target_load(i, load_idx);
2444 } else {
2445 load = source_load(i, load_idx);
2446 if (load > max_cpu_load)
2447 max_cpu_load = load;
2448 if (min_cpu_load > load)
2449 min_cpu_load = load;
2450 }
2451
2452 sgs->group_load += load;
2453 sgs->sum_nr_running += rq->nr_running;
2454 sgs->sum_weighted_load += weighted_cpuload(i);
2455
2456 sum_avg_load_per_task += cpu_avg_load_per_task(i);
2457 }
2458
2459 /*
2460 * First idle cpu or the first cpu(busiest) in this sched group
2461 * is eligible for doing load balancing at this and above
2462 * domains. In the newly idle case, we will allow all the cpu's
2463 * to do the newly idle load balance.
2464 */
2465 if (idle != CPU_NEWLY_IDLE && local_group &&
2466 balance_cpu != this_cpu) {
2467 *balance = 0;
2468 return;
2469 }
2470
2471 update_group_power(sd, this_cpu);
2472
2473 /* Adjust by relative CPU power of the group */
2474 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2475
2476
2477 /*
2478 * Consider the group unbalanced when the imbalance is larger
2479 * than the average weight of two tasks.
2480 *
2481 * APZ: with cgroup the avg task weight can vary wildly and
2482 * might not be a suitable number - should we keep a
2483 * normalized nr_running number somewhere that negates
2484 * the hierarchy?
2485 */
2486 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
2487 group->cpu_power;
2488
2489 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
2490 sgs->group_imb = 1;
2491
2492 sgs->group_capacity =
2493 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2494}
2495
2496/**
2497 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2498 * @sd: sched_domain whose statistics are to be updated.
2499 * @this_cpu: Cpu for which load balance is currently performed.
2500 * @idle: Idle status of this_cpu
2501 * @sd_idle: Idle status of the sched_domain containing group.
2502 * @cpus: Set of cpus considered for load balancing.
2503 * @balance: Should we balance.
2504 * @sds: variable to hold the statistics for this sched_domain.
2505 */
2506static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2507 enum cpu_idle_type idle, int *sd_idle,
2508 const struct cpumask *cpus, int *balance,
2509 struct sd_lb_stats *sds)
2510{
2511 struct sched_domain *child = sd->child;
2512 struct sched_group *group = sd->groups;
2513 struct sg_lb_stats sgs;
2514 int load_idx, prefer_sibling = 0;
2515
2516 if (child && child->flags & SD_PREFER_SIBLING)
2517 prefer_sibling = 1;
2518
2519 init_sd_power_savings_stats(sd, sds, idle);
2520 load_idx = get_sd_load_idx(sd, idle);
2521
2522 do {
2523 int local_group;
2524
2525 local_group = cpumask_test_cpu(this_cpu,
2526 sched_group_cpus(group));
2527 memset(&sgs, 0, sizeof(sgs));
2528 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
2529 local_group, cpus, balance, &sgs);
2530
2531 if (local_group && !(*balance))
2532 return;
2533
2534 sds->total_load += sgs.group_load;
2535 sds->total_pwr += group->cpu_power;
2536
2537 /*
2538 * In case the child domain prefers tasks go to siblings
2539 * first, lower the group capacity to one so that we'll try
2540 * and move all the excess tasks away.
2541 */
2542 if (prefer_sibling)
2543 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2544
2545 if (local_group) {
2546 sds->this_load = sgs.avg_load;
2547 sds->this = group;
2548 sds->this_nr_running = sgs.sum_nr_running;
2549 sds->this_load_per_task = sgs.sum_weighted_load;
2550 } else if (sgs.avg_load > sds->max_load &&
2551 (sgs.sum_nr_running > sgs.group_capacity ||
2552 sgs.group_imb)) {
2553 sds->max_load = sgs.avg_load;
2554 sds->busiest = group;
2555 sds->busiest_nr_running = sgs.sum_nr_running;
2556 sds->busiest_load_per_task = sgs.sum_weighted_load;
2557 sds->group_imb = sgs.group_imb;
2558 }
2559
2560 update_sd_power_savings_stats(group, sds, local_group, &sgs);
2561 group = group->next;
2562 } while (group != sd->groups);
2563}
2564
2565/**
2566 * fix_small_imbalance - Calculate the minor imbalance that exists
2567 * amongst the groups of a sched_domain, during
2568 * load balancing.
2569 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
2570 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2571 * @imbalance: Variable to store the imbalance.
2572 */
2573static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2574 int this_cpu, unsigned long *imbalance)
2575{
2576 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2577 unsigned int imbn = 2;
2578
2579 if (sds->this_nr_running) {
2580 sds->this_load_per_task /= sds->this_nr_running;
2581 if (sds->busiest_load_per_task >
2582 sds->this_load_per_task)
2583 imbn = 1;
2584 } else
2585 sds->this_load_per_task =
2586 cpu_avg_load_per_task(this_cpu);
2587
2588 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
2589 sds->busiest_load_per_task * imbn) {
2590 *imbalance = sds->busiest_load_per_task;
2591 return;
2592 }
2593
2594 /*
2595 * OK, we don't have enough imbalance to justify moving tasks,
2596 * however we may be able to increase total CPU power used by
2597 * moving them.
2598 */
2599
2600 pwr_now += sds->busiest->cpu_power *
2601 min(sds->busiest_load_per_task, sds->max_load);
2602 pwr_now += sds->this->cpu_power *
2603 min(sds->this_load_per_task, sds->this_load);
2604 pwr_now /= SCHED_LOAD_SCALE;
2605
2606 /* Amount of load we'd subtract */
2607 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2608 sds->busiest->cpu_power;
2609 if (sds->max_load > tmp)
2610 pwr_move += sds->busiest->cpu_power *
2611 min(sds->busiest_load_per_task, sds->max_load - tmp);
2612
2613 /* Amount of load we'd add */
2614 if (sds->max_load * sds->busiest->cpu_power <
2615 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2616 tmp = (sds->max_load * sds->busiest->cpu_power) /
2617 sds->this->cpu_power;
2618 else
2619 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2620 sds->this->cpu_power;
2621 pwr_move += sds->this->cpu_power *
2622 min(sds->this_load_per_task, sds->this_load + tmp);
2623 pwr_move /= SCHED_LOAD_SCALE;
2624
2625 /* Move if we gain throughput */
2626 if (pwr_move > pwr_now)
2627 *imbalance = sds->busiest_load_per_task;
2628}
2629
2630/**
2631 * calculate_imbalance - Calculate the amount of imbalance present within the
2632 * groups of a given sched_domain during load balance.
2633 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
2634 * @this_cpu: Cpu for which currently load balance is being performed.
2635 * @imbalance: The variable to store the imbalance.
2636 */
2637static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2638 unsigned long *imbalance)
2639{
2640 unsigned long max_pull;
2641 /*
2642 * In the presence of smp nice balancing, certain scenarios can have
2643 * max load less than avg load(as we skip the groups at or below
2644 * its cpu_power, while calculating max_load..)
2645 */
2646 if (sds->max_load < sds->avg_load) {
2647 *imbalance = 0;
2648 return fix_small_imbalance(sds, this_cpu, imbalance);
2649 }
2650
2651 /* Don't want to pull so many tasks that a group would go idle */
2652 max_pull = min(sds->max_load - sds->avg_load,
2653 sds->max_load - sds->busiest_load_per_task);
2654
2655 /* How much load to actually move to equalise the imbalance */
2656 *imbalance = min(max_pull * sds->busiest->cpu_power,
2657 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
2658 / SCHED_LOAD_SCALE;
2659
2660 /*
2661 * if *imbalance is less than the average load per runnable task
2662 * there is no gaurantee that any tasks will be moved so we'll have
2663 * a think about bumping its value to force at least one task to be
2664 * moved
2665 */
2666 if (*imbalance < sds->busiest_load_per_task)
2667 return fix_small_imbalance(sds, this_cpu, imbalance);
2668
2669}
2670/******* find_busiest_group() helpers end here *********************/
2671
2672/**
2673 * find_busiest_group - Returns the busiest group within the sched_domain
2674 * if there is an imbalance. If there isn't an imbalance, and
2675 * the user has opted for power-savings, it returns a group whose
2676 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
2677 * such a group exists.
2678 *
2679 * Also calculates the amount of weighted load which should be moved
2680 * to restore balance.
2681 *
2682 * @sd: The sched_domain whose busiest group is to be returned.
2683 * @this_cpu: The cpu for which load balancing is currently being performed.
2684 * @imbalance: Variable which stores amount of weighted load which should
2685 * be moved to restore balance/put a group to idle.
2686 * @idle: The idle status of this_cpu.
2687 * @sd_idle: The idleness of sd
2688 * @cpus: The set of CPUs under consideration for load-balancing.
2689 * @balance: Pointer to a variable indicating if this_cpu
2690 * is the appropriate cpu to perform load balancing at this_level.
2691 *
2692 * Returns: - the busiest group if imbalance exists.
2693 * - If no imbalance and user has opted for power-savings balance,
2694 * return the least loaded group whose CPUs can be
2695 * put to idle by rebalancing its tasks onto our group.
2696 */
2697static struct sched_group *
2698find_busiest_group(struct sched_domain *sd, int this_cpu,
2699 unsigned long *imbalance, enum cpu_idle_type idle,
2700 int *sd_idle, const struct cpumask *cpus, int *balance)
2701{
2702 struct sd_lb_stats sds;
2703
2704 memset(&sds, 0, sizeof(sds));
2705
2706 /*
2707 * Compute the various statistics relavent for load balancing at
2708 * this level.
2709 */
2710 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
2711 balance, &sds);
2712
2713 /* Cases where imbalance does not exist from POV of this_cpu */
2714 /* 1) this_cpu is not the appropriate cpu to perform load balancing
2715 * at this level.
2716 * 2) There is no busy sibling group to pull from.
2717 * 3) This group is the busiest group.
2718 * 4) This group is more busy than the avg busieness at this
2719 * sched_domain.
2720 * 5) The imbalance is within the specified limit.
2721 * 6) Any rebalance would lead to ping-pong
2722 */
2723 if (!(*balance))
2724 goto ret;
2725
2726 if (!sds.busiest || sds.busiest_nr_running == 0)
2727 goto out_balanced;
2728
2729 if (sds.this_load >= sds.max_load)
2730 goto out_balanced;
2731
2732 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
2733
2734 if (sds.this_load >= sds.avg_load)
2735 goto out_balanced;
2736
2737 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2738 goto out_balanced;
2739
2740 sds.busiest_load_per_task /= sds.busiest_nr_running;
2741 if (sds.group_imb)
2742 sds.busiest_load_per_task =
2743 min(sds.busiest_load_per_task, sds.avg_load);
2744
2745 /*
2746 * We're trying to get all the cpus to the average_load, so we don't
2747 * want to push ourselves above the average load, nor do we wish to
2748 * reduce the max loaded cpu below the average load, as either of these
2749 * actions would just result in more rebalancing later, and ping-pong
2750 * tasks around. Thus we look for the minimum possible imbalance.
2751 * Negative imbalances (*we* are more loaded than anyone else) will
2752 * be counted as no imbalance for these purposes -- we can't fix that
2753 * by pulling tasks to us. Be careful of negative numbers as they'll
2754 * appear as very large values with unsigned longs.
2755 */
2756 if (sds.max_load <= sds.busiest_load_per_task)
2757 goto out_balanced;
2758
2759 /* Looks like there is an imbalance. Compute it */
2760 calculate_imbalance(&sds, this_cpu, imbalance);
2761 return sds.busiest;
2762
2763out_balanced:
2764 /*
2765 * There is no obvious imbalance. But check if we can do some balancing
2766 * to save power.
2767 */
2768 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
2769 return sds.busiest;
2770ret:
2771 *imbalance = 0;
2772 return NULL;
2773}
2774
2775/*
2776 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2777 */
2778static struct rq *
2779find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2780 unsigned long imbalance, const struct cpumask *cpus)
2781{
2782 struct rq *busiest = NULL, *rq;
2783 unsigned long max_load = 0;
2784 int i;
2785
2786 for_each_cpu(i, sched_group_cpus(group)) {
2787 unsigned long power = power_of(i);
2788 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2789 unsigned long wl;
2790
2791 if (!cpumask_test_cpu(i, cpus))
2792 continue;
2793
2794 rq = cpu_rq(i);
2795 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
2796 wl /= power;
2797
2798 if (capacity && rq->nr_running == 1 && wl > imbalance)
2799 continue;
2800
2801 if (wl > max_load) {
2802 max_load = wl;
2803 busiest = rq;
2804 }
2805 }
2806
2807 return busiest;
2808}
2809
2810/*
2811 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2812 * so long as it is large enough.
2813 */
2814#define MAX_PINNED_INTERVAL 512
2815
2816/* Working cpumask for load_balance and load_balance_newidle. */
2817static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2818
2819static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2820{
2821 if (idle == CPU_NEWLY_IDLE) {
2822 /*
2823 * The only task running in a non-idle cpu can be moved to this
2824 * cpu in an attempt to completely freeup the other CPU
2825 * package.
2826 *
2827 * The package power saving logic comes from
2828 * find_busiest_group(). If there are no imbalance, then
2829 * f_b_g() will return NULL. However when sched_mc={1,2} then
2830 * f_b_g() will select a group from which a running task may be
2831 * pulled to this cpu in order to make the other package idle.
2832 * If there is no opportunity to make a package idle and if
2833 * there are no imbalance, then f_b_g() will return NULL and no
2834 * action will be taken in load_balance_newidle().
2835 *
2836 * Under normal task pull operation due to imbalance, there
2837 * will be more than one task in the source run queue and
2838 * move_tasks() will succeed. ld_moved will be true and this
2839 * active balance code will not be triggered.
2840 */
2841 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2842 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2843 return 0;
2844
2845 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2846 return 0;
2847 }
2848
2849 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2850}
2851
2852/*
2853 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2854 * tasks if there is an imbalance.
2855 */
2856static int load_balance(int this_cpu, struct rq *this_rq,
2857 struct sched_domain *sd, enum cpu_idle_type idle,
2858 int *balance)
2859{
2860 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2861 struct sched_group *group;
2862 unsigned long imbalance;
2863 struct rq *busiest;
2864 unsigned long flags;
2865 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
2866
2867 cpumask_copy(cpus, cpu_active_mask);
2868
2869 /*
2870 * When power savings policy is enabled for the parent domain, idle
2871 * sibling can pick up load irrespective of busy siblings. In this case,
2872 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2873 * portraying it as CPU_NOT_IDLE.
2874 */
2875 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2876 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2877 sd_idle = 1;
2878
2879 schedstat_inc(sd, lb_count[idle]);
2880
2881redo:
2882 update_shares(sd);
2883 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2884 cpus, balance);
2885
2886 if (*balance == 0)
2887 goto out_balanced;
2888
2889 if (!group) {
2890 schedstat_inc(sd, lb_nobusyg[idle]);
2891 goto out_balanced;
2892 }
2893
2894 busiest = find_busiest_queue(group, idle, imbalance, cpus);
2895 if (!busiest) {
2896 schedstat_inc(sd, lb_nobusyq[idle]);
2897 goto out_balanced;
2898 }
2899
2900 BUG_ON(busiest == this_rq);
2901
2902 schedstat_add(sd, lb_imbalance[idle], imbalance);
2903
2904 ld_moved = 0;
2905 if (busiest->nr_running > 1) {
2906 /*
2907 * Attempt to move tasks. If find_busiest_group has found
2908 * an imbalance but busiest->nr_running <= 1, the group is
2909 * still unbalanced. ld_moved simply stays zero, so it is
2910 * correctly treated as an imbalance.
2911 */
2912 local_irq_save(flags);
2913 double_rq_lock(this_rq, busiest);
2914 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2915 imbalance, sd, idle, &all_pinned);
2916 double_rq_unlock(this_rq, busiest);
2917 local_irq_restore(flags);
2918
2919 /*
2920 * some other cpu did the load balance for us.
2921 */
2922 if (ld_moved && this_cpu != smp_processor_id())
2923 resched_cpu(this_cpu);
2924
2925 /* All tasks on this runqueue were pinned by CPU affinity */
2926 if (unlikely(all_pinned)) {
2927 cpumask_clear_cpu(cpu_of(busiest), cpus);
2928 if (!cpumask_empty(cpus))
2929 goto redo;
2930 goto out_balanced;
2931 }
2932 }
2933
2934 if (!ld_moved) {
2935 schedstat_inc(sd, lb_failed[idle]);
2936 sd->nr_balance_failed++;
2937
2938 if (need_active_balance(sd, sd_idle, idle)) {
2939 raw_spin_lock_irqsave(&busiest->lock, flags);
2940
2941 /* don't kick the migration_thread, if the curr
2942 * task on busiest cpu can't be moved to this_cpu
2943 */
2944 if (!cpumask_test_cpu(this_cpu,
2945 &busiest->curr->cpus_allowed)) {
2946 raw_spin_unlock_irqrestore(&busiest->lock,
2947 flags);
2948 all_pinned = 1;
2949 goto out_one_pinned;
2950 }
2951
2952 if (!busiest->active_balance) {
2953 busiest->active_balance = 1;
2954 busiest->push_cpu = this_cpu;
2955 active_balance = 1;
2956 }
2957 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2958 if (active_balance)
2959 wake_up_process(busiest->migration_thread);
2960
2961 /*
2962 * We've kicked active balancing, reset the failure
2963 * counter.
2964 */
2965 sd->nr_balance_failed = sd->cache_nice_tries+1;
2966 }
2967 } else
2968 sd->nr_balance_failed = 0;
2969
2970 if (likely(!active_balance)) {
2971 /* We were unbalanced, so reset the balancing interval */
2972 sd->balance_interval = sd->min_interval;
2973 } else {
2974 /*
2975 * If we've begun active balancing, start to back off. This
2976 * case may not be covered by the all_pinned logic if there
2977 * is only 1 task on the busy runqueue (because we don't call
2978 * move_tasks).
2979 */
2980 if (sd->balance_interval < sd->max_interval)
2981 sd->balance_interval *= 2;
2982 }
2983
2984 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2985 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2986 ld_moved = -1;
2987
2988 goto out;
2989
2990out_balanced:
2991 schedstat_inc(sd, lb_balanced[idle]);
2992
2993 sd->nr_balance_failed = 0;
2994
2995out_one_pinned:
2996 /* tune up the balancing interval */
2997 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2998 (sd->balance_interval < sd->max_interval))
2999 sd->balance_interval *= 2;
3000
3001 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3002 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3003 ld_moved = -1;
3004 else
3005 ld_moved = 0;
3006out:
3007 if (ld_moved)
3008 update_shares(sd);
3009 return ld_moved;
3010}
3011
3012/*
3013 * idle_balance is called by schedule() if this_cpu is about to become
3014 * idle. Attempts to pull tasks from other CPUs.
3015 */
3016static void idle_balance(int this_cpu, struct rq *this_rq)
3017{
3018 struct sched_domain *sd;
3019 int pulled_task = 0;
3020 unsigned long next_balance = jiffies + HZ;
3021
3022 this_rq->idle_stamp = this_rq->clock;
3023
3024 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3025 return;
3026
3027 /*
3028 * Drop the rq->lock, but keep IRQ/preempt disabled.
3029 */
3030 raw_spin_unlock(&this_rq->lock);
3031
3032 for_each_domain(this_cpu, sd) {
3033 unsigned long interval;
3034 int balance = 1;
3035
3036 if (!(sd->flags & SD_LOAD_BALANCE))
3037 continue;
3038
3039 if (sd->flags & SD_BALANCE_NEWIDLE) {
3040 /* If we've pulled tasks over stop searching: */
3041 pulled_task = load_balance(this_cpu, this_rq,
3042 sd, CPU_NEWLY_IDLE, &balance);
3043 }
3044
3045 interval = msecs_to_jiffies(sd->balance_interval);
3046 if (time_after(next_balance, sd->last_balance + interval))
3047 next_balance = sd->last_balance + interval;
3048 if (pulled_task) {
3049 this_rq->idle_stamp = 0;
3050 break;
3051 }
3052 }
3053
3054 raw_spin_lock(&this_rq->lock);
3055
3056 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3057 /*
3058 * We are going idle. next_balance may be set based on
3059 * a busy processor. So reset next_balance.
3060 */
3061 this_rq->next_balance = next_balance;
3062 }
3063}
3064
3065/*
3066 * active_load_balance is run by migration threads. It pushes running tasks
3067 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
3068 * running on each physical CPU where possible, and avoids physical /
3069 * logical imbalances.
3070 *
3071 * Called with busiest_rq locked.
3072 */
3073static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3074{
3075 int target_cpu = busiest_rq->push_cpu;
3076 struct sched_domain *sd;
3077 struct rq *target_rq;
3078
3079 /* Is there any task to move? */
3080 if (busiest_rq->nr_running <= 1)
3081 return;
3082
3083 target_rq = cpu_rq(target_cpu);
3084
3085 /*
3086 * This condition is "impossible", if it occurs
3087 * we need to fix it. Originally reported by
3088 * Bjorn Helgaas on a 128-cpu setup.
3089 */
3090 BUG_ON(busiest_rq == target_rq);
3091
3092 /* move a task from busiest_rq to target_rq */
3093 double_lock_balance(busiest_rq, target_rq);
3094 update_rq_clock(busiest_rq);
3095 update_rq_clock(target_rq);
3096
3097 /* Search for an sd spanning us and the target CPU. */
3098 for_each_domain(target_cpu, sd) {
3099 if ((sd->flags & SD_LOAD_BALANCE) &&
3100 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3101 break;
3102 }
3103
3104 if (likely(sd)) {
3105 schedstat_inc(sd, alb_count);
3106
3107 if (move_one_task(target_rq, target_cpu, busiest_rq,
3108 sd, CPU_IDLE))
3109 schedstat_inc(sd, alb_pushed);
3110 else
3111 schedstat_inc(sd, alb_failed);
3112 }
3113 double_unlock_balance(busiest_rq, target_rq);
3114}
3115
3116#ifdef CONFIG_NO_HZ
3117static struct {
3118 atomic_t load_balancer;
3119 cpumask_var_t cpu_mask;
3120 cpumask_var_t ilb_grp_nohz_mask;
3121} nohz ____cacheline_aligned = {
3122 .load_balancer = ATOMIC_INIT(-1),
3123};
3124
3125int get_nohz_load_balancer(void)
3126{
3127 return atomic_read(&nohz.load_balancer);
3128}
3129
3130#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3131/**
3132 * lowest_flag_domain - Return lowest sched_domain containing flag.
3133 * @cpu: The cpu whose lowest level of sched domain is to
3134 * be returned.
3135 * @flag: The flag to check for the lowest sched_domain
3136 * for the given cpu.
3137 *
3138 * Returns the lowest sched_domain of a cpu which contains the given flag.
3139 */
3140static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3141{
3142 struct sched_domain *sd;
3143
3144 for_each_domain(cpu, sd)
3145 if (sd && (sd->flags & flag))
3146 break;
3147
3148 return sd;
3149}
3150
3151/**
3152 * for_each_flag_domain - Iterates over sched_domains containing the flag.
3153 * @cpu: The cpu whose domains we're iterating over.
3154 * @sd: variable holding the value of the power_savings_sd
3155 * for cpu.
3156 * @flag: The flag to filter the sched_domains to be iterated.
3157 *
3158 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
3159 * set, starting from the lowest sched_domain to the highest.
3160 */
3161#define for_each_flag_domain(cpu, sd, flag) \
3162 for (sd = lowest_flag_domain(cpu, flag); \
3163 (sd && (sd->flags & flag)); sd = sd->parent)
3164
3165/**
3166 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3167 * @ilb_group: group to be checked for semi-idleness
3168 *
3169 * Returns: 1 if the group is semi-idle. 0 otherwise.
3170 *
3171 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3172 * and atleast one non-idle CPU. This helper function checks if the given
3173 * sched_group is semi-idle or not.
3174 */
3175static inline int is_semi_idle_group(struct sched_group *ilb_group)
3176{
3177 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
3178 sched_group_cpus(ilb_group));
3179
3180 /*
3181 * A sched_group is semi-idle when it has atleast one busy cpu
3182 * and atleast one idle cpu.
3183 */
3184 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
3185 return 0;
3186
3187 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
3188 return 0;
3189
3190 return 1;
3191}
3192/**
3193 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3194 * @cpu: The cpu which is nominating a new idle_load_balancer.
3195 *
3196 * Returns: Returns the id of the idle load balancer if it exists,
3197 * Else, returns >= nr_cpu_ids.
3198 *
3199 * This algorithm picks the idle load balancer such that it belongs to a
3200 * semi-idle powersavings sched_domain. The idea is to try and avoid
3201 * completely idle packages/cores just for the purpose of idle load balancing
3202 * when there are other idle cpu's which are better suited for that job.
3203 */
3204static int find_new_ilb(int cpu)
3205{
3206 struct sched_domain *sd;
3207 struct sched_group *ilb_group;
3208
3209 /*
3210 * Have idle load balancer selection from semi-idle packages only
3211 * when power-aware load balancing is enabled
3212 */
3213 if (!(sched_smt_power_savings || sched_mc_power_savings))
3214 goto out_done;
3215
3216 /*
3217 * Optimize for the case when we have no idle CPUs or only one
3218 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3219 */
3220 if (cpumask_weight(nohz.cpu_mask) < 2)
3221 goto out_done;
3222
3223 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3224 ilb_group = sd->groups;
3225
3226 do {
3227 if (is_semi_idle_group(ilb_group))
3228 return cpumask_first(nohz.ilb_grp_nohz_mask);
3229
3230 ilb_group = ilb_group->next;
3231
3232 } while (ilb_group != sd->groups);
3233 }
3234
3235out_done:
3236 return cpumask_first(nohz.cpu_mask);
3237}
3238#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3239static inline int find_new_ilb(int call_cpu)
3240{
3241 return cpumask_first(nohz.cpu_mask);
3242}
3243#endif
3244
3245/*
3246 * This routine will try to nominate the ilb (idle load balancing)
3247 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3248 * load balancing on behalf of all those cpus. If all the cpus in the system
3249 * go into this tickless mode, then there will be no ilb owner (as there is
3250 * no need for one) and all the cpus will sleep till the next wakeup event
3251 * arrives...
3252 *
3253 * For the ilb owner, tick is not stopped. And this tick will be used
3254 * for idle load balancing. ilb owner will still be part of
3255 * nohz.cpu_mask..
3256 *
3257 * While stopping the tick, this cpu will become the ilb owner if there
3258 * is no other owner. And will be the owner till that cpu becomes busy
3259 * or if all cpus in the system stop their ticks at which point
3260 * there is no need for ilb owner.
3261 *
3262 * When the ilb owner becomes busy, it nominates another owner, during the
3263 * next busy scheduler_tick()
3264 */
3265int select_nohz_load_balancer(int stop_tick)
3266{
3267 int cpu = smp_processor_id();
3268
3269 if (stop_tick) {
3270 cpu_rq(cpu)->in_nohz_recently = 1;
3271
3272 if (!cpu_active(cpu)) {
3273 if (atomic_read(&nohz.load_balancer) != cpu)
3274 return 0;
3275
3276 /*
3277 * If we are going offline and still the leader,
3278 * give up!
3279 */
3280 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3281 BUG();
3282
3283 return 0;
3284 }
3285
3286 cpumask_set_cpu(cpu, nohz.cpu_mask);
3287
3288 /* time for ilb owner also to sleep */
3289 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
3290 if (atomic_read(&nohz.load_balancer) == cpu)
3291 atomic_set(&nohz.load_balancer, -1);
3292 return 0;
3293 }
3294
3295 if (atomic_read(&nohz.load_balancer) == -1) {
3296 /* make me the ilb owner */
3297 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3298 return 1;
3299 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3300 int new_ilb;
3301
3302 if (!(sched_smt_power_savings ||
3303 sched_mc_power_savings))
3304 return 1;
3305 /*
3306 * Check to see if there is a more power-efficient
3307 * ilb.
3308 */
3309 new_ilb = find_new_ilb(cpu);
3310 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3311 atomic_set(&nohz.load_balancer, -1);
3312 resched_cpu(new_ilb);
3313 return 0;
3314 }
3315 return 1;
3316 }
3317 } else {
3318 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3319 return 0;
3320
3321 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3322
3323 if (atomic_read(&nohz.load_balancer) == cpu)
3324 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3325 BUG();
3326 }
3327 return 0;
3328}
3329#endif
3330
3331static DEFINE_SPINLOCK(balancing);
3332
3333/*
3334 * It checks each scheduling domain to see if it is due to be balanced,
3335 * and initiates a balancing operation if so.
3336 *
3337 * Balancing parameters are set up in arch_init_sched_domains.
3338 */
3339static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3340{
3341 int balance = 1;
3342 struct rq *rq = cpu_rq(cpu);
3343 unsigned long interval;
3344 struct sched_domain *sd;
3345 /* Earliest time when we have to do rebalance again */
3346 unsigned long next_balance = jiffies + 60*HZ;
3347 int update_next_balance = 0;
3348 int need_serialize;
3349
3350 for_each_domain(cpu, sd) {
3351 if (!(sd->flags & SD_LOAD_BALANCE))
3352 continue;
3353
3354 interval = sd->balance_interval;
3355 if (idle != CPU_IDLE)
3356 interval *= sd->busy_factor;
3357
3358 /* scale ms to jiffies */
3359 interval = msecs_to_jiffies(interval);
3360 if (unlikely(!interval))
3361 interval = 1;
3362 if (interval > HZ*NR_CPUS/10)
3363 interval = HZ*NR_CPUS/10;
3364
3365 need_serialize = sd->flags & SD_SERIALIZE;
3366
3367 if (need_serialize) {
3368 if (!spin_trylock(&balancing))
3369 goto out;
3370 }
3371
3372 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3373 if (load_balance(cpu, rq, sd, idle, &balance)) {
3374 /*
3375 * We've pulled tasks over so either we're no
3376 * longer idle, or one of our SMT siblings is
3377 * not idle.
3378 */
3379 idle = CPU_NOT_IDLE;
3380 }
3381 sd->last_balance = jiffies;
3382 }
3383 if (need_serialize)
3384 spin_unlock(&balancing);
3385out:
3386 if (time_after(next_balance, sd->last_balance + interval)) {
3387 next_balance = sd->last_balance + interval;
3388 update_next_balance = 1;
3389 }
3390
3391 /*
3392 * Stop the load balance at this level. There is another
3393 * CPU in our sched group which is doing load balancing more
3394 * actively.
3395 */
3396 if (!balance)
3397 break;
3398 }
3399
3400 /*
3401 * next_balance will be updated only when there is a need.
3402 * When the cpu is attached to null domain for ex, it will not be
3403 * updated.
3404 */
3405 if (likely(update_next_balance))
3406 rq->next_balance = next_balance;
3407}
3408
3409/*
3410 * run_rebalance_domains is triggered when needed from the scheduler tick.
3411 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3412 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3413 */
3414static void run_rebalance_domains(struct softirq_action *h)
3415{
3416 int this_cpu = smp_processor_id();
3417 struct rq *this_rq = cpu_rq(this_cpu);
3418 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3419 CPU_IDLE : CPU_NOT_IDLE;
3420
3421 rebalance_domains(this_cpu, idle);
3422
3423#ifdef CONFIG_NO_HZ
3424 /*
3425 * If this cpu is the owner for idle load balancing, then do the
3426 * balancing on behalf of the other idle cpus whose ticks are
3427 * stopped.
3428 */
3429 if (this_rq->idle_at_tick &&
3430 atomic_read(&nohz.load_balancer) == this_cpu) {
3431 struct rq *rq;
3432 int balance_cpu;
3433
3434 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3435 if (balance_cpu == this_cpu)
3436 continue;
3437
3438 /*
3439 * If this cpu gets work to do, stop the load balancing
3440 * work being done for other cpus. Next load
3441 * balancing owner will pick it up.
3442 */
3443 if (need_resched())
3444 break;
3445
3446 rebalance_domains(balance_cpu, CPU_IDLE);
3447
3448 rq = cpu_rq(balance_cpu);
3449 if (time_after(this_rq->next_balance, rq->next_balance))
3450 this_rq->next_balance = rq->next_balance;
3451 }
3452 }
3453#endif
3454}
3455
3456static inline int on_null_domain(int cpu)
3457{
3458 return !rcu_dereference(cpu_rq(cpu)->sd);
3459}
3460
3461/*
3462 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3463 *
3464 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3465 * idle load balancing owner or decide to stop the periodic load balancing,
3466 * if the whole system is idle.
3467 */
3468static inline void trigger_load_balance(struct rq *rq, int cpu)
3469{
3470#ifdef CONFIG_NO_HZ
3471 /*
3472 * If we were in the nohz mode recently and busy at the current
3473 * scheduler tick, then check if we need to nominate new idle
3474 * load balancer.
3475 */
3476 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3477 rq->in_nohz_recently = 0;
3478
3479 if (atomic_read(&nohz.load_balancer) == cpu) {
3480 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3481 atomic_set(&nohz.load_balancer, -1);
3482 }
3483
3484 if (atomic_read(&nohz.load_balancer) == -1) {
3485 int ilb = find_new_ilb(cpu);
3486
3487 if (ilb < nr_cpu_ids)
3488 resched_cpu(ilb);
3489 }
3490 }
3491
3492 /*
3493 * If this cpu is idle and doing idle load balancing for all the
3494 * cpus with ticks stopped, is it time for that to stop?
3495 */
3496 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3497 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3498 resched_cpu(cpu);
3499 return;
3500 }
3501
3502 /*
3503 * If this cpu is idle and the idle load balancing is done by
3504 * someone else, then no need raise the SCHED_SOFTIRQ
3505 */
3506 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3507 cpumask_test_cpu(cpu, nohz.cpu_mask))
3508 return;
3509#endif
3510 /* Don't need to rebalance while attached to NULL domain */
3511 if (time_after_eq(jiffies, rq->next_balance) &&
3512 likely(!on_null_domain(cpu)))
3513 raise_softirq(SCHED_SOFTIRQ);
3514}
1954 3515
1955static void rq_online_fair(struct rq *rq) 3516static void rq_online_fair(struct rq *rq)
1956{ 3517{
@@ -1962,6 +3523,15 @@ static void rq_offline_fair(struct rq *rq)
1962 update_sysctl(); 3523 update_sysctl();
1963} 3524}
1964 3525
3526#else /* CONFIG_SMP */
3527
3528/*
3529 * on UP we do not need to balance between CPUs:
3530 */
3531static inline void idle_balance(int cpu, struct rq *rq)
3532{
3533}
3534
1965#endif /* CONFIG_SMP */ 3535#endif /* CONFIG_SMP */
1966 3536
1967/* 3537/*
@@ -2076,7 +3646,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
2076} 3646}
2077#endif 3647#endif
2078 3648
2079unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 3649static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2080{ 3650{
2081 struct sched_entity *se = &task->se; 3651 struct sched_entity *se = &task->se;
2082 unsigned int rr_interval = 0; 3652 unsigned int rr_interval = 0;
@@ -2108,8 +3678,6 @@ static const struct sched_class fair_sched_class = {
2108#ifdef CONFIG_SMP 3678#ifdef CONFIG_SMP
2109 .select_task_rq = select_task_rq_fair, 3679 .select_task_rq = select_task_rq_fair,
2110 3680
2111 .load_balance = load_balance_fair,
2112 .move_one_task = move_one_task_fair,
2113 .rq_online = rq_online_fair, 3681 .rq_online = rq_online_fair,
2114 .rq_offline = rq_offline_fair, 3682 .rq_offline = rq_offline_fair,
2115 3683
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
44{ 44{
45} 45}
46 46
47#ifdef CONFIG_SMP
48static unsigned long
49load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
50 unsigned long max_load_move,
51 struct sched_domain *sd, enum cpu_idle_type idle,
52 int *all_pinned, int *this_best_prio)
53{
54 return 0;
55}
56
57static int
58move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
59 struct sched_domain *sd, enum cpu_idle_type idle)
60{
61 return 0;
62}
63#endif
64
65static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
66{ 48{
67} 49}
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
98} 80}
99 81
100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 83{
102 return 0; 84 return 0;
103} 85}
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
119 101
120#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
121 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
122
123 .load_balance = load_balance_idle,
124 .move_one_task = move_one_task_idle,
125#endif 104#endif
126 105
127 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216f..bf3e38fdbe6d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
194 return rt_se->my_q; 194 return rt_se->my_q;
195} 195}
196 196
197static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
198static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 199
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 int this_cpu = smp_processor_id();
202 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
203 struct sched_rt_entity *rt_se = rt_rq->rt_se; 204 struct sched_rt_entity *rt_se;
205
206 rt_se = rt_rq->tg->rt_se[this_cpu];
204 207
205 if (rt_rq->rt_nr_running) { 208 if (rt_rq->rt_nr_running) {
206 if (rt_se && !on_rt_rq(rt_se)) 209 if (rt_se && !on_rt_rq(rt_se))
207 enqueue_rt_entity(rt_se); 210 enqueue_rt_entity(rt_se, false);
208 if (rt_rq->highest_prio.curr < curr->prio) 211 if (rt_rq->highest_prio.curr < curr->prio)
209 resched_task(curr); 212 resched_task(curr);
210 } 213 }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212 215
213static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
214{ 217{
215 struct sched_rt_entity *rt_se = rt_rq->rt_se; 218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se;
220
221 rt_se = rt_rq->tg->rt_se[this_cpu];
216 222
217 if (rt_se && on_rt_rq(rt_se)) 223 if (rt_se && on_rt_rq(rt_se))
218 dequeue_rt_entity(rt_se); 224 dequeue_rt_entity(rt_se);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
803 dec_rt_group(rt_se, rt_rq); 809 dec_rt_group(rt_se, rt_rq);
804} 810}
805 811
806static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 812static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
807{ 813{
808 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 814 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
809 struct rt_prio_array *array = &rt_rq->active; 815 struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
819 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
820 return; 826 return;
821 827
822 list_add_tail(&rt_se->run_list, queue); 828 if (head)
829 list_add(&rt_se->run_list, queue);
830 else
831 list_add_tail(&rt_se->run_list, queue);
823 __set_bit(rt_se_prio(rt_se), array->bitmap); 832 __set_bit(rt_se_prio(rt_se), array->bitmap);
824 833
825 inc_rt_tasks(rt_se, rt_rq); 834 inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
856 } 865 }
857} 866}
858 867
859static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 868static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
860{ 869{
861 dequeue_rt_stack(rt_se); 870 dequeue_rt_stack(rt_se);
862 for_each_sched_rt_entity(rt_se) 871 for_each_sched_rt_entity(rt_se)
863 __enqueue_rt_entity(rt_se); 872 __enqueue_rt_entity(rt_se, head);
864} 873}
865 874
866static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 875static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
871 struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 struct rt_rq *rt_rq = group_rt_rq(rt_se);
872 881
873 if (rt_rq && rt_rq->rt_nr_running) 882 if (rt_rq && rt_rq->rt_nr_running)
874 __enqueue_rt_entity(rt_se); 883 __enqueue_rt_entity(rt_se, false);
875 } 884 }
876} 885}
877 886
878/* 887/*
879 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
880 */ 889 */
881static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
882{ 892{
883 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
884 894
885 if (wakeup) 895 if (wakeup)
886 rt_se->timeout = 0; 896 rt_se->timeout = 0;
887 897
888 enqueue_rt_entity(rt_se); 898 enqueue_rt_entity(rt_se, head);
889 899
890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
891 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
@@ -1481,24 +1491,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1481 push_rt_tasks(rq); 1491 push_rt_tasks(rq);
1482} 1492}
1483 1493
1484static unsigned long
1485load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1486 unsigned long max_load_move,
1487 struct sched_domain *sd, enum cpu_idle_type idle,
1488 int *all_pinned, int *this_best_prio)
1489{
1490 /* don't touch RT tasks */
1491 return 0;
1492}
1493
1494static int
1495move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1496 struct sched_domain *sd, enum cpu_idle_type idle)
1497{
1498 /* don't touch RT tasks */
1499 return 0;
1500}
1501
1502static void set_cpus_allowed_rt(struct task_struct *p, 1494static void set_cpus_allowed_rt(struct task_struct *p,
1503 const struct cpumask *new_mask) 1495 const struct cpumask *new_mask)
1504{ 1496{
@@ -1721,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1713 dequeue_pushable_task(rq, p);
1722} 1714}
1723 1715
1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 1716static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1717{
1726 /* 1718 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1719 * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1738,6 @@ static const struct sched_class rt_sched_class = {
1746#ifdef CONFIG_SMP 1738#ifdef CONFIG_SMP
1747 .select_task_rq = select_task_rq_rt, 1739 .select_task_rq = select_task_rq_rt,
1748 1740
1749 .load_balance = load_balance_rt,
1750 .move_one_task = move_one_task_rt,
1751 .set_cpus_allowed = set_cpus_allowed_rt, 1741 .set_cpus_allowed = set_cpus_allowed_rt,
1752 .rq_online = rq_online_rt, 1742 .rq_online = rq_online_rt,
1753 .rq_offline = rq_offline_rt, 1743 .rq_offline = rq_offline_rt,
diff --git a/kernel/sys.c b/kernel/sys.c
index 26a6b73a6b85..f75bf0936f47 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -569,11 +569,6 @@ static int set_user(struct cred *new)
569 if (!new_user) 569 if (!new_user)
570 return -EAGAIN; 570 return -EAGAIN;
571 571
572 if (!task_can_switch_user(new_user, current)) {
573 free_uid(new_user);
574 return -EINVAL;
575 }
576
577 if (atomic_read(&new_user->processes) >= 572 if (atomic_read(&new_user->processes) >=
578 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 573 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
579 new_user != INIT_USER) { 574 new_user != INIT_USER) {
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
56 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
57 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns, 58 .user_ns = &init_user_ns,
59#ifdef CONFIG_USER_SCHED
60 .tg = &init_task_group,
61#endif
62}; 59};
63 60
64/* 61/*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 72 put_user_ns(up->user_ns);
76} 73}
77 74
78#ifdef CONFIG_USER_SCHED
79
80static void sched_destroy_user(struct user_struct *up)
81{
82 sched_destroy_group(up->tg);
83}
84
85static int sched_create_user(struct user_struct *up)
86{
87 int rc = 0;
88
89 up->tg = sched_create_group(&root_task_group);
90 if (IS_ERR(up->tg))
91 rc = -ENOMEM;
92
93 set_tg_uid(up);
94
95 return rc;
96}
97
98#else /* CONFIG_USER_SCHED */
99
100static void sched_destroy_user(struct user_struct *up) { }
101static int sched_create_user(struct user_struct *up) { return 0; }
102
103#endif /* CONFIG_USER_SCHED */
104
105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
125static DEFINE_MUTEX(uids_mutex);
126
127static inline void uids_mutex_lock(void)
128{
129 mutex_lock(&uids_mutex);
130}
131
132static inline void uids_mutex_unlock(void)
133{
134 mutex_unlock(&uids_mutex);
135}
136
137/* uid directory attributes */
138#ifdef CONFIG_FAIR_GROUP_SCHED
139static ssize_t cpu_shares_show(struct kobject *kobj,
140 struct kobj_attribute *attr,
141 char *buf)
142{
143 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144
145 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
146}
147
148static ssize_t cpu_shares_store(struct kobject *kobj,
149 struct kobj_attribute *attr,
150 const char *buf, size_t size)
151{
152 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
153 unsigned long shares;
154 int rc;
155
156 sscanf(buf, "%lu", &shares);
157
158 rc = sched_group_set_shares(up->tg, shares);
159
160 return (rc ? rc : size);
161}
162
163static struct kobj_attribute cpu_share_attr =
164 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
165#endif
166
167#ifdef CONFIG_RT_GROUP_SCHED
168static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 char *buf)
171{
172 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
173
174 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
175}
176
177static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
178 struct kobj_attribute *attr,
179 const char *buf, size_t size)
180{
181 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
182 unsigned long rt_runtime;
183 int rc;
184
185 sscanf(buf, "%ld", &rt_runtime);
186
187 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
188
189 return (rc ? rc : size);
190}
191
192static struct kobj_attribute cpu_rt_runtime_attr =
193 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
194
195static ssize_t cpu_rt_period_show(struct kobject *kobj,
196 struct kobj_attribute *attr,
197 char *buf)
198{
199 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
200
201 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
202}
203
204static ssize_t cpu_rt_period_store(struct kobject *kobj,
205 struct kobj_attribute *attr,
206 const char *buf, size_t size)
207{
208 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
209 unsigned long rt_period;
210 int rc;
211
212 sscanf(buf, "%lu", &rt_period);
213
214 rc = sched_group_set_rt_period(up->tg, rt_period);
215
216 return (rc ? rc : size);
217}
218
219static struct kobj_attribute cpu_rt_period_attr =
220 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
221#endif
222
223/* default attributes per uid directory */
224static struct attribute *uids_attributes[] = {
225#ifdef CONFIG_FAIR_GROUP_SCHED
226 &cpu_share_attr.attr,
227#endif
228#ifdef CONFIG_RT_GROUP_SCHED
229 &cpu_rt_runtime_attr.attr,
230 &cpu_rt_period_attr.attr,
231#endif
232 NULL
233};
234
235/* the lifetime of user_struct is not managed by the core (now) */
236static void uids_release(struct kobject *kobj)
237{
238 return;
239}
240
241static struct kobj_type uids_ktype = {
242 .sysfs_ops = &kobj_sysfs_ops,
243 .default_attrs = uids_attributes,
244 .release = uids_release,
245};
246
247/*
248 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
249 * We do not create this file for users in a user namespace (until
250 * sysfs tagging is implemented).
251 *
252 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
253 */
254static int uids_user_create(struct user_struct *up)
255{
256 struct kobject *kobj = &up->kobj;
257 int error;
258
259 memset(kobj, 0, sizeof(struct kobject));
260 if (up->user_ns != &init_user_ns)
261 return 0;
262 kobj->kset = uids_kset;
263 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
264 if (error) {
265 kobject_put(kobj);
266 goto done;
267 }
268
269 kobject_uevent(kobj, KOBJ_ADD);
270done:
271 return error;
272}
273
274/* create these entries in sysfs:
275 * "/sys/kernel/uids" directory
276 * "/sys/kernel/uids/0" directory (for root user)
277 * "/sys/kernel/uids/0/cpu_share" file (for root user)
278 */
279int __init uids_sysfs_init(void)
280{
281 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
282 if (!uids_kset)
283 return -ENOMEM;
284
285 return uids_user_create(&root_user);
286}
287
288/* delayed work function to remove sysfs directory for a user and free up
289 * corresponding structures.
290 */
291static void cleanup_user_struct(struct work_struct *w)
292{
293 struct user_struct *up = container_of(w, struct user_struct, work.work);
294 unsigned long flags;
295 int remove_user = 0;
296
297 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
298 * atomic.
299 */
300 uids_mutex_lock();
301
302 spin_lock_irqsave(&uidhash_lock, flags);
303 if (atomic_read(&up->__count) == 0) {
304 uid_hash_remove(up);
305 remove_user = 1;
306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
308
309 if (!remove_user)
310 goto done;
311
312 if (up->user_ns == &init_user_ns) {
313 kobject_uevent(&up->kobj, KOBJ_REMOVE);
314 kobject_del(&up->kobj);
315 kobject_put(&up->kobj);
316 }
317
318 sched_destroy_user(up);
319 key_put(up->uid_keyring);
320 key_put(up->session_keyring);
321 kmem_cache_free(uid_cachep, up);
322
323done:
324 uids_mutex_unlock();
325}
326
327/* IRQs are disabled and uidhash_lock is held upon function entry.
328 * IRQ state (as stored in flags) is restored and uidhash_lock released
329 * upon function exit.
330 */
331static void free_user(struct user_struct *up, unsigned long flags)
332{
333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336}
337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 75static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{ 76{
342 struct user_struct *user; 77 struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
352 return NULL; 87 return NULL;
353} 88}
354 89
355int uids_sysfs_init(void) { return 0; }
356static inline int uids_user_create(struct user_struct *up) { return 0; }
357static inline void uids_mutex_lock(void) { }
358static inline void uids_mutex_unlock(void) { }
359
360/* IRQs are disabled and uidhash_lock is held upon function entry. 90/* IRQs are disabled and uidhash_lock is held upon function entry.
361 * IRQ state (as stored in flags) is restored and uidhash_lock released 91 * IRQ state (as stored in flags) is restored and uidhash_lock released
362 * upon function exit. 92 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
365{ 95{
366 uid_hash_remove(up); 96 uid_hash_remove(up);
367 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
368 sched_destroy_user(up);
369 key_put(up->uid_keyring); 98 key_put(up->uid_keyring);
370 key_put(up->session_keyring); 99 key_put(up->session_keyring);
371 kmem_cache_free(uid_cachep, up); 100 kmem_cache_free(uid_cachep, up);
372} 101}
373 102
374#endif
375
376#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
377/*
378 * We need to check if a setuid can take place. This function should be called
379 * before successfully completing the setuid.
380 */
381int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
382{
383
384 return sched_rt_can_attach(up->tg, tsk);
385
386}
387#else
388int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
389{
390 return 1;
391}
392#endif
393
394/* 103/*
395 * Locate the user_struct for the passed UID. If found, take a ref on it. The 104 * Locate the user_struct for the passed UID. If found, take a ref on it. The
396 * caller must undo that ref with free_uid(). 105 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
431 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() 140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
432 * atomic. 141 * atomic.
433 */ 142 */
434 uids_mutex_lock();
435
436 spin_lock_irq(&uidhash_lock); 143 spin_lock_irq(&uidhash_lock);
437 up = uid_hash_find(uid, hashent); 144 up = uid_hash_find(uid, hashent);
438 spin_unlock_irq(&uidhash_lock); 145 spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
445 new->uid = uid; 152 new->uid = uid;
446 atomic_set(&new->__count, 1); 153 atomic_set(&new->__count, 1);
447 154
448 if (sched_create_user(new) < 0)
449 goto out_free_user;
450
451 new->user_ns = get_user_ns(ns); 155 new->user_ns = get_user_ns(ns);
452 156
453 if (uids_user_create(new))
454 goto out_destoy_sched;
455
456 /* 157 /*
457 * Before adding this, check whether we raced 158 * Before adding this, check whether we raced
458 * on adding the same user already.. 159 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
475 spin_unlock_irq(&uidhash_lock); 176 spin_unlock_irq(&uidhash_lock);
476 } 177 }
477 178
478 uids_mutex_unlock();
479
480 return up; 179 return up;
481 180
482out_destoy_sched:
483 sched_destroy_user(new);
484 put_user_ns(new->user_ns); 181 put_user_ns(new->user_ns);
485out_free_user:
486 kmem_cache_free(uid_cachep, new); 182 kmem_cache_free(uid_cachep, new);
487out_unlock: 183out_unlock:
488 uids_mutex_unlock();
489 return NULL; 184 return NULL;
490} 185}
491 186