aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2008-07-21 00:55:14 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2008-07-21 00:55:14 -0400
commit908cf4b925e419bc74f3297b2f0e51d6f8a81da2 (patch)
tree6c2da79366d4695a9c2560ab18259eca8a2a25b4 /kernel/sched.c
parent92c49890922d54cba4b1eadeb0b185773c2c9570 (diff)
parent14b395e35d1afdd8019d11b92e28041fad591b71 (diff)
Merge master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6 into next
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c972
1 files changed, 489 insertions, 483 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..99e6d850ecab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,10 +70,13 @@
70#include <linux/bootmem.h> 70#include <linux/bootmem.h>
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h>
73 74
74#include <asm/tlb.h> 75#include <asm/tlb.h>
75#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
76 77
78#include "sched_cpupri.h"
79
77/* 80/*
78 * Convert user-nice values [ -20 ... 0 ... 19 ] 81 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 82 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -136,7 +139,7 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
136 139
137static inline int rt_policy(int policy) 140static inline int rt_policy(int policy)
138{ 141{
139 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) 142 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
140 return 1; 143 return 1;
141 return 0; 144 return 0;
142} 145}
@@ -289,15 +292,15 @@ struct task_group root_task_group;
289static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 292static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
290/* Default task group's cfs_rq on each cpu */ 293/* Default task group's cfs_rq on each cpu */
291static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 294static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
292#endif 295#endif /* CONFIG_FAIR_GROUP_SCHED */
293 296
294#ifdef CONFIG_RT_GROUP_SCHED 297#ifdef CONFIG_RT_GROUP_SCHED
295static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
296static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
297#endif 300#endif /* CONFIG_RT_GROUP_SCHED */
298#else 301#else /* !CONFIG_FAIR_GROUP_SCHED */
299#define root_task_group init_task_group 302#define root_task_group init_task_group
300#endif 303#endif /* CONFIG_FAIR_GROUP_SCHED */
301 304
302/* task_group_lock serializes add/remove of task groups and also changes to 305/* task_group_lock serializes add/remove of task groups and also changes to
303 * a task group's cpu shares. 306 * a task group's cpu shares.
@@ -307,17 +310,20 @@ static DEFINE_SPINLOCK(task_group_lock);
307#ifdef CONFIG_FAIR_GROUP_SCHED 310#ifdef CONFIG_FAIR_GROUP_SCHED
308#ifdef CONFIG_USER_SCHED 311#ifdef CONFIG_USER_SCHED
309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 312# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
310#else 313#else /* !CONFIG_USER_SCHED */
311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 314# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
312#endif 315#endif /* CONFIG_USER_SCHED */
313 316
314/* 317/*
315 * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. 318 * A weight of 0 or 1 can cause arithmetics problems.
319 * A weight of a cfs_rq is the sum of weights of which entities
320 * are queued on this cfs_rq, so a weight of a entity should not be
321 * too large, so as the shares value of a task group.
316 * (The default weight is 1024 - so there's no practical 322 * (The default weight is 1024 - so there's no practical
317 * limitation from this.) 323 * limitation from this.)
318 */ 324 */
319#define MIN_SHARES 2 325#define MIN_SHARES 2
320#define MAX_SHARES (ULONG_MAX - 1) 326#define MAX_SHARES (1UL << 18)
321 327
322static int init_task_group_load = INIT_TASK_GROUP_LOAD; 328static int init_task_group_load = INIT_TASK_GROUP_LOAD;
323#endif 329#endif
@@ -360,6 +366,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
360#else 366#else
361 367
362static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 368static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
369static inline struct task_group *task_group(struct task_struct *p)
370{
371 return NULL;
372}
363 373
364#endif /* CONFIG_GROUP_SCHED */ 374#endif /* CONFIG_GROUP_SCHED */
365 375
@@ -370,6 +380,7 @@ struct cfs_rq {
370 380
371 u64 exec_clock; 381 u64 exec_clock;
372 u64 min_vruntime; 382 u64 min_vruntime;
383 u64 pair_start;
373 384
374 struct rb_root tasks_timeline; 385 struct rb_root tasks_timeline;
375 struct rb_node *rb_leftmost; 386 struct rb_node *rb_leftmost;
@@ -400,40 +411,28 @@ struct cfs_rq {
400 struct task_group *tg; /* group that "owns" this runqueue */ 411 struct task_group *tg; /* group that "owns" this runqueue */
401 412
402#ifdef CONFIG_SMP 413#ifdef CONFIG_SMP
403 unsigned long task_weight;
404 unsigned long shares;
405 /* 414 /*
406 * We need space to build a sched_domain wide view of the full task 415 * the part of load.weight contributed by tasks
407 * group tree, in order to avoid depending on dynamic memory allocation
408 * during the load balancing we place this in the per cpu task group
409 * hierarchy. This limits the load balancing to one instance per cpu,
410 * but more should not be needed anyway.
411 */ 416 */
412 struct aggregate_struct { 417 unsigned long task_weight;
413 /*
414 * load = weight(cpus) * f(tg)
415 *
416 * Where f(tg) is the recursive weight fraction assigned to
417 * this group.
418 */
419 unsigned long load;
420 418
421 /* 419 /*
422 * part of the group weight distributed to this span. 420 * h_load = weight * f(tg)
423 */ 421 *
424 unsigned long shares; 422 * Where f(tg) is the recursive weight fraction assigned to
423 * this group.
424 */
425 unsigned long h_load;
425 426
426 /* 427 /*
427 * The sum of all runqueue weights within this span. 428 * this cpu's part of tg->shares
428 */ 429 */
429 unsigned long rq_weight; 430 unsigned long shares;
430 431
431 /* 432 /*
432 * Weight contributed by tasks; this is the part we can 433 * load.weight at the time we set shares
433 * influence by moving tasks around. 434 */
434 */ 435 unsigned long rq_weight;
435 unsigned long task_weight;
436 } aggregate;
437#endif 436#endif
438#endif 437#endif
439}; 438};
@@ -486,6 +485,9 @@ struct root_domain {
486 */ 485 */
487 cpumask_t rto_mask; 486 cpumask_t rto_mask;
488 atomic_t rto_count; 487 atomic_t rto_count;
488#ifdef CONFIG_SMP
489 struct cpupri cpupri;
490#endif
489}; 491};
490 492
491/* 493/*
@@ -560,6 +562,9 @@ struct rq {
560 int push_cpu; 562 int push_cpu;
561 /* cpu of this runqueue: */ 563 /* cpu of this runqueue: */
562 int cpu; 564 int cpu;
565 int online;
566
567 unsigned long avg_load_per_task;
563 568
564 struct task_struct *migration_thread; 569 struct task_struct *migration_thread;
565 struct list_head migration_queue; 570 struct list_head migration_queue;
@@ -641,6 +646,24 @@ static inline void update_rq_clock(struct rq *rq)
641# define const_debug static const 646# define const_debug static const
642#endif 647#endif
643 648
649/**
650 * runqueue_is_locked
651 *
652 * Returns true if the current cpu runqueue is locked.
653 * This interface allows printk to be called with the runqueue lock
654 * held and know whether or not it is OK to wake up the klogd.
655 */
656int runqueue_is_locked(void)
657{
658 int cpu = get_cpu();
659 struct rq *rq = cpu_rq(cpu);
660 int ret;
661
662 ret = spin_is_locked(&rq->lock);
663 put_cpu();
664 return ret;
665}
666
644/* 667/*
645 * Debugging: various feature bits 668 * Debugging: various feature bits
646 */ 669 */
@@ -783,6 +806,12 @@ late_initcall(sched_init_debug);
783const_debug unsigned int sysctl_sched_nr_migrate = 32; 806const_debug unsigned int sysctl_sched_nr_migrate = 32;
784 807
785/* 808/*
809 * ratelimit for updating the group shares.
810 * default: 0.5ms
811 */
812const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
813
814/*
786 * period over which we measure -rt task cpu usage in us. 815 * period over which we measure -rt task cpu usage in us.
787 * default: 1s 816 * default: 1s
788 */ 817 */
@@ -809,82 +838,6 @@ static inline u64 global_rt_runtime(void)
809 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 838 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
810} 839}
811 840
812unsigned long long time_sync_thresh = 100000;
813
814static DEFINE_PER_CPU(unsigned long long, time_offset);
815static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
816
817/*
818 * Global lock which we take every now and then to synchronize
819 * the CPUs time. This method is not warp-safe, but it's good
820 * enough to synchronize slowly diverging time sources and thus
821 * it's good enough for tracing:
822 */
823static DEFINE_SPINLOCK(time_sync_lock);
824static unsigned long long prev_global_time;
825
826static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
827{
828 /*
829 * We want this inlined, to not get tracer function calls
830 * in this critical section:
831 */
832 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
833 __raw_spin_lock(&time_sync_lock.raw_lock);
834
835 if (time < prev_global_time) {
836 per_cpu(time_offset, cpu) += prev_global_time - time;
837 time = prev_global_time;
838 } else {
839 prev_global_time = time;
840 }
841
842 __raw_spin_unlock(&time_sync_lock.raw_lock);
843 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
844
845 return time;
846}
847
848static unsigned long long __cpu_clock(int cpu)
849{
850 unsigned long long now;
851
852 /*
853 * Only call sched_clock() if the scheduler has already been
854 * initialized (some code might call cpu_clock() very early):
855 */
856 if (unlikely(!scheduler_running))
857 return 0;
858
859 now = sched_clock_cpu(cpu);
860
861 return now;
862}
863
864/*
865 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
866 * clock constructed from sched_clock():
867 */
868unsigned long long cpu_clock(int cpu)
869{
870 unsigned long long prev_cpu_time, time, delta_time;
871 unsigned long flags;
872
873 local_irq_save(flags);
874 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
875 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
876 delta_time = time-prev_cpu_time;
877
878 if (unlikely(delta_time > time_sync_thresh)) {
879 time = __sync_cpu_clock(time, cpu);
880 per_cpu(prev_cpu_time, cpu) = time;
881 }
882 local_irq_restore(flags);
883
884 return time;
885}
886EXPORT_SYMBOL_GPL(cpu_clock);
887
888#ifndef prepare_arch_switch 841#ifndef prepare_arch_switch
889# define prepare_arch_switch(next) do { } while (0) 842# define prepare_arch_switch(next) do { } while (0)
890#endif 843#endif
@@ -1161,6 +1114,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1161 return HRTIMER_NORESTART; 1114 return HRTIMER_NORESTART;
1162} 1115}
1163 1116
1117#ifdef CONFIG_SMP
1164static void hotplug_hrtick_disable(int cpu) 1118static void hotplug_hrtick_disable(int cpu)
1165{ 1119{
1166 struct rq *rq = cpu_rq(cpu); 1120 struct rq *rq = cpu_rq(cpu);
@@ -1216,6 +1170,7 @@ static void init_hrtick(void)
1216{ 1170{
1217 hotcpu_notifier(hotplug_hrtick, 0); 1171 hotcpu_notifier(hotplug_hrtick, 0);
1218} 1172}
1173#endif /* CONFIG_SMP */
1219 1174
1220static void init_rq_hrtick(struct rq *rq) 1175static void init_rq_hrtick(struct rq *rq)
1221{ 1176{
@@ -1345,15 +1300,15 @@ void wake_up_idle_cpu(int cpu)
1345 if (!tsk_is_polling(rq->idle)) 1300 if (!tsk_is_polling(rq->idle))
1346 smp_send_reschedule(cpu); 1301 smp_send_reschedule(cpu);
1347} 1302}
1348#endif 1303#endif /* CONFIG_NO_HZ */
1349 1304
1350#else 1305#else /* !CONFIG_SMP */
1351static void __resched_task(struct task_struct *p, int tif_bit) 1306static void __resched_task(struct task_struct *p, int tif_bit)
1352{ 1307{
1353 assert_spin_locked(&task_rq(p)->lock); 1308 assert_spin_locked(&task_rq(p)->lock);
1354 set_tsk_thread_flag(p, tif_bit); 1309 set_tsk_thread_flag(p, tif_bit);
1355} 1310}
1356#endif 1311#endif /* CONFIG_SMP */
1357 1312
1358#if BITS_PER_LONG == 32 1313#if BITS_PER_LONG == 32
1359# define WMULT_CONST (~0UL) 1314# define WMULT_CONST (~0UL)
@@ -1377,8 +1332,13 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1377{ 1332{
1378 u64 tmp; 1333 u64 tmp;
1379 1334
1380 if (!lw->inv_weight) 1335 if (!lw->inv_weight) {
1381 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1); 1336 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1337 lw->inv_weight = 1;
1338 else
1339 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1340 / (lw->weight+1);
1341 }
1382 1342
1383 tmp = (u64)delta_exec * weight; 1343 tmp = (u64)delta_exec * weight;
1384 /* 1344 /*
@@ -1503,63 +1463,35 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1503#ifdef CONFIG_SMP 1463#ifdef CONFIG_SMP
1504static unsigned long source_load(int cpu, int type); 1464static unsigned long source_load(int cpu, int type);
1505static unsigned long target_load(int cpu, int type); 1465static unsigned long target_load(int cpu, int type);
1506static unsigned long cpu_avg_load_per_task(int cpu);
1507static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1466static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1508 1467
1509#ifdef CONFIG_FAIR_GROUP_SCHED 1468static unsigned long cpu_avg_load_per_task(int cpu)
1469{
1470 struct rq *rq = cpu_rq(cpu);
1510 1471
1511/* 1472 if (rq->nr_running)
1512 * Group load balancing. 1473 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1513 *
1514 * We calculate a few balance domain wide aggregate numbers; load and weight.
1515 * Given the pictures below, and assuming each item has equal weight:
1516 *
1517 * root 1 - thread
1518 * / | \ A - group
1519 * A 1 B
1520 * /|\ / \
1521 * C 2 D 3 4
1522 * | |
1523 * 5 6
1524 *
1525 * load:
1526 * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
1527 * which equals 1/9-th of the total load.
1528 *
1529 * shares:
1530 * The weight of this group on the selected cpus.
1531 *
1532 * rq_weight:
1533 * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
1534 * B would get 2.
1535 *
1536 * task_weight:
1537 * Part of the rq_weight contributed by tasks; all groups except B would
1538 * get 1, B gets 2.
1539 */
1540 1474
1541static inline struct aggregate_struct * 1475 return rq->avg_load_per_task;
1542aggregate(struct task_group *tg, struct sched_domain *sd)
1543{
1544 return &tg->cfs_rq[sd->first_cpu]->aggregate;
1545} 1476}
1546 1477
1547typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); 1478#ifdef CONFIG_FAIR_GROUP_SCHED
1479
1480typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1548 1481
1549/* 1482/*
1550 * Iterate the full tree, calling @down when first entering a node and @up when 1483 * Iterate the full tree, calling @down when first entering a node and @up when
1551 * leaving it for the final time. 1484 * leaving it for the final time.
1552 */ 1485 */
1553static 1486static void
1554void aggregate_walk_tree(aggregate_func down, aggregate_func up, 1487walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1555 struct sched_domain *sd)
1556{ 1488{
1557 struct task_group *parent, *child; 1489 struct task_group *parent, *child;
1558 1490
1559 rcu_read_lock(); 1491 rcu_read_lock();
1560 parent = &root_task_group; 1492 parent = &root_task_group;
1561down: 1493down:
1562 (*down)(parent, sd); 1494 (*down)(parent, cpu, sd);
1563 list_for_each_entry_rcu(child, &parent->children, siblings) { 1495 list_for_each_entry_rcu(child, &parent->children, siblings) {
1564 parent = child; 1496 parent = child;
1565 goto down; 1497 goto down;
@@ -1567,7 +1499,7 @@ down:
1567up: 1499up:
1568 continue; 1500 continue;
1569 } 1501 }
1570 (*up)(parent, sd); 1502 (*up)(parent, cpu, sd);
1571 1503
1572 child = parent; 1504 child = parent;
1573 parent = parent->parent; 1505 parent = parent->parent;
@@ -1576,90 +1508,23 @@ up:
1576 rcu_read_unlock(); 1508 rcu_read_unlock();
1577} 1509}
1578 1510
1579/*
1580 * Calculate the aggregate runqueue weight.
1581 */
1582static
1583void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
1584{
1585 unsigned long rq_weight = 0;
1586 unsigned long task_weight = 0;
1587 int i;
1588
1589 for_each_cpu_mask(i, sd->span) {
1590 rq_weight += tg->cfs_rq[i]->load.weight;
1591 task_weight += tg->cfs_rq[i]->task_weight;
1592 }
1593
1594 aggregate(tg, sd)->rq_weight = rq_weight;
1595 aggregate(tg, sd)->task_weight = task_weight;
1596}
1597
1598/*
1599 * Compute the weight of this group on the given cpus.
1600 */
1601static
1602void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
1603{
1604 unsigned long shares = 0;
1605 int i;
1606
1607 for_each_cpu_mask(i, sd->span)
1608 shares += tg->cfs_rq[i]->shares;
1609
1610 if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
1611 shares = tg->shares;
1612
1613 aggregate(tg, sd)->shares = shares;
1614}
1615
1616/*
1617 * Compute the load fraction assigned to this group, relies on the aggregate
1618 * weight and this group's parent's load, i.e. top-down.
1619 */
1620static
1621void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
1622{
1623 unsigned long load;
1624
1625 if (!tg->parent) {
1626 int i;
1627
1628 load = 0;
1629 for_each_cpu_mask(i, sd->span)
1630 load += cpu_rq(i)->load.weight;
1631
1632 } else {
1633 load = aggregate(tg->parent, sd)->load;
1634
1635 /*
1636 * shares is our weight in the parent's rq so
1637 * shares/parent->rq_weight gives our fraction of the load
1638 */
1639 load *= aggregate(tg, sd)->shares;
1640 load /= aggregate(tg->parent, sd)->rq_weight + 1;
1641 }
1642
1643 aggregate(tg, sd)->load = load;
1644}
1645
1646static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1511static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1647 1512
1648/* 1513/*
1649 * Calculate and set the cpu's group shares. 1514 * Calculate and set the cpu's group shares.
1650 */ 1515 */
1651static void 1516static void
1652__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, 1517__update_group_shares_cpu(struct task_group *tg, int cpu,
1653 int tcpu) 1518 unsigned long sd_shares, unsigned long sd_rq_weight)
1654{ 1519{
1655 int boost = 0; 1520 int boost = 0;
1656 unsigned long shares; 1521 unsigned long shares;
1657 unsigned long rq_weight; 1522 unsigned long rq_weight;
1658 1523
1659 if (!tg->se[tcpu]) 1524 if (!tg->se[cpu])
1660 return; 1525 return;
1661 1526
1662 rq_weight = tg->cfs_rq[tcpu]->load.weight; 1527 rq_weight = tg->cfs_rq[cpu]->load.weight;
1663 1528
1664 /* 1529 /*
1665 * If there are currently no tasks on the cpu pretend there is one of 1530 * If there are currently no tasks on the cpu pretend there is one of
@@ -1671,170 +1536,139 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1671 rq_weight = NICE_0_LOAD; 1536 rq_weight = NICE_0_LOAD;
1672 } 1537 }
1673 1538
1539 if (unlikely(rq_weight > sd_rq_weight))
1540 rq_weight = sd_rq_weight;
1541
1674 /* 1542 /*
1675 * \Sum shares * rq_weight 1543 * \Sum shares * rq_weight
1676 * shares = ----------------------- 1544 * shares = -----------------------
1677 * \Sum rq_weight 1545 * \Sum rq_weight
1678 * 1546 *
1679 */ 1547 */
1680 shares = aggregate(tg, sd)->shares * rq_weight; 1548 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1681 shares /= aggregate(tg, sd)->rq_weight + 1;
1682 1549
1683 /* 1550 /*
1684 * record the actual number of shares, not the boosted amount. 1551 * record the actual number of shares, not the boosted amount.
1685 */ 1552 */
1686 tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; 1553 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1554 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1687 1555
1688 if (shares < MIN_SHARES) 1556 if (shares < MIN_SHARES)
1689 shares = MIN_SHARES; 1557 shares = MIN_SHARES;
1690 else if (shares > MAX_SHARES) 1558 else if (shares > MAX_SHARES)
1691 shares = MAX_SHARES; 1559 shares = MAX_SHARES;
1692 1560
1693 __set_se_shares(tg->se[tcpu], shares); 1561 __set_se_shares(tg->se[cpu], shares);
1694} 1562}
1695 1563
1696/* 1564/*
1697 * Re-adjust the weights on the cpu the task came from and on the cpu the 1565 * Re-compute the task group their per cpu shares over the given domain.
1698 * task went to. 1566 * This needs to be done in a bottom-up fashion because the rq weight of a
1567 * parent group depends on the shares of its child groups.
1699 */ 1568 */
1700static void 1569static void
1701__move_group_shares(struct task_group *tg, struct sched_domain *sd, 1570tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1702 int scpu, int dcpu)
1703{ 1571{
1704 unsigned long shares; 1572 unsigned long rq_weight = 0;
1705 1573 unsigned long shares = 0;
1706 shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; 1574 int i;
1707 1575
1708 __update_group_shares_cpu(tg, sd, scpu); 1576 for_each_cpu_mask(i, sd->span) {
1709 __update_group_shares_cpu(tg, sd, dcpu); 1577 rq_weight += tg->cfs_rq[i]->load.weight;
1578 shares += tg->cfs_rq[i]->shares;
1579 }
1710 1580
1711 /* 1581 if ((!shares && rq_weight) || shares > tg->shares)
1712 * ensure we never loose shares due to rounding errors in the 1582 shares = tg->shares;
1713 * above redistribution.
1714 */
1715 shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1716 if (shares)
1717 tg->cfs_rq[dcpu]->shares += shares;
1718}
1719 1583
1720/* 1584 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1721 * Because changing a group's shares changes the weight of the super-group 1585 shares = tg->shares;
1722 * we need to walk up the tree and change all shares until we hit the root.
1723 */
1724static void
1725move_group_shares(struct task_group *tg, struct sched_domain *sd,
1726 int scpu, int dcpu)
1727{
1728 while (tg) {
1729 __move_group_shares(tg, sd, scpu, dcpu);
1730 tg = tg->parent;
1731 }
1732}
1733 1586
1734static 1587 if (!rq_weight)
1735void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) 1588 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1736{
1737 unsigned long shares = aggregate(tg, sd)->shares;
1738 int i;
1739 1589
1740 for_each_cpu_mask(i, sd->span) { 1590 for_each_cpu_mask(i, sd->span) {
1741 struct rq *rq = cpu_rq(i); 1591 struct rq *rq = cpu_rq(i);
1742 unsigned long flags; 1592 unsigned long flags;
1743 1593
1744 spin_lock_irqsave(&rq->lock, flags); 1594 spin_lock_irqsave(&rq->lock, flags);
1745 __update_group_shares_cpu(tg, sd, i); 1595 __update_group_shares_cpu(tg, i, shares, rq_weight);
1746 spin_unlock_irqrestore(&rq->lock, flags); 1596 spin_unlock_irqrestore(&rq->lock, flags);
1747 } 1597 }
1748
1749 aggregate_group_shares(tg, sd);
1750
1751 /*
1752 * ensure we never loose shares due to rounding errors in the
1753 * above redistribution.
1754 */
1755 shares -= aggregate(tg, sd)->shares;
1756 if (shares) {
1757 tg->cfs_rq[sd->first_cpu]->shares += shares;
1758 aggregate(tg, sd)->shares += shares;
1759 }
1760} 1598}
1761 1599
1762/* 1600/*
1763 * Calculate the accumulative weight and recursive load of each task group 1601 * Compute the cpu's hierarchical load factor for each task group.
1764 * while walking down the tree. 1602 * This needs to be done in a top-down fashion because the load of a child
1603 * group is a fraction of its parents load.
1765 */ 1604 */
1766static 1605static void
1767void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) 1606tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1768{ 1607{
1769 aggregate_group_weight(tg, sd); 1608 unsigned long load;
1770 aggregate_group_shares(tg, sd);
1771 aggregate_group_load(tg, sd);
1772}
1773 1609
1774/* 1610 if (!tg->parent) {
1775 * Rebalance the cpu shares while walking back up the tree. 1611 load = cpu_rq(cpu)->load.weight;
1776 */ 1612 } else {
1777static 1613 load = tg->parent->cfs_rq[cpu]->h_load;
1778void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) 1614 load *= tg->cfs_rq[cpu]->shares;
1779{ 1615 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1780 aggregate_group_set_shares(tg, sd); 1616 }
1781}
1782 1617
1783static DEFINE_PER_CPU(spinlock_t, aggregate_lock); 1618 tg->cfs_rq[cpu]->h_load = load;
1619}
1784 1620
1785static void __init init_aggregate(void) 1621static void
1622tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1786{ 1623{
1787 int i;
1788
1789 for_each_possible_cpu(i)
1790 spin_lock_init(&per_cpu(aggregate_lock, i));
1791} 1624}
1792 1625
1793static int get_aggregate(struct sched_domain *sd) 1626static void update_shares(struct sched_domain *sd)
1794{ 1627{
1795 if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) 1628 u64 now = cpu_clock(raw_smp_processor_id());
1796 return 0; 1629 s64 elapsed = now - sd->last_update;
1797 1630
1798 aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); 1631 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1799 return 1; 1632 sd->last_update = now;
1633 walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
1634 }
1800} 1635}
1801 1636
1802static void put_aggregate(struct sched_domain *sd) 1637static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1803{ 1638{
1804 spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); 1639 spin_unlock(&rq->lock);
1640 update_shares(sd);
1641 spin_lock(&rq->lock);
1805} 1642}
1806 1643
1807static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1644static void update_h_load(int cpu)
1808{ 1645{
1809 cfs_rq->shares = shares; 1646 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
1810} 1647}
1811 1648
1812#else 1649#else
1813 1650
1814static inline void init_aggregate(void) 1651static inline void update_shares(struct sched_domain *sd)
1815{ 1652{
1816} 1653}
1817 1654
1818static inline int get_aggregate(struct sched_domain *sd) 1655static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1819{ 1656{
1820 return 0;
1821} 1657}
1822 1658
1823static inline void put_aggregate(struct sched_domain *sd)
1824{
1825}
1826#endif 1659#endif
1827 1660
1828#else /* CONFIG_SMP */ 1661#endif
1829 1662
1830#ifdef CONFIG_FAIR_GROUP_SCHED 1663#ifdef CONFIG_FAIR_GROUP_SCHED
1831static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1664static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1832{ 1665{
1666#ifdef CONFIG_SMP
1667 cfs_rq->shares = shares;
1668#endif
1833} 1669}
1834#endif 1670#endif
1835 1671
1836#endif /* CONFIG_SMP */
1837
1838#include "sched_stats.h" 1672#include "sched_stats.h"
1839#include "sched_idletask.c" 1673#include "sched_idletask.c"
1840#include "sched_fair.c" 1674#include "sched_fair.c"
@@ -1844,6 +1678,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1844#endif 1678#endif
1845 1679
1846#define sched_class_highest (&rt_sched_class) 1680#define sched_class_highest (&rt_sched_class)
1681#define for_each_class(class) \
1682 for (class = sched_class_highest; class; class = class->next)
1847 1683
1848static void inc_nr_running(struct rq *rq) 1684static void inc_nr_running(struct rq *rq)
1849{ 1685{
@@ -1876,6 +1712,12 @@ static void set_load_weight(struct task_struct *p)
1876 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1712 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1877} 1713}
1878 1714
1715static void update_avg(u64 *avg, u64 sample)
1716{
1717 s64 diff = sample - *avg;
1718 *avg += diff >> 3;
1719}
1720
1879static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1721static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1880{ 1722{
1881 sched_info_queued(p); 1723 sched_info_queued(p);
@@ -1885,6 +1727,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1885 1727
1886static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1728static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1887{ 1729{
1730 if (sleep && p->se.last_wakeup) {
1731 update_avg(&p->se.avg_overlap,
1732 p->se.sum_exec_runtime - p->se.last_wakeup);
1733 p->se.last_wakeup = 0;
1734 }
1735
1736 sched_info_dequeued(p);
1888 p->sched_class->dequeue_task(rq, p, sleep); 1737 p->sched_class->dequeue_task(rq, p, sleep);
1889 p->se.on_rq = 0; 1738 p->se.on_rq = 0;
1890} 1739}
@@ -1968,12 +1817,6 @@ inline int task_curr(const struct task_struct *p)
1968 return cpu_curr(task_cpu(p)) == p; 1817 return cpu_curr(task_cpu(p)) == p;
1969} 1818}
1970 1819
1971/* Used instead of source_load when we know the type == 0 */
1972unsigned long weighted_cpuload(const int cpu)
1973{
1974 return cpu_rq(cpu)->load.weight;
1975}
1976
1977static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1820static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1978{ 1821{
1979 set_task_rq(p, cpu); 1822 set_task_rq(p, cpu);
@@ -2002,6 +1845,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2002 1845
2003#ifdef CONFIG_SMP 1846#ifdef CONFIG_SMP
2004 1847
1848/* Used instead of source_load when we know the type == 0 */
1849static unsigned long weighted_cpuload(const int cpu)
1850{
1851 return cpu_rq(cpu)->load.weight;
1852}
1853
2005/* 1854/*
2006 * Is this task likely cache-hot: 1855 * Is this task likely cache-hot:
2007 */ 1856 */
@@ -2212,7 +2061,7 @@ static unsigned long source_load(int cpu, int type)
2212 struct rq *rq = cpu_rq(cpu); 2061 struct rq *rq = cpu_rq(cpu);
2213 unsigned long total = weighted_cpuload(cpu); 2062 unsigned long total = weighted_cpuload(cpu);
2214 2063
2215 if (type == 0) 2064 if (type == 0 || !sched_feat(LB_BIAS))
2216 return total; 2065 return total;
2217 2066
2218 return min(rq->cpu_load[type-1], total); 2067 return min(rq->cpu_load[type-1], total);
@@ -2227,25 +2076,13 @@ static unsigned long target_load(int cpu, int type)
2227 struct rq *rq = cpu_rq(cpu); 2076 struct rq *rq = cpu_rq(cpu);
2228 unsigned long total = weighted_cpuload(cpu); 2077 unsigned long total = weighted_cpuload(cpu);
2229 2078
2230 if (type == 0) 2079 if (type == 0 || !sched_feat(LB_BIAS))
2231 return total; 2080 return total;
2232 2081
2233 return max(rq->cpu_load[type-1], total); 2082 return max(rq->cpu_load[type-1], total);
2234} 2083}
2235 2084
2236/* 2085/*
2237 * Return the average load per task on the cpu's run queue
2238 */
2239static unsigned long cpu_avg_load_per_task(int cpu)
2240{
2241 struct rq *rq = cpu_rq(cpu);
2242 unsigned long total = weighted_cpuload(cpu);
2243 unsigned long n = rq->nr_running;
2244
2245 return n ? total / n : SCHED_LOAD_SCALE;
2246}
2247
2248/*
2249 * find_idlest_group finds and returns the least busy CPU group within the 2086 * find_idlest_group finds and returns the least busy CPU group within the
2250 * domain. 2087 * domain.
2251 */ 2088 */
@@ -2351,6 +2188,9 @@ static int sched_balance_self(int cpu, int flag)
2351 sd = tmp; 2188 sd = tmp;
2352 } 2189 }
2353 2190
2191 if (sd)
2192 update_shares(sd);
2193
2354 while (sd) { 2194 while (sd) {
2355 cpumask_t span, tmpmask; 2195 cpumask_t span, tmpmask;
2356 struct sched_group *group; 2196 struct sched_group *group;
@@ -2417,6 +2257,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2417 if (!sched_feat(SYNC_WAKEUPS)) 2257 if (!sched_feat(SYNC_WAKEUPS))
2418 sync = 0; 2258 sync = 0;
2419 2259
2260#ifdef CONFIG_SMP
2261 if (sched_feat(LB_WAKEUP_UPDATE)) {
2262 struct sched_domain *sd;
2263
2264 this_cpu = raw_smp_processor_id();
2265 cpu = task_cpu(p);
2266
2267 for_each_domain(this_cpu, sd) {
2268 if (cpu_isset(cpu, sd->span)) {
2269 update_shares(sd);
2270 break;
2271 }
2272 }
2273 }
2274#endif
2275
2420 smp_wmb(); 2276 smp_wmb();
2421 rq = task_rq_lock(p, &flags); 2277 rq = task_rq_lock(p, &flags);
2422 old_state = p->state; 2278 old_state = p->state;
@@ -2463,7 +2319,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2463 } 2319 }
2464 } 2320 }
2465 } 2321 }
2466#endif 2322#endif /* CONFIG_SCHEDSTATS */
2467 2323
2468out_activate: 2324out_activate:
2469#endif /* CONFIG_SMP */ 2325#endif /* CONFIG_SMP */
@@ -2481,6 +2337,9 @@ out_activate:
2481 success = 1; 2337 success = 1;
2482 2338
2483out_running: 2339out_running:
2340 trace_mark(kernel_sched_wakeup,
2341 "pid %d state %ld ## rq %p task %p rq->curr %p",
2342 p->pid, p->state, rq, p, rq->curr);
2484 check_preempt_curr(rq, p); 2343 check_preempt_curr(rq, p);
2485 2344
2486 p->state = TASK_RUNNING; 2345 p->state = TASK_RUNNING;
@@ -2489,6 +2348,8 @@ out_running:
2489 p->sched_class->task_wake_up(rq, p); 2348 p->sched_class->task_wake_up(rq, p);
2490#endif 2349#endif
2491out: 2350out:
2351 current->se.last_wakeup = current->se.sum_exec_runtime;
2352
2492 task_rq_unlock(rq, &flags); 2353 task_rq_unlock(rq, &flags);
2493 2354
2494 return success; 2355 return success;
@@ -2611,6 +2472,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2611 p->sched_class->task_new(rq, p); 2472 p->sched_class->task_new(rq, p);
2612 inc_nr_running(rq); 2473 inc_nr_running(rq);
2613 } 2474 }
2475 trace_mark(kernel_sched_wakeup_new,
2476 "pid %d state %ld ## rq %p task %p rq->curr %p",
2477 p->pid, p->state, rq, p, rq->curr);
2614 check_preempt_curr(rq, p); 2478 check_preempt_curr(rq, p);
2615#ifdef CONFIG_SMP 2479#ifdef CONFIG_SMP
2616 if (p->sched_class->task_wake_up) 2480 if (p->sched_class->task_wake_up)
@@ -2663,7 +2527,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2663 notifier->ops->sched_out(notifier, next); 2527 notifier->ops->sched_out(notifier, next);
2664} 2528}
2665 2529
2666#else 2530#else /* !CONFIG_PREEMPT_NOTIFIERS */
2667 2531
2668static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2532static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2669{ 2533{
@@ -2675,7 +2539,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2675{ 2539{
2676} 2540}
2677 2541
2678#endif 2542#endif /* CONFIG_PREEMPT_NOTIFIERS */
2679 2543
2680/** 2544/**
2681 * prepare_task_switch - prepare to switch tasks 2545 * prepare_task_switch - prepare to switch tasks
@@ -2783,6 +2647,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
2783 struct mm_struct *mm, *oldmm; 2647 struct mm_struct *mm, *oldmm;
2784 2648
2785 prepare_task_switch(rq, prev, next); 2649 prepare_task_switch(rq, prev, next);
2650 trace_mark(kernel_sched_schedule,
2651 "prev_pid %d next_pid %d prev_state %ld "
2652 "## rq %p prev %p next %p",
2653 prev->pid, next->pid, prev->state,
2654 rq, prev, next);
2786 mm = next->mm; 2655 mm = next->mm;
2787 oldmm = prev->active_mm; 2656 oldmm = prev->active_mm;
2788 /* 2657 /*
@@ -3117,7 +2986,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3117 enum cpu_idle_type idle, int *all_pinned, 2986 enum cpu_idle_type idle, int *all_pinned,
3118 int *this_best_prio, struct rq_iterator *iterator) 2987 int *this_best_prio, struct rq_iterator *iterator)
3119{ 2988{
3120 int loops = 0, pulled = 0, pinned = 0, skip_for_load; 2989 int loops = 0, pulled = 0, pinned = 0;
3121 struct task_struct *p; 2990 struct task_struct *p;
3122 long rem_load_move = max_load_move; 2991 long rem_load_move = max_load_move;
3123 2992
@@ -3133,14 +3002,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3133next: 3002next:
3134 if (!p || loops++ > sysctl_sched_nr_migrate) 3003 if (!p || loops++ > sysctl_sched_nr_migrate)
3135 goto out; 3004 goto out;
3136 /* 3005
3137 * To help distribute high priority tasks across CPUs we don't 3006 if ((p->se.load.weight >> 1) > rem_load_move ||
3138 * skip a task if it will be the highest priority task (i.e. smallest
3139 * prio value) on its new queue regardless of its load weight
3140 */
3141 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
3142 SCHED_LOAD_SCALE_FUZZ;
3143 if ((skip_for_load && p->prio >= *this_best_prio) ||
3144 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 3007 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3145 p = iterator->next(iterator->arg); 3008 p = iterator->next(iterator->arg);
3146 goto next; 3009 goto next;
@@ -3195,6 +3058,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3195 max_load_move - total_load_moved, 3058 max_load_move - total_load_moved,
3196 sd, idle, all_pinned, &this_best_prio); 3059 sd, idle, all_pinned, &this_best_prio);
3197 class = class->next; 3060 class = class->next;
3061
3062 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3063 break;
3064
3198 } while (class && max_load_move > total_load_moved); 3065 } while (class && max_load_move > total_load_moved);
3199 3066
3200 return total_load_moved > 0; 3067 return total_load_moved > 0;
@@ -3271,6 +3138,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3271 max_load = this_load = total_load = total_pwr = 0; 3138 max_load = this_load = total_load = total_pwr = 0;
3272 busiest_load_per_task = busiest_nr_running = 0; 3139 busiest_load_per_task = busiest_nr_running = 0;
3273 this_load_per_task = this_nr_running = 0; 3140 this_load_per_task = this_nr_running = 0;
3141
3274 if (idle == CPU_NOT_IDLE) 3142 if (idle == CPU_NOT_IDLE)
3275 load_idx = sd->busy_idx; 3143 load_idx = sd->busy_idx;
3276 else if (idle == CPU_NEWLY_IDLE) 3144 else if (idle == CPU_NEWLY_IDLE)
@@ -3285,6 +3153,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3285 int __group_imb = 0; 3153 int __group_imb = 0;
3286 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3154 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3287 unsigned long sum_nr_running, sum_weighted_load; 3155 unsigned long sum_nr_running, sum_weighted_load;
3156 unsigned long sum_avg_load_per_task;
3157 unsigned long avg_load_per_task;
3288 3158
3289 local_group = cpu_isset(this_cpu, group->cpumask); 3159 local_group = cpu_isset(this_cpu, group->cpumask);
3290 3160
@@ -3293,6 +3163,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3293 3163
3294 /* Tally up the load of all CPUs in the group */ 3164 /* Tally up the load of all CPUs in the group */
3295 sum_weighted_load = sum_nr_running = avg_load = 0; 3165 sum_weighted_load = sum_nr_running = avg_load = 0;
3166 sum_avg_load_per_task = avg_load_per_task = 0;
3167
3296 max_cpu_load = 0; 3168 max_cpu_load = 0;
3297 min_cpu_load = ~0UL; 3169 min_cpu_load = ~0UL;
3298 3170
@@ -3326,6 +3198,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3326 avg_load += load; 3198 avg_load += load;
3327 sum_nr_running += rq->nr_running; 3199 sum_nr_running += rq->nr_running;
3328 sum_weighted_load += weighted_cpuload(i); 3200 sum_weighted_load += weighted_cpuload(i);
3201
3202 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3329 } 3203 }
3330 3204
3331 /* 3205 /*
@@ -3347,7 +3221,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3347 avg_load = sg_div_cpu_power(group, 3221 avg_load = sg_div_cpu_power(group,
3348 avg_load * SCHED_LOAD_SCALE); 3222 avg_load * SCHED_LOAD_SCALE);
3349 3223
3350 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) 3224
3225 /*
3226 * Consider the group unbalanced when the imbalance is larger
3227 * than the average weight of two tasks.
3228 *
3229 * APZ: with cgroup the avg task weight can vary wildly and
3230 * might not be a suitable number - should we keep a
3231 * normalized nr_running number somewhere that negates
3232 * the hierarchy?
3233 */
3234 avg_load_per_task = sg_div_cpu_power(group,
3235 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3236
3237 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3351 __group_imb = 1; 3238 __group_imb = 1;
3352 3239
3353 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3240 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3488,9 +3375,9 @@ small_imbalance:
3488 if (busiest_load_per_task > this_load_per_task) 3375 if (busiest_load_per_task > this_load_per_task)
3489 imbn = 1; 3376 imbn = 1;
3490 } else 3377 } else
3491 this_load_per_task = SCHED_LOAD_SCALE; 3378 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3492 3379
3493 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= 3380 if (max_load - this_load + 2*busiest_load_per_task >=
3494 busiest_load_per_task * imbn) { 3381 busiest_load_per_task * imbn) {
3495 *imbalance = busiest_load_per_task; 3382 *imbalance = busiest_load_per_task;
3496 return busiest; 3383 return busiest;
@@ -3600,12 +3487,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3600 unsigned long imbalance; 3487 unsigned long imbalance;
3601 struct rq *busiest; 3488 struct rq *busiest;
3602 unsigned long flags; 3489 unsigned long flags;
3603 int unlock_aggregate;
3604 3490
3605 cpus_setall(*cpus); 3491 cpus_setall(*cpus);
3606 3492
3607 unlock_aggregate = get_aggregate(sd);
3608
3609 /* 3493 /*
3610 * When power savings policy is enabled for the parent domain, idle 3494 * When power savings policy is enabled for the parent domain, idle
3611 * sibling can pick up load irrespective of busy siblings. In this case, 3495 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3619,6 +3503,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3619 schedstat_inc(sd, lb_count[idle]); 3503 schedstat_inc(sd, lb_count[idle]);
3620 3504
3621redo: 3505redo:
3506 update_shares(sd);
3622 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3507 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3623 cpus, balance); 3508 cpus, balance);
3624 3509
@@ -3742,8 +3627,8 @@ out_one_pinned:
3742 else 3627 else
3743 ld_moved = 0; 3628 ld_moved = 0;
3744out: 3629out:
3745 if (unlock_aggregate) 3630 if (ld_moved)
3746 put_aggregate(sd); 3631 update_shares(sd);
3747 return ld_moved; 3632 return ld_moved;
3748} 3633}
3749 3634
@@ -3779,6 +3664,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3779 3664
3780 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); 3665 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3781redo: 3666redo:
3667 update_shares_locked(this_rq, sd);
3782 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 3668 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3783 &sd_idle, cpus, NULL); 3669 &sd_idle, cpus, NULL);
3784 if (!group) { 3670 if (!group) {
@@ -3822,6 +3708,7 @@ redo:
3822 } else 3708 } else
3823 sd->nr_balance_failed = 0; 3709 sd->nr_balance_failed = 0;
3824 3710
3711 update_shares_locked(this_rq, sd);
3825 return ld_moved; 3712 return ld_moved;
3826 3713
3827out_balanced: 3714out_balanced:
@@ -4013,6 +3900,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4013 /* Earliest time when we have to do rebalance again */ 3900 /* Earliest time when we have to do rebalance again */
4014 unsigned long next_balance = jiffies + 60*HZ; 3901 unsigned long next_balance = jiffies + 60*HZ;
4015 int update_next_balance = 0; 3902 int update_next_balance = 0;
3903 int need_serialize;
4016 cpumask_t tmp; 3904 cpumask_t tmp;
4017 3905
4018 for_each_domain(cpu, sd) { 3906 for_each_domain(cpu, sd) {
@@ -4030,8 +3918,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4030 if (interval > HZ*NR_CPUS/10) 3918 if (interval > HZ*NR_CPUS/10)
4031 interval = HZ*NR_CPUS/10; 3919 interval = HZ*NR_CPUS/10;
4032 3920
3921 need_serialize = sd->flags & SD_SERIALIZE;
4033 3922
4034 if (sd->flags & SD_SERIALIZE) { 3923 if (need_serialize) {
4035 if (!spin_trylock(&balancing)) 3924 if (!spin_trylock(&balancing))
4036 goto out; 3925 goto out;
4037 } 3926 }
@@ -4047,7 +3936,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4047 } 3936 }
4048 sd->last_balance = jiffies; 3937 sd->last_balance = jiffies;
4049 } 3938 }
4050 if (sd->flags & SD_SERIALIZE) 3939 if (need_serialize)
4051 spin_unlock(&balancing); 3940 spin_unlock(&balancing);
4052out: 3941out:
4053 if (time_after(next_balance, sd->last_balance + interval)) { 3942 if (time_after(next_balance, sd->last_balance + interval)) {
@@ -4362,26 +4251,44 @@ void scheduler_tick(void)
4362#endif 4251#endif
4363} 4252}
4364 4253
4365#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 4254#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4255 defined(CONFIG_PREEMPT_TRACER))
4256
4257static inline unsigned long get_parent_ip(unsigned long addr)
4258{
4259 if (in_lock_functions(addr)) {
4260 addr = CALLER_ADDR2;
4261 if (in_lock_functions(addr))
4262 addr = CALLER_ADDR3;
4263 }
4264 return addr;
4265}
4366 4266
4367void __kprobes add_preempt_count(int val) 4267void __kprobes add_preempt_count(int val)
4368{ 4268{
4269#ifdef CONFIG_DEBUG_PREEMPT
4369 /* 4270 /*
4370 * Underflow? 4271 * Underflow?
4371 */ 4272 */
4372 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 4273 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4373 return; 4274 return;
4275#endif
4374 preempt_count() += val; 4276 preempt_count() += val;
4277#ifdef CONFIG_DEBUG_PREEMPT
4375 /* 4278 /*
4376 * Spinlock count overflowing soon? 4279 * Spinlock count overflowing soon?
4377 */ 4280 */
4378 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 4281 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4379 PREEMPT_MASK - 10); 4282 PREEMPT_MASK - 10);
4283#endif
4284 if (preempt_count() == val)
4285 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4380} 4286}
4381EXPORT_SYMBOL(add_preempt_count); 4287EXPORT_SYMBOL(add_preempt_count);
4382 4288
4383void __kprobes sub_preempt_count(int val) 4289void __kprobes sub_preempt_count(int val)
4384{ 4290{
4291#ifdef CONFIG_DEBUG_PREEMPT
4385 /* 4292 /*
4386 * Underflow? 4293 * Underflow?
4387 */ 4294 */
@@ -4393,7 +4300,10 @@ void __kprobes sub_preempt_count(int val)
4393 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 4300 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4394 !(preempt_count() & PREEMPT_MASK))) 4301 !(preempt_count() & PREEMPT_MASK)))
4395 return; 4302 return;
4303#endif
4396 4304
4305 if (preempt_count() == val)
4306 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4397 preempt_count() -= val; 4307 preempt_count() -= val;
4398} 4308}
4399EXPORT_SYMBOL(sub_preempt_count); 4309EXPORT_SYMBOL(sub_preempt_count);
@@ -4411,6 +4321,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
4411 prev->comm, prev->pid, preempt_count()); 4321 prev->comm, prev->pid, preempt_count());
4412 4322
4413 debug_show_held_locks(prev); 4323 debug_show_held_locks(prev);
4324 print_modules();
4414 if (irqs_disabled()) 4325 if (irqs_disabled())
4415 print_irqtrace_events(prev); 4326 print_irqtrace_events(prev);
4416 4327
@@ -4430,7 +4341,7 @@ static inline void schedule_debug(struct task_struct *prev)
4430 * schedule() atomically, we ignore that path for now. 4341 * schedule() atomically, we ignore that path for now.
4431 * Otherwise, whine if we are scheduling when we should not be. 4342 * Otherwise, whine if we are scheduling when we should not be.
4432 */ 4343 */
4433 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) 4344 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4434 __schedule_bug(prev); 4345 __schedule_bug(prev);
4435 4346
4436 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4347 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4484,7 +4395,7 @@ asmlinkage void __sched schedule(void)
4484 struct task_struct *prev, *next; 4395 struct task_struct *prev, *next;
4485 unsigned long *switch_count; 4396 unsigned long *switch_count;
4486 struct rq *rq; 4397 struct rq *rq;
4487 int cpu; 4398 int cpu, hrtick = sched_feat(HRTICK);
4488 4399
4489need_resched: 4400need_resched:
4490 preempt_disable(); 4401 preempt_disable();
@@ -4499,7 +4410,8 @@ need_resched_nonpreemptible:
4499 4410
4500 schedule_debug(prev); 4411 schedule_debug(prev);
4501 4412
4502 hrtick_clear(rq); 4413 if (hrtick)
4414 hrtick_clear(rq);
4503 4415
4504 /* 4416 /*
4505 * Do the rq-clock update outside the rq lock: 4417 * Do the rq-clock update outside the rq lock:
@@ -4510,12 +4422,10 @@ need_resched_nonpreemptible:
4510 clear_tsk_need_resched(prev); 4422 clear_tsk_need_resched(prev);
4511 4423
4512 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4424 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4513 if (unlikely((prev->state & TASK_INTERRUPTIBLE) && 4425 if (unlikely(signal_pending_state(prev->state, prev)))
4514 signal_pending(prev))) {
4515 prev->state = TASK_RUNNING; 4426 prev->state = TASK_RUNNING;
4516 } else { 4427 else
4517 deactivate_task(rq, prev, 1); 4428 deactivate_task(rq, prev, 1);
4518 }
4519 switch_count = &prev->nvcsw; 4429 switch_count = &prev->nvcsw;
4520 } 4430 }
4521 4431
@@ -4547,7 +4457,8 @@ need_resched_nonpreemptible:
4547 } else 4457 } else
4548 spin_unlock_irq(&rq->lock); 4458 spin_unlock_irq(&rq->lock);
4549 4459
4550 hrtick_set(rq); 4460 if (hrtick)
4461 hrtick_set(rq);
4551 4462
4552 if (unlikely(reacquire_kernel_lock(current) < 0)) 4463 if (unlikely(reacquire_kernel_lock(current) < 0))
4553 goto need_resched_nonpreemptible; 4464 goto need_resched_nonpreemptible;
@@ -4741,22 +4652,20 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4741 signal_pending(current)) || 4652 signal_pending(current)) ||
4742 (state == TASK_KILLABLE && 4653 (state == TASK_KILLABLE &&
4743 fatal_signal_pending(current))) { 4654 fatal_signal_pending(current))) {
4744 __remove_wait_queue(&x->wait, &wait); 4655 timeout = -ERESTARTSYS;
4745 return -ERESTARTSYS; 4656 break;
4746 } 4657 }
4747 __set_current_state(state); 4658 __set_current_state(state);
4748 spin_unlock_irq(&x->wait.lock); 4659 spin_unlock_irq(&x->wait.lock);
4749 timeout = schedule_timeout(timeout); 4660 timeout = schedule_timeout(timeout);
4750 spin_lock_irq(&x->wait.lock); 4661 spin_lock_irq(&x->wait.lock);
4751 if (!timeout) { 4662 } while (!x->done && timeout);
4752 __remove_wait_queue(&x->wait, &wait);
4753 return timeout;
4754 }
4755 } while (!x->done);
4756 __remove_wait_queue(&x->wait, &wait); 4663 __remove_wait_queue(&x->wait, &wait);
4664 if (!x->done)
4665 return timeout;
4757 } 4666 }
4758 x->done--; 4667 x->done--;
4759 return timeout; 4668 return timeout ?: 1;
4760} 4669}
4761 4670
4762static long __sched 4671static long __sched
@@ -5086,16 +4995,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5086 set_load_weight(p); 4995 set_load_weight(p);
5087} 4996}
5088 4997
5089/** 4998static int __sched_setscheduler(struct task_struct *p, int policy,
5090 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4999 struct sched_param *param, bool user)
5091 * @p: the task in question.
5092 * @policy: new policy.
5093 * @param: structure containing the new RT priority.
5094 *
5095 * NOTE that the task may be already dead.
5096 */
5097int sched_setscheduler(struct task_struct *p, int policy,
5098 struct sched_param *param)
5099{ 5000{
5100 int retval, oldprio, oldpolicy = -1, on_rq, running; 5001 int retval, oldprio, oldpolicy = -1, on_rq, running;
5101 unsigned long flags; 5002 unsigned long flags;
@@ -5127,7 +5028,7 @@ recheck:
5127 /* 5028 /*
5128 * Allow unprivileged RT tasks to decrease priority: 5029 * Allow unprivileged RT tasks to decrease priority:
5129 */ 5030 */
5130 if (!capable(CAP_SYS_NICE)) { 5031 if (user && !capable(CAP_SYS_NICE)) {
5131 if (rt_policy(policy)) { 5032 if (rt_policy(policy)) {
5132 unsigned long rlim_rtprio; 5033 unsigned long rlim_rtprio;
5133 5034
@@ -5163,7 +5064,8 @@ recheck:
5163 * Do not allow realtime tasks into groups that have no runtime 5064 * Do not allow realtime tasks into groups that have no runtime
5164 * assigned. 5065 * assigned.
5165 */ 5066 */
5166 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5067 if (user
5068 && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
5167 return -EPERM; 5069 return -EPERM;
5168#endif 5070#endif
5169 5071
@@ -5212,8 +5114,39 @@ recheck:
5212 5114
5213 return 0; 5115 return 0;
5214} 5116}
5117
5118/**
5119 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5120 * @p: the task in question.
5121 * @policy: new policy.
5122 * @param: structure containing the new RT priority.
5123 *
5124 * NOTE that the task may be already dead.
5125 */
5126int sched_setscheduler(struct task_struct *p, int policy,
5127 struct sched_param *param)
5128{
5129 return __sched_setscheduler(p, policy, param, true);
5130}
5215EXPORT_SYMBOL_GPL(sched_setscheduler); 5131EXPORT_SYMBOL_GPL(sched_setscheduler);
5216 5132
5133/**
5134 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5135 * @p: the task in question.
5136 * @policy: new policy.
5137 * @param: structure containing the new RT priority.
5138 *
5139 * Just like sched_setscheduler, only don't bother checking if the
5140 * current context has permission. For example, this is needed in
5141 * stop_machine(): we create temporary high priority worker threads,
5142 * but our caller might not have that capability.
5143 */
5144int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5145 struct sched_param *param)
5146{
5147 return __sched_setscheduler(p, policy, param, false);
5148}
5149
5217static int 5150static int
5218do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5151do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5219{ 5152{
@@ -5412,24 +5345,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5412 return sched_setaffinity(pid, &new_mask); 5345 return sched_setaffinity(pid, &new_mask);
5413} 5346}
5414 5347
5415/*
5416 * Represents all cpu's present in the system
5417 * In systems capable of hotplug, this map could dynamically grow
5418 * as new cpu's are detected in the system via any platform specific
5419 * method, such as ACPI for e.g.
5420 */
5421
5422cpumask_t cpu_present_map __read_mostly;
5423EXPORT_SYMBOL(cpu_present_map);
5424
5425#ifndef CONFIG_SMP
5426cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
5427EXPORT_SYMBOL(cpu_online_map);
5428
5429cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
5430EXPORT_SYMBOL(cpu_possible_map);
5431#endif
5432
5433long sched_getaffinity(pid_t pid, cpumask_t *mask) 5348long sched_getaffinity(pid_t pid, cpumask_t *mask)
5434{ 5349{
5435 struct task_struct *p; 5350 struct task_struct *p;
@@ -5726,7 +5641,7 @@ out_unlock:
5726 return retval; 5641 return retval;
5727} 5642}
5728 5643
5729static const char stat_nam[] = "RSDTtZX"; 5644static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5730 5645
5731void sched_show_task(struct task_struct *p) 5646void sched_show_task(struct task_struct *p)
5732{ 5647{
@@ -5913,6 +5828,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5913 goto out; 5828 goto out;
5914 } 5829 }
5915 5830
5831 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5832 !cpus_equal(p->cpus_allowed, *new_mask))) {
5833 ret = -EINVAL;
5834 goto out;
5835 }
5836
5916 if (p->sched_class->set_cpus_allowed) 5837 if (p->sched_class->set_cpus_allowed)
5917 p->sched_class->set_cpus_allowed(p, new_mask); 5838 p->sched_class->set_cpus_allowed(p, new_mask);
5918 else { 5839 else {
@@ -5964,10 +5885,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5964 double_rq_lock(rq_src, rq_dest); 5885 double_rq_lock(rq_src, rq_dest);
5965 /* Already moved. */ 5886 /* Already moved. */
5966 if (task_cpu(p) != src_cpu) 5887 if (task_cpu(p) != src_cpu)
5967 goto out; 5888 goto done;
5968 /* Affinity changed (again). */ 5889 /* Affinity changed (again). */
5969 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 5890 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5970 goto out; 5891 goto fail;
5971 5892
5972 on_rq = p->se.on_rq; 5893 on_rq = p->se.on_rq;
5973 if (on_rq) 5894 if (on_rq)
@@ -5978,8 +5899,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5978 activate_task(rq_dest, p, 0); 5899 activate_task(rq_dest, p, 0);
5979 check_preempt_curr(rq_dest, p); 5900 check_preempt_curr(rq_dest, p);
5980 } 5901 }
5902done:
5981 ret = 1; 5903 ret = 1;
5982out: 5904fail:
5983 double_rq_unlock(rq_src, rq_dest); 5905 double_rq_unlock(rq_src, rq_dest);
5984 return ret; 5906 return ret;
5985} 5907}
@@ -6229,6 +6151,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6229 next = pick_next_task(rq, rq->curr); 6151 next = pick_next_task(rq, rq->curr);
6230 if (!next) 6152 if (!next)
6231 break; 6153 break;
6154 next->sched_class->put_prev_task(rq, next);
6232 migrate_dead(dead_cpu, next); 6155 migrate_dead(dead_cpu, next);
6233 6156
6234 } 6157 }
@@ -6400,6 +6323,36 @@ static void unregister_sched_domain_sysctl(void)
6400} 6323}
6401#endif 6324#endif
6402 6325
6326static void set_rq_online(struct rq *rq)
6327{
6328 if (!rq->online) {
6329 const struct sched_class *class;
6330
6331 cpu_set(rq->cpu, rq->rd->online);
6332 rq->online = 1;
6333
6334 for_each_class(class) {
6335 if (class->rq_online)
6336 class->rq_online(rq);
6337 }
6338 }
6339}
6340
6341static void set_rq_offline(struct rq *rq)
6342{
6343 if (rq->online) {
6344 const struct sched_class *class;
6345
6346 for_each_class(class) {
6347 if (class->rq_offline)
6348 class->rq_offline(rq);
6349 }
6350
6351 cpu_clear(rq->cpu, rq->rd->online);
6352 rq->online = 0;
6353 }
6354}
6355
6403/* 6356/*
6404 * migration_call - callback that gets triggered when a CPU is added. 6357 * migration_call - callback that gets triggered when a CPU is added.
6405 * Here we can start up the necessary migration thread for the new CPU. 6358 * Here we can start up the necessary migration thread for the new CPU.
@@ -6437,7 +6390,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6437 spin_lock_irqsave(&rq->lock, flags); 6390 spin_lock_irqsave(&rq->lock, flags);
6438 if (rq->rd) { 6391 if (rq->rd) {
6439 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6392 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6440 cpu_set(cpu, rq->rd->online); 6393
6394 set_rq_online(rq);
6441 } 6395 }
6442 spin_unlock_irqrestore(&rq->lock, flags); 6396 spin_unlock_irqrestore(&rq->lock, flags);
6443 break; 6397 break;
@@ -6498,7 +6452,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6498 spin_lock_irqsave(&rq->lock, flags); 6452 spin_lock_irqsave(&rq->lock, flags);
6499 if (rq->rd) { 6453 if (rq->rd) {
6500 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6454 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6501 cpu_clear(cpu, rq->rd->online); 6455 set_rq_offline(rq);
6502 } 6456 }
6503 spin_unlock_irqrestore(&rq->lock, flags); 6457 spin_unlock_irqrestore(&rq->lock, flags);
6504 break; 6458 break;
@@ -6532,6 +6486,28 @@ void __init migration_init(void)
6532 6486
6533#ifdef CONFIG_SCHED_DEBUG 6487#ifdef CONFIG_SCHED_DEBUG
6534 6488
6489static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6490{
6491 switch (lvl) {
6492 case SD_LV_NONE:
6493 return "NONE";
6494 case SD_LV_SIBLING:
6495 return "SIBLING";
6496 case SD_LV_MC:
6497 return "MC";
6498 case SD_LV_CPU:
6499 return "CPU";
6500 case SD_LV_NODE:
6501 return "NODE";
6502 case SD_LV_ALLNODES:
6503 return "ALLNODES";
6504 case SD_LV_MAX:
6505 return "MAX";
6506
6507 }
6508 return "MAX";
6509}
6510
6535static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6511static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6536 cpumask_t *groupmask) 6512 cpumask_t *groupmask)
6537{ 6513{
@@ -6551,7 +6527,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6551 return -1; 6527 return -1;
6552 } 6528 }
6553 6529
6554 printk(KERN_CONT "span %s\n", str); 6530 printk(KERN_CONT "span %s level %s\n",
6531 str, sd_level_to_string(sd->level));
6555 6532
6556 if (!cpu_isset(cpu, sd->span)) { 6533 if (!cpu_isset(cpu, sd->span)) {
6557 printk(KERN_ERR "ERROR: domain->span does not contain " 6534 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6635,9 +6612,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6635 } 6612 }
6636 kfree(groupmask); 6613 kfree(groupmask);
6637} 6614}
6638#else 6615#else /* !CONFIG_SCHED_DEBUG */
6639# define sched_domain_debug(sd, cpu) do { } while (0) 6616# define sched_domain_debug(sd, cpu) do { } while (0)
6640#endif 6617#endif /* CONFIG_SCHED_DEBUG */
6641 6618
6642static int sd_degenerate(struct sched_domain *sd) 6619static int sd_degenerate(struct sched_domain *sd)
6643{ 6620{
@@ -6697,20 +6674,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6697static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6674static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6698{ 6675{
6699 unsigned long flags; 6676 unsigned long flags;
6700 const struct sched_class *class;
6701 6677
6702 spin_lock_irqsave(&rq->lock, flags); 6678 spin_lock_irqsave(&rq->lock, flags);
6703 6679
6704 if (rq->rd) { 6680 if (rq->rd) {
6705 struct root_domain *old_rd = rq->rd; 6681 struct root_domain *old_rd = rq->rd;
6706 6682
6707 for (class = sched_class_highest; class; class = class->next) { 6683 if (cpu_isset(rq->cpu, old_rd->online))
6708 if (class->leave_domain) 6684 set_rq_offline(rq);
6709 class->leave_domain(rq);
6710 }
6711 6685
6712 cpu_clear(rq->cpu, old_rd->span); 6686 cpu_clear(rq->cpu, old_rd->span);
6713 cpu_clear(rq->cpu, old_rd->online);
6714 6687
6715 if (atomic_dec_and_test(&old_rd->refcount)) 6688 if (atomic_dec_and_test(&old_rd->refcount))
6716 kfree(old_rd); 6689 kfree(old_rd);
@@ -6721,12 +6694,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6721 6694
6722 cpu_set(rq->cpu, rd->span); 6695 cpu_set(rq->cpu, rd->span);
6723 if (cpu_isset(rq->cpu, cpu_online_map)) 6696 if (cpu_isset(rq->cpu, cpu_online_map))
6724 cpu_set(rq->cpu, rd->online); 6697 set_rq_online(rq);
6725
6726 for (class = sched_class_highest; class; class = class->next) {
6727 if (class->join_domain)
6728 class->join_domain(rq);
6729 }
6730 6698
6731 spin_unlock_irqrestore(&rq->lock, flags); 6699 spin_unlock_irqrestore(&rq->lock, flags);
6732} 6700}
@@ -6737,6 +6705,8 @@ static void init_rootdomain(struct root_domain *rd)
6737 6705
6738 cpus_clear(rd->span); 6706 cpus_clear(rd->span);
6739 cpus_clear(rd->online); 6707 cpus_clear(rd->online);
6708
6709 cpupri_init(&rd->cpupri);
6740} 6710}
6741 6711
6742static void init_defrootdomain(void) 6712static void init_defrootdomain(void)
@@ -6879,9 +6849,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6879 6849
6880 min_val = INT_MAX; 6850 min_val = INT_MAX;
6881 6851
6882 for (i = 0; i < MAX_NUMNODES; i++) { 6852 for (i = 0; i < nr_node_ids; i++) {
6883 /* Start at @node */ 6853 /* Start at @node */
6884 n = (node + i) % MAX_NUMNODES; 6854 n = (node + i) % nr_node_ids;
6885 6855
6886 if (!nr_cpus_node(n)) 6856 if (!nr_cpus_node(n))
6887 continue; 6857 continue;
@@ -6931,7 +6901,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
6931 cpus_or(*span, *span, *nodemask); 6901 cpus_or(*span, *span, *nodemask);
6932 } 6902 }
6933} 6903}
6934#endif 6904#endif /* CONFIG_NUMA */
6935 6905
6936int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6906int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6937 6907
@@ -6950,7 +6920,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6950 *sg = &per_cpu(sched_group_cpus, cpu); 6920 *sg = &per_cpu(sched_group_cpus, cpu);
6951 return cpu; 6921 return cpu;
6952} 6922}
6953#endif 6923#endif /* CONFIG_SCHED_SMT */
6954 6924
6955/* 6925/*
6956 * multi-core sched-domains: 6926 * multi-core sched-domains:
@@ -6958,7 +6928,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6958#ifdef CONFIG_SCHED_MC 6928#ifdef CONFIG_SCHED_MC
6959static DEFINE_PER_CPU(struct sched_domain, core_domains); 6929static DEFINE_PER_CPU(struct sched_domain, core_domains);
6960static DEFINE_PER_CPU(struct sched_group, sched_group_core); 6930static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6961#endif 6931#endif /* CONFIG_SCHED_MC */
6962 6932
6963#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6933#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6964static int 6934static int
@@ -7060,7 +7030,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7060 sg = sg->next; 7030 sg = sg->next;
7061 } while (sg != group_head); 7031 } while (sg != group_head);
7062} 7032}
7063#endif 7033#endif /* CONFIG_NUMA */
7064 7034
7065#ifdef CONFIG_NUMA 7035#ifdef CONFIG_NUMA
7066/* Free memory allocated for various sched_group structures */ 7036/* Free memory allocated for various sched_group structures */
@@ -7075,7 +7045,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7075 if (!sched_group_nodes) 7045 if (!sched_group_nodes)
7076 continue; 7046 continue;
7077 7047
7078 for (i = 0; i < MAX_NUMNODES; i++) { 7048 for (i = 0; i < nr_node_ids; i++) {
7079 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7049 struct sched_group *oldsg, *sg = sched_group_nodes[i];
7080 7050
7081 *nodemask = node_to_cpumask(i); 7051 *nodemask = node_to_cpumask(i);
@@ -7097,11 +7067,11 @@ next_sg:
7097 sched_group_nodes_bycpu[cpu] = NULL; 7067 sched_group_nodes_bycpu[cpu] = NULL;
7098 } 7068 }
7099} 7069}
7100#else 7070#else /* !CONFIG_NUMA */
7101static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7071static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7102{ 7072{
7103} 7073}
7104#endif 7074#endif /* CONFIG_NUMA */
7105 7075
7106/* 7076/*
7107 * Initialize sched groups cpu_power. 7077 * Initialize sched groups cpu_power.
@@ -7219,7 +7189,12 @@ static int default_relax_domain_level = -1;
7219 7189
7220static int __init setup_relax_domain_level(char *str) 7190static int __init setup_relax_domain_level(char *str)
7221{ 7191{
7222 default_relax_domain_level = simple_strtoul(str, NULL, 0); 7192 unsigned long val;
7193
7194 val = simple_strtoul(str, NULL, 0);
7195 if (val < SD_LV_MAX)
7196 default_relax_domain_level = val;
7197
7223 return 1; 7198 return 1;
7224} 7199}
7225__setup("relax_domain_level=", setup_relax_domain_level); 7200__setup("relax_domain_level=", setup_relax_domain_level);
@@ -7263,7 +7238,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7263 /* 7238 /*
7264 * Allocate the per-node list of sched groups 7239 * Allocate the per-node list of sched groups
7265 */ 7240 */
7266 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), 7241 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
7267 GFP_KERNEL); 7242 GFP_KERNEL);
7268 if (!sched_group_nodes) { 7243 if (!sched_group_nodes) {
7269 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7244 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -7316,7 +7291,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7316 SD_INIT(sd, ALLNODES); 7291 SD_INIT(sd, ALLNODES);
7317 set_domain_attribute(sd, attr); 7292 set_domain_attribute(sd, attr);
7318 sd->span = *cpu_map; 7293 sd->span = *cpu_map;
7319 sd->first_cpu = first_cpu(sd->span);
7320 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7294 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7321 p = sd; 7295 p = sd;
7322 sd_allnodes = 1; 7296 sd_allnodes = 1;
@@ -7327,7 +7301,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7327 SD_INIT(sd, NODE); 7301 SD_INIT(sd, NODE);
7328 set_domain_attribute(sd, attr); 7302 set_domain_attribute(sd, attr);
7329 sched_domain_node_span(cpu_to_node(i), &sd->span); 7303 sched_domain_node_span(cpu_to_node(i), &sd->span);
7330 sd->first_cpu = first_cpu(sd->span);
7331 sd->parent = p; 7304 sd->parent = p;
7332 if (p) 7305 if (p)
7333 p->child = sd; 7306 p->child = sd;
@@ -7339,7 +7312,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7339 SD_INIT(sd, CPU); 7312 SD_INIT(sd, CPU);
7340 set_domain_attribute(sd, attr); 7313 set_domain_attribute(sd, attr);
7341 sd->span = *nodemask; 7314 sd->span = *nodemask;
7342 sd->first_cpu = first_cpu(sd->span);
7343 sd->parent = p; 7315 sd->parent = p;
7344 if (p) 7316 if (p)
7345 p->child = sd; 7317 p->child = sd;
@@ -7351,7 +7323,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7351 SD_INIT(sd, MC); 7323 SD_INIT(sd, MC);
7352 set_domain_attribute(sd, attr); 7324 set_domain_attribute(sd, attr);
7353 sd->span = cpu_coregroup_map(i); 7325 sd->span = cpu_coregroup_map(i);
7354 sd->first_cpu = first_cpu(sd->span);
7355 cpus_and(sd->span, sd->span, *cpu_map); 7326 cpus_and(sd->span, sd->span, *cpu_map);
7356 sd->parent = p; 7327 sd->parent = p;
7357 p->child = sd; 7328 p->child = sd;
@@ -7364,7 +7335,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7364 SD_INIT(sd, SIBLING); 7335 SD_INIT(sd, SIBLING);
7365 set_domain_attribute(sd, attr); 7336 set_domain_attribute(sd, attr);
7366 sd->span = per_cpu(cpu_sibling_map, i); 7337 sd->span = per_cpu(cpu_sibling_map, i);
7367 sd->first_cpu = first_cpu(sd->span);
7368 cpus_and(sd->span, sd->span, *cpu_map); 7338 cpus_and(sd->span, sd->span, *cpu_map);
7369 sd->parent = p; 7339 sd->parent = p;
7370 p->child = sd; 7340 p->child = sd;
@@ -7407,7 +7377,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7407#endif 7377#endif
7408 7378
7409 /* Set up physical groups */ 7379 /* Set up physical groups */
7410 for (i = 0; i < MAX_NUMNODES; i++) { 7380 for (i = 0; i < nr_node_ids; i++) {
7411 SCHED_CPUMASK_VAR(nodemask, allmasks); 7381 SCHED_CPUMASK_VAR(nodemask, allmasks);
7412 SCHED_CPUMASK_VAR(send_covered, allmasks); 7382 SCHED_CPUMASK_VAR(send_covered, allmasks);
7413 7383
@@ -7431,7 +7401,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7431 send_covered, tmpmask); 7401 send_covered, tmpmask);
7432 } 7402 }
7433 7403
7434 for (i = 0; i < MAX_NUMNODES; i++) { 7404 for (i = 0; i < nr_node_ids; i++) {
7435 /* Set up node groups */ 7405 /* Set up node groups */
7436 struct sched_group *sg, *prev; 7406 struct sched_group *sg, *prev;
7437 SCHED_CPUMASK_VAR(nodemask, allmasks); 7407 SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7470,9 +7440,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7470 cpus_or(*covered, *covered, *nodemask); 7440 cpus_or(*covered, *covered, *nodemask);
7471 prev = sg; 7441 prev = sg;
7472 7442
7473 for (j = 0; j < MAX_NUMNODES; j++) { 7443 for (j = 0; j < nr_node_ids; j++) {
7474 SCHED_CPUMASK_VAR(notcovered, allmasks); 7444 SCHED_CPUMASK_VAR(notcovered, allmasks);
7475 int n = (i + j) % MAX_NUMNODES; 7445 int n = (i + j) % nr_node_ids;
7476 node_to_cpumask_ptr(pnodemask, n); 7446 node_to_cpumask_ptr(pnodemask, n);
7477 7447
7478 cpus_complement(*notcovered, *covered); 7448 cpus_complement(*notcovered, *covered);
@@ -7525,7 +7495,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7525 } 7495 }
7526 7496
7527#ifdef CONFIG_NUMA 7497#ifdef CONFIG_NUMA
7528 for (i = 0; i < MAX_NUMNODES; i++) 7498 for (i = 0; i < nr_node_ids; i++)
7529 init_numa_sched_groups_power(sched_group_nodes[i]); 7499 init_numa_sched_groups_power(sched_group_nodes[i]);
7530 7500
7531 if (sd_allnodes) { 7501 if (sd_allnodes) {
@@ -7568,8 +7538,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
7568 7538
7569static cpumask_t *doms_cur; /* current sched domains */ 7539static cpumask_t *doms_cur; /* current sched domains */
7570static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7540static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7571static struct sched_domain_attr *dattr_cur; /* attribues of custom domains 7541static struct sched_domain_attr *dattr_cur;
7572 in 'doms_cur' */ 7542 /* attribues of custom domains in 'doms_cur' */
7573 7543
7574/* 7544/*
7575 * Special case: If a kmalloc of a doms_cur partition (array of 7545 * Special case: If a kmalloc of a doms_cur partition (array of
@@ -7583,6 +7553,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7583} 7553}
7584 7554
7585/* 7555/*
7556 * Free current domain masks.
7557 * Called after all cpus are attached to NULL domain.
7558 */
7559static void free_sched_domains(void)
7560{
7561 ndoms_cur = 0;
7562 if (doms_cur != &fallback_doms)
7563 kfree(doms_cur);
7564 doms_cur = &fallback_doms;
7565}
7566
7567/*
7586 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7568 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7587 * For now this just excludes isolated cpus, but could be used to 7569 * For now this just excludes isolated cpus, but could be used to
7588 * exclude other special cases in the future. 7570 * exclude other special cases in the future.
@@ -7729,6 +7711,7 @@ int arch_reinit_sched_domains(void)
7729 get_online_cpus(); 7711 get_online_cpus();
7730 mutex_lock(&sched_domains_mutex); 7712 mutex_lock(&sched_domains_mutex);
7731 detach_destroy_domains(&cpu_online_map); 7713 detach_destroy_domains(&cpu_online_map);
7714 free_sched_domains();
7732 err = arch_init_sched_domains(&cpu_online_map); 7715 err = arch_init_sched_domains(&cpu_online_map);
7733 mutex_unlock(&sched_domains_mutex); 7716 mutex_unlock(&sched_domains_mutex);
7734 put_online_cpus(); 7717 put_online_cpus();
@@ -7797,7 +7780,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7797#endif 7780#endif
7798 return err; 7781 return err;
7799} 7782}
7800#endif 7783#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7801 7784
7802/* 7785/*
7803 * Force a reinitialization of the sched domains hierarchy. The domains 7786 * Force a reinitialization of the sched domains hierarchy. The domains
@@ -7808,20 +7791,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7808static int update_sched_domains(struct notifier_block *nfb, 7791static int update_sched_domains(struct notifier_block *nfb,
7809 unsigned long action, void *hcpu) 7792 unsigned long action, void *hcpu)
7810{ 7793{
7794 int cpu = (int)(long)hcpu;
7795
7811 switch (action) { 7796 switch (action) {
7812 case CPU_UP_PREPARE:
7813 case CPU_UP_PREPARE_FROZEN:
7814 case CPU_DOWN_PREPARE: 7797 case CPU_DOWN_PREPARE:
7815 case CPU_DOWN_PREPARE_FROZEN: 7798 case CPU_DOWN_PREPARE_FROZEN:
7799 disable_runtime(cpu_rq(cpu));
7800 /* fall-through */
7801 case CPU_UP_PREPARE:
7802 case CPU_UP_PREPARE_FROZEN:
7816 detach_destroy_domains(&cpu_online_map); 7803 detach_destroy_domains(&cpu_online_map);
7804 free_sched_domains();
7817 return NOTIFY_OK; 7805 return NOTIFY_OK;
7818 7806
7819 case CPU_UP_CANCELED: 7807
7820 case CPU_UP_CANCELED_FROZEN:
7821 case CPU_DOWN_FAILED: 7808 case CPU_DOWN_FAILED:
7822 case CPU_DOWN_FAILED_FROZEN: 7809 case CPU_DOWN_FAILED_FROZEN:
7823 case CPU_ONLINE: 7810 case CPU_ONLINE:
7824 case CPU_ONLINE_FROZEN: 7811 case CPU_ONLINE_FROZEN:
7812 enable_runtime(cpu_rq(cpu));
7813 /* fall-through */
7814 case CPU_UP_CANCELED:
7815 case CPU_UP_CANCELED_FROZEN:
7825 case CPU_DEAD: 7816 case CPU_DEAD:
7826 case CPU_DEAD_FROZEN: 7817 case CPU_DEAD_FROZEN:
7827 /* 7818 /*
@@ -7832,8 +7823,16 @@ static int update_sched_domains(struct notifier_block *nfb,
7832 return NOTIFY_DONE; 7823 return NOTIFY_DONE;
7833 } 7824 }
7834 7825
7826#ifndef CONFIG_CPUSETS
7827 /*
7828 * Create default domain partitioning if cpusets are disabled.
7829 * Otherwise we let cpusets rebuild the domains based on the
7830 * current setup.
7831 */
7832
7835 /* The hotplug lock is already held by cpu_up/cpu_down */ 7833 /* The hotplug lock is already held by cpu_up/cpu_down */
7836 arch_init_sched_domains(&cpu_online_map); 7834 arch_init_sched_domains(&cpu_online_map);
7835#endif
7837 7836
7838 return NOTIFY_OK; 7837 return NOTIFY_OK;
7839} 7838}
@@ -7973,7 +7972,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7973 else 7972 else
7974 rt_se->rt_rq = parent->my_q; 7973 rt_se->rt_rq = parent->my_q;
7975 7974
7976 rt_se->rt_rq = &rq->rt;
7977 rt_se->my_q = rt_rq; 7975 rt_se->my_q = rt_rq;
7978 rt_se->parent = parent; 7976 rt_se->parent = parent;
7979 INIT_LIST_HEAD(&rt_se->run_list); 7977 INIT_LIST_HEAD(&rt_se->run_list);
@@ -8014,8 +8012,8 @@ void __init sched_init(void)
8014 8012
8015 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 8013 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8016 ptr += nr_cpu_ids * sizeof(void **); 8014 ptr += nr_cpu_ids * sizeof(void **);
8017#endif 8015#endif /* CONFIG_USER_SCHED */
8018#endif 8016#endif /* CONFIG_FAIR_GROUP_SCHED */
8019#ifdef CONFIG_RT_GROUP_SCHED 8017#ifdef CONFIG_RT_GROUP_SCHED
8020 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 8018 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
8021 ptr += nr_cpu_ids * sizeof(void **); 8019 ptr += nr_cpu_ids * sizeof(void **);
@@ -8029,12 +8027,11 @@ void __init sched_init(void)
8029 8027
8030 root_task_group.rt_rq = (struct rt_rq **)ptr; 8028 root_task_group.rt_rq = (struct rt_rq **)ptr;
8031 ptr += nr_cpu_ids * sizeof(void **); 8029 ptr += nr_cpu_ids * sizeof(void **);
8032#endif 8030#endif /* CONFIG_USER_SCHED */
8033#endif 8031#endif /* CONFIG_RT_GROUP_SCHED */
8034 } 8032 }
8035 8033
8036#ifdef CONFIG_SMP 8034#ifdef CONFIG_SMP
8037 init_aggregate();
8038 init_defrootdomain(); 8035 init_defrootdomain();
8039#endif 8036#endif
8040 8037
@@ -8047,8 +8044,8 @@ void __init sched_init(void)
8047#ifdef CONFIG_USER_SCHED 8044#ifdef CONFIG_USER_SCHED
8048 init_rt_bandwidth(&root_task_group.rt_bandwidth, 8045 init_rt_bandwidth(&root_task_group.rt_bandwidth,
8049 global_rt_period(), RUNTIME_INF); 8046 global_rt_period(), RUNTIME_INF);
8050#endif 8047#endif /* CONFIG_USER_SCHED */
8051#endif 8048#endif /* CONFIG_RT_GROUP_SCHED */
8052 8049
8053#ifdef CONFIG_GROUP_SCHED 8050#ifdef CONFIG_GROUP_SCHED
8054 list_add(&init_task_group.list, &task_groups); 8051 list_add(&init_task_group.list, &task_groups);
@@ -8058,8 +8055,8 @@ void __init sched_init(void)
8058 INIT_LIST_HEAD(&root_task_group.children); 8055 INIT_LIST_HEAD(&root_task_group.children);
8059 init_task_group.parent = &root_task_group; 8056 init_task_group.parent = &root_task_group;
8060 list_add(&init_task_group.siblings, &root_task_group.children); 8057 list_add(&init_task_group.siblings, &root_task_group.children);
8061#endif 8058#endif /* CONFIG_USER_SCHED */
8062#endif 8059#endif /* CONFIG_GROUP_SCHED */
8063 8060
8064 for_each_possible_cpu(i) { 8061 for_each_possible_cpu(i) {
8065 struct rq *rq; 8062 struct rq *rq;
@@ -8139,6 +8136,7 @@ void __init sched_init(void)
8139 rq->next_balance = jiffies; 8136 rq->next_balance = jiffies;
8140 rq->push_cpu = 0; 8137 rq->push_cpu = 0;
8141 rq->cpu = i; 8138 rq->cpu = i;
8139 rq->online = 0;
8142 rq->migration_thread = NULL; 8140 rq->migration_thread = NULL;
8143 INIT_LIST_HEAD(&rq->migration_queue); 8141 INIT_LIST_HEAD(&rq->migration_queue);
8144 rq_attach_root(rq, &def_root_domain); 8142 rq_attach_root(rq, &def_root_domain);
@@ -8154,7 +8152,7 @@ void __init sched_init(void)
8154#endif 8152#endif
8155 8153
8156#ifdef CONFIG_SMP 8154#ifdef CONFIG_SMP
8157 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 8155 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8158#endif 8156#endif
8159 8157
8160#ifdef CONFIG_RT_MUTEXES 8158#ifdef CONFIG_RT_MUTEXES
@@ -8378,7 +8376,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8378{ 8376{
8379 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8377 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8380} 8378}
8381#else 8379#else /* !CONFG_FAIR_GROUP_SCHED */
8382static inline void free_fair_sched_group(struct task_group *tg) 8380static inline void free_fair_sched_group(struct task_group *tg)
8383{ 8381{
8384} 8382}
@@ -8396,7 +8394,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8396static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8394static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8397{ 8395{
8398} 8396}
8399#endif 8397#endif /* CONFIG_FAIR_GROUP_SCHED */
8400 8398
8401#ifdef CONFIG_RT_GROUP_SCHED 8399#ifdef CONFIG_RT_GROUP_SCHED
8402static void free_rt_sched_group(struct task_group *tg) 8400static void free_rt_sched_group(struct task_group *tg)
@@ -8467,7 +8465,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8467{ 8465{
8468 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 8466 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8469} 8467}
8470#else 8468#else /* !CONFIG_RT_GROUP_SCHED */
8471static inline void free_rt_sched_group(struct task_group *tg) 8469static inline void free_rt_sched_group(struct task_group *tg)
8472{ 8470{
8473} 8471}
@@ -8485,7 +8483,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8485static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8483static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8486{ 8484{
8487} 8485}
8488#endif 8486#endif /* CONFIG_RT_GROUP_SCHED */
8489 8487
8490#ifdef CONFIG_GROUP_SCHED 8488#ifdef CONFIG_GROUP_SCHED
8491static void free_sched_group(struct task_group *tg) 8489static void free_sched_group(struct task_group *tg)
@@ -8596,7 +8594,7 @@ void sched_move_task(struct task_struct *tsk)
8596 8594
8597 task_rq_unlock(rq, &flags); 8595 task_rq_unlock(rq, &flags);
8598} 8596}
8599#endif 8597#endif /* CONFIG_GROUP_SCHED */
8600 8598
8601#ifdef CONFIG_FAIR_GROUP_SCHED 8599#ifdef CONFIG_FAIR_GROUP_SCHED
8602static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8600static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -8731,7 +8729,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8731 } 8729 }
8732 rcu_read_unlock(); 8730 rcu_read_unlock();
8733 8731
8734 return total + to_ratio(period, runtime) < 8732 return total + to_ratio(period, runtime) <=
8735 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8733 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8736 parent->rt_bandwidth.rt_runtime); 8734 parent->rt_bandwidth.rt_runtime);
8737} 8735}
@@ -8834,6 +8832,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8834 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 8832 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8835 rt_runtime = tg->rt_bandwidth.rt_runtime; 8833 rt_runtime = tg->rt_bandwidth.rt_runtime;
8836 8834
8835 if (rt_period == 0)
8836 return -EINVAL;
8837
8837 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8838 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8838} 8839}
8839 8840
@@ -8848,16 +8849,21 @@ long sched_group_rt_period(struct task_group *tg)
8848 8849
8849static int sched_rt_global_constraints(void) 8850static int sched_rt_global_constraints(void)
8850{ 8851{
8852 struct task_group *tg = &root_task_group;
8853 u64 rt_runtime, rt_period;
8851 int ret = 0; 8854 int ret = 0;
8852 8855
8856 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8857 rt_runtime = tg->rt_bandwidth.rt_runtime;
8858
8853 mutex_lock(&rt_constraints_mutex); 8859 mutex_lock(&rt_constraints_mutex);
8854 if (!__rt_schedulable(NULL, 1, 0)) 8860 if (!__rt_schedulable(tg, rt_period, rt_runtime))
8855 ret = -EINVAL; 8861 ret = -EINVAL;
8856 mutex_unlock(&rt_constraints_mutex); 8862 mutex_unlock(&rt_constraints_mutex);
8857 8863
8858 return ret; 8864 return ret;
8859} 8865}
8860#else 8866#else /* !CONFIG_RT_GROUP_SCHED */
8861static int sched_rt_global_constraints(void) 8867static int sched_rt_global_constraints(void)
8862{ 8868{
8863 unsigned long flags; 8869 unsigned long flags;
@@ -8875,7 +8881,7 @@ static int sched_rt_global_constraints(void)
8875 8881
8876 return 0; 8882 return 0;
8877} 8883}
8878#endif 8884#endif /* CONFIG_RT_GROUP_SCHED */
8879 8885
8880int sched_rt_handler(struct ctl_table *table, int write, 8886int sched_rt_handler(struct ctl_table *table, int write,
8881 struct file *filp, void __user *buffer, size_t *lenp, 8887 struct file *filp, void __user *buffer, size_t *lenp,
@@ -8983,7 +8989,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8983 8989
8984 return (u64) tg->shares; 8990 return (u64) tg->shares;
8985} 8991}
8986#endif 8992#endif /* CONFIG_FAIR_GROUP_SCHED */
8987 8993
8988#ifdef CONFIG_RT_GROUP_SCHED 8994#ifdef CONFIG_RT_GROUP_SCHED
8989static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 8995static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -9007,7 +9013,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9007{ 9013{
9008 return sched_group_rt_period(cgroup_tg(cgrp)); 9014 return sched_group_rt_period(cgroup_tg(cgrp));
9009} 9015}
9010#endif 9016#endif /* CONFIG_RT_GROUP_SCHED */
9011 9017
9012static struct cftype cpu_files[] = { 9018static struct cftype cpu_files[] = {
9013#ifdef CONFIG_FAIR_GROUP_SCHED 9019#ifdef CONFIG_FAIR_GROUP_SCHED