aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c857
1 files changed, 598 insertions, 259 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 3aaa5c8cb421..99e6d850ecab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,10 +70,13 @@
70#include <linux/bootmem.h> 70#include <linux/bootmem.h>
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h>
73 74
74#include <asm/tlb.h> 75#include <asm/tlb.h>
75#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
76 77
78#include "sched_cpupri.h"
79
77/* 80/*
78 * Convert user-nice values [ -20 ... 0 ... 19 ] 81 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 82 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -289,15 +292,15 @@ struct task_group root_task_group;
289static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 292static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
290/* Default task group's cfs_rq on each cpu */ 293/* Default task group's cfs_rq on each cpu */
291static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 294static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
292#endif 295#endif /* CONFIG_FAIR_GROUP_SCHED */
293 296
294#ifdef CONFIG_RT_GROUP_SCHED 297#ifdef CONFIG_RT_GROUP_SCHED
295static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
296static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
297#endif 300#endif /* CONFIG_RT_GROUP_SCHED */
298#else 301#else /* !CONFIG_FAIR_GROUP_SCHED */
299#define root_task_group init_task_group 302#define root_task_group init_task_group
300#endif 303#endif /* CONFIG_FAIR_GROUP_SCHED */
301 304
302/* task_group_lock serializes add/remove of task groups and also changes to 305/* task_group_lock serializes add/remove of task groups and also changes to
303 * a task group's cpu shares. 306 * a task group's cpu shares.
@@ -307,9 +310,9 @@ static DEFINE_SPINLOCK(task_group_lock);
307#ifdef CONFIG_FAIR_GROUP_SCHED 310#ifdef CONFIG_FAIR_GROUP_SCHED
308#ifdef CONFIG_USER_SCHED 311#ifdef CONFIG_USER_SCHED
309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 312# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
310#else 313#else /* !CONFIG_USER_SCHED */
311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 314# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
312#endif 315#endif /* CONFIG_USER_SCHED */
313 316
314/* 317/*
315 * A weight of 0 or 1 can cause arithmetics problems. 318 * A weight of 0 or 1 can cause arithmetics problems.
@@ -363,6 +366,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
363#else 366#else
364 367
365static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 368static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
369static inline struct task_group *task_group(struct task_struct *p)
370{
371 return NULL;
372}
366 373
367#endif /* CONFIG_GROUP_SCHED */ 374#endif /* CONFIG_GROUP_SCHED */
368 375
@@ -373,6 +380,7 @@ struct cfs_rq {
373 380
374 u64 exec_clock; 381 u64 exec_clock;
375 u64 min_vruntime; 382 u64 min_vruntime;
383 u64 pair_start;
376 384
377 struct rb_root tasks_timeline; 385 struct rb_root tasks_timeline;
378 struct rb_node *rb_leftmost; 386 struct rb_node *rb_leftmost;
@@ -401,6 +409,31 @@ struct cfs_rq {
401 */ 409 */
402 struct list_head leaf_cfs_rq_list; 410 struct list_head leaf_cfs_rq_list;
403 struct task_group *tg; /* group that "owns" this runqueue */ 411 struct task_group *tg; /* group that "owns" this runqueue */
412
413#ifdef CONFIG_SMP
414 /*
415 * the part of load.weight contributed by tasks
416 */
417 unsigned long task_weight;
418
419 /*
420 * h_load = weight * f(tg)
421 *
422 * Where f(tg) is the recursive weight fraction assigned to
423 * this group.
424 */
425 unsigned long h_load;
426
427 /*
428 * this cpu's part of tg->shares
429 */
430 unsigned long shares;
431
432 /*
433 * load.weight at the time we set shares
434 */
435 unsigned long rq_weight;
436#endif
404#endif 437#endif
405}; 438};
406 439
@@ -452,6 +485,9 @@ struct root_domain {
452 */ 485 */
453 cpumask_t rto_mask; 486 cpumask_t rto_mask;
454 atomic_t rto_count; 487 atomic_t rto_count;
488#ifdef CONFIG_SMP
489 struct cpupri cpupri;
490#endif
455}; 491};
456 492
457/* 493/*
@@ -526,6 +562,9 @@ struct rq {
526 int push_cpu; 562 int push_cpu;
527 /* cpu of this runqueue: */ 563 /* cpu of this runqueue: */
528 int cpu; 564 int cpu;
565 int online;
566
567 unsigned long avg_load_per_task;
529 568
530 struct task_struct *migration_thread; 569 struct task_struct *migration_thread;
531 struct list_head migration_queue; 570 struct list_head migration_queue;
@@ -607,6 +646,24 @@ static inline void update_rq_clock(struct rq *rq)
607# define const_debug static const 646# define const_debug static const
608#endif 647#endif
609 648
649/**
650 * runqueue_is_locked
651 *
652 * Returns true if the current cpu runqueue is locked.
653 * This interface allows printk to be called with the runqueue lock
654 * held and know whether or not it is OK to wake up the klogd.
655 */
656int runqueue_is_locked(void)
657{
658 int cpu = get_cpu();
659 struct rq *rq = cpu_rq(cpu);
660 int ret;
661
662 ret = spin_is_locked(&rq->lock);
663 put_cpu();
664 return ret;
665}
666
610/* 667/*
611 * Debugging: various feature bits 668 * Debugging: various feature bits
612 */ 669 */
@@ -749,6 +806,12 @@ late_initcall(sched_init_debug);
749const_debug unsigned int sysctl_sched_nr_migrate = 32; 806const_debug unsigned int sysctl_sched_nr_migrate = 32;
750 807
751/* 808/*
809 * ratelimit for updating the group shares.
810 * default: 0.5ms
811 */
812const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
813
814/*
752 * period over which we measure -rt task cpu usage in us. 815 * period over which we measure -rt task cpu usage in us.
753 * default: 1s 816 * default: 1s
754 */ 817 */
@@ -775,82 +838,6 @@ static inline u64 global_rt_runtime(void)
775 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 838 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
776} 839}
777 840
778unsigned long long time_sync_thresh = 100000;
779
780static DEFINE_PER_CPU(unsigned long long, time_offset);
781static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
782
783/*
784 * Global lock which we take every now and then to synchronize
785 * the CPUs time. This method is not warp-safe, but it's good
786 * enough to synchronize slowly diverging time sources and thus
787 * it's good enough for tracing:
788 */
789static DEFINE_SPINLOCK(time_sync_lock);
790static unsigned long long prev_global_time;
791
792static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
793{
794 /*
795 * We want this inlined, to not get tracer function calls
796 * in this critical section:
797 */
798 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
799 __raw_spin_lock(&time_sync_lock.raw_lock);
800
801 if (time < prev_global_time) {
802 per_cpu(time_offset, cpu) += prev_global_time - time;
803 time = prev_global_time;
804 } else {
805 prev_global_time = time;
806 }
807
808 __raw_spin_unlock(&time_sync_lock.raw_lock);
809 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
810
811 return time;
812}
813
814static unsigned long long __cpu_clock(int cpu)
815{
816 unsigned long long now;
817
818 /*
819 * Only call sched_clock() if the scheduler has already been
820 * initialized (some code might call cpu_clock() very early):
821 */
822 if (unlikely(!scheduler_running))
823 return 0;
824
825 now = sched_clock_cpu(cpu);
826
827 return now;
828}
829
830/*
831 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
832 * clock constructed from sched_clock():
833 */
834unsigned long long cpu_clock(int cpu)
835{
836 unsigned long long prev_cpu_time, time, delta_time;
837 unsigned long flags;
838
839 local_irq_save(flags);
840 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
841 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
842 delta_time = time-prev_cpu_time;
843
844 if (unlikely(delta_time > time_sync_thresh)) {
845 time = __sync_cpu_clock(time, cpu);
846 per_cpu(prev_cpu_time, cpu) = time;
847 }
848 local_irq_restore(flags);
849
850 return time;
851}
852EXPORT_SYMBOL_GPL(cpu_clock);
853
854#ifndef prepare_arch_switch 841#ifndef prepare_arch_switch
855# define prepare_arch_switch(next) do { } while (0) 842# define prepare_arch_switch(next) do { } while (0)
856#endif 843#endif
@@ -1313,15 +1300,15 @@ void wake_up_idle_cpu(int cpu)
1313 if (!tsk_is_polling(rq->idle)) 1300 if (!tsk_is_polling(rq->idle))
1314 smp_send_reschedule(cpu); 1301 smp_send_reschedule(cpu);
1315} 1302}
1316#endif 1303#endif /* CONFIG_NO_HZ */
1317 1304
1318#else 1305#else /* !CONFIG_SMP */
1319static void __resched_task(struct task_struct *p, int tif_bit) 1306static void __resched_task(struct task_struct *p, int tif_bit)
1320{ 1307{
1321 assert_spin_locked(&task_rq(p)->lock); 1308 assert_spin_locked(&task_rq(p)->lock);
1322 set_tsk_thread_flag(p, tif_bit); 1309 set_tsk_thread_flag(p, tif_bit);
1323} 1310}
1324#endif 1311#endif /* CONFIG_SMP */
1325 1312
1326#if BITS_PER_LONG == 32 1313#if BITS_PER_LONG == 32
1327# define WMULT_CONST (~0UL) 1314# define WMULT_CONST (~0UL)
@@ -1336,6 +1323,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
1336 */ 1323 */
1337#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1324#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1338 1325
1326/*
1327 * delta *= weight / lw
1328 */
1339static unsigned long 1329static unsigned long
1340calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1330calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1341 struct load_weight *lw) 1331 struct load_weight *lw)
@@ -1363,12 +1353,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1363 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1353 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1364} 1354}
1365 1355
1366static inline unsigned long
1367calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1368{
1369 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
1370}
1371
1372static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1356static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1373{ 1357{
1374 lw->weight += inc; 1358 lw->weight += inc;
@@ -1479,17 +1463,211 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1479#ifdef CONFIG_SMP 1463#ifdef CONFIG_SMP
1480static unsigned long source_load(int cpu, int type); 1464static unsigned long source_load(int cpu, int type);
1481static unsigned long target_load(int cpu, int type); 1465static unsigned long target_load(int cpu, int type);
1482static unsigned long cpu_avg_load_per_task(int cpu);
1483static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1466static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1484#else /* CONFIG_SMP */ 1467
1468static unsigned long cpu_avg_load_per_task(int cpu)
1469{
1470 struct rq *rq = cpu_rq(cpu);
1471
1472 if (rq->nr_running)
1473 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1474
1475 return rq->avg_load_per_task;
1476}
1485 1477
1486#ifdef CONFIG_FAIR_GROUP_SCHED 1478#ifdef CONFIG_FAIR_GROUP_SCHED
1487static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1479
1480typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1481
1482/*
1483 * Iterate the full tree, calling @down when first entering a node and @up when
1484 * leaving it for the final time.
1485 */
1486static void
1487walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1488{
1489 struct task_group *parent, *child;
1490
1491 rcu_read_lock();
1492 parent = &root_task_group;
1493down:
1494 (*down)(parent, cpu, sd);
1495 list_for_each_entry_rcu(child, &parent->children, siblings) {
1496 parent = child;
1497 goto down;
1498
1499up:
1500 continue;
1501 }
1502 (*up)(parent, cpu, sd);
1503
1504 child = parent;
1505 parent = parent->parent;
1506 if (parent)
1507 goto up;
1508 rcu_read_unlock();
1509}
1510
1511static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1512
1513/*
1514 * Calculate and set the cpu's group shares.
1515 */
1516static void
1517__update_group_shares_cpu(struct task_group *tg, int cpu,
1518 unsigned long sd_shares, unsigned long sd_rq_weight)
1519{
1520 int boost = 0;
1521 unsigned long shares;
1522 unsigned long rq_weight;
1523
1524 if (!tg->se[cpu])
1525 return;
1526
1527 rq_weight = tg->cfs_rq[cpu]->load.weight;
1528
1529 /*
1530 * If there are currently no tasks on the cpu pretend there is one of
1531 * average load so that when a new task gets to run here it will not
1532 * get delayed by group starvation.
1533 */
1534 if (!rq_weight) {
1535 boost = 1;
1536 rq_weight = NICE_0_LOAD;
1537 }
1538
1539 if (unlikely(rq_weight > sd_rq_weight))
1540 rq_weight = sd_rq_weight;
1541
1542 /*
1543 * \Sum shares * rq_weight
1544 * shares = -----------------------
1545 * \Sum rq_weight
1546 *
1547 */
1548 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1549
1550 /*
1551 * record the actual number of shares, not the boosted amount.
1552 */
1553 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1554 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1555
1556 if (shares < MIN_SHARES)
1557 shares = MIN_SHARES;
1558 else if (shares > MAX_SHARES)
1559 shares = MAX_SHARES;
1560
1561 __set_se_shares(tg->se[cpu], shares);
1562}
1563
1564/*
1565 * Re-compute the task group their per cpu shares over the given domain.
1566 * This needs to be done in a bottom-up fashion because the rq weight of a
1567 * parent group depends on the shares of its child groups.
1568 */
1569static void
1570tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1571{
1572 unsigned long rq_weight = 0;
1573 unsigned long shares = 0;
1574 int i;
1575
1576 for_each_cpu_mask(i, sd->span) {
1577 rq_weight += tg->cfs_rq[i]->load.weight;
1578 shares += tg->cfs_rq[i]->shares;
1579 }
1580
1581 if ((!shares && rq_weight) || shares > tg->shares)
1582 shares = tg->shares;
1583
1584 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1585 shares = tg->shares;
1586
1587 if (!rq_weight)
1588 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1589
1590 for_each_cpu_mask(i, sd->span) {
1591 struct rq *rq = cpu_rq(i);
1592 unsigned long flags;
1593
1594 spin_lock_irqsave(&rq->lock, flags);
1595 __update_group_shares_cpu(tg, i, shares, rq_weight);
1596 spin_unlock_irqrestore(&rq->lock, flags);
1597 }
1598}
1599
1600/*
1601 * Compute the cpu's hierarchical load factor for each task group.
1602 * This needs to be done in a top-down fashion because the load of a child
1603 * group is a fraction of its parents load.
1604 */
1605static void
1606tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1607{
1608 unsigned long load;
1609
1610 if (!tg->parent) {
1611 load = cpu_rq(cpu)->load.weight;
1612 } else {
1613 load = tg->parent->cfs_rq[cpu]->h_load;
1614 load *= tg->cfs_rq[cpu]->shares;
1615 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1616 }
1617
1618 tg->cfs_rq[cpu]->h_load = load;
1619}
1620
1621static void
1622tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1488{ 1623{
1489} 1624}
1625
1626static void update_shares(struct sched_domain *sd)
1627{
1628 u64 now = cpu_clock(raw_smp_processor_id());
1629 s64 elapsed = now - sd->last_update;
1630
1631 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1632 sd->last_update = now;
1633 walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
1634 }
1635}
1636
1637static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1638{
1639 spin_unlock(&rq->lock);
1640 update_shares(sd);
1641 spin_lock(&rq->lock);
1642}
1643
1644static void update_h_load(int cpu)
1645{
1646 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
1647}
1648
1649#else
1650
1651static inline void update_shares(struct sched_domain *sd)
1652{
1653}
1654
1655static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1656{
1657}
1658
1490#endif 1659#endif
1491 1660
1492#endif /* CONFIG_SMP */ 1661#endif
1662
1663#ifdef CONFIG_FAIR_GROUP_SCHED
1664static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1665{
1666#ifdef CONFIG_SMP
1667 cfs_rq->shares = shares;
1668#endif
1669}
1670#endif
1493 1671
1494#include "sched_stats.h" 1672#include "sched_stats.h"
1495#include "sched_idletask.c" 1673#include "sched_idletask.c"
@@ -1500,27 +1678,17 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1500#endif 1678#endif
1501 1679
1502#define sched_class_highest (&rt_sched_class) 1680#define sched_class_highest (&rt_sched_class)
1681#define for_each_class(class) \
1682 for (class = sched_class_highest; class; class = class->next)
1503 1683
1504static inline void inc_load(struct rq *rq, const struct task_struct *p) 1684static void inc_nr_running(struct rq *rq)
1505{
1506 update_load_add(&rq->load, p->se.load.weight);
1507}
1508
1509static inline void dec_load(struct rq *rq, const struct task_struct *p)
1510{
1511 update_load_sub(&rq->load, p->se.load.weight);
1512}
1513
1514static void inc_nr_running(struct task_struct *p, struct rq *rq)
1515{ 1685{
1516 rq->nr_running++; 1686 rq->nr_running++;
1517 inc_load(rq, p);
1518} 1687}
1519 1688
1520static void dec_nr_running(struct task_struct *p, struct rq *rq) 1689static void dec_nr_running(struct rq *rq)
1521{ 1690{
1522 rq->nr_running--; 1691 rq->nr_running--;
1523 dec_load(rq, p);
1524} 1692}
1525 1693
1526static void set_load_weight(struct task_struct *p) 1694static void set_load_weight(struct task_struct *p)
@@ -1544,6 +1712,12 @@ static void set_load_weight(struct task_struct *p)
1544 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1712 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1545} 1713}
1546 1714
1715static void update_avg(u64 *avg, u64 sample)
1716{
1717 s64 diff = sample - *avg;
1718 *avg += diff >> 3;
1719}
1720
1547static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1721static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1548{ 1722{
1549 sched_info_queued(p); 1723 sched_info_queued(p);
@@ -1553,6 +1727,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1553 1727
1554static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1728static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1555{ 1729{
1730 if (sleep && p->se.last_wakeup) {
1731 update_avg(&p->se.avg_overlap,
1732 p->se.sum_exec_runtime - p->se.last_wakeup);
1733 p->se.last_wakeup = 0;
1734 }
1735
1736 sched_info_dequeued(p);
1556 p->sched_class->dequeue_task(rq, p, sleep); 1737 p->sched_class->dequeue_task(rq, p, sleep);
1557 p->se.on_rq = 0; 1738 p->se.on_rq = 0;
1558} 1739}
@@ -1612,7 +1793,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1612 rq->nr_uninterruptible--; 1793 rq->nr_uninterruptible--;
1613 1794
1614 enqueue_task(rq, p, wakeup); 1795 enqueue_task(rq, p, wakeup);
1615 inc_nr_running(p, rq); 1796 inc_nr_running(rq);
1616} 1797}
1617 1798
1618/* 1799/*
@@ -1624,7 +1805,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1624 rq->nr_uninterruptible++; 1805 rq->nr_uninterruptible++;
1625 1806
1626 dequeue_task(rq, p, sleep); 1807 dequeue_task(rq, p, sleep);
1627 dec_nr_running(p, rq); 1808 dec_nr_running(rq);
1628} 1809}
1629 1810
1630/** 1811/**
@@ -1636,12 +1817,6 @@ inline int task_curr(const struct task_struct *p)
1636 return cpu_curr(task_cpu(p)) == p; 1817 return cpu_curr(task_cpu(p)) == p;
1637} 1818}
1638 1819
1639/* Used instead of source_load when we know the type == 0 */
1640unsigned long weighted_cpuload(const int cpu)
1641{
1642 return cpu_rq(cpu)->load.weight;
1643}
1644
1645static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1820static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1646{ 1821{
1647 set_task_rq(p, cpu); 1822 set_task_rq(p, cpu);
@@ -1670,6 +1845,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1670 1845
1671#ifdef CONFIG_SMP 1846#ifdef CONFIG_SMP
1672 1847
1848/* Used instead of source_load when we know the type == 0 */
1849static unsigned long weighted_cpuload(const int cpu)
1850{
1851 return cpu_rq(cpu)->load.weight;
1852}
1853
1673/* 1854/*
1674 * Is this task likely cache-hot: 1855 * Is this task likely cache-hot:
1675 */ 1856 */
@@ -1880,7 +2061,7 @@ static unsigned long source_load(int cpu, int type)
1880 struct rq *rq = cpu_rq(cpu); 2061 struct rq *rq = cpu_rq(cpu);
1881 unsigned long total = weighted_cpuload(cpu); 2062 unsigned long total = weighted_cpuload(cpu);
1882 2063
1883 if (type == 0) 2064 if (type == 0 || !sched_feat(LB_BIAS))
1884 return total; 2065 return total;
1885 2066
1886 return min(rq->cpu_load[type-1], total); 2067 return min(rq->cpu_load[type-1], total);
@@ -1895,25 +2076,13 @@ static unsigned long target_load(int cpu, int type)
1895 struct rq *rq = cpu_rq(cpu); 2076 struct rq *rq = cpu_rq(cpu);
1896 unsigned long total = weighted_cpuload(cpu); 2077 unsigned long total = weighted_cpuload(cpu);
1897 2078
1898 if (type == 0) 2079 if (type == 0 || !sched_feat(LB_BIAS))
1899 return total; 2080 return total;
1900 2081
1901 return max(rq->cpu_load[type-1], total); 2082 return max(rq->cpu_load[type-1], total);
1902} 2083}
1903 2084
1904/* 2085/*
1905 * Return the average load per task on the cpu's run queue
1906 */
1907static unsigned long cpu_avg_load_per_task(int cpu)
1908{
1909 struct rq *rq = cpu_rq(cpu);
1910 unsigned long total = weighted_cpuload(cpu);
1911 unsigned long n = rq->nr_running;
1912
1913 return n ? total / n : SCHED_LOAD_SCALE;
1914}
1915
1916/*
1917 * find_idlest_group finds and returns the least busy CPU group within the 2086 * find_idlest_group finds and returns the least busy CPU group within the
1918 * domain. 2087 * domain.
1919 */ 2088 */
@@ -2019,6 +2188,9 @@ static int sched_balance_self(int cpu, int flag)
2019 sd = tmp; 2188 sd = tmp;
2020 } 2189 }
2021 2190
2191 if (sd)
2192 update_shares(sd);
2193
2022 while (sd) { 2194 while (sd) {
2023 cpumask_t span, tmpmask; 2195 cpumask_t span, tmpmask;
2024 struct sched_group *group; 2196 struct sched_group *group;
@@ -2085,6 +2257,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2085 if (!sched_feat(SYNC_WAKEUPS)) 2257 if (!sched_feat(SYNC_WAKEUPS))
2086 sync = 0; 2258 sync = 0;
2087 2259
2260#ifdef CONFIG_SMP
2261 if (sched_feat(LB_WAKEUP_UPDATE)) {
2262 struct sched_domain *sd;
2263
2264 this_cpu = raw_smp_processor_id();
2265 cpu = task_cpu(p);
2266
2267 for_each_domain(this_cpu, sd) {
2268 if (cpu_isset(cpu, sd->span)) {
2269 update_shares(sd);
2270 break;
2271 }
2272 }
2273 }
2274#endif
2275
2088 smp_wmb(); 2276 smp_wmb();
2089 rq = task_rq_lock(p, &flags); 2277 rq = task_rq_lock(p, &flags);
2090 old_state = p->state; 2278 old_state = p->state;
@@ -2131,7 +2319,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2131 } 2319 }
2132 } 2320 }
2133 } 2321 }
2134#endif 2322#endif /* CONFIG_SCHEDSTATS */
2135 2323
2136out_activate: 2324out_activate:
2137#endif /* CONFIG_SMP */ 2325#endif /* CONFIG_SMP */
@@ -2149,6 +2337,9 @@ out_activate:
2149 success = 1; 2337 success = 1;
2150 2338
2151out_running: 2339out_running:
2340 trace_mark(kernel_sched_wakeup,
2341 "pid %d state %ld ## rq %p task %p rq->curr %p",
2342 p->pid, p->state, rq, p, rq->curr);
2152 check_preempt_curr(rq, p); 2343 check_preempt_curr(rq, p);
2153 2344
2154 p->state = TASK_RUNNING; 2345 p->state = TASK_RUNNING;
@@ -2157,6 +2348,8 @@ out_running:
2157 p->sched_class->task_wake_up(rq, p); 2348 p->sched_class->task_wake_up(rq, p);
2158#endif 2349#endif
2159out: 2350out:
2351 current->se.last_wakeup = current->se.sum_exec_runtime;
2352
2160 task_rq_unlock(rq, &flags); 2353 task_rq_unlock(rq, &flags);
2161 2354
2162 return success; 2355 return success;
@@ -2277,8 +2470,11 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2277 * management (if any): 2470 * management (if any):
2278 */ 2471 */
2279 p->sched_class->task_new(rq, p); 2472 p->sched_class->task_new(rq, p);
2280 inc_nr_running(p, rq); 2473 inc_nr_running(rq);
2281 } 2474 }
2475 trace_mark(kernel_sched_wakeup_new,
2476 "pid %d state %ld ## rq %p task %p rq->curr %p",
2477 p->pid, p->state, rq, p, rq->curr);
2282 check_preempt_curr(rq, p); 2478 check_preempt_curr(rq, p);
2283#ifdef CONFIG_SMP 2479#ifdef CONFIG_SMP
2284 if (p->sched_class->task_wake_up) 2480 if (p->sched_class->task_wake_up)
@@ -2331,7 +2527,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2331 notifier->ops->sched_out(notifier, next); 2527 notifier->ops->sched_out(notifier, next);
2332} 2528}
2333 2529
2334#else 2530#else /* !CONFIG_PREEMPT_NOTIFIERS */
2335 2531
2336static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2532static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2337{ 2533{
@@ -2343,7 +2539,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2343{ 2539{
2344} 2540}
2345 2541
2346#endif 2542#endif /* CONFIG_PREEMPT_NOTIFIERS */
2347 2543
2348/** 2544/**
2349 * prepare_task_switch - prepare to switch tasks 2545 * prepare_task_switch - prepare to switch tasks
@@ -2451,6 +2647,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
2451 struct mm_struct *mm, *oldmm; 2647 struct mm_struct *mm, *oldmm;
2452 2648
2453 prepare_task_switch(rq, prev, next); 2649 prepare_task_switch(rq, prev, next);
2650 trace_mark(kernel_sched_schedule,
2651 "prev_pid %d next_pid %d prev_state %ld "
2652 "## rq %p prev %p next %p",
2653 prev->pid, next->pid, prev->state,
2654 rq, prev, next);
2454 mm = next->mm; 2655 mm = next->mm;
2455 oldmm = prev->active_mm; 2656 oldmm = prev->active_mm;
2456 /* 2657 /*
@@ -2785,7 +2986,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2785 enum cpu_idle_type idle, int *all_pinned, 2986 enum cpu_idle_type idle, int *all_pinned,
2786 int *this_best_prio, struct rq_iterator *iterator) 2987 int *this_best_prio, struct rq_iterator *iterator)
2787{ 2988{
2788 int loops = 0, pulled = 0, pinned = 0, skip_for_load; 2989 int loops = 0, pulled = 0, pinned = 0;
2789 struct task_struct *p; 2990 struct task_struct *p;
2790 long rem_load_move = max_load_move; 2991 long rem_load_move = max_load_move;
2791 2992
@@ -2801,14 +3002,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2801next: 3002next:
2802 if (!p || loops++ > sysctl_sched_nr_migrate) 3003 if (!p || loops++ > sysctl_sched_nr_migrate)
2803 goto out; 3004 goto out;
2804 /* 3005
2805 * To help distribute high priority tasks across CPUs we don't 3006 if ((p->se.load.weight >> 1) > rem_load_move ||
2806 * skip a task if it will be the highest priority task (i.e. smallest
2807 * prio value) on its new queue regardless of its load weight
2808 */
2809 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2810 SCHED_LOAD_SCALE_FUZZ;
2811 if ((skip_for_load && p->prio >= *this_best_prio) ||
2812 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 3007 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2813 p = iterator->next(iterator->arg); 3008 p = iterator->next(iterator->arg);
2814 goto next; 3009 goto next;
@@ -2863,6 +3058,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2863 max_load_move - total_load_moved, 3058 max_load_move - total_load_moved,
2864 sd, idle, all_pinned, &this_best_prio); 3059 sd, idle, all_pinned, &this_best_prio);
2865 class = class->next; 3060 class = class->next;
3061
3062 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3063 break;
3064
2866 } while (class && max_load_move > total_load_moved); 3065 } while (class && max_load_move > total_load_moved);
2867 3066
2868 return total_load_moved > 0; 3067 return total_load_moved > 0;
@@ -2939,6 +3138,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2939 max_load = this_load = total_load = total_pwr = 0; 3138 max_load = this_load = total_load = total_pwr = 0;
2940 busiest_load_per_task = busiest_nr_running = 0; 3139 busiest_load_per_task = busiest_nr_running = 0;
2941 this_load_per_task = this_nr_running = 0; 3140 this_load_per_task = this_nr_running = 0;
3141
2942 if (idle == CPU_NOT_IDLE) 3142 if (idle == CPU_NOT_IDLE)
2943 load_idx = sd->busy_idx; 3143 load_idx = sd->busy_idx;
2944 else if (idle == CPU_NEWLY_IDLE) 3144 else if (idle == CPU_NEWLY_IDLE)
@@ -2953,6 +3153,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2953 int __group_imb = 0; 3153 int __group_imb = 0;
2954 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3154 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2955 unsigned long sum_nr_running, sum_weighted_load; 3155 unsigned long sum_nr_running, sum_weighted_load;
3156 unsigned long sum_avg_load_per_task;
3157 unsigned long avg_load_per_task;
2956 3158
2957 local_group = cpu_isset(this_cpu, group->cpumask); 3159 local_group = cpu_isset(this_cpu, group->cpumask);
2958 3160
@@ -2961,6 +3163,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2961 3163
2962 /* Tally up the load of all CPUs in the group */ 3164 /* Tally up the load of all CPUs in the group */
2963 sum_weighted_load = sum_nr_running = avg_load = 0; 3165 sum_weighted_load = sum_nr_running = avg_load = 0;
3166 sum_avg_load_per_task = avg_load_per_task = 0;
3167
2964 max_cpu_load = 0; 3168 max_cpu_load = 0;
2965 min_cpu_load = ~0UL; 3169 min_cpu_load = ~0UL;
2966 3170
@@ -2994,6 +3198,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2994 avg_load += load; 3198 avg_load += load;
2995 sum_nr_running += rq->nr_running; 3199 sum_nr_running += rq->nr_running;
2996 sum_weighted_load += weighted_cpuload(i); 3200 sum_weighted_load += weighted_cpuload(i);
3201
3202 sum_avg_load_per_task += cpu_avg_load_per_task(i);
2997 } 3203 }
2998 3204
2999 /* 3205 /*
@@ -3015,7 +3221,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3015 avg_load = sg_div_cpu_power(group, 3221 avg_load = sg_div_cpu_power(group,
3016 avg_load * SCHED_LOAD_SCALE); 3222 avg_load * SCHED_LOAD_SCALE);
3017 3223
3018 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) 3224
3225 /*
3226 * Consider the group unbalanced when the imbalance is larger
3227 * than the average weight of two tasks.
3228 *
3229 * APZ: with cgroup the avg task weight can vary wildly and
3230 * might not be a suitable number - should we keep a
3231 * normalized nr_running number somewhere that negates
3232 * the hierarchy?
3233 */
3234 avg_load_per_task = sg_div_cpu_power(group,
3235 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3236
3237 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3019 __group_imb = 1; 3238 __group_imb = 1;
3020 3239
3021 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3240 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3156,9 +3375,9 @@ small_imbalance:
3156 if (busiest_load_per_task > this_load_per_task) 3375 if (busiest_load_per_task > this_load_per_task)
3157 imbn = 1; 3376 imbn = 1;
3158 } else 3377 } else
3159 this_load_per_task = SCHED_LOAD_SCALE; 3378 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3160 3379
3161 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= 3380 if (max_load - this_load + 2*busiest_load_per_task >=
3162 busiest_load_per_task * imbn) { 3381 busiest_load_per_task * imbn) {
3163 *imbalance = busiest_load_per_task; 3382 *imbalance = busiest_load_per_task;
3164 return busiest; 3383 return busiest;
@@ -3284,6 +3503,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3284 schedstat_inc(sd, lb_count[idle]); 3503 schedstat_inc(sd, lb_count[idle]);
3285 3504
3286redo: 3505redo:
3506 update_shares(sd);
3287 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3507 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3288 cpus, balance); 3508 cpus, balance);
3289 3509
@@ -3386,8 +3606,9 @@ redo:
3386 3606
3387 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3607 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3388 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3608 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3389 return -1; 3609 ld_moved = -1;
3390 return ld_moved; 3610
3611 goto out;
3391 3612
3392out_balanced: 3613out_balanced:
3393 schedstat_inc(sd, lb_balanced[idle]); 3614 schedstat_inc(sd, lb_balanced[idle]);
@@ -3402,8 +3623,13 @@ out_one_pinned:
3402 3623
3403 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3624 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3404 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3625 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3405 return -1; 3626 ld_moved = -1;
3406 return 0; 3627 else
3628 ld_moved = 0;
3629out:
3630 if (ld_moved)
3631 update_shares(sd);
3632 return ld_moved;
3407} 3633}
3408 3634
3409/* 3635/*
@@ -3438,6 +3664,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3438 3664
3439 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); 3665 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3440redo: 3666redo:
3667 update_shares_locked(this_rq, sd);
3441 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 3668 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3442 &sd_idle, cpus, NULL); 3669 &sd_idle, cpus, NULL);
3443 if (!group) { 3670 if (!group) {
@@ -3481,6 +3708,7 @@ redo:
3481 } else 3708 } else
3482 sd->nr_balance_failed = 0; 3709 sd->nr_balance_failed = 0;
3483 3710
3711 update_shares_locked(this_rq, sd);
3484 return ld_moved; 3712 return ld_moved;
3485 3713
3486out_balanced: 3714out_balanced:
@@ -3672,6 +3900,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3672 /* Earliest time when we have to do rebalance again */ 3900 /* Earliest time when we have to do rebalance again */
3673 unsigned long next_balance = jiffies + 60*HZ; 3901 unsigned long next_balance = jiffies + 60*HZ;
3674 int update_next_balance = 0; 3902 int update_next_balance = 0;
3903 int need_serialize;
3675 cpumask_t tmp; 3904 cpumask_t tmp;
3676 3905
3677 for_each_domain(cpu, sd) { 3906 for_each_domain(cpu, sd) {
@@ -3689,8 +3918,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3689 if (interval > HZ*NR_CPUS/10) 3918 if (interval > HZ*NR_CPUS/10)
3690 interval = HZ*NR_CPUS/10; 3919 interval = HZ*NR_CPUS/10;
3691 3920
3921 need_serialize = sd->flags & SD_SERIALIZE;
3692 3922
3693 if (sd->flags & SD_SERIALIZE) { 3923 if (need_serialize) {
3694 if (!spin_trylock(&balancing)) 3924 if (!spin_trylock(&balancing))
3695 goto out; 3925 goto out;
3696 } 3926 }
@@ -3706,7 +3936,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3706 } 3936 }
3707 sd->last_balance = jiffies; 3937 sd->last_balance = jiffies;
3708 } 3938 }
3709 if (sd->flags & SD_SERIALIZE) 3939 if (need_serialize)
3710 spin_unlock(&balancing); 3940 spin_unlock(&balancing);
3711out: 3941out:
3712 if (time_after(next_balance, sd->last_balance + interval)) { 3942 if (time_after(next_balance, sd->last_balance + interval)) {
@@ -4021,26 +4251,44 @@ void scheduler_tick(void)
4021#endif 4251#endif
4022} 4252}
4023 4253
4024#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 4254#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4255 defined(CONFIG_PREEMPT_TRACER))
4256
4257static inline unsigned long get_parent_ip(unsigned long addr)
4258{
4259 if (in_lock_functions(addr)) {
4260 addr = CALLER_ADDR2;
4261 if (in_lock_functions(addr))
4262 addr = CALLER_ADDR3;
4263 }
4264 return addr;
4265}
4025 4266
4026void __kprobes add_preempt_count(int val) 4267void __kprobes add_preempt_count(int val)
4027{ 4268{
4269#ifdef CONFIG_DEBUG_PREEMPT
4028 /* 4270 /*
4029 * Underflow? 4271 * Underflow?
4030 */ 4272 */
4031 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 4273 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4032 return; 4274 return;
4275#endif
4033 preempt_count() += val; 4276 preempt_count() += val;
4277#ifdef CONFIG_DEBUG_PREEMPT
4034 /* 4278 /*
4035 * Spinlock count overflowing soon? 4279 * Spinlock count overflowing soon?
4036 */ 4280 */
4037 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 4281 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4038 PREEMPT_MASK - 10); 4282 PREEMPT_MASK - 10);
4283#endif
4284 if (preempt_count() == val)
4285 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4039} 4286}
4040EXPORT_SYMBOL(add_preempt_count); 4287EXPORT_SYMBOL(add_preempt_count);
4041 4288
4042void __kprobes sub_preempt_count(int val) 4289void __kprobes sub_preempt_count(int val)
4043{ 4290{
4291#ifdef CONFIG_DEBUG_PREEMPT
4044 /* 4292 /*
4045 * Underflow? 4293 * Underflow?
4046 */ 4294 */
@@ -4052,7 +4300,10 @@ void __kprobes sub_preempt_count(int val)
4052 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 4300 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4053 !(preempt_count() & PREEMPT_MASK))) 4301 !(preempt_count() & PREEMPT_MASK)))
4054 return; 4302 return;
4303#endif
4055 4304
4305 if (preempt_count() == val)
4306 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4056 preempt_count() -= val; 4307 preempt_count() -= val;
4057} 4308}
4058EXPORT_SYMBOL(sub_preempt_count); 4309EXPORT_SYMBOL(sub_preempt_count);
@@ -4070,6 +4321,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
4070 prev->comm, prev->pid, preempt_count()); 4321 prev->comm, prev->pid, preempt_count());
4071 4322
4072 debug_show_held_locks(prev); 4323 debug_show_held_locks(prev);
4324 print_modules();
4073 if (irqs_disabled()) 4325 if (irqs_disabled())
4074 print_irqtrace_events(prev); 4326 print_irqtrace_events(prev);
4075 4327
@@ -4143,7 +4395,7 @@ asmlinkage void __sched schedule(void)
4143 struct task_struct *prev, *next; 4395 struct task_struct *prev, *next;
4144 unsigned long *switch_count; 4396 unsigned long *switch_count;
4145 struct rq *rq; 4397 struct rq *rq;
4146 int cpu; 4398 int cpu, hrtick = sched_feat(HRTICK);
4147 4399
4148need_resched: 4400need_resched:
4149 preempt_disable(); 4401 preempt_disable();
@@ -4158,7 +4410,8 @@ need_resched_nonpreemptible:
4158 4410
4159 schedule_debug(prev); 4411 schedule_debug(prev);
4160 4412
4161 hrtick_clear(rq); 4413 if (hrtick)
4414 hrtick_clear(rq);
4162 4415
4163 /* 4416 /*
4164 * Do the rq-clock update outside the rq lock: 4417 * Do the rq-clock update outside the rq lock:
@@ -4204,7 +4457,8 @@ need_resched_nonpreemptible:
4204 } else 4457 } else
4205 spin_unlock_irq(&rq->lock); 4458 spin_unlock_irq(&rq->lock);
4206 4459
4207 hrtick_set(rq); 4460 if (hrtick)
4461 hrtick_set(rq);
4208 4462
4209 if (unlikely(reacquire_kernel_lock(current) < 0)) 4463 if (unlikely(reacquire_kernel_lock(current) < 0))
4210 goto need_resched_nonpreemptible; 4464 goto need_resched_nonpreemptible;
@@ -4586,10 +4840,8 @@ void set_user_nice(struct task_struct *p, long nice)
4586 goto out_unlock; 4840 goto out_unlock;
4587 } 4841 }
4588 on_rq = p->se.on_rq; 4842 on_rq = p->se.on_rq;
4589 if (on_rq) { 4843 if (on_rq)
4590 dequeue_task(rq, p, 0); 4844 dequeue_task(rq, p, 0);
4591 dec_load(rq, p);
4592 }
4593 4845
4594 p->static_prio = NICE_TO_PRIO(nice); 4846 p->static_prio = NICE_TO_PRIO(nice);
4595 set_load_weight(p); 4847 set_load_weight(p);
@@ -4599,7 +4851,6 @@ void set_user_nice(struct task_struct *p, long nice)
4599 4851
4600 if (on_rq) { 4852 if (on_rq) {
4601 enqueue_task(rq, p, 0); 4853 enqueue_task(rq, p, 0);
4602 inc_load(rq, p);
4603 /* 4854 /*
4604 * If the task increased its priority or is running and 4855 * If the task increased its priority or is running and
4605 * lowered its priority, then reschedule its CPU: 4856 * lowered its priority, then reschedule its CPU:
@@ -4744,16 +4995,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4744 set_load_weight(p); 4995 set_load_weight(p);
4745} 4996}
4746 4997
4747/** 4998static int __sched_setscheduler(struct task_struct *p, int policy,
4748 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4999 struct sched_param *param, bool user)
4749 * @p: the task in question.
4750 * @policy: new policy.
4751 * @param: structure containing the new RT priority.
4752 *
4753 * NOTE that the task may be already dead.
4754 */
4755int sched_setscheduler(struct task_struct *p, int policy,
4756 struct sched_param *param)
4757{ 5000{
4758 int retval, oldprio, oldpolicy = -1, on_rq, running; 5001 int retval, oldprio, oldpolicy = -1, on_rq, running;
4759 unsigned long flags; 5002 unsigned long flags;
@@ -4785,7 +5028,7 @@ recheck:
4785 /* 5028 /*
4786 * Allow unprivileged RT tasks to decrease priority: 5029 * Allow unprivileged RT tasks to decrease priority:
4787 */ 5030 */
4788 if (!capable(CAP_SYS_NICE)) { 5031 if (user && !capable(CAP_SYS_NICE)) {
4789 if (rt_policy(policy)) { 5032 if (rt_policy(policy)) {
4790 unsigned long rlim_rtprio; 5033 unsigned long rlim_rtprio;
4791 5034
@@ -4821,7 +5064,8 @@ recheck:
4821 * Do not allow realtime tasks into groups that have no runtime 5064 * Do not allow realtime tasks into groups that have no runtime
4822 * assigned. 5065 * assigned.
4823 */ 5066 */
4824 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5067 if (user
5068 && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
4825 return -EPERM; 5069 return -EPERM;
4826#endif 5070#endif
4827 5071
@@ -4870,8 +5114,39 @@ recheck:
4870 5114
4871 return 0; 5115 return 0;
4872} 5116}
5117
5118/**
5119 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5120 * @p: the task in question.
5121 * @policy: new policy.
5122 * @param: structure containing the new RT priority.
5123 *
5124 * NOTE that the task may be already dead.
5125 */
5126int sched_setscheduler(struct task_struct *p, int policy,
5127 struct sched_param *param)
5128{
5129 return __sched_setscheduler(p, policy, param, true);
5130}
4873EXPORT_SYMBOL_GPL(sched_setscheduler); 5131EXPORT_SYMBOL_GPL(sched_setscheduler);
4874 5132
5133/**
5134 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5135 * @p: the task in question.
5136 * @policy: new policy.
5137 * @param: structure containing the new RT priority.
5138 *
5139 * Just like sched_setscheduler, only don't bother checking if the
5140 * current context has permission. For example, this is needed in
5141 * stop_machine(): we create temporary high priority worker threads,
5142 * but our caller might not have that capability.
5143 */
5144int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5145 struct sched_param *param)
5146{
5147 return __sched_setscheduler(p, policy, param, false);
5148}
5149
4875static int 5150static int
4876do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5151do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4877{ 5152{
@@ -5070,24 +5345,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5070 return sched_setaffinity(pid, &new_mask); 5345 return sched_setaffinity(pid, &new_mask);
5071} 5346}
5072 5347
5073/*
5074 * Represents all cpu's present in the system
5075 * In systems capable of hotplug, this map could dynamically grow
5076 * as new cpu's are detected in the system via any platform specific
5077 * method, such as ACPI for e.g.
5078 */
5079
5080cpumask_t cpu_present_map __read_mostly;
5081EXPORT_SYMBOL(cpu_present_map);
5082
5083#ifndef CONFIG_SMP
5084cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
5085EXPORT_SYMBOL(cpu_online_map);
5086
5087cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
5088EXPORT_SYMBOL(cpu_possible_map);
5089#endif
5090
5091long sched_getaffinity(pid_t pid, cpumask_t *mask) 5348long sched_getaffinity(pid_t pid, cpumask_t *mask)
5092{ 5349{
5093 struct task_struct *p; 5350 struct task_struct *p;
@@ -5384,7 +5641,7 @@ out_unlock:
5384 return retval; 5641 return retval;
5385} 5642}
5386 5643
5387static const char stat_nam[] = "RSDTtZX"; 5644static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5388 5645
5389void sched_show_task(struct task_struct *p) 5646void sched_show_task(struct task_struct *p)
5390{ 5647{
@@ -5571,6 +5828,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5571 goto out; 5828 goto out;
5572 } 5829 }
5573 5830
5831 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5832 !cpus_equal(p->cpus_allowed, *new_mask))) {
5833 ret = -EINVAL;
5834 goto out;
5835 }
5836
5574 if (p->sched_class->set_cpus_allowed) 5837 if (p->sched_class->set_cpus_allowed)
5575 p->sched_class->set_cpus_allowed(p, new_mask); 5838 p->sched_class->set_cpus_allowed(p, new_mask);
5576 else { 5839 else {
@@ -5622,10 +5885,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5622 double_rq_lock(rq_src, rq_dest); 5885 double_rq_lock(rq_src, rq_dest);
5623 /* Already moved. */ 5886 /* Already moved. */
5624 if (task_cpu(p) != src_cpu) 5887 if (task_cpu(p) != src_cpu)
5625 goto out; 5888 goto done;
5626 /* Affinity changed (again). */ 5889 /* Affinity changed (again). */
5627 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 5890 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5628 goto out; 5891 goto fail;
5629 5892
5630 on_rq = p->se.on_rq; 5893 on_rq = p->se.on_rq;
5631 if (on_rq) 5894 if (on_rq)
@@ -5636,8 +5899,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5636 activate_task(rq_dest, p, 0); 5899 activate_task(rq_dest, p, 0);
5637 check_preempt_curr(rq_dest, p); 5900 check_preempt_curr(rq_dest, p);
5638 } 5901 }
5902done:
5639 ret = 1; 5903 ret = 1;
5640out: 5904fail:
5641 double_rq_unlock(rq_src, rq_dest); 5905 double_rq_unlock(rq_src, rq_dest);
5642 return ret; 5906 return ret;
5643} 5907}
@@ -5887,6 +6151,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5887 next = pick_next_task(rq, rq->curr); 6151 next = pick_next_task(rq, rq->curr);
5888 if (!next) 6152 if (!next)
5889 break; 6153 break;
6154 next->sched_class->put_prev_task(rq, next);
5890 migrate_dead(dead_cpu, next); 6155 migrate_dead(dead_cpu, next);
5891 6156
5892 } 6157 }
@@ -6058,6 +6323,36 @@ static void unregister_sched_domain_sysctl(void)
6058} 6323}
6059#endif 6324#endif
6060 6325
6326static void set_rq_online(struct rq *rq)
6327{
6328 if (!rq->online) {
6329 const struct sched_class *class;
6330
6331 cpu_set(rq->cpu, rq->rd->online);
6332 rq->online = 1;
6333
6334 for_each_class(class) {
6335 if (class->rq_online)
6336 class->rq_online(rq);
6337 }
6338 }
6339}
6340
6341static void set_rq_offline(struct rq *rq)
6342{
6343 if (rq->online) {
6344 const struct sched_class *class;
6345
6346 for_each_class(class) {
6347 if (class->rq_offline)
6348 class->rq_offline(rq);
6349 }
6350
6351 cpu_clear(rq->cpu, rq->rd->online);
6352 rq->online = 0;
6353 }
6354}
6355
6061/* 6356/*
6062 * migration_call - callback that gets triggered when a CPU is added. 6357 * migration_call - callback that gets triggered when a CPU is added.
6063 * Here we can start up the necessary migration thread for the new CPU. 6358 * Here we can start up the necessary migration thread for the new CPU.
@@ -6095,7 +6390,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6095 spin_lock_irqsave(&rq->lock, flags); 6390 spin_lock_irqsave(&rq->lock, flags);
6096 if (rq->rd) { 6391 if (rq->rd) {
6097 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6392 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6098 cpu_set(cpu, rq->rd->online); 6393
6394 set_rq_online(rq);
6099 } 6395 }
6100 spin_unlock_irqrestore(&rq->lock, flags); 6396 spin_unlock_irqrestore(&rq->lock, flags);
6101 break; 6397 break;
@@ -6156,7 +6452,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6156 spin_lock_irqsave(&rq->lock, flags); 6452 spin_lock_irqsave(&rq->lock, flags);
6157 if (rq->rd) { 6453 if (rq->rd) {
6158 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6454 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6159 cpu_clear(cpu, rq->rd->online); 6455 set_rq_offline(rq);
6160 } 6456 }
6161 spin_unlock_irqrestore(&rq->lock, flags); 6457 spin_unlock_irqrestore(&rq->lock, flags);
6162 break; 6458 break;
@@ -6190,6 +6486,28 @@ void __init migration_init(void)
6190 6486
6191#ifdef CONFIG_SCHED_DEBUG 6487#ifdef CONFIG_SCHED_DEBUG
6192 6488
6489static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6490{
6491 switch (lvl) {
6492 case SD_LV_NONE:
6493 return "NONE";
6494 case SD_LV_SIBLING:
6495 return "SIBLING";
6496 case SD_LV_MC:
6497 return "MC";
6498 case SD_LV_CPU:
6499 return "CPU";
6500 case SD_LV_NODE:
6501 return "NODE";
6502 case SD_LV_ALLNODES:
6503 return "ALLNODES";
6504 case SD_LV_MAX:
6505 return "MAX";
6506
6507 }
6508 return "MAX";
6509}
6510
6193static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6511static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6194 cpumask_t *groupmask) 6512 cpumask_t *groupmask)
6195{ 6513{
@@ -6209,7 +6527,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6209 return -1; 6527 return -1;
6210 } 6528 }
6211 6529
6212 printk(KERN_CONT "span %s\n", str); 6530 printk(KERN_CONT "span %s level %s\n",
6531 str, sd_level_to_string(sd->level));
6213 6532
6214 if (!cpu_isset(cpu, sd->span)) { 6533 if (!cpu_isset(cpu, sd->span)) {
6215 printk(KERN_ERR "ERROR: domain->span does not contain " 6534 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6293,9 +6612,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6293 } 6612 }
6294 kfree(groupmask); 6613 kfree(groupmask);
6295} 6614}
6296#else 6615#else /* !CONFIG_SCHED_DEBUG */
6297# define sched_domain_debug(sd, cpu) do { } while (0) 6616# define sched_domain_debug(sd, cpu) do { } while (0)
6298#endif 6617#endif /* CONFIG_SCHED_DEBUG */
6299 6618
6300static int sd_degenerate(struct sched_domain *sd) 6619static int sd_degenerate(struct sched_domain *sd)
6301{ 6620{
@@ -6355,20 +6674,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6355static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6674static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6356{ 6675{
6357 unsigned long flags; 6676 unsigned long flags;
6358 const struct sched_class *class;
6359 6677
6360 spin_lock_irqsave(&rq->lock, flags); 6678 spin_lock_irqsave(&rq->lock, flags);
6361 6679
6362 if (rq->rd) { 6680 if (rq->rd) {
6363 struct root_domain *old_rd = rq->rd; 6681 struct root_domain *old_rd = rq->rd;
6364 6682
6365 for (class = sched_class_highest; class; class = class->next) { 6683 if (cpu_isset(rq->cpu, old_rd->online))
6366 if (class->leave_domain) 6684 set_rq_offline(rq);
6367 class->leave_domain(rq);
6368 }
6369 6685
6370 cpu_clear(rq->cpu, old_rd->span); 6686 cpu_clear(rq->cpu, old_rd->span);
6371 cpu_clear(rq->cpu, old_rd->online);
6372 6687
6373 if (atomic_dec_and_test(&old_rd->refcount)) 6688 if (atomic_dec_and_test(&old_rd->refcount))
6374 kfree(old_rd); 6689 kfree(old_rd);
@@ -6379,12 +6694,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6379 6694
6380 cpu_set(rq->cpu, rd->span); 6695 cpu_set(rq->cpu, rd->span);
6381 if (cpu_isset(rq->cpu, cpu_online_map)) 6696 if (cpu_isset(rq->cpu, cpu_online_map))
6382 cpu_set(rq->cpu, rd->online); 6697 set_rq_online(rq);
6383
6384 for (class = sched_class_highest; class; class = class->next) {
6385 if (class->join_domain)
6386 class->join_domain(rq);
6387 }
6388 6698
6389 spin_unlock_irqrestore(&rq->lock, flags); 6699 spin_unlock_irqrestore(&rq->lock, flags);
6390} 6700}
@@ -6395,6 +6705,8 @@ static void init_rootdomain(struct root_domain *rd)
6395 6705
6396 cpus_clear(rd->span); 6706 cpus_clear(rd->span);
6397 cpus_clear(rd->online); 6707 cpus_clear(rd->online);
6708
6709 cpupri_init(&rd->cpupri);
6398} 6710}
6399 6711
6400static void init_defrootdomain(void) 6712static void init_defrootdomain(void)
@@ -6537,9 +6849,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6537 6849
6538 min_val = INT_MAX; 6850 min_val = INT_MAX;
6539 6851
6540 for (i = 0; i < MAX_NUMNODES; i++) { 6852 for (i = 0; i < nr_node_ids; i++) {
6541 /* Start at @node */ 6853 /* Start at @node */
6542 n = (node + i) % MAX_NUMNODES; 6854 n = (node + i) % nr_node_ids;
6543 6855
6544 if (!nr_cpus_node(n)) 6856 if (!nr_cpus_node(n))
6545 continue; 6857 continue;
@@ -6589,7 +6901,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
6589 cpus_or(*span, *span, *nodemask); 6901 cpus_or(*span, *span, *nodemask);
6590 } 6902 }
6591} 6903}
6592#endif 6904#endif /* CONFIG_NUMA */
6593 6905
6594int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6906int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6595 6907
@@ -6608,7 +6920,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6608 *sg = &per_cpu(sched_group_cpus, cpu); 6920 *sg = &per_cpu(sched_group_cpus, cpu);
6609 return cpu; 6921 return cpu;
6610} 6922}
6611#endif 6923#endif /* CONFIG_SCHED_SMT */
6612 6924
6613/* 6925/*
6614 * multi-core sched-domains: 6926 * multi-core sched-domains:
@@ -6616,7 +6928,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6616#ifdef CONFIG_SCHED_MC 6928#ifdef CONFIG_SCHED_MC
6617static DEFINE_PER_CPU(struct sched_domain, core_domains); 6929static DEFINE_PER_CPU(struct sched_domain, core_domains);
6618static DEFINE_PER_CPU(struct sched_group, sched_group_core); 6930static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6619#endif 6931#endif /* CONFIG_SCHED_MC */
6620 6932
6621#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6933#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6622static int 6934static int
@@ -6718,7 +7030,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6718 sg = sg->next; 7030 sg = sg->next;
6719 } while (sg != group_head); 7031 } while (sg != group_head);
6720} 7032}
6721#endif 7033#endif /* CONFIG_NUMA */
6722 7034
6723#ifdef CONFIG_NUMA 7035#ifdef CONFIG_NUMA
6724/* Free memory allocated for various sched_group structures */ 7036/* Free memory allocated for various sched_group structures */
@@ -6733,7 +7045,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6733 if (!sched_group_nodes) 7045 if (!sched_group_nodes)
6734 continue; 7046 continue;
6735 7047
6736 for (i = 0; i < MAX_NUMNODES; i++) { 7048 for (i = 0; i < nr_node_ids; i++) {
6737 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7049 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6738 7050
6739 *nodemask = node_to_cpumask(i); 7051 *nodemask = node_to_cpumask(i);
@@ -6755,11 +7067,11 @@ next_sg:
6755 sched_group_nodes_bycpu[cpu] = NULL; 7067 sched_group_nodes_bycpu[cpu] = NULL;
6756 } 7068 }
6757} 7069}
6758#else 7070#else /* !CONFIG_NUMA */
6759static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7071static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6760{ 7072{
6761} 7073}
6762#endif 7074#endif /* CONFIG_NUMA */
6763 7075
6764/* 7076/*
6765 * Initialize sched groups cpu_power. 7077 * Initialize sched groups cpu_power.
@@ -6926,7 +7238,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6926 /* 7238 /*
6927 * Allocate the per-node list of sched groups 7239 * Allocate the per-node list of sched groups
6928 */ 7240 */
6929 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), 7241 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
6930 GFP_KERNEL); 7242 GFP_KERNEL);
6931 if (!sched_group_nodes) { 7243 if (!sched_group_nodes) {
6932 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7244 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -7065,7 +7377,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7065#endif 7377#endif
7066 7378
7067 /* Set up physical groups */ 7379 /* Set up physical groups */
7068 for (i = 0; i < MAX_NUMNODES; i++) { 7380 for (i = 0; i < nr_node_ids; i++) {
7069 SCHED_CPUMASK_VAR(nodemask, allmasks); 7381 SCHED_CPUMASK_VAR(nodemask, allmasks);
7070 SCHED_CPUMASK_VAR(send_covered, allmasks); 7382 SCHED_CPUMASK_VAR(send_covered, allmasks);
7071 7383
@@ -7089,7 +7401,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7089 send_covered, tmpmask); 7401 send_covered, tmpmask);
7090 } 7402 }
7091 7403
7092 for (i = 0; i < MAX_NUMNODES; i++) { 7404 for (i = 0; i < nr_node_ids; i++) {
7093 /* Set up node groups */ 7405 /* Set up node groups */
7094 struct sched_group *sg, *prev; 7406 struct sched_group *sg, *prev;
7095 SCHED_CPUMASK_VAR(nodemask, allmasks); 7407 SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7128,9 +7440,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7128 cpus_or(*covered, *covered, *nodemask); 7440 cpus_or(*covered, *covered, *nodemask);
7129 prev = sg; 7441 prev = sg;
7130 7442
7131 for (j = 0; j < MAX_NUMNODES; j++) { 7443 for (j = 0; j < nr_node_ids; j++) {
7132 SCHED_CPUMASK_VAR(notcovered, allmasks); 7444 SCHED_CPUMASK_VAR(notcovered, allmasks);
7133 int n = (i + j) % MAX_NUMNODES; 7445 int n = (i + j) % nr_node_ids;
7134 node_to_cpumask_ptr(pnodemask, n); 7446 node_to_cpumask_ptr(pnodemask, n);
7135 7447
7136 cpus_complement(*notcovered, *covered); 7448 cpus_complement(*notcovered, *covered);
@@ -7183,7 +7495,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7183 } 7495 }
7184 7496
7185#ifdef CONFIG_NUMA 7497#ifdef CONFIG_NUMA
7186 for (i = 0; i < MAX_NUMNODES; i++) 7498 for (i = 0; i < nr_node_ids; i++)
7187 init_numa_sched_groups_power(sched_group_nodes[i]); 7499 init_numa_sched_groups_power(sched_group_nodes[i]);
7188 7500
7189 if (sd_allnodes) { 7501 if (sd_allnodes) {
@@ -7468,7 +7780,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7468#endif 7780#endif
7469 return err; 7781 return err;
7470} 7782}
7471#endif 7783#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7472 7784
7473/* 7785/*
7474 * Force a reinitialization of the sched domains hierarchy. The domains 7786 * Force a reinitialization of the sched domains hierarchy. The domains
@@ -7479,21 +7791,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7479static int update_sched_domains(struct notifier_block *nfb, 7791static int update_sched_domains(struct notifier_block *nfb,
7480 unsigned long action, void *hcpu) 7792 unsigned long action, void *hcpu)
7481{ 7793{
7794 int cpu = (int)(long)hcpu;
7795
7482 switch (action) { 7796 switch (action) {
7483 case CPU_UP_PREPARE:
7484 case CPU_UP_PREPARE_FROZEN:
7485 case CPU_DOWN_PREPARE: 7797 case CPU_DOWN_PREPARE:
7486 case CPU_DOWN_PREPARE_FROZEN: 7798 case CPU_DOWN_PREPARE_FROZEN:
7799 disable_runtime(cpu_rq(cpu));
7800 /* fall-through */
7801 case CPU_UP_PREPARE:
7802 case CPU_UP_PREPARE_FROZEN:
7487 detach_destroy_domains(&cpu_online_map); 7803 detach_destroy_domains(&cpu_online_map);
7488 free_sched_domains(); 7804 free_sched_domains();
7489 return NOTIFY_OK; 7805 return NOTIFY_OK;
7490 7806
7491 case CPU_UP_CANCELED: 7807
7492 case CPU_UP_CANCELED_FROZEN:
7493 case CPU_DOWN_FAILED: 7808 case CPU_DOWN_FAILED:
7494 case CPU_DOWN_FAILED_FROZEN: 7809 case CPU_DOWN_FAILED_FROZEN:
7495 case CPU_ONLINE: 7810 case CPU_ONLINE:
7496 case CPU_ONLINE_FROZEN: 7811 case CPU_ONLINE_FROZEN:
7812 enable_runtime(cpu_rq(cpu));
7813 /* fall-through */
7814 case CPU_UP_CANCELED:
7815 case CPU_UP_CANCELED_FROZEN:
7497 case CPU_DEAD: 7816 case CPU_DEAD:
7498 case CPU_DEAD_FROZEN: 7817 case CPU_DEAD_FROZEN:
7499 /* 7818 /*
@@ -7693,8 +8012,8 @@ void __init sched_init(void)
7693 8012
7694 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 8013 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7695 ptr += nr_cpu_ids * sizeof(void **); 8014 ptr += nr_cpu_ids * sizeof(void **);
7696#endif 8015#endif /* CONFIG_USER_SCHED */
7697#endif 8016#endif /* CONFIG_FAIR_GROUP_SCHED */
7698#ifdef CONFIG_RT_GROUP_SCHED 8017#ifdef CONFIG_RT_GROUP_SCHED
7699 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 8018 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
7700 ptr += nr_cpu_ids * sizeof(void **); 8019 ptr += nr_cpu_ids * sizeof(void **);
@@ -7708,8 +8027,8 @@ void __init sched_init(void)
7708 8027
7709 root_task_group.rt_rq = (struct rt_rq **)ptr; 8028 root_task_group.rt_rq = (struct rt_rq **)ptr;
7710 ptr += nr_cpu_ids * sizeof(void **); 8029 ptr += nr_cpu_ids * sizeof(void **);
7711#endif 8030#endif /* CONFIG_USER_SCHED */
7712#endif 8031#endif /* CONFIG_RT_GROUP_SCHED */
7713 } 8032 }
7714 8033
7715#ifdef CONFIG_SMP 8034#ifdef CONFIG_SMP
@@ -7725,8 +8044,8 @@ void __init sched_init(void)
7725#ifdef CONFIG_USER_SCHED 8044#ifdef CONFIG_USER_SCHED
7726 init_rt_bandwidth(&root_task_group.rt_bandwidth, 8045 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7727 global_rt_period(), RUNTIME_INF); 8046 global_rt_period(), RUNTIME_INF);
7728#endif 8047#endif /* CONFIG_USER_SCHED */
7729#endif 8048#endif /* CONFIG_RT_GROUP_SCHED */
7730 8049
7731#ifdef CONFIG_GROUP_SCHED 8050#ifdef CONFIG_GROUP_SCHED
7732 list_add(&init_task_group.list, &task_groups); 8051 list_add(&init_task_group.list, &task_groups);
@@ -7736,8 +8055,8 @@ void __init sched_init(void)
7736 INIT_LIST_HEAD(&root_task_group.children); 8055 INIT_LIST_HEAD(&root_task_group.children);
7737 init_task_group.parent = &root_task_group; 8056 init_task_group.parent = &root_task_group;
7738 list_add(&init_task_group.siblings, &root_task_group.children); 8057 list_add(&init_task_group.siblings, &root_task_group.children);
7739#endif 8058#endif /* CONFIG_USER_SCHED */
7740#endif 8059#endif /* CONFIG_GROUP_SCHED */
7741 8060
7742 for_each_possible_cpu(i) { 8061 for_each_possible_cpu(i) {
7743 struct rq *rq; 8062 struct rq *rq;
@@ -7817,6 +8136,7 @@ void __init sched_init(void)
7817 rq->next_balance = jiffies; 8136 rq->next_balance = jiffies;
7818 rq->push_cpu = 0; 8137 rq->push_cpu = 0;
7819 rq->cpu = i; 8138 rq->cpu = i;
8139 rq->online = 0;
7820 rq->migration_thread = NULL; 8140 rq->migration_thread = NULL;
7821 INIT_LIST_HEAD(&rq->migration_queue); 8141 INIT_LIST_HEAD(&rq->migration_queue);
7822 rq_attach_root(rq, &def_root_domain); 8142 rq_attach_root(rq, &def_root_domain);
@@ -7832,7 +8152,7 @@ void __init sched_init(void)
7832#endif 8152#endif
7833 8153
7834#ifdef CONFIG_SMP 8154#ifdef CONFIG_SMP
7835 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 8155 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7836#endif 8156#endif
7837 8157
7838#ifdef CONFIG_RT_MUTEXES 8158#ifdef CONFIG_RT_MUTEXES
@@ -8056,7 +8376,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8056{ 8376{
8057 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8377 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8058} 8378}
8059#else 8379#else /* !CONFG_FAIR_GROUP_SCHED */
8060static inline void free_fair_sched_group(struct task_group *tg) 8380static inline void free_fair_sched_group(struct task_group *tg)
8061{ 8381{
8062} 8382}
@@ -8074,7 +8394,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8074static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8394static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8075{ 8395{
8076} 8396}
8077#endif 8397#endif /* CONFIG_FAIR_GROUP_SCHED */
8078 8398
8079#ifdef CONFIG_RT_GROUP_SCHED 8399#ifdef CONFIG_RT_GROUP_SCHED
8080static void free_rt_sched_group(struct task_group *tg) 8400static void free_rt_sched_group(struct task_group *tg)
@@ -8145,7 +8465,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8145{ 8465{
8146 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 8466 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8147} 8467}
8148#else 8468#else /* !CONFIG_RT_GROUP_SCHED */
8149static inline void free_rt_sched_group(struct task_group *tg) 8469static inline void free_rt_sched_group(struct task_group *tg)
8150{ 8470{
8151} 8471}
@@ -8163,7 +8483,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8163static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8483static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8164{ 8484{
8165} 8485}
8166#endif 8486#endif /* CONFIG_RT_GROUP_SCHED */
8167 8487
8168#ifdef CONFIG_GROUP_SCHED 8488#ifdef CONFIG_GROUP_SCHED
8169static void free_sched_group(struct task_group *tg) 8489static void free_sched_group(struct task_group *tg)
@@ -8274,17 +8594,14 @@ void sched_move_task(struct task_struct *tsk)
8274 8594
8275 task_rq_unlock(rq, &flags); 8595 task_rq_unlock(rq, &flags);
8276} 8596}
8277#endif 8597#endif /* CONFIG_GROUP_SCHED */
8278 8598
8279#ifdef CONFIG_FAIR_GROUP_SCHED 8599#ifdef CONFIG_FAIR_GROUP_SCHED
8280static void set_se_shares(struct sched_entity *se, unsigned long shares) 8600static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8281{ 8601{
8282 struct cfs_rq *cfs_rq = se->cfs_rq; 8602 struct cfs_rq *cfs_rq = se->cfs_rq;
8283 struct rq *rq = cfs_rq->rq;
8284 int on_rq; 8603 int on_rq;
8285 8604
8286 spin_lock_irq(&rq->lock);
8287
8288 on_rq = se->on_rq; 8605 on_rq = se->on_rq;
8289 if (on_rq) 8606 if (on_rq)
8290 dequeue_entity(cfs_rq, se, 0); 8607 dequeue_entity(cfs_rq, se, 0);
@@ -8294,8 +8611,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
8294 8611
8295 if (on_rq) 8612 if (on_rq)
8296 enqueue_entity(cfs_rq, se, 0); 8613 enqueue_entity(cfs_rq, se, 0);
8614}
8297 8615
8298 spin_unlock_irq(&rq->lock); 8616static void set_se_shares(struct sched_entity *se, unsigned long shares)
8617{
8618 struct cfs_rq *cfs_rq = se->cfs_rq;
8619 struct rq *rq = cfs_rq->rq;
8620 unsigned long flags;
8621
8622 spin_lock_irqsave(&rq->lock, flags);
8623 __set_se_shares(se, shares);
8624 spin_unlock_irqrestore(&rq->lock, flags);
8299} 8625}
8300 8626
8301static DEFINE_MUTEX(shares_mutex); 8627static DEFINE_MUTEX(shares_mutex);
@@ -8334,8 +8660,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8334 * w/o tripping rebalance_share or load_balance_fair. 8660 * w/o tripping rebalance_share or load_balance_fair.
8335 */ 8661 */
8336 tg->shares = shares; 8662 tg->shares = shares;
8337 for_each_possible_cpu(i) 8663 for_each_possible_cpu(i) {
8664 /*
8665 * force a rebalance
8666 */
8667 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8338 set_se_shares(tg->se[i], shares); 8668 set_se_shares(tg->se[i], shares);
8669 }
8339 8670
8340 /* 8671 /*
8341 * Enable load balance activity on this group, by inserting it back on 8672 * Enable load balance activity on this group, by inserting it back on
@@ -8374,7 +8705,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8374#ifdef CONFIG_CGROUP_SCHED 8705#ifdef CONFIG_CGROUP_SCHED
8375static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8706static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8376{ 8707{
8377 struct task_group *tgi, *parent = tg ? tg->parent : NULL; 8708 struct task_group *tgi, *parent = tg->parent;
8378 unsigned long total = 0; 8709 unsigned long total = 0;
8379 8710
8380 if (!parent) { 8711 if (!parent) {
@@ -8398,7 +8729,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8398 } 8729 }
8399 rcu_read_unlock(); 8730 rcu_read_unlock();
8400 8731
8401 return total + to_ratio(period, runtime) < 8732 return total + to_ratio(period, runtime) <=
8402 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8733 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8403 parent->rt_bandwidth.rt_runtime); 8734 parent->rt_bandwidth.rt_runtime);
8404} 8735}
@@ -8501,6 +8832,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8501 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 8832 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8502 rt_runtime = tg->rt_bandwidth.rt_runtime; 8833 rt_runtime = tg->rt_bandwidth.rt_runtime;
8503 8834
8835 if (rt_period == 0)
8836 return -EINVAL;
8837
8504 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8838 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8505} 8839}
8506 8840
@@ -8515,16 +8849,21 @@ long sched_group_rt_period(struct task_group *tg)
8515 8849
8516static int sched_rt_global_constraints(void) 8850static int sched_rt_global_constraints(void)
8517{ 8851{
8852 struct task_group *tg = &root_task_group;
8853 u64 rt_runtime, rt_period;
8518 int ret = 0; 8854 int ret = 0;
8519 8855
8856 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8857 rt_runtime = tg->rt_bandwidth.rt_runtime;
8858
8520 mutex_lock(&rt_constraints_mutex); 8859 mutex_lock(&rt_constraints_mutex);
8521 if (!__rt_schedulable(NULL, 1, 0)) 8860 if (!__rt_schedulable(tg, rt_period, rt_runtime))
8522 ret = -EINVAL; 8861 ret = -EINVAL;
8523 mutex_unlock(&rt_constraints_mutex); 8862 mutex_unlock(&rt_constraints_mutex);
8524 8863
8525 return ret; 8864 return ret;
8526} 8865}
8527#else 8866#else /* !CONFIG_RT_GROUP_SCHED */
8528static int sched_rt_global_constraints(void) 8867static int sched_rt_global_constraints(void)
8529{ 8868{
8530 unsigned long flags; 8869 unsigned long flags;
@@ -8542,7 +8881,7 @@ static int sched_rt_global_constraints(void)
8542 8881
8543 return 0; 8882 return 0;
8544} 8883}
8545#endif 8884#endif /* CONFIG_RT_GROUP_SCHED */
8546 8885
8547int sched_rt_handler(struct ctl_table *table, int write, 8886int sched_rt_handler(struct ctl_table *table, int write,
8548 struct file *filp, void __user *buffer, size_t *lenp, 8887 struct file *filp, void __user *buffer, size_t *lenp,
@@ -8650,7 +8989,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8650 8989
8651 return (u64) tg->shares; 8990 return (u64) tg->shares;
8652} 8991}
8653#endif 8992#endif /* CONFIG_FAIR_GROUP_SCHED */
8654 8993
8655#ifdef CONFIG_RT_GROUP_SCHED 8994#ifdef CONFIG_RT_GROUP_SCHED
8656static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 8995static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8674,7 +9013,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8674{ 9013{
8675 return sched_group_rt_period(cgroup_tg(cgrp)); 9014 return sched_group_rt_period(cgroup_tg(cgrp));
8676} 9015}
8677#endif 9016#endif /* CONFIG_RT_GROUP_SCHED */
8678 9017
8679static struct cftype cpu_files[] = { 9018static struct cftype cpu_files[] = {
8680#ifdef CONFIG_FAIR_GROUP_SCHED 9019#ifdef CONFIG_FAIR_GROUP_SCHED