aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c899
1 files changed, 632 insertions, 267 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index eaf6751e7612..2317a2178104 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,10 +70,13 @@
70#include <linux/bootmem.h> 70#include <linux/bootmem.h>
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h>
73 74
74#include <asm/tlb.h> 75#include <asm/tlb.h>
75#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
76 77
78#include "sched_cpupri.h"
79
77/* 80/*
78 * Convert user-nice values [ -20 ... 0 ... 19 ] 81 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 82 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -289,15 +292,15 @@ struct task_group root_task_group;
289static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 292static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
290/* Default task group's cfs_rq on each cpu */ 293/* Default task group's cfs_rq on each cpu */
291static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 294static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
292#endif 295#endif /* CONFIG_FAIR_GROUP_SCHED */
293 296
294#ifdef CONFIG_RT_GROUP_SCHED 297#ifdef CONFIG_RT_GROUP_SCHED
295static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
296static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
297#endif 300#endif /* CONFIG_RT_GROUP_SCHED */
298#else 301#else /* !CONFIG_FAIR_GROUP_SCHED */
299#define root_task_group init_task_group 302#define root_task_group init_task_group
300#endif 303#endif /* CONFIG_FAIR_GROUP_SCHED */
301 304
302/* task_group_lock serializes add/remove of task groups and also changes to 305/* task_group_lock serializes add/remove of task groups and also changes to
303 * a task group's cpu shares. 306 * a task group's cpu shares.
@@ -307,9 +310,9 @@ static DEFINE_SPINLOCK(task_group_lock);
307#ifdef CONFIG_FAIR_GROUP_SCHED 310#ifdef CONFIG_FAIR_GROUP_SCHED
308#ifdef CONFIG_USER_SCHED 311#ifdef CONFIG_USER_SCHED
309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 312# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
310#else 313#else /* !CONFIG_USER_SCHED */
311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 314# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
312#endif 315#endif /* CONFIG_USER_SCHED */
313 316
314/* 317/*
315 * A weight of 0 or 1 can cause arithmetics problems. 318 * A weight of 0 or 1 can cause arithmetics problems.
@@ -363,6 +366,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
363#else 366#else
364 367
365static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 368static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
369static inline struct task_group *task_group(struct task_struct *p)
370{
371 return NULL;
372}
366 373
367#endif /* CONFIG_GROUP_SCHED */ 374#endif /* CONFIG_GROUP_SCHED */
368 375
@@ -373,6 +380,7 @@ struct cfs_rq {
373 380
374 u64 exec_clock; 381 u64 exec_clock;
375 u64 min_vruntime; 382 u64 min_vruntime;
383 u64 pair_start;
376 384
377 struct rb_root tasks_timeline; 385 struct rb_root tasks_timeline;
378 struct rb_node *rb_leftmost; 386 struct rb_node *rb_leftmost;
@@ -401,6 +409,31 @@ struct cfs_rq {
401 */ 409 */
402 struct list_head leaf_cfs_rq_list; 410 struct list_head leaf_cfs_rq_list;
403 struct task_group *tg; /* group that "owns" this runqueue */ 411 struct task_group *tg; /* group that "owns" this runqueue */
412
413#ifdef CONFIG_SMP
414 /*
415 * the part of load.weight contributed by tasks
416 */
417 unsigned long task_weight;
418
419 /*
420 * h_load = weight * f(tg)
421 *
422 * Where f(tg) is the recursive weight fraction assigned to
423 * this group.
424 */
425 unsigned long h_load;
426
427 /*
428 * this cpu's part of tg->shares
429 */
430 unsigned long shares;
431
432 /*
433 * load.weight at the time we set shares
434 */
435 unsigned long rq_weight;
436#endif
404#endif 437#endif
405}; 438};
406 439
@@ -452,6 +485,9 @@ struct root_domain {
452 */ 485 */
453 cpumask_t rto_mask; 486 cpumask_t rto_mask;
454 atomic_t rto_count; 487 atomic_t rto_count;
488#ifdef CONFIG_SMP
489 struct cpupri cpupri;
490#endif
455}; 491};
456 492
457/* 493/*
@@ -526,6 +562,9 @@ struct rq {
526 int push_cpu; 562 int push_cpu;
527 /* cpu of this runqueue: */ 563 /* cpu of this runqueue: */
528 int cpu; 564 int cpu;
565 int online;
566
567 unsigned long avg_load_per_task;
529 568
530 struct task_struct *migration_thread; 569 struct task_struct *migration_thread;
531 struct list_head migration_queue; 570 struct list_head migration_queue;
@@ -607,6 +646,24 @@ static inline void update_rq_clock(struct rq *rq)
607# define const_debug static const 646# define const_debug static const
608#endif 647#endif
609 648
649/**
650 * runqueue_is_locked
651 *
652 * Returns true if the current cpu runqueue is locked.
653 * This interface allows printk to be called with the runqueue lock
654 * held and know whether or not it is OK to wake up the klogd.
655 */
656int runqueue_is_locked(void)
657{
658 int cpu = get_cpu();
659 struct rq *rq = cpu_rq(cpu);
660 int ret;
661
662 ret = spin_is_locked(&rq->lock);
663 put_cpu();
664 return ret;
665}
666
610/* 667/*
611 * Debugging: various feature bits 668 * Debugging: various feature bits
612 */ 669 */
@@ -749,6 +806,12 @@ late_initcall(sched_init_debug);
749const_debug unsigned int sysctl_sched_nr_migrate = 32; 806const_debug unsigned int sysctl_sched_nr_migrate = 32;
750 807
751/* 808/*
809 * ratelimit for updating the group shares.
810 * default: 0.5ms
811 */
812const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
813
814/*
752 * period over which we measure -rt task cpu usage in us. 815 * period over which we measure -rt task cpu usage in us.
753 * default: 1s 816 * default: 1s
754 */ 817 */
@@ -775,82 +838,6 @@ static inline u64 global_rt_runtime(void)
775 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 838 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
776} 839}
777 840
778unsigned long long time_sync_thresh = 100000;
779
780static DEFINE_PER_CPU(unsigned long long, time_offset);
781static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
782
783/*
784 * Global lock which we take every now and then to synchronize
785 * the CPUs time. This method is not warp-safe, but it's good
786 * enough to synchronize slowly diverging time sources and thus
787 * it's good enough for tracing:
788 */
789static DEFINE_SPINLOCK(time_sync_lock);
790static unsigned long long prev_global_time;
791
792static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
793{
794 /*
795 * We want this inlined, to not get tracer function calls
796 * in this critical section:
797 */
798 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
799 __raw_spin_lock(&time_sync_lock.raw_lock);
800
801 if (time < prev_global_time) {
802 per_cpu(time_offset, cpu) += prev_global_time - time;
803 time = prev_global_time;
804 } else {
805 prev_global_time = time;
806 }
807
808 __raw_spin_unlock(&time_sync_lock.raw_lock);
809 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
810
811 return time;
812}
813
814static unsigned long long __cpu_clock(int cpu)
815{
816 unsigned long long now;
817
818 /*
819 * Only call sched_clock() if the scheduler has already been
820 * initialized (some code might call cpu_clock() very early):
821 */
822 if (unlikely(!scheduler_running))
823 return 0;
824
825 now = sched_clock_cpu(cpu);
826
827 return now;
828}
829
830/*
831 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
832 * clock constructed from sched_clock():
833 */
834unsigned long long cpu_clock(int cpu)
835{
836 unsigned long long prev_cpu_time, time, delta_time;
837 unsigned long flags;
838
839 local_irq_save(flags);
840 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
841 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
842 delta_time = time-prev_cpu_time;
843
844 if (unlikely(delta_time > time_sync_thresh)) {
845 time = __sync_cpu_clock(time, cpu);
846 per_cpu(prev_cpu_time, cpu) = time;
847 }
848 local_irq_restore(flags);
849
850 return time;
851}
852EXPORT_SYMBOL_GPL(cpu_clock);
853
854#ifndef prepare_arch_switch 841#ifndef prepare_arch_switch
855# define prepare_arch_switch(next) do { } while (0) 842# define prepare_arch_switch(next) do { } while (0)
856#endif 843#endif
@@ -1127,6 +1114,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1127 return HRTIMER_NORESTART; 1114 return HRTIMER_NORESTART;
1128} 1115}
1129 1116
1117#ifdef CONFIG_SMP
1130static void hotplug_hrtick_disable(int cpu) 1118static void hotplug_hrtick_disable(int cpu)
1131{ 1119{
1132 struct rq *rq = cpu_rq(cpu); 1120 struct rq *rq = cpu_rq(cpu);
@@ -1182,6 +1170,7 @@ static void init_hrtick(void)
1182{ 1170{
1183 hotcpu_notifier(hotplug_hrtick, 0); 1171 hotcpu_notifier(hotplug_hrtick, 0);
1184} 1172}
1173#endif /* CONFIG_SMP */
1185 1174
1186static void init_rq_hrtick(struct rq *rq) 1175static void init_rq_hrtick(struct rq *rq)
1187{ 1176{
@@ -1311,15 +1300,15 @@ void wake_up_idle_cpu(int cpu)
1311 if (!tsk_is_polling(rq->idle)) 1300 if (!tsk_is_polling(rq->idle))
1312 smp_send_reschedule(cpu); 1301 smp_send_reschedule(cpu);
1313} 1302}
1314#endif 1303#endif /* CONFIG_NO_HZ */
1315 1304
1316#else 1305#else /* !CONFIG_SMP */
1317static void __resched_task(struct task_struct *p, int tif_bit) 1306static void __resched_task(struct task_struct *p, int tif_bit)
1318{ 1307{
1319 assert_spin_locked(&task_rq(p)->lock); 1308 assert_spin_locked(&task_rq(p)->lock);
1320 set_tsk_thread_flag(p, tif_bit); 1309 set_tsk_thread_flag(p, tif_bit);
1321} 1310}
1322#endif 1311#endif /* CONFIG_SMP */
1323 1312
1324#if BITS_PER_LONG == 32 1313#if BITS_PER_LONG == 32
1325# define WMULT_CONST (~0UL) 1314# define WMULT_CONST (~0UL)
@@ -1334,6 +1323,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
1334 */ 1323 */
1335#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1324#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1336 1325
1326/*
1327 * delta *= weight / lw
1328 */
1337static unsigned long 1329static unsigned long
1338calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1330calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1339 struct load_weight *lw) 1331 struct load_weight *lw)
@@ -1361,12 +1353,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1361 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1353 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1362} 1354}
1363 1355
1364static inline unsigned long
1365calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1366{
1367 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
1368}
1369
1370static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1356static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1371{ 1357{
1372 lw->weight += inc; 1358 lw->weight += inc;
@@ -1477,17 +1463,211 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1477#ifdef CONFIG_SMP 1463#ifdef CONFIG_SMP
1478static unsigned long source_load(int cpu, int type); 1464static unsigned long source_load(int cpu, int type);
1479static unsigned long target_load(int cpu, int type); 1465static unsigned long target_load(int cpu, int type);
1480static unsigned long cpu_avg_load_per_task(int cpu);
1481static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1466static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1482#else /* CONFIG_SMP */ 1467
1468static unsigned long cpu_avg_load_per_task(int cpu)
1469{
1470 struct rq *rq = cpu_rq(cpu);
1471
1472 if (rq->nr_running)
1473 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1474
1475 return rq->avg_load_per_task;
1476}
1483 1477
1484#ifdef CONFIG_FAIR_GROUP_SCHED 1478#ifdef CONFIG_FAIR_GROUP_SCHED
1485static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1479
1480typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1481
1482/*
1483 * Iterate the full tree, calling @down when first entering a node and @up when
1484 * leaving it for the final time.
1485 */
1486static void
1487walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1486{ 1488{
1489 struct task_group *parent, *child;
1490
1491 rcu_read_lock();
1492 parent = &root_task_group;
1493down:
1494 (*down)(parent, cpu, sd);
1495 list_for_each_entry_rcu(child, &parent->children, siblings) {
1496 parent = child;
1497 goto down;
1498
1499up:
1500 continue;
1501 }
1502 (*up)(parent, cpu, sd);
1503
1504 child = parent;
1505 parent = parent->parent;
1506 if (parent)
1507 goto up;
1508 rcu_read_unlock();
1487} 1509}
1510
1511static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1512
1513/*
1514 * Calculate and set the cpu's group shares.
1515 */
1516static void
1517__update_group_shares_cpu(struct task_group *tg, int cpu,
1518 unsigned long sd_shares, unsigned long sd_rq_weight)
1519{
1520 int boost = 0;
1521 unsigned long shares;
1522 unsigned long rq_weight;
1523
1524 if (!tg->se[cpu])
1525 return;
1526
1527 rq_weight = tg->cfs_rq[cpu]->load.weight;
1528
1529 /*
1530 * If there are currently no tasks on the cpu pretend there is one of
1531 * average load so that when a new task gets to run here it will not
1532 * get delayed by group starvation.
1533 */
1534 if (!rq_weight) {
1535 boost = 1;
1536 rq_weight = NICE_0_LOAD;
1537 }
1538
1539 if (unlikely(rq_weight > sd_rq_weight))
1540 rq_weight = sd_rq_weight;
1541
1542 /*
1543 * \Sum shares * rq_weight
1544 * shares = -----------------------
1545 * \Sum rq_weight
1546 *
1547 */
1548 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1549
1550 /*
1551 * record the actual number of shares, not the boosted amount.
1552 */
1553 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1554 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1555
1556 if (shares < MIN_SHARES)
1557 shares = MIN_SHARES;
1558 else if (shares > MAX_SHARES)
1559 shares = MAX_SHARES;
1560
1561 __set_se_shares(tg->se[cpu], shares);
1562}
1563
1564/*
1565 * Re-compute the task group their per cpu shares over the given domain.
1566 * This needs to be done in a bottom-up fashion because the rq weight of a
1567 * parent group depends on the shares of its child groups.
1568 */
1569static void
1570tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1571{
1572 unsigned long rq_weight = 0;
1573 unsigned long shares = 0;
1574 int i;
1575
1576 for_each_cpu_mask(i, sd->span) {
1577 rq_weight += tg->cfs_rq[i]->load.weight;
1578 shares += tg->cfs_rq[i]->shares;
1579 }
1580
1581 if ((!shares && rq_weight) || shares > tg->shares)
1582 shares = tg->shares;
1583
1584 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1585 shares = tg->shares;
1586
1587 if (!rq_weight)
1588 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1589
1590 for_each_cpu_mask(i, sd->span) {
1591 struct rq *rq = cpu_rq(i);
1592 unsigned long flags;
1593
1594 spin_lock_irqsave(&rq->lock, flags);
1595 __update_group_shares_cpu(tg, i, shares, rq_weight);
1596 spin_unlock_irqrestore(&rq->lock, flags);
1597 }
1598}
1599
1600/*
1601 * Compute the cpu's hierarchical load factor for each task group.
1602 * This needs to be done in a top-down fashion because the load of a child
1603 * group is a fraction of its parents load.
1604 */
1605static void
1606tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1607{
1608 unsigned long load;
1609
1610 if (!tg->parent) {
1611 load = cpu_rq(cpu)->load.weight;
1612 } else {
1613 load = tg->parent->cfs_rq[cpu]->h_load;
1614 load *= tg->cfs_rq[cpu]->shares;
1615 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1616 }
1617
1618 tg->cfs_rq[cpu]->h_load = load;
1619}
1620
1621static void
1622tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1623{
1624}
1625
1626static void update_shares(struct sched_domain *sd)
1627{
1628 u64 now = cpu_clock(raw_smp_processor_id());
1629 s64 elapsed = now - sd->last_update;
1630
1631 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1632 sd->last_update = now;
1633 walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
1634 }
1635}
1636
1637static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1638{
1639 spin_unlock(&rq->lock);
1640 update_shares(sd);
1641 spin_lock(&rq->lock);
1642}
1643
1644static void update_h_load(int cpu)
1645{
1646 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
1647}
1648
1649#else
1650
1651static inline void update_shares(struct sched_domain *sd)
1652{
1653}
1654
1655static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1656{
1657}
1658
1488#endif 1659#endif
1489 1660
1490#endif /* CONFIG_SMP */ 1661#endif
1662
1663#ifdef CONFIG_FAIR_GROUP_SCHED
1664static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1665{
1666#ifdef CONFIG_SMP
1667 cfs_rq->shares = shares;
1668#endif
1669}
1670#endif
1491 1671
1492#include "sched_stats.h" 1672#include "sched_stats.h"
1493#include "sched_idletask.c" 1673#include "sched_idletask.c"
@@ -1498,27 +1678,17 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1498#endif 1678#endif
1499 1679
1500#define sched_class_highest (&rt_sched_class) 1680#define sched_class_highest (&rt_sched_class)
1681#define for_each_class(class) \
1682 for (class = sched_class_highest; class; class = class->next)
1501 1683
1502static inline void inc_load(struct rq *rq, const struct task_struct *p) 1684static void inc_nr_running(struct rq *rq)
1503{
1504 update_load_add(&rq->load, p->se.load.weight);
1505}
1506
1507static inline void dec_load(struct rq *rq, const struct task_struct *p)
1508{
1509 update_load_sub(&rq->load, p->se.load.weight);
1510}
1511
1512static void inc_nr_running(struct task_struct *p, struct rq *rq)
1513{ 1685{
1514 rq->nr_running++; 1686 rq->nr_running++;
1515 inc_load(rq, p);
1516} 1687}
1517 1688
1518static void dec_nr_running(struct task_struct *p, struct rq *rq) 1689static void dec_nr_running(struct rq *rq)
1519{ 1690{
1520 rq->nr_running--; 1691 rq->nr_running--;
1521 dec_load(rq, p);
1522} 1692}
1523 1693
1524static void set_load_weight(struct task_struct *p) 1694static void set_load_weight(struct task_struct *p)
@@ -1542,6 +1712,12 @@ static void set_load_weight(struct task_struct *p)
1542 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1712 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1543} 1713}
1544 1714
1715static void update_avg(u64 *avg, u64 sample)
1716{
1717 s64 diff = sample - *avg;
1718 *avg += diff >> 3;
1719}
1720
1545static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1721static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1546{ 1722{
1547 sched_info_queued(p); 1723 sched_info_queued(p);
@@ -1551,6 +1727,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1551 1727
1552static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1728static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1553{ 1729{
1730 if (sleep && p->se.last_wakeup) {
1731 update_avg(&p->se.avg_overlap,
1732 p->se.sum_exec_runtime - p->se.last_wakeup);
1733 p->se.last_wakeup = 0;
1734 }
1735
1736 sched_info_dequeued(p);
1554 p->sched_class->dequeue_task(rq, p, sleep); 1737 p->sched_class->dequeue_task(rq, p, sleep);
1555 p->se.on_rq = 0; 1738 p->se.on_rq = 0;
1556} 1739}
@@ -1610,7 +1793,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1610 rq->nr_uninterruptible--; 1793 rq->nr_uninterruptible--;
1611 1794
1612 enqueue_task(rq, p, wakeup); 1795 enqueue_task(rq, p, wakeup);
1613 inc_nr_running(p, rq); 1796 inc_nr_running(rq);
1614} 1797}
1615 1798
1616/* 1799/*
@@ -1622,7 +1805,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1622 rq->nr_uninterruptible++; 1805 rq->nr_uninterruptible++;
1623 1806
1624 dequeue_task(rq, p, sleep); 1807 dequeue_task(rq, p, sleep);
1625 dec_nr_running(p, rq); 1808 dec_nr_running(rq);
1626} 1809}
1627 1810
1628/** 1811/**
@@ -1634,12 +1817,6 @@ inline int task_curr(const struct task_struct *p)
1634 return cpu_curr(task_cpu(p)) == p; 1817 return cpu_curr(task_cpu(p)) == p;
1635} 1818}
1636 1819
1637/* Used instead of source_load when we know the type == 0 */
1638unsigned long weighted_cpuload(const int cpu)
1639{
1640 return cpu_rq(cpu)->load.weight;
1641}
1642
1643static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1820static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1644{ 1821{
1645 set_task_rq(p, cpu); 1822 set_task_rq(p, cpu);
@@ -1668,6 +1845,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1668 1845
1669#ifdef CONFIG_SMP 1846#ifdef CONFIG_SMP
1670 1847
1848/* Used instead of source_load when we know the type == 0 */
1849static unsigned long weighted_cpuload(const int cpu)
1850{
1851 return cpu_rq(cpu)->load.weight;
1852}
1853
1671/* 1854/*
1672 * Is this task likely cache-hot: 1855 * Is this task likely cache-hot:
1673 */ 1856 */
@@ -1878,7 +2061,7 @@ static unsigned long source_load(int cpu, int type)
1878 struct rq *rq = cpu_rq(cpu); 2061 struct rq *rq = cpu_rq(cpu);
1879 unsigned long total = weighted_cpuload(cpu); 2062 unsigned long total = weighted_cpuload(cpu);
1880 2063
1881 if (type == 0) 2064 if (type == 0 || !sched_feat(LB_BIAS))
1882 return total; 2065 return total;
1883 2066
1884 return min(rq->cpu_load[type-1], total); 2067 return min(rq->cpu_load[type-1], total);
@@ -1893,25 +2076,13 @@ static unsigned long target_load(int cpu, int type)
1893 struct rq *rq = cpu_rq(cpu); 2076 struct rq *rq = cpu_rq(cpu);
1894 unsigned long total = weighted_cpuload(cpu); 2077 unsigned long total = weighted_cpuload(cpu);
1895 2078
1896 if (type == 0) 2079 if (type == 0 || !sched_feat(LB_BIAS))
1897 return total; 2080 return total;
1898 2081
1899 return max(rq->cpu_load[type-1], total); 2082 return max(rq->cpu_load[type-1], total);
1900} 2083}
1901 2084
1902/* 2085/*
1903 * Return the average load per task on the cpu's run queue
1904 */
1905static unsigned long cpu_avg_load_per_task(int cpu)
1906{
1907 struct rq *rq = cpu_rq(cpu);
1908 unsigned long total = weighted_cpuload(cpu);
1909 unsigned long n = rq->nr_running;
1910
1911 return n ? total / n : SCHED_LOAD_SCALE;
1912}
1913
1914/*
1915 * find_idlest_group finds and returns the least busy CPU group within the 2086 * find_idlest_group finds and returns the least busy CPU group within the
1916 * domain. 2087 * domain.
1917 */ 2088 */
@@ -2017,6 +2188,9 @@ static int sched_balance_self(int cpu, int flag)
2017 sd = tmp; 2188 sd = tmp;
2018 } 2189 }
2019 2190
2191 if (sd)
2192 update_shares(sd);
2193
2020 while (sd) { 2194 while (sd) {
2021 cpumask_t span, tmpmask; 2195 cpumask_t span, tmpmask;
2022 struct sched_group *group; 2196 struct sched_group *group;
@@ -2083,6 +2257,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2083 if (!sched_feat(SYNC_WAKEUPS)) 2257 if (!sched_feat(SYNC_WAKEUPS))
2084 sync = 0; 2258 sync = 0;
2085 2259
2260#ifdef CONFIG_SMP
2261 if (sched_feat(LB_WAKEUP_UPDATE)) {
2262 struct sched_domain *sd;
2263
2264 this_cpu = raw_smp_processor_id();
2265 cpu = task_cpu(p);
2266
2267 for_each_domain(this_cpu, sd) {
2268 if (cpu_isset(cpu, sd->span)) {
2269 update_shares(sd);
2270 break;
2271 }
2272 }
2273 }
2274#endif
2275
2086 smp_wmb(); 2276 smp_wmb();
2087 rq = task_rq_lock(p, &flags); 2277 rq = task_rq_lock(p, &flags);
2088 old_state = p->state; 2278 old_state = p->state;
@@ -2129,7 +2319,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2129 } 2319 }
2130 } 2320 }
2131 } 2321 }
2132#endif 2322#endif /* CONFIG_SCHEDSTATS */
2133 2323
2134out_activate: 2324out_activate:
2135#endif /* CONFIG_SMP */ 2325#endif /* CONFIG_SMP */
@@ -2147,6 +2337,9 @@ out_activate:
2147 success = 1; 2337 success = 1;
2148 2338
2149out_running: 2339out_running:
2340 trace_mark(kernel_sched_wakeup,
2341 "pid %d state %ld ## rq %p task %p rq->curr %p",
2342 p->pid, p->state, rq, p, rq->curr);
2150 check_preempt_curr(rq, p); 2343 check_preempt_curr(rq, p);
2151 2344
2152 p->state = TASK_RUNNING; 2345 p->state = TASK_RUNNING;
@@ -2155,6 +2348,8 @@ out_running:
2155 p->sched_class->task_wake_up(rq, p); 2348 p->sched_class->task_wake_up(rq, p);
2156#endif 2349#endif
2157out: 2350out:
2351 current->se.last_wakeup = current->se.sum_exec_runtime;
2352
2158 task_rq_unlock(rq, &flags); 2353 task_rq_unlock(rq, &flags);
2159 2354
2160 return success; 2355 return success;
@@ -2275,8 +2470,11 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2275 * management (if any): 2470 * management (if any):
2276 */ 2471 */
2277 p->sched_class->task_new(rq, p); 2472 p->sched_class->task_new(rq, p);
2278 inc_nr_running(p, rq); 2473 inc_nr_running(rq);
2279 } 2474 }
2475 trace_mark(kernel_sched_wakeup_new,
2476 "pid %d state %ld ## rq %p task %p rq->curr %p",
2477 p->pid, p->state, rq, p, rq->curr);
2280 check_preempt_curr(rq, p); 2478 check_preempt_curr(rq, p);
2281#ifdef CONFIG_SMP 2479#ifdef CONFIG_SMP
2282 if (p->sched_class->task_wake_up) 2480 if (p->sched_class->task_wake_up)
@@ -2329,7 +2527,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2329 notifier->ops->sched_out(notifier, next); 2527 notifier->ops->sched_out(notifier, next);
2330} 2528}
2331 2529
2332#else 2530#else /* !CONFIG_PREEMPT_NOTIFIERS */
2333 2531
2334static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2532static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2335{ 2533{
@@ -2341,7 +2539,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2341{ 2539{
2342} 2540}
2343 2541
2344#endif 2542#endif /* CONFIG_PREEMPT_NOTIFIERS */
2345 2543
2346/** 2544/**
2347 * prepare_task_switch - prepare to switch tasks 2545 * prepare_task_switch - prepare to switch tasks
@@ -2449,6 +2647,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
2449 struct mm_struct *mm, *oldmm; 2647 struct mm_struct *mm, *oldmm;
2450 2648
2451 prepare_task_switch(rq, prev, next); 2649 prepare_task_switch(rq, prev, next);
2650 trace_mark(kernel_sched_schedule,
2651 "prev_pid %d next_pid %d prev_state %ld "
2652 "## rq %p prev %p next %p",
2653 prev->pid, next->pid, prev->state,
2654 rq, prev, next);
2452 mm = next->mm; 2655 mm = next->mm;
2453 oldmm = prev->active_mm; 2656 oldmm = prev->active_mm;
2454 /* 2657 /*
@@ -2783,7 +2986,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2783 enum cpu_idle_type idle, int *all_pinned, 2986 enum cpu_idle_type idle, int *all_pinned,
2784 int *this_best_prio, struct rq_iterator *iterator) 2987 int *this_best_prio, struct rq_iterator *iterator)
2785{ 2988{
2786 int loops = 0, pulled = 0, pinned = 0, skip_for_load; 2989 int loops = 0, pulled = 0, pinned = 0;
2787 struct task_struct *p; 2990 struct task_struct *p;
2788 long rem_load_move = max_load_move; 2991 long rem_load_move = max_load_move;
2789 2992
@@ -2799,14 +3002,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2799next: 3002next:
2800 if (!p || loops++ > sysctl_sched_nr_migrate) 3003 if (!p || loops++ > sysctl_sched_nr_migrate)
2801 goto out; 3004 goto out;
2802 /* 3005
2803 * To help distribute high priority tasks across CPUs we don't 3006 if ((p->se.load.weight >> 1) > rem_load_move ||
2804 * skip a task if it will be the highest priority task (i.e. smallest
2805 * prio value) on its new queue regardless of its load weight
2806 */
2807 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2808 SCHED_LOAD_SCALE_FUZZ;
2809 if ((skip_for_load && p->prio >= *this_best_prio) ||
2810 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 3007 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2811 p = iterator->next(iterator->arg); 3008 p = iterator->next(iterator->arg);
2812 goto next; 3009 goto next;
@@ -2861,6 +3058,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2861 max_load_move - total_load_moved, 3058 max_load_move - total_load_moved,
2862 sd, idle, all_pinned, &this_best_prio); 3059 sd, idle, all_pinned, &this_best_prio);
2863 class = class->next; 3060 class = class->next;
3061
3062 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3063 break;
3064
2864 } while (class && max_load_move > total_load_moved); 3065 } while (class && max_load_move > total_load_moved);
2865 3066
2866 return total_load_moved > 0; 3067 return total_load_moved > 0;
@@ -2937,6 +3138,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2937 max_load = this_load = total_load = total_pwr = 0; 3138 max_load = this_load = total_load = total_pwr = 0;
2938 busiest_load_per_task = busiest_nr_running = 0; 3139 busiest_load_per_task = busiest_nr_running = 0;
2939 this_load_per_task = this_nr_running = 0; 3140 this_load_per_task = this_nr_running = 0;
3141
2940 if (idle == CPU_NOT_IDLE) 3142 if (idle == CPU_NOT_IDLE)
2941 load_idx = sd->busy_idx; 3143 load_idx = sd->busy_idx;
2942 else if (idle == CPU_NEWLY_IDLE) 3144 else if (idle == CPU_NEWLY_IDLE)
@@ -2951,6 +3153,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2951 int __group_imb = 0; 3153 int __group_imb = 0;
2952 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3154 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2953 unsigned long sum_nr_running, sum_weighted_load; 3155 unsigned long sum_nr_running, sum_weighted_load;
3156 unsigned long sum_avg_load_per_task;
3157 unsigned long avg_load_per_task;
2954 3158
2955 local_group = cpu_isset(this_cpu, group->cpumask); 3159 local_group = cpu_isset(this_cpu, group->cpumask);
2956 3160
@@ -2959,6 +3163,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2959 3163
2960 /* Tally up the load of all CPUs in the group */ 3164 /* Tally up the load of all CPUs in the group */
2961 sum_weighted_load = sum_nr_running = avg_load = 0; 3165 sum_weighted_load = sum_nr_running = avg_load = 0;
3166 sum_avg_load_per_task = avg_load_per_task = 0;
3167
2962 max_cpu_load = 0; 3168 max_cpu_load = 0;
2963 min_cpu_load = ~0UL; 3169 min_cpu_load = ~0UL;
2964 3170
@@ -2992,6 +3198,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2992 avg_load += load; 3198 avg_load += load;
2993 sum_nr_running += rq->nr_running; 3199 sum_nr_running += rq->nr_running;
2994 sum_weighted_load += weighted_cpuload(i); 3200 sum_weighted_load += weighted_cpuload(i);
3201
3202 sum_avg_load_per_task += cpu_avg_load_per_task(i);
2995 } 3203 }
2996 3204
2997 /* 3205 /*
@@ -3013,7 +3221,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3013 avg_load = sg_div_cpu_power(group, 3221 avg_load = sg_div_cpu_power(group,
3014 avg_load * SCHED_LOAD_SCALE); 3222 avg_load * SCHED_LOAD_SCALE);
3015 3223
3016 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) 3224
3225 /*
3226 * Consider the group unbalanced when the imbalance is larger
3227 * than the average weight of two tasks.
3228 *
3229 * APZ: with cgroup the avg task weight can vary wildly and
3230 * might not be a suitable number - should we keep a
3231 * normalized nr_running number somewhere that negates
3232 * the hierarchy?
3233 */
3234 avg_load_per_task = sg_div_cpu_power(group,
3235 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3236
3237 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3017 __group_imb = 1; 3238 __group_imb = 1;
3018 3239
3019 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3240 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3154,9 +3375,9 @@ small_imbalance:
3154 if (busiest_load_per_task > this_load_per_task) 3375 if (busiest_load_per_task > this_load_per_task)
3155 imbn = 1; 3376 imbn = 1;
3156 } else 3377 } else
3157 this_load_per_task = SCHED_LOAD_SCALE; 3378 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3158 3379
3159 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= 3380 if (max_load - this_load + 2*busiest_load_per_task >=
3160 busiest_load_per_task * imbn) { 3381 busiest_load_per_task * imbn) {
3161 *imbalance = busiest_load_per_task; 3382 *imbalance = busiest_load_per_task;
3162 return busiest; 3383 return busiest;
@@ -3282,6 +3503,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3282 schedstat_inc(sd, lb_count[idle]); 3503 schedstat_inc(sd, lb_count[idle]);
3283 3504
3284redo: 3505redo:
3506 update_shares(sd);
3285 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3507 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3286 cpus, balance); 3508 cpus, balance);
3287 3509
@@ -3384,8 +3606,9 @@ redo:
3384 3606
3385 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3607 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3386 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3608 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3387 return -1; 3609 ld_moved = -1;
3388 return ld_moved; 3610
3611 goto out;
3389 3612
3390out_balanced: 3613out_balanced:
3391 schedstat_inc(sd, lb_balanced[idle]); 3614 schedstat_inc(sd, lb_balanced[idle]);
@@ -3400,8 +3623,13 @@ out_one_pinned:
3400 3623
3401 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3624 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3402 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3625 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3403 return -1; 3626 ld_moved = -1;
3404 return 0; 3627 else
3628 ld_moved = 0;
3629out:
3630 if (ld_moved)
3631 update_shares(sd);
3632 return ld_moved;
3405} 3633}
3406 3634
3407/* 3635/*
@@ -3436,6 +3664,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3436 3664
3437 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); 3665 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3438redo: 3666redo:
3667 update_shares_locked(this_rq, sd);
3439 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 3668 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3440 &sd_idle, cpus, NULL); 3669 &sd_idle, cpus, NULL);
3441 if (!group) { 3670 if (!group) {
@@ -3479,6 +3708,7 @@ redo:
3479 } else 3708 } else
3480 sd->nr_balance_failed = 0; 3709 sd->nr_balance_failed = 0;
3481 3710
3711 update_shares_locked(this_rq, sd);
3482 return ld_moved; 3712 return ld_moved;
3483 3713
3484out_balanced: 3714out_balanced:
@@ -3670,6 +3900,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3670 /* Earliest time when we have to do rebalance again */ 3900 /* Earliest time when we have to do rebalance again */
3671 unsigned long next_balance = jiffies + 60*HZ; 3901 unsigned long next_balance = jiffies + 60*HZ;
3672 int update_next_balance = 0; 3902 int update_next_balance = 0;
3903 int need_serialize;
3673 cpumask_t tmp; 3904 cpumask_t tmp;
3674 3905
3675 for_each_domain(cpu, sd) { 3906 for_each_domain(cpu, sd) {
@@ -3687,8 +3918,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3687 if (interval > HZ*NR_CPUS/10) 3918 if (interval > HZ*NR_CPUS/10)
3688 interval = HZ*NR_CPUS/10; 3919 interval = HZ*NR_CPUS/10;
3689 3920
3921 need_serialize = sd->flags & SD_SERIALIZE;
3690 3922
3691 if (sd->flags & SD_SERIALIZE) { 3923 if (need_serialize) {
3692 if (!spin_trylock(&balancing)) 3924 if (!spin_trylock(&balancing))
3693 goto out; 3925 goto out;
3694 } 3926 }
@@ -3704,7 +3936,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3704 } 3936 }
3705 sd->last_balance = jiffies; 3937 sd->last_balance = jiffies;
3706 } 3938 }
3707 if (sd->flags & SD_SERIALIZE) 3939 if (need_serialize)
3708 spin_unlock(&balancing); 3940 spin_unlock(&balancing);
3709out: 3941out:
3710 if (time_after(next_balance, sd->last_balance + interval)) { 3942 if (time_after(next_balance, sd->last_balance + interval)) {
@@ -4019,26 +4251,44 @@ void scheduler_tick(void)
4019#endif 4251#endif
4020} 4252}
4021 4253
4022#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 4254#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4255 defined(CONFIG_PREEMPT_TRACER))
4256
4257static inline unsigned long get_parent_ip(unsigned long addr)
4258{
4259 if (in_lock_functions(addr)) {
4260 addr = CALLER_ADDR2;
4261 if (in_lock_functions(addr))
4262 addr = CALLER_ADDR3;
4263 }
4264 return addr;
4265}
4023 4266
4024void __kprobes add_preempt_count(int val) 4267void __kprobes add_preempt_count(int val)
4025{ 4268{
4269#ifdef CONFIG_DEBUG_PREEMPT
4026 /* 4270 /*
4027 * Underflow? 4271 * Underflow?
4028 */ 4272 */
4029 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 4273 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4030 return; 4274 return;
4275#endif
4031 preempt_count() += val; 4276 preempt_count() += val;
4277#ifdef CONFIG_DEBUG_PREEMPT
4032 /* 4278 /*
4033 * Spinlock count overflowing soon? 4279 * Spinlock count overflowing soon?
4034 */ 4280 */
4035 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 4281 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4036 PREEMPT_MASK - 10); 4282 PREEMPT_MASK - 10);
4283#endif
4284 if (preempt_count() == val)
4285 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4037} 4286}
4038EXPORT_SYMBOL(add_preempt_count); 4287EXPORT_SYMBOL(add_preempt_count);
4039 4288
4040void __kprobes sub_preempt_count(int val) 4289void __kprobes sub_preempt_count(int val)
4041{ 4290{
4291#ifdef CONFIG_DEBUG_PREEMPT
4042 /* 4292 /*
4043 * Underflow? 4293 * Underflow?
4044 */ 4294 */
@@ -4050,7 +4300,10 @@ void __kprobes sub_preempt_count(int val)
4050 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 4300 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4051 !(preempt_count() & PREEMPT_MASK))) 4301 !(preempt_count() & PREEMPT_MASK)))
4052 return; 4302 return;
4303#endif
4053 4304
4305 if (preempt_count() == val)
4306 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4054 preempt_count() -= val; 4307 preempt_count() -= val;
4055} 4308}
4056EXPORT_SYMBOL(sub_preempt_count); 4309EXPORT_SYMBOL(sub_preempt_count);
@@ -4068,6 +4321,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
4068 prev->comm, prev->pid, preempt_count()); 4321 prev->comm, prev->pid, preempt_count());
4069 4322
4070 debug_show_held_locks(prev); 4323 debug_show_held_locks(prev);
4324 print_modules();
4071 if (irqs_disabled()) 4325 if (irqs_disabled())
4072 print_irqtrace_events(prev); 4326 print_irqtrace_events(prev);
4073 4327
@@ -4141,7 +4395,7 @@ asmlinkage void __sched schedule(void)
4141 struct task_struct *prev, *next; 4395 struct task_struct *prev, *next;
4142 unsigned long *switch_count; 4396 unsigned long *switch_count;
4143 struct rq *rq; 4397 struct rq *rq;
4144 int cpu; 4398 int cpu, hrtick = sched_feat(HRTICK);
4145 4399
4146need_resched: 4400need_resched:
4147 preempt_disable(); 4401 preempt_disable();
@@ -4156,7 +4410,8 @@ need_resched_nonpreemptible:
4156 4410
4157 schedule_debug(prev); 4411 schedule_debug(prev);
4158 4412
4159 hrtick_clear(rq); 4413 if (hrtick)
4414 hrtick_clear(rq);
4160 4415
4161 /* 4416 /*
4162 * Do the rq-clock update outside the rq lock: 4417 * Do the rq-clock update outside the rq lock:
@@ -4202,7 +4457,8 @@ need_resched_nonpreemptible:
4202 } else 4457 } else
4203 spin_unlock_irq(&rq->lock); 4458 spin_unlock_irq(&rq->lock);
4204 4459
4205 hrtick_set(rq); 4460 if (hrtick)
4461 hrtick_set(rq);
4206 4462
4207 if (unlikely(reacquire_kernel_lock(current) < 0)) 4463 if (unlikely(reacquire_kernel_lock(current) < 0))
4208 goto need_resched_nonpreemptible; 4464 goto need_resched_nonpreemptible;
@@ -4396,22 +4652,20 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4396 signal_pending(current)) || 4652 signal_pending(current)) ||
4397 (state == TASK_KILLABLE && 4653 (state == TASK_KILLABLE &&
4398 fatal_signal_pending(current))) { 4654 fatal_signal_pending(current))) {
4399 __remove_wait_queue(&x->wait, &wait); 4655 timeout = -ERESTARTSYS;
4400 return -ERESTARTSYS; 4656 break;
4401 } 4657 }
4402 __set_current_state(state); 4658 __set_current_state(state);
4403 spin_unlock_irq(&x->wait.lock); 4659 spin_unlock_irq(&x->wait.lock);
4404 timeout = schedule_timeout(timeout); 4660 timeout = schedule_timeout(timeout);
4405 spin_lock_irq(&x->wait.lock); 4661 spin_lock_irq(&x->wait.lock);
4406 if (!timeout) { 4662 } while (!x->done && timeout);
4407 __remove_wait_queue(&x->wait, &wait);
4408 return timeout;
4409 }
4410 } while (!x->done);
4411 __remove_wait_queue(&x->wait, &wait); 4663 __remove_wait_queue(&x->wait, &wait);
4664 if (!x->done)
4665 return timeout;
4412 } 4666 }
4413 x->done--; 4667 x->done--;
4414 return timeout; 4668 return timeout ?: 1;
4415} 4669}
4416 4670
4417static long __sched 4671static long __sched
@@ -4586,10 +4840,8 @@ void set_user_nice(struct task_struct *p, long nice)
4586 goto out_unlock; 4840 goto out_unlock;
4587 } 4841 }
4588 on_rq = p->se.on_rq; 4842 on_rq = p->se.on_rq;
4589 if (on_rq) { 4843 if (on_rq)
4590 dequeue_task(rq, p, 0); 4844 dequeue_task(rq, p, 0);
4591 dec_load(rq, p);
4592 }
4593 4845
4594 p->static_prio = NICE_TO_PRIO(nice); 4846 p->static_prio = NICE_TO_PRIO(nice);
4595 set_load_weight(p); 4847 set_load_weight(p);
@@ -4599,7 +4851,6 @@ void set_user_nice(struct task_struct *p, long nice)
4599 4851
4600 if (on_rq) { 4852 if (on_rq) {
4601 enqueue_task(rq, p, 0); 4853 enqueue_task(rq, p, 0);
4602 inc_load(rq, p);
4603 /* 4854 /*
4604 * If the task increased its priority or is running and 4855 * If the task increased its priority or is running and
4605 * lowered its priority, then reschedule its CPU: 4856 * lowered its priority, then reschedule its CPU:
@@ -4744,16 +4995,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4744 set_load_weight(p); 4995 set_load_weight(p);
4745} 4996}
4746 4997
4747/** 4998static int __sched_setscheduler(struct task_struct *p, int policy,
4748 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4999 struct sched_param *param, bool user)
4749 * @p: the task in question.
4750 * @policy: new policy.
4751 * @param: structure containing the new RT priority.
4752 *
4753 * NOTE that the task may be already dead.
4754 */
4755int sched_setscheduler(struct task_struct *p, int policy,
4756 struct sched_param *param)
4757{ 5000{
4758 int retval, oldprio, oldpolicy = -1, on_rq, running; 5001 int retval, oldprio, oldpolicy = -1, on_rq, running;
4759 unsigned long flags; 5002 unsigned long flags;
@@ -4785,7 +5028,7 @@ recheck:
4785 /* 5028 /*
4786 * Allow unprivileged RT tasks to decrease priority: 5029 * Allow unprivileged RT tasks to decrease priority:
4787 */ 5030 */
4788 if (!capable(CAP_SYS_NICE)) { 5031 if (user && !capable(CAP_SYS_NICE)) {
4789 if (rt_policy(policy)) { 5032 if (rt_policy(policy)) {
4790 unsigned long rlim_rtprio; 5033 unsigned long rlim_rtprio;
4791 5034
@@ -4821,7 +5064,8 @@ recheck:
4821 * Do not allow realtime tasks into groups that have no runtime 5064 * Do not allow realtime tasks into groups that have no runtime
4822 * assigned. 5065 * assigned.
4823 */ 5066 */
4824 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5067 if (user
5068 && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
4825 return -EPERM; 5069 return -EPERM;
4826#endif 5070#endif
4827 5071
@@ -4870,8 +5114,39 @@ recheck:
4870 5114
4871 return 0; 5115 return 0;
4872} 5116}
5117
5118/**
5119 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5120 * @p: the task in question.
5121 * @policy: new policy.
5122 * @param: structure containing the new RT priority.
5123 *
5124 * NOTE that the task may be already dead.
5125 */
5126int sched_setscheduler(struct task_struct *p, int policy,
5127 struct sched_param *param)
5128{
5129 return __sched_setscheduler(p, policy, param, true);
5130}
4873EXPORT_SYMBOL_GPL(sched_setscheduler); 5131EXPORT_SYMBOL_GPL(sched_setscheduler);
4874 5132
5133/**
5134 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5135 * @p: the task in question.
5136 * @policy: new policy.
5137 * @param: structure containing the new RT priority.
5138 *
5139 * Just like sched_setscheduler, only don't bother checking if the
5140 * current context has permission. For example, this is needed in
5141 * stop_machine(): we create temporary high priority worker threads,
5142 * but our caller might not have that capability.
5143 */
5144int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5145 struct sched_param *param)
5146{
5147 return __sched_setscheduler(p, policy, param, false);
5148}
5149
4875static int 5150static int
4876do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5151do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4877{ 5152{
@@ -5070,24 +5345,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5070 return sched_setaffinity(pid, &new_mask); 5345 return sched_setaffinity(pid, &new_mask);
5071} 5346}
5072 5347
5073/*
5074 * Represents all cpu's present in the system
5075 * In systems capable of hotplug, this map could dynamically grow
5076 * as new cpu's are detected in the system via any platform specific
5077 * method, such as ACPI for e.g.
5078 */
5079
5080cpumask_t cpu_present_map __read_mostly;
5081EXPORT_SYMBOL(cpu_present_map);
5082
5083#ifndef CONFIG_SMP
5084cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
5085EXPORT_SYMBOL(cpu_online_map);
5086
5087cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
5088EXPORT_SYMBOL(cpu_possible_map);
5089#endif
5090
5091long sched_getaffinity(pid_t pid, cpumask_t *mask) 5348long sched_getaffinity(pid_t pid, cpumask_t *mask)
5092{ 5349{
5093 struct task_struct *p; 5350 struct task_struct *p;
@@ -5384,7 +5641,7 @@ out_unlock:
5384 return retval; 5641 return retval;
5385} 5642}
5386 5643
5387static const char stat_nam[] = "RSDTtZX"; 5644static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5388 5645
5389void sched_show_task(struct task_struct *p) 5646void sched_show_task(struct task_struct *p)
5390{ 5647{
@@ -5571,6 +5828,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5571 goto out; 5828 goto out;
5572 } 5829 }
5573 5830
5831 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5832 !cpus_equal(p->cpus_allowed, *new_mask))) {
5833 ret = -EINVAL;
5834 goto out;
5835 }
5836
5574 if (p->sched_class->set_cpus_allowed) 5837 if (p->sched_class->set_cpus_allowed)
5575 p->sched_class->set_cpus_allowed(p, new_mask); 5838 p->sched_class->set_cpus_allowed(p, new_mask);
5576 else { 5839 else {
@@ -5622,10 +5885,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5622 double_rq_lock(rq_src, rq_dest); 5885 double_rq_lock(rq_src, rq_dest);
5623 /* Already moved. */ 5886 /* Already moved. */
5624 if (task_cpu(p) != src_cpu) 5887 if (task_cpu(p) != src_cpu)
5625 goto out; 5888 goto done;
5626 /* Affinity changed (again). */ 5889 /* Affinity changed (again). */
5627 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 5890 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5628 goto out; 5891 goto fail;
5629 5892
5630 on_rq = p->se.on_rq; 5893 on_rq = p->se.on_rq;
5631 if (on_rq) 5894 if (on_rq)
@@ -5636,8 +5899,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5636 activate_task(rq_dest, p, 0); 5899 activate_task(rq_dest, p, 0);
5637 check_preempt_curr(rq_dest, p); 5900 check_preempt_curr(rq_dest, p);
5638 } 5901 }
5902done:
5639 ret = 1; 5903 ret = 1;
5640out: 5904fail:
5641 double_rq_unlock(rq_src, rq_dest); 5905 double_rq_unlock(rq_src, rq_dest);
5642 return ret; 5906 return ret;
5643} 5907}
@@ -5887,6 +6151,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5887 next = pick_next_task(rq, rq->curr); 6151 next = pick_next_task(rq, rq->curr);
5888 if (!next) 6152 if (!next)
5889 break; 6153 break;
6154 next->sched_class->put_prev_task(rq, next);
5890 migrate_dead(dead_cpu, next); 6155 migrate_dead(dead_cpu, next);
5891 6156
5892 } 6157 }
@@ -6058,6 +6323,36 @@ static void unregister_sched_domain_sysctl(void)
6058} 6323}
6059#endif 6324#endif
6060 6325
6326static void set_rq_online(struct rq *rq)
6327{
6328 if (!rq->online) {
6329 const struct sched_class *class;
6330
6331 cpu_set(rq->cpu, rq->rd->online);
6332 rq->online = 1;
6333
6334 for_each_class(class) {
6335 if (class->rq_online)
6336 class->rq_online(rq);
6337 }
6338 }
6339}
6340
6341static void set_rq_offline(struct rq *rq)
6342{
6343 if (rq->online) {
6344 const struct sched_class *class;
6345
6346 for_each_class(class) {
6347 if (class->rq_offline)
6348 class->rq_offline(rq);
6349 }
6350
6351 cpu_clear(rq->cpu, rq->rd->online);
6352 rq->online = 0;
6353 }
6354}
6355
6061/* 6356/*
6062 * migration_call - callback that gets triggered when a CPU is added. 6357 * migration_call - callback that gets triggered when a CPU is added.
6063 * Here we can start up the necessary migration thread for the new CPU. 6358 * Here we can start up the necessary migration thread for the new CPU.
@@ -6095,7 +6390,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6095 spin_lock_irqsave(&rq->lock, flags); 6390 spin_lock_irqsave(&rq->lock, flags);
6096 if (rq->rd) { 6391 if (rq->rd) {
6097 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6392 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6098 cpu_set(cpu, rq->rd->online); 6393
6394 set_rq_online(rq);
6099 } 6395 }
6100 spin_unlock_irqrestore(&rq->lock, flags); 6396 spin_unlock_irqrestore(&rq->lock, flags);
6101 break; 6397 break;
@@ -6156,7 +6452,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6156 spin_lock_irqsave(&rq->lock, flags); 6452 spin_lock_irqsave(&rq->lock, flags);
6157 if (rq->rd) { 6453 if (rq->rd) {
6158 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6454 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6159 cpu_clear(cpu, rq->rd->online); 6455 set_rq_offline(rq);
6160 } 6456 }
6161 spin_unlock_irqrestore(&rq->lock, flags); 6457 spin_unlock_irqrestore(&rq->lock, flags);
6162 break; 6458 break;
@@ -6190,6 +6486,28 @@ void __init migration_init(void)
6190 6486
6191#ifdef CONFIG_SCHED_DEBUG 6487#ifdef CONFIG_SCHED_DEBUG
6192 6488
6489static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6490{
6491 switch (lvl) {
6492 case SD_LV_NONE:
6493 return "NONE";
6494 case SD_LV_SIBLING:
6495 return "SIBLING";
6496 case SD_LV_MC:
6497 return "MC";
6498 case SD_LV_CPU:
6499 return "CPU";
6500 case SD_LV_NODE:
6501 return "NODE";
6502 case SD_LV_ALLNODES:
6503 return "ALLNODES";
6504 case SD_LV_MAX:
6505 return "MAX";
6506
6507 }
6508 return "MAX";
6509}
6510
6193static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6511static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6194 cpumask_t *groupmask) 6512 cpumask_t *groupmask)
6195{ 6513{
@@ -6209,7 +6527,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6209 return -1; 6527 return -1;
6210 } 6528 }
6211 6529
6212 printk(KERN_CONT "span %s\n", str); 6530 printk(KERN_CONT "span %s level %s\n",
6531 str, sd_level_to_string(sd->level));
6213 6532
6214 if (!cpu_isset(cpu, sd->span)) { 6533 if (!cpu_isset(cpu, sd->span)) {
6215 printk(KERN_ERR "ERROR: domain->span does not contain " 6534 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6293,9 +6612,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6293 } 6612 }
6294 kfree(groupmask); 6613 kfree(groupmask);
6295} 6614}
6296#else 6615#else /* !CONFIG_SCHED_DEBUG */
6297# define sched_domain_debug(sd, cpu) do { } while (0) 6616# define sched_domain_debug(sd, cpu) do { } while (0)
6298#endif 6617#endif /* CONFIG_SCHED_DEBUG */
6299 6618
6300static int sd_degenerate(struct sched_domain *sd) 6619static int sd_degenerate(struct sched_domain *sd)
6301{ 6620{
@@ -6355,20 +6674,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6355static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6674static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6356{ 6675{
6357 unsigned long flags; 6676 unsigned long flags;
6358 const struct sched_class *class;
6359 6677
6360 spin_lock_irqsave(&rq->lock, flags); 6678 spin_lock_irqsave(&rq->lock, flags);
6361 6679
6362 if (rq->rd) { 6680 if (rq->rd) {
6363 struct root_domain *old_rd = rq->rd; 6681 struct root_domain *old_rd = rq->rd;
6364 6682
6365 for (class = sched_class_highest; class; class = class->next) { 6683 if (cpu_isset(rq->cpu, old_rd->online))
6366 if (class->leave_domain) 6684 set_rq_offline(rq);
6367 class->leave_domain(rq);
6368 }
6369 6685
6370 cpu_clear(rq->cpu, old_rd->span); 6686 cpu_clear(rq->cpu, old_rd->span);
6371 cpu_clear(rq->cpu, old_rd->online);
6372 6687
6373 if (atomic_dec_and_test(&old_rd->refcount)) 6688 if (atomic_dec_and_test(&old_rd->refcount))
6374 kfree(old_rd); 6689 kfree(old_rd);
@@ -6379,12 +6694,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6379 6694
6380 cpu_set(rq->cpu, rd->span); 6695 cpu_set(rq->cpu, rd->span);
6381 if (cpu_isset(rq->cpu, cpu_online_map)) 6696 if (cpu_isset(rq->cpu, cpu_online_map))
6382 cpu_set(rq->cpu, rd->online); 6697 set_rq_online(rq);
6383
6384 for (class = sched_class_highest; class; class = class->next) {
6385 if (class->join_domain)
6386 class->join_domain(rq);
6387 }
6388 6698
6389 spin_unlock_irqrestore(&rq->lock, flags); 6699 spin_unlock_irqrestore(&rq->lock, flags);
6390} 6700}
@@ -6395,6 +6705,8 @@ static void init_rootdomain(struct root_domain *rd)
6395 6705
6396 cpus_clear(rd->span); 6706 cpus_clear(rd->span);
6397 cpus_clear(rd->online); 6707 cpus_clear(rd->online);
6708
6709 cpupri_init(&rd->cpupri);
6398} 6710}
6399 6711
6400static void init_defrootdomain(void) 6712static void init_defrootdomain(void)
@@ -6537,9 +6849,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6537 6849
6538 min_val = INT_MAX; 6850 min_val = INT_MAX;
6539 6851
6540 for (i = 0; i < MAX_NUMNODES; i++) { 6852 for (i = 0; i < nr_node_ids; i++) {
6541 /* Start at @node */ 6853 /* Start at @node */
6542 n = (node + i) % MAX_NUMNODES; 6854 n = (node + i) % nr_node_ids;
6543 6855
6544 if (!nr_cpus_node(n)) 6856 if (!nr_cpus_node(n))
6545 continue; 6857 continue;
@@ -6589,7 +6901,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
6589 cpus_or(*span, *span, *nodemask); 6901 cpus_or(*span, *span, *nodemask);
6590 } 6902 }
6591} 6903}
6592#endif 6904#endif /* CONFIG_NUMA */
6593 6905
6594int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6906int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6595 6907
@@ -6608,7 +6920,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6608 *sg = &per_cpu(sched_group_cpus, cpu); 6920 *sg = &per_cpu(sched_group_cpus, cpu);
6609 return cpu; 6921 return cpu;
6610} 6922}
6611#endif 6923#endif /* CONFIG_SCHED_SMT */
6612 6924
6613/* 6925/*
6614 * multi-core sched-domains: 6926 * multi-core sched-domains:
@@ -6616,7 +6928,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6616#ifdef CONFIG_SCHED_MC 6928#ifdef CONFIG_SCHED_MC
6617static DEFINE_PER_CPU(struct sched_domain, core_domains); 6929static DEFINE_PER_CPU(struct sched_domain, core_domains);
6618static DEFINE_PER_CPU(struct sched_group, sched_group_core); 6930static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6619#endif 6931#endif /* CONFIG_SCHED_MC */
6620 6932
6621#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6933#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6622static int 6934static int
@@ -6718,7 +7030,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6718 sg = sg->next; 7030 sg = sg->next;
6719 } while (sg != group_head); 7031 } while (sg != group_head);
6720} 7032}
6721#endif 7033#endif /* CONFIG_NUMA */
6722 7034
6723#ifdef CONFIG_NUMA 7035#ifdef CONFIG_NUMA
6724/* Free memory allocated for various sched_group structures */ 7036/* Free memory allocated for various sched_group structures */
@@ -6733,7 +7045,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6733 if (!sched_group_nodes) 7045 if (!sched_group_nodes)
6734 continue; 7046 continue;
6735 7047
6736 for (i = 0; i < MAX_NUMNODES; i++) { 7048 for (i = 0; i < nr_node_ids; i++) {
6737 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7049 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6738 7050
6739 *nodemask = node_to_cpumask(i); 7051 *nodemask = node_to_cpumask(i);
@@ -6755,11 +7067,11 @@ next_sg:
6755 sched_group_nodes_bycpu[cpu] = NULL; 7067 sched_group_nodes_bycpu[cpu] = NULL;
6756 } 7068 }
6757} 7069}
6758#else 7070#else /* !CONFIG_NUMA */
6759static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7071static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6760{ 7072{
6761} 7073}
6762#endif 7074#endif /* CONFIG_NUMA */
6763 7075
6764/* 7076/*
6765 * Initialize sched groups cpu_power. 7077 * Initialize sched groups cpu_power.
@@ -6877,7 +7189,12 @@ static int default_relax_domain_level = -1;
6877 7189
6878static int __init setup_relax_domain_level(char *str) 7190static int __init setup_relax_domain_level(char *str)
6879{ 7191{
6880 default_relax_domain_level = simple_strtoul(str, NULL, 0); 7192 unsigned long val;
7193
7194 val = simple_strtoul(str, NULL, 0);
7195 if (val < SD_LV_MAX)
7196 default_relax_domain_level = val;
7197
6881 return 1; 7198 return 1;
6882} 7199}
6883__setup("relax_domain_level=", setup_relax_domain_level); 7200__setup("relax_domain_level=", setup_relax_domain_level);
@@ -6921,7 +7238,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6921 /* 7238 /*
6922 * Allocate the per-node list of sched groups 7239 * Allocate the per-node list of sched groups
6923 */ 7240 */
6924 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), 7241 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
6925 GFP_KERNEL); 7242 GFP_KERNEL);
6926 if (!sched_group_nodes) { 7243 if (!sched_group_nodes) {
6927 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7244 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -7060,7 +7377,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7060#endif 7377#endif
7061 7378
7062 /* Set up physical groups */ 7379 /* Set up physical groups */
7063 for (i = 0; i < MAX_NUMNODES; i++) { 7380 for (i = 0; i < nr_node_ids; i++) {
7064 SCHED_CPUMASK_VAR(nodemask, allmasks); 7381 SCHED_CPUMASK_VAR(nodemask, allmasks);
7065 SCHED_CPUMASK_VAR(send_covered, allmasks); 7382 SCHED_CPUMASK_VAR(send_covered, allmasks);
7066 7383
@@ -7084,7 +7401,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7084 send_covered, tmpmask); 7401 send_covered, tmpmask);
7085 } 7402 }
7086 7403
7087 for (i = 0; i < MAX_NUMNODES; i++) { 7404 for (i = 0; i < nr_node_ids; i++) {
7088 /* Set up node groups */ 7405 /* Set up node groups */
7089 struct sched_group *sg, *prev; 7406 struct sched_group *sg, *prev;
7090 SCHED_CPUMASK_VAR(nodemask, allmasks); 7407 SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7123,9 +7440,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7123 cpus_or(*covered, *covered, *nodemask); 7440 cpus_or(*covered, *covered, *nodemask);
7124 prev = sg; 7441 prev = sg;
7125 7442
7126 for (j = 0; j < MAX_NUMNODES; j++) { 7443 for (j = 0; j < nr_node_ids; j++) {
7127 SCHED_CPUMASK_VAR(notcovered, allmasks); 7444 SCHED_CPUMASK_VAR(notcovered, allmasks);
7128 int n = (i + j) % MAX_NUMNODES; 7445 int n = (i + j) % nr_node_ids;
7129 node_to_cpumask_ptr(pnodemask, n); 7446 node_to_cpumask_ptr(pnodemask, n);
7130 7447
7131 cpus_complement(*notcovered, *covered); 7448 cpus_complement(*notcovered, *covered);
@@ -7178,7 +7495,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7178 } 7495 }
7179 7496
7180#ifdef CONFIG_NUMA 7497#ifdef CONFIG_NUMA
7181 for (i = 0; i < MAX_NUMNODES; i++) 7498 for (i = 0; i < nr_node_ids; i++)
7182 init_numa_sched_groups_power(sched_group_nodes[i]); 7499 init_numa_sched_groups_power(sched_group_nodes[i]);
7183 7500
7184 if (sd_allnodes) { 7501 if (sd_allnodes) {
@@ -7236,6 +7553,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7236} 7553}
7237 7554
7238/* 7555/*
7556 * Free current domain masks.
7557 * Called after all cpus are attached to NULL domain.
7558 */
7559static void free_sched_domains(void)
7560{
7561 ndoms_cur = 0;
7562 if (doms_cur != &fallback_doms)
7563 kfree(doms_cur);
7564 doms_cur = &fallback_doms;
7565}
7566
7567/*
7239 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7568 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7240 * For now this just excludes isolated cpus, but could be used to 7569 * For now this just excludes isolated cpus, but could be used to
7241 * exclude other special cases in the future. 7570 * exclude other special cases in the future.
@@ -7382,6 +7711,7 @@ int arch_reinit_sched_domains(void)
7382 get_online_cpus(); 7711 get_online_cpus();
7383 mutex_lock(&sched_domains_mutex); 7712 mutex_lock(&sched_domains_mutex);
7384 detach_destroy_domains(&cpu_online_map); 7713 detach_destroy_domains(&cpu_online_map);
7714 free_sched_domains();
7385 err = arch_init_sched_domains(&cpu_online_map); 7715 err = arch_init_sched_domains(&cpu_online_map);
7386 mutex_unlock(&sched_domains_mutex); 7716 mutex_unlock(&sched_domains_mutex);
7387 put_online_cpus(); 7717 put_online_cpus();
@@ -7450,7 +7780,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7450#endif 7780#endif
7451 return err; 7781 return err;
7452} 7782}
7453#endif 7783#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7454 7784
7455/* 7785/*
7456 * Force a reinitialization of the sched domains hierarchy. The domains 7786 * Force a reinitialization of the sched domains hierarchy. The domains
@@ -7461,20 +7791,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7461static int update_sched_domains(struct notifier_block *nfb, 7791static int update_sched_domains(struct notifier_block *nfb,
7462 unsigned long action, void *hcpu) 7792 unsigned long action, void *hcpu)
7463{ 7793{
7794 int cpu = (int)(long)hcpu;
7795
7464 switch (action) { 7796 switch (action) {
7465 case CPU_UP_PREPARE:
7466 case CPU_UP_PREPARE_FROZEN:
7467 case CPU_DOWN_PREPARE: 7797 case CPU_DOWN_PREPARE:
7468 case CPU_DOWN_PREPARE_FROZEN: 7798 case CPU_DOWN_PREPARE_FROZEN:
7799 disable_runtime(cpu_rq(cpu));
7800 /* fall-through */
7801 case CPU_UP_PREPARE:
7802 case CPU_UP_PREPARE_FROZEN:
7469 detach_destroy_domains(&cpu_online_map); 7803 detach_destroy_domains(&cpu_online_map);
7804 free_sched_domains();
7470 return NOTIFY_OK; 7805 return NOTIFY_OK;
7471 7806
7472 case CPU_UP_CANCELED: 7807
7473 case CPU_UP_CANCELED_FROZEN:
7474 case CPU_DOWN_FAILED: 7808 case CPU_DOWN_FAILED:
7475 case CPU_DOWN_FAILED_FROZEN: 7809 case CPU_DOWN_FAILED_FROZEN:
7476 case CPU_ONLINE: 7810 case CPU_ONLINE:
7477 case CPU_ONLINE_FROZEN: 7811 case CPU_ONLINE_FROZEN:
7812 enable_runtime(cpu_rq(cpu));
7813 /* fall-through */
7814 case CPU_UP_CANCELED:
7815 case CPU_UP_CANCELED_FROZEN:
7478 case CPU_DEAD: 7816 case CPU_DEAD:
7479 case CPU_DEAD_FROZEN: 7817 case CPU_DEAD_FROZEN:
7480 /* 7818 /*
@@ -7485,8 +7823,16 @@ static int update_sched_domains(struct notifier_block *nfb,
7485 return NOTIFY_DONE; 7823 return NOTIFY_DONE;
7486 } 7824 }
7487 7825
7826#ifndef CONFIG_CPUSETS
7827 /*
7828 * Create default domain partitioning if cpusets are disabled.
7829 * Otherwise we let cpusets rebuild the domains based on the
7830 * current setup.
7831 */
7832
7488 /* The hotplug lock is already held by cpu_up/cpu_down */ 7833 /* The hotplug lock is already held by cpu_up/cpu_down */
7489 arch_init_sched_domains(&cpu_online_map); 7834 arch_init_sched_domains(&cpu_online_map);
7835#endif
7490 7836
7491 return NOTIFY_OK; 7837 return NOTIFY_OK;
7492} 7838}
@@ -7626,7 +7972,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7626 else 7972 else
7627 rt_se->rt_rq = parent->my_q; 7973 rt_se->rt_rq = parent->my_q;
7628 7974
7629 rt_se->rt_rq = &rq->rt;
7630 rt_se->my_q = rt_rq; 7975 rt_se->my_q = rt_rq;
7631 rt_se->parent = parent; 7976 rt_se->parent = parent;
7632 INIT_LIST_HEAD(&rt_se->run_list); 7977 INIT_LIST_HEAD(&rt_se->run_list);
@@ -7667,8 +8012,8 @@ void __init sched_init(void)
7667 8012
7668 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 8013 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7669 ptr += nr_cpu_ids * sizeof(void **); 8014 ptr += nr_cpu_ids * sizeof(void **);
7670#endif 8015#endif /* CONFIG_USER_SCHED */
7671#endif 8016#endif /* CONFIG_FAIR_GROUP_SCHED */
7672#ifdef CONFIG_RT_GROUP_SCHED 8017#ifdef CONFIG_RT_GROUP_SCHED
7673 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 8018 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
7674 ptr += nr_cpu_ids * sizeof(void **); 8019 ptr += nr_cpu_ids * sizeof(void **);
@@ -7682,8 +8027,8 @@ void __init sched_init(void)
7682 8027
7683 root_task_group.rt_rq = (struct rt_rq **)ptr; 8028 root_task_group.rt_rq = (struct rt_rq **)ptr;
7684 ptr += nr_cpu_ids * sizeof(void **); 8029 ptr += nr_cpu_ids * sizeof(void **);
7685#endif 8030#endif /* CONFIG_USER_SCHED */
7686#endif 8031#endif /* CONFIG_RT_GROUP_SCHED */
7687 } 8032 }
7688 8033
7689#ifdef CONFIG_SMP 8034#ifdef CONFIG_SMP
@@ -7699,8 +8044,8 @@ void __init sched_init(void)
7699#ifdef CONFIG_USER_SCHED 8044#ifdef CONFIG_USER_SCHED
7700 init_rt_bandwidth(&root_task_group.rt_bandwidth, 8045 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7701 global_rt_period(), RUNTIME_INF); 8046 global_rt_period(), RUNTIME_INF);
7702#endif 8047#endif /* CONFIG_USER_SCHED */
7703#endif 8048#endif /* CONFIG_RT_GROUP_SCHED */
7704 8049
7705#ifdef CONFIG_GROUP_SCHED 8050#ifdef CONFIG_GROUP_SCHED
7706 list_add(&init_task_group.list, &task_groups); 8051 list_add(&init_task_group.list, &task_groups);
@@ -7710,8 +8055,8 @@ void __init sched_init(void)
7710 INIT_LIST_HEAD(&root_task_group.children); 8055 INIT_LIST_HEAD(&root_task_group.children);
7711 init_task_group.parent = &root_task_group; 8056 init_task_group.parent = &root_task_group;
7712 list_add(&init_task_group.siblings, &root_task_group.children); 8057 list_add(&init_task_group.siblings, &root_task_group.children);
7713#endif 8058#endif /* CONFIG_USER_SCHED */
7714#endif 8059#endif /* CONFIG_GROUP_SCHED */
7715 8060
7716 for_each_possible_cpu(i) { 8061 for_each_possible_cpu(i) {
7717 struct rq *rq; 8062 struct rq *rq;
@@ -7791,6 +8136,7 @@ void __init sched_init(void)
7791 rq->next_balance = jiffies; 8136 rq->next_balance = jiffies;
7792 rq->push_cpu = 0; 8137 rq->push_cpu = 0;
7793 rq->cpu = i; 8138 rq->cpu = i;
8139 rq->online = 0;
7794 rq->migration_thread = NULL; 8140 rq->migration_thread = NULL;
7795 INIT_LIST_HEAD(&rq->migration_queue); 8141 INIT_LIST_HEAD(&rq->migration_queue);
7796 rq_attach_root(rq, &def_root_domain); 8142 rq_attach_root(rq, &def_root_domain);
@@ -8030,7 +8376,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8030{ 8376{
8031 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8377 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8032} 8378}
8033#else 8379#else /* !CONFG_FAIR_GROUP_SCHED */
8034static inline void free_fair_sched_group(struct task_group *tg) 8380static inline void free_fair_sched_group(struct task_group *tg)
8035{ 8381{
8036} 8382}
@@ -8048,7 +8394,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8048static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8394static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8049{ 8395{
8050} 8396}
8051#endif 8397#endif /* CONFIG_FAIR_GROUP_SCHED */
8052 8398
8053#ifdef CONFIG_RT_GROUP_SCHED 8399#ifdef CONFIG_RT_GROUP_SCHED
8054static void free_rt_sched_group(struct task_group *tg) 8400static void free_rt_sched_group(struct task_group *tg)
@@ -8119,7 +8465,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8119{ 8465{
8120 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 8466 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8121} 8467}
8122#else 8468#else /* !CONFIG_RT_GROUP_SCHED */
8123static inline void free_rt_sched_group(struct task_group *tg) 8469static inline void free_rt_sched_group(struct task_group *tg)
8124{ 8470{
8125} 8471}
@@ -8137,7 +8483,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8137static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8483static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8138{ 8484{
8139} 8485}
8140#endif 8486#endif /* CONFIG_RT_GROUP_SCHED */
8141 8487
8142#ifdef CONFIG_GROUP_SCHED 8488#ifdef CONFIG_GROUP_SCHED
8143static void free_sched_group(struct task_group *tg) 8489static void free_sched_group(struct task_group *tg)
@@ -8248,17 +8594,14 @@ void sched_move_task(struct task_struct *tsk)
8248 8594
8249 task_rq_unlock(rq, &flags); 8595 task_rq_unlock(rq, &flags);
8250} 8596}
8251#endif 8597#endif /* CONFIG_GROUP_SCHED */
8252 8598
8253#ifdef CONFIG_FAIR_GROUP_SCHED 8599#ifdef CONFIG_FAIR_GROUP_SCHED
8254static void set_se_shares(struct sched_entity *se, unsigned long shares) 8600static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8255{ 8601{
8256 struct cfs_rq *cfs_rq = se->cfs_rq; 8602 struct cfs_rq *cfs_rq = se->cfs_rq;
8257 struct rq *rq = cfs_rq->rq;
8258 int on_rq; 8603 int on_rq;
8259 8604
8260 spin_lock_irq(&rq->lock);
8261
8262 on_rq = se->on_rq; 8605 on_rq = se->on_rq;
8263 if (on_rq) 8606 if (on_rq)
8264 dequeue_entity(cfs_rq, se, 0); 8607 dequeue_entity(cfs_rq, se, 0);
@@ -8268,8 +8611,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
8268 8611
8269 if (on_rq) 8612 if (on_rq)
8270 enqueue_entity(cfs_rq, se, 0); 8613 enqueue_entity(cfs_rq, se, 0);
8614}
8271 8615
8272 spin_unlock_irq(&rq->lock); 8616static void set_se_shares(struct sched_entity *se, unsigned long shares)
8617{
8618 struct cfs_rq *cfs_rq = se->cfs_rq;
8619 struct rq *rq = cfs_rq->rq;
8620 unsigned long flags;
8621
8622 spin_lock_irqsave(&rq->lock, flags);
8623 __set_se_shares(se, shares);
8624 spin_unlock_irqrestore(&rq->lock, flags);
8273} 8625}
8274 8626
8275static DEFINE_MUTEX(shares_mutex); 8627static DEFINE_MUTEX(shares_mutex);
@@ -8308,8 +8660,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8308 * w/o tripping rebalance_share or load_balance_fair. 8660 * w/o tripping rebalance_share or load_balance_fair.
8309 */ 8661 */
8310 tg->shares = shares; 8662 tg->shares = shares;
8311 for_each_possible_cpu(i) 8663 for_each_possible_cpu(i) {
8664 /*
8665 * force a rebalance
8666 */
8667 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8312 set_se_shares(tg->se[i], shares); 8668 set_se_shares(tg->se[i], shares);
8669 }
8313 8670
8314 /* 8671 /*
8315 * Enable load balance activity on this group, by inserting it back on 8672 * Enable load balance activity on this group, by inserting it back on
@@ -8372,7 +8729,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8372 } 8729 }
8373 rcu_read_unlock(); 8730 rcu_read_unlock();
8374 8731
8375 return total + to_ratio(period, runtime) < 8732 return total + to_ratio(period, runtime) <=
8376 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8733 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8377 parent->rt_bandwidth.rt_runtime); 8734 parent->rt_bandwidth.rt_runtime);
8378} 8735}
@@ -8475,6 +8832,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8475 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 8832 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8476 rt_runtime = tg->rt_bandwidth.rt_runtime; 8833 rt_runtime = tg->rt_bandwidth.rt_runtime;
8477 8834
8835 if (rt_period == 0)
8836 return -EINVAL;
8837
8478 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8838 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8479} 8839}
8480 8840
@@ -8489,16 +8849,21 @@ long sched_group_rt_period(struct task_group *tg)
8489 8849
8490static int sched_rt_global_constraints(void) 8850static int sched_rt_global_constraints(void)
8491{ 8851{
8852 struct task_group *tg = &root_task_group;
8853 u64 rt_runtime, rt_period;
8492 int ret = 0; 8854 int ret = 0;
8493 8855
8856 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8857 rt_runtime = tg->rt_bandwidth.rt_runtime;
8858
8494 mutex_lock(&rt_constraints_mutex); 8859 mutex_lock(&rt_constraints_mutex);
8495 if (!__rt_schedulable(NULL, 1, 0)) 8860 if (!__rt_schedulable(tg, rt_period, rt_runtime))
8496 ret = -EINVAL; 8861 ret = -EINVAL;
8497 mutex_unlock(&rt_constraints_mutex); 8862 mutex_unlock(&rt_constraints_mutex);
8498 8863
8499 return ret; 8864 return ret;
8500} 8865}
8501#else 8866#else /* !CONFIG_RT_GROUP_SCHED */
8502static int sched_rt_global_constraints(void) 8867static int sched_rt_global_constraints(void)
8503{ 8868{
8504 unsigned long flags; 8869 unsigned long flags;
@@ -8516,7 +8881,7 @@ static int sched_rt_global_constraints(void)
8516 8881
8517 return 0; 8882 return 0;
8518} 8883}
8519#endif 8884#endif /* CONFIG_RT_GROUP_SCHED */
8520 8885
8521int sched_rt_handler(struct ctl_table *table, int write, 8886int sched_rt_handler(struct ctl_table *table, int write,
8522 struct file *filp, void __user *buffer, size_t *lenp, 8887 struct file *filp, void __user *buffer, size_t *lenp,
@@ -8624,7 +8989,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8624 8989
8625 return (u64) tg->shares; 8990 return (u64) tg->shares;
8626} 8991}
8627#endif 8992#endif /* CONFIG_FAIR_GROUP_SCHED */
8628 8993
8629#ifdef CONFIG_RT_GROUP_SCHED 8994#ifdef CONFIG_RT_GROUP_SCHED
8630static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 8995static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8648,7 +9013,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8648{ 9013{
8649 return sched_group_rt_period(cgroup_tg(cgrp)); 9014 return sched_group_rt_period(cgroup_tg(cgrp));
8650} 9015}
8651#endif 9016#endif /* CONFIG_RT_GROUP_SCHED */
8652 9017
8653static struct cftype cpu_files[] = { 9018static struct cftype cpu_files[] = {
8654#ifdef CONFIG_FAIR_GROUP_SCHED 9019#ifdef CONFIG_FAIR_GROUP_SCHED