diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 855 |
1 files changed, 597 insertions, 258 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 56958359d20c..99e6d850ecab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -70,10 +70,13 @@ | |||
70 | #include <linux/bootmem.h> | 70 | #include <linux/bootmem.h> |
71 | #include <linux/debugfs.h> | 71 | #include <linux/debugfs.h> |
72 | #include <linux/ctype.h> | 72 | #include <linux/ctype.h> |
73 | #include <linux/ftrace.h> | ||
73 | 74 | ||
74 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
75 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
76 | 77 | ||
78 | #include "sched_cpupri.h" | ||
79 | |||
77 | /* | 80 | /* |
78 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 81 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
79 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 82 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
@@ -289,15 +292,15 @@ struct task_group root_task_group; | |||
289 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 292 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
290 | /* Default task group's cfs_rq on each cpu */ | 293 | /* Default task group's cfs_rq on each cpu */ |
291 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 294 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
292 | #endif | 295 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
293 | 296 | ||
294 | #ifdef CONFIG_RT_GROUP_SCHED | 297 | #ifdef CONFIG_RT_GROUP_SCHED |
295 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 298 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
296 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 299 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
297 | #endif | 300 | #endif /* CONFIG_RT_GROUP_SCHED */ |
298 | #else | 301 | #else /* !CONFIG_FAIR_GROUP_SCHED */ |
299 | #define root_task_group init_task_group | 302 | #define root_task_group init_task_group |
300 | #endif | 303 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
301 | 304 | ||
302 | /* task_group_lock serializes add/remove of task groups and also changes to | 305 | /* task_group_lock serializes add/remove of task groups and also changes to |
303 | * a task group's cpu shares. | 306 | * a task group's cpu shares. |
@@ -307,9 +310,9 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
307 | #ifdef CONFIG_FAIR_GROUP_SCHED | 310 | #ifdef CONFIG_FAIR_GROUP_SCHED |
308 | #ifdef CONFIG_USER_SCHED | 311 | #ifdef CONFIG_USER_SCHED |
309 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | 312 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
310 | #else | 313 | #else /* !CONFIG_USER_SCHED */ |
311 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 314 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
312 | #endif | 315 | #endif /* CONFIG_USER_SCHED */ |
313 | 316 | ||
314 | /* | 317 | /* |
315 | * A weight of 0 or 1 can cause arithmetics problems. | 318 | * A weight of 0 or 1 can cause arithmetics problems. |
@@ -363,6 +366,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | |||
363 | #else | 366 | #else |
364 | 367 | ||
365 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | 368 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
369 | static inline struct task_group *task_group(struct task_struct *p) | ||
370 | { | ||
371 | return NULL; | ||
372 | } | ||
366 | 373 | ||
367 | #endif /* CONFIG_GROUP_SCHED */ | 374 | #endif /* CONFIG_GROUP_SCHED */ |
368 | 375 | ||
@@ -373,6 +380,7 @@ struct cfs_rq { | |||
373 | 380 | ||
374 | u64 exec_clock; | 381 | u64 exec_clock; |
375 | u64 min_vruntime; | 382 | u64 min_vruntime; |
383 | u64 pair_start; | ||
376 | 384 | ||
377 | struct rb_root tasks_timeline; | 385 | struct rb_root tasks_timeline; |
378 | struct rb_node *rb_leftmost; | 386 | struct rb_node *rb_leftmost; |
@@ -401,6 +409,31 @@ struct cfs_rq { | |||
401 | */ | 409 | */ |
402 | struct list_head leaf_cfs_rq_list; | 410 | struct list_head leaf_cfs_rq_list; |
403 | struct task_group *tg; /* group that "owns" this runqueue */ | 411 | struct task_group *tg; /* group that "owns" this runqueue */ |
412 | |||
413 | #ifdef CONFIG_SMP | ||
414 | /* | ||
415 | * the part of load.weight contributed by tasks | ||
416 | */ | ||
417 | unsigned long task_weight; | ||
418 | |||
419 | /* | ||
420 | * h_load = weight * f(tg) | ||
421 | * | ||
422 | * Where f(tg) is the recursive weight fraction assigned to | ||
423 | * this group. | ||
424 | */ | ||
425 | unsigned long h_load; | ||
426 | |||
427 | /* | ||
428 | * this cpu's part of tg->shares | ||
429 | */ | ||
430 | unsigned long shares; | ||
431 | |||
432 | /* | ||
433 | * load.weight at the time we set shares | ||
434 | */ | ||
435 | unsigned long rq_weight; | ||
436 | #endif | ||
404 | #endif | 437 | #endif |
405 | }; | 438 | }; |
406 | 439 | ||
@@ -452,6 +485,9 @@ struct root_domain { | |||
452 | */ | 485 | */ |
453 | cpumask_t rto_mask; | 486 | cpumask_t rto_mask; |
454 | atomic_t rto_count; | 487 | atomic_t rto_count; |
488 | #ifdef CONFIG_SMP | ||
489 | struct cpupri cpupri; | ||
490 | #endif | ||
455 | }; | 491 | }; |
456 | 492 | ||
457 | /* | 493 | /* |
@@ -526,6 +562,9 @@ struct rq { | |||
526 | int push_cpu; | 562 | int push_cpu; |
527 | /* cpu of this runqueue: */ | 563 | /* cpu of this runqueue: */ |
528 | int cpu; | 564 | int cpu; |
565 | int online; | ||
566 | |||
567 | unsigned long avg_load_per_task; | ||
529 | 568 | ||
530 | struct task_struct *migration_thread; | 569 | struct task_struct *migration_thread; |
531 | struct list_head migration_queue; | 570 | struct list_head migration_queue; |
@@ -607,6 +646,24 @@ static inline void update_rq_clock(struct rq *rq) | |||
607 | # define const_debug static const | 646 | # define const_debug static const |
608 | #endif | 647 | #endif |
609 | 648 | ||
649 | /** | ||
650 | * runqueue_is_locked | ||
651 | * | ||
652 | * Returns true if the current cpu runqueue is locked. | ||
653 | * This interface allows printk to be called with the runqueue lock | ||
654 | * held and know whether or not it is OK to wake up the klogd. | ||
655 | */ | ||
656 | int runqueue_is_locked(void) | ||
657 | { | ||
658 | int cpu = get_cpu(); | ||
659 | struct rq *rq = cpu_rq(cpu); | ||
660 | int ret; | ||
661 | |||
662 | ret = spin_is_locked(&rq->lock); | ||
663 | put_cpu(); | ||
664 | return ret; | ||
665 | } | ||
666 | |||
610 | /* | 667 | /* |
611 | * Debugging: various feature bits | 668 | * Debugging: various feature bits |
612 | */ | 669 | */ |
@@ -749,6 +806,12 @@ late_initcall(sched_init_debug); | |||
749 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 806 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
750 | 807 | ||
751 | /* | 808 | /* |
809 | * ratelimit for updating the group shares. | ||
810 | * default: 0.5ms | ||
811 | */ | ||
812 | const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; | ||
813 | |||
814 | /* | ||
752 | * period over which we measure -rt task cpu usage in us. | 815 | * period over which we measure -rt task cpu usage in us. |
753 | * default: 1s | 816 | * default: 1s |
754 | */ | 817 | */ |
@@ -775,82 +838,6 @@ static inline u64 global_rt_runtime(void) | |||
775 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 838 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; |
776 | } | 839 | } |
777 | 840 | ||
778 | unsigned long long time_sync_thresh = 100000; | ||
779 | |||
780 | static DEFINE_PER_CPU(unsigned long long, time_offset); | ||
781 | static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); | ||
782 | |||
783 | /* | ||
784 | * Global lock which we take every now and then to synchronize | ||
785 | * the CPUs time. This method is not warp-safe, but it's good | ||
786 | * enough to synchronize slowly diverging time sources and thus | ||
787 | * it's good enough for tracing: | ||
788 | */ | ||
789 | static DEFINE_SPINLOCK(time_sync_lock); | ||
790 | static unsigned long long prev_global_time; | ||
791 | |||
792 | static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu) | ||
793 | { | ||
794 | /* | ||
795 | * We want this inlined, to not get tracer function calls | ||
796 | * in this critical section: | ||
797 | */ | ||
798 | spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_); | ||
799 | __raw_spin_lock(&time_sync_lock.raw_lock); | ||
800 | |||
801 | if (time < prev_global_time) { | ||
802 | per_cpu(time_offset, cpu) += prev_global_time - time; | ||
803 | time = prev_global_time; | ||
804 | } else { | ||
805 | prev_global_time = time; | ||
806 | } | ||
807 | |||
808 | __raw_spin_unlock(&time_sync_lock.raw_lock); | ||
809 | spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_); | ||
810 | |||
811 | return time; | ||
812 | } | ||
813 | |||
814 | static unsigned long long __cpu_clock(int cpu) | ||
815 | { | ||
816 | unsigned long long now; | ||
817 | |||
818 | /* | ||
819 | * Only call sched_clock() if the scheduler has already been | ||
820 | * initialized (some code might call cpu_clock() very early): | ||
821 | */ | ||
822 | if (unlikely(!scheduler_running)) | ||
823 | return 0; | ||
824 | |||
825 | now = sched_clock_cpu(cpu); | ||
826 | |||
827 | return now; | ||
828 | } | ||
829 | |||
830 | /* | ||
831 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | ||
832 | * clock constructed from sched_clock(): | ||
833 | */ | ||
834 | unsigned long long cpu_clock(int cpu) | ||
835 | { | ||
836 | unsigned long long prev_cpu_time, time, delta_time; | ||
837 | unsigned long flags; | ||
838 | |||
839 | local_irq_save(flags); | ||
840 | prev_cpu_time = per_cpu(prev_cpu_time, cpu); | ||
841 | time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); | ||
842 | delta_time = time-prev_cpu_time; | ||
843 | |||
844 | if (unlikely(delta_time > time_sync_thresh)) { | ||
845 | time = __sync_cpu_clock(time, cpu); | ||
846 | per_cpu(prev_cpu_time, cpu) = time; | ||
847 | } | ||
848 | local_irq_restore(flags); | ||
849 | |||
850 | return time; | ||
851 | } | ||
852 | EXPORT_SYMBOL_GPL(cpu_clock); | ||
853 | |||
854 | #ifndef prepare_arch_switch | 841 | #ifndef prepare_arch_switch |
855 | # define prepare_arch_switch(next) do { } while (0) | 842 | # define prepare_arch_switch(next) do { } while (0) |
856 | #endif | 843 | #endif |
@@ -1313,15 +1300,15 @@ void wake_up_idle_cpu(int cpu) | |||
1313 | if (!tsk_is_polling(rq->idle)) | 1300 | if (!tsk_is_polling(rq->idle)) |
1314 | smp_send_reschedule(cpu); | 1301 | smp_send_reschedule(cpu); |
1315 | } | 1302 | } |
1316 | #endif | 1303 | #endif /* CONFIG_NO_HZ */ |
1317 | 1304 | ||
1318 | #else | 1305 | #else /* !CONFIG_SMP */ |
1319 | static void __resched_task(struct task_struct *p, int tif_bit) | 1306 | static void __resched_task(struct task_struct *p, int tif_bit) |
1320 | { | 1307 | { |
1321 | assert_spin_locked(&task_rq(p)->lock); | 1308 | assert_spin_locked(&task_rq(p)->lock); |
1322 | set_tsk_thread_flag(p, tif_bit); | 1309 | set_tsk_thread_flag(p, tif_bit); |
1323 | } | 1310 | } |
1324 | #endif | 1311 | #endif /* CONFIG_SMP */ |
1325 | 1312 | ||
1326 | #if BITS_PER_LONG == 32 | 1313 | #if BITS_PER_LONG == 32 |
1327 | # define WMULT_CONST (~0UL) | 1314 | # define WMULT_CONST (~0UL) |
@@ -1336,6 +1323,9 @@ static void __resched_task(struct task_struct *p, int tif_bit) | |||
1336 | */ | 1323 | */ |
1337 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 1324 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) |
1338 | 1325 | ||
1326 | /* | ||
1327 | * delta *= weight / lw | ||
1328 | */ | ||
1339 | static unsigned long | 1329 | static unsigned long |
1340 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 1330 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
1341 | struct load_weight *lw) | 1331 | struct load_weight *lw) |
@@ -1363,12 +1353,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1363 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 1353 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
1364 | } | 1354 | } |
1365 | 1355 | ||
1366 | static inline unsigned long | ||
1367 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | ||
1368 | { | ||
1369 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | ||
1370 | } | ||
1371 | |||
1372 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 1356 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
1373 | { | 1357 | { |
1374 | lw->weight += inc; | 1358 | lw->weight += inc; |
@@ -1479,17 +1463,211 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
1479 | #ifdef CONFIG_SMP | 1463 | #ifdef CONFIG_SMP |
1480 | static unsigned long source_load(int cpu, int type); | 1464 | static unsigned long source_load(int cpu, int type); |
1481 | static unsigned long target_load(int cpu, int type); | 1465 | static unsigned long target_load(int cpu, int type); |
1482 | static unsigned long cpu_avg_load_per_task(int cpu); | ||
1483 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1466 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1484 | #else /* CONFIG_SMP */ | 1467 | |
1468 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1469 | { | ||
1470 | struct rq *rq = cpu_rq(cpu); | ||
1471 | |||
1472 | if (rq->nr_running) | ||
1473 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; | ||
1474 | |||
1475 | return rq->avg_load_per_task; | ||
1476 | } | ||
1485 | 1477 | ||
1486 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1478 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1487 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | 1479 | |
1480 | typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); | ||
1481 | |||
1482 | /* | ||
1483 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1484 | * leaving it for the final time. | ||
1485 | */ | ||
1486 | static void | ||
1487 | walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) | ||
1488 | { | ||
1489 | struct task_group *parent, *child; | ||
1490 | |||
1491 | rcu_read_lock(); | ||
1492 | parent = &root_task_group; | ||
1493 | down: | ||
1494 | (*down)(parent, cpu, sd); | ||
1495 | list_for_each_entry_rcu(child, &parent->children, siblings) { | ||
1496 | parent = child; | ||
1497 | goto down; | ||
1498 | |||
1499 | up: | ||
1500 | continue; | ||
1501 | } | ||
1502 | (*up)(parent, cpu, sd); | ||
1503 | |||
1504 | child = parent; | ||
1505 | parent = parent->parent; | ||
1506 | if (parent) | ||
1507 | goto up; | ||
1508 | rcu_read_unlock(); | ||
1509 | } | ||
1510 | |||
1511 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1512 | |||
1513 | /* | ||
1514 | * Calculate and set the cpu's group shares. | ||
1515 | */ | ||
1516 | static void | ||
1517 | __update_group_shares_cpu(struct task_group *tg, int cpu, | ||
1518 | unsigned long sd_shares, unsigned long sd_rq_weight) | ||
1519 | { | ||
1520 | int boost = 0; | ||
1521 | unsigned long shares; | ||
1522 | unsigned long rq_weight; | ||
1523 | |||
1524 | if (!tg->se[cpu]) | ||
1525 | return; | ||
1526 | |||
1527 | rq_weight = tg->cfs_rq[cpu]->load.weight; | ||
1528 | |||
1529 | /* | ||
1530 | * If there are currently no tasks on the cpu pretend there is one of | ||
1531 | * average load so that when a new task gets to run here it will not | ||
1532 | * get delayed by group starvation. | ||
1533 | */ | ||
1534 | if (!rq_weight) { | ||
1535 | boost = 1; | ||
1536 | rq_weight = NICE_0_LOAD; | ||
1537 | } | ||
1538 | |||
1539 | if (unlikely(rq_weight > sd_rq_weight)) | ||
1540 | rq_weight = sd_rq_weight; | ||
1541 | |||
1542 | /* | ||
1543 | * \Sum shares * rq_weight | ||
1544 | * shares = ----------------------- | ||
1545 | * \Sum rq_weight | ||
1546 | * | ||
1547 | */ | ||
1548 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); | ||
1549 | |||
1550 | /* | ||
1551 | * record the actual number of shares, not the boosted amount. | ||
1552 | */ | ||
1553 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1554 | tg->cfs_rq[cpu]->rq_weight = rq_weight; | ||
1555 | |||
1556 | if (shares < MIN_SHARES) | ||
1557 | shares = MIN_SHARES; | ||
1558 | else if (shares > MAX_SHARES) | ||
1559 | shares = MAX_SHARES; | ||
1560 | |||
1561 | __set_se_shares(tg->se[cpu], shares); | ||
1562 | } | ||
1563 | |||
1564 | /* | ||
1565 | * Re-compute the task group their per cpu shares over the given domain. | ||
1566 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
1567 | * parent group depends on the shares of its child groups. | ||
1568 | */ | ||
1569 | static void | ||
1570 | tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1571 | { | ||
1572 | unsigned long rq_weight = 0; | ||
1573 | unsigned long shares = 0; | ||
1574 | int i; | ||
1575 | |||
1576 | for_each_cpu_mask(i, sd->span) { | ||
1577 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
1578 | shares += tg->cfs_rq[i]->shares; | ||
1579 | } | ||
1580 | |||
1581 | if ((!shares && rq_weight) || shares > tg->shares) | ||
1582 | shares = tg->shares; | ||
1583 | |||
1584 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | ||
1585 | shares = tg->shares; | ||
1586 | |||
1587 | if (!rq_weight) | ||
1588 | rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; | ||
1589 | |||
1590 | for_each_cpu_mask(i, sd->span) { | ||
1591 | struct rq *rq = cpu_rq(i); | ||
1592 | unsigned long flags; | ||
1593 | |||
1594 | spin_lock_irqsave(&rq->lock, flags); | ||
1595 | __update_group_shares_cpu(tg, i, shares, rq_weight); | ||
1596 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1597 | } | ||
1598 | } | ||
1599 | |||
1600 | /* | ||
1601 | * Compute the cpu's hierarchical load factor for each task group. | ||
1602 | * This needs to be done in a top-down fashion because the load of a child | ||
1603 | * group is a fraction of its parents load. | ||
1604 | */ | ||
1605 | static void | ||
1606 | tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1607 | { | ||
1608 | unsigned long load; | ||
1609 | |||
1610 | if (!tg->parent) { | ||
1611 | load = cpu_rq(cpu)->load.weight; | ||
1612 | } else { | ||
1613 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
1614 | load *= tg->cfs_rq[cpu]->shares; | ||
1615 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | ||
1616 | } | ||
1617 | |||
1618 | tg->cfs_rq[cpu]->h_load = load; | ||
1619 | } | ||
1620 | |||
1621 | static void | ||
1622 | tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1488 | { | 1623 | { |
1489 | } | 1624 | } |
1625 | |||
1626 | static void update_shares(struct sched_domain *sd) | ||
1627 | { | ||
1628 | u64 now = cpu_clock(raw_smp_processor_id()); | ||
1629 | s64 elapsed = now - sd->last_update; | ||
1630 | |||
1631 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | ||
1632 | sd->last_update = now; | ||
1633 | walk_tg_tree(tg_nop, tg_shares_up, 0, sd); | ||
1634 | } | ||
1635 | } | ||
1636 | |||
1637 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
1638 | { | ||
1639 | spin_unlock(&rq->lock); | ||
1640 | update_shares(sd); | ||
1641 | spin_lock(&rq->lock); | ||
1642 | } | ||
1643 | |||
1644 | static void update_h_load(int cpu) | ||
1645 | { | ||
1646 | walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); | ||
1647 | } | ||
1648 | |||
1649 | #else | ||
1650 | |||
1651 | static inline void update_shares(struct sched_domain *sd) | ||
1652 | { | ||
1653 | } | ||
1654 | |||
1655 | static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | ||
1656 | { | ||
1657 | } | ||
1658 | |||
1490 | #endif | 1659 | #endif |
1491 | 1660 | ||
1492 | #endif /* CONFIG_SMP */ | 1661 | #endif |
1662 | |||
1663 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1664 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1665 | { | ||
1666 | #ifdef CONFIG_SMP | ||
1667 | cfs_rq->shares = shares; | ||
1668 | #endif | ||
1669 | } | ||
1670 | #endif | ||
1493 | 1671 | ||
1494 | #include "sched_stats.h" | 1672 | #include "sched_stats.h" |
1495 | #include "sched_idletask.c" | 1673 | #include "sched_idletask.c" |
@@ -1500,27 +1678,17 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1500 | #endif | 1678 | #endif |
1501 | 1679 | ||
1502 | #define sched_class_highest (&rt_sched_class) | 1680 | #define sched_class_highest (&rt_sched_class) |
1681 | #define for_each_class(class) \ | ||
1682 | for (class = sched_class_highest; class; class = class->next) | ||
1503 | 1683 | ||
1504 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 1684 | static void inc_nr_running(struct rq *rq) |
1505 | { | ||
1506 | update_load_add(&rq->load, p->se.load.weight); | ||
1507 | } | ||
1508 | |||
1509 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
1510 | { | ||
1511 | update_load_sub(&rq->load, p->se.load.weight); | ||
1512 | } | ||
1513 | |||
1514 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
1515 | { | 1685 | { |
1516 | rq->nr_running++; | 1686 | rq->nr_running++; |
1517 | inc_load(rq, p); | ||
1518 | } | 1687 | } |
1519 | 1688 | ||
1520 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1689 | static void dec_nr_running(struct rq *rq) |
1521 | { | 1690 | { |
1522 | rq->nr_running--; | 1691 | rq->nr_running--; |
1523 | dec_load(rq, p); | ||
1524 | } | 1692 | } |
1525 | 1693 | ||
1526 | static void set_load_weight(struct task_struct *p) | 1694 | static void set_load_weight(struct task_struct *p) |
@@ -1544,6 +1712,12 @@ static void set_load_weight(struct task_struct *p) | |||
1544 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 1712 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; |
1545 | } | 1713 | } |
1546 | 1714 | ||
1715 | static void update_avg(u64 *avg, u64 sample) | ||
1716 | { | ||
1717 | s64 diff = sample - *avg; | ||
1718 | *avg += diff >> 3; | ||
1719 | } | ||
1720 | |||
1547 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | 1721 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) |
1548 | { | 1722 | { |
1549 | sched_info_queued(p); | 1723 | sched_info_queued(p); |
@@ -1553,6 +1727,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1553 | 1727 | ||
1554 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) | 1728 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) |
1555 | { | 1729 | { |
1730 | if (sleep && p->se.last_wakeup) { | ||
1731 | update_avg(&p->se.avg_overlap, | ||
1732 | p->se.sum_exec_runtime - p->se.last_wakeup); | ||
1733 | p->se.last_wakeup = 0; | ||
1734 | } | ||
1735 | |||
1736 | sched_info_dequeued(p); | ||
1556 | p->sched_class->dequeue_task(rq, p, sleep); | 1737 | p->sched_class->dequeue_task(rq, p, sleep); |
1557 | p->se.on_rq = 0; | 1738 | p->se.on_rq = 0; |
1558 | } | 1739 | } |
@@ -1612,7 +1793,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1612 | rq->nr_uninterruptible--; | 1793 | rq->nr_uninterruptible--; |
1613 | 1794 | ||
1614 | enqueue_task(rq, p, wakeup); | 1795 | enqueue_task(rq, p, wakeup); |
1615 | inc_nr_running(p, rq); | 1796 | inc_nr_running(rq); |
1616 | } | 1797 | } |
1617 | 1798 | ||
1618 | /* | 1799 | /* |
@@ -1624,7 +1805,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1624 | rq->nr_uninterruptible++; | 1805 | rq->nr_uninterruptible++; |
1625 | 1806 | ||
1626 | dequeue_task(rq, p, sleep); | 1807 | dequeue_task(rq, p, sleep); |
1627 | dec_nr_running(p, rq); | 1808 | dec_nr_running(rq); |
1628 | } | 1809 | } |
1629 | 1810 | ||
1630 | /** | 1811 | /** |
@@ -1636,12 +1817,6 @@ inline int task_curr(const struct task_struct *p) | |||
1636 | return cpu_curr(task_cpu(p)) == p; | 1817 | return cpu_curr(task_cpu(p)) == p; |
1637 | } | 1818 | } |
1638 | 1819 | ||
1639 | /* Used instead of source_load when we know the type == 0 */ | ||
1640 | unsigned long weighted_cpuload(const int cpu) | ||
1641 | { | ||
1642 | return cpu_rq(cpu)->load.weight; | ||
1643 | } | ||
1644 | |||
1645 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1820 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1646 | { | 1821 | { |
1647 | set_task_rq(p, cpu); | 1822 | set_task_rq(p, cpu); |
@@ -1670,6 +1845,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
1670 | 1845 | ||
1671 | #ifdef CONFIG_SMP | 1846 | #ifdef CONFIG_SMP |
1672 | 1847 | ||
1848 | /* Used instead of source_load when we know the type == 0 */ | ||
1849 | static unsigned long weighted_cpuload(const int cpu) | ||
1850 | { | ||
1851 | return cpu_rq(cpu)->load.weight; | ||
1852 | } | ||
1853 | |||
1673 | /* | 1854 | /* |
1674 | * Is this task likely cache-hot: | 1855 | * Is this task likely cache-hot: |
1675 | */ | 1856 | */ |
@@ -1880,7 +2061,7 @@ static unsigned long source_load(int cpu, int type) | |||
1880 | struct rq *rq = cpu_rq(cpu); | 2061 | struct rq *rq = cpu_rq(cpu); |
1881 | unsigned long total = weighted_cpuload(cpu); | 2062 | unsigned long total = weighted_cpuload(cpu); |
1882 | 2063 | ||
1883 | if (type == 0) | 2064 | if (type == 0 || !sched_feat(LB_BIAS)) |
1884 | return total; | 2065 | return total; |
1885 | 2066 | ||
1886 | return min(rq->cpu_load[type-1], total); | 2067 | return min(rq->cpu_load[type-1], total); |
@@ -1895,25 +2076,13 @@ static unsigned long target_load(int cpu, int type) | |||
1895 | struct rq *rq = cpu_rq(cpu); | 2076 | struct rq *rq = cpu_rq(cpu); |
1896 | unsigned long total = weighted_cpuload(cpu); | 2077 | unsigned long total = weighted_cpuload(cpu); |
1897 | 2078 | ||
1898 | if (type == 0) | 2079 | if (type == 0 || !sched_feat(LB_BIAS)) |
1899 | return total; | 2080 | return total; |
1900 | 2081 | ||
1901 | return max(rq->cpu_load[type-1], total); | 2082 | return max(rq->cpu_load[type-1], total); |
1902 | } | 2083 | } |
1903 | 2084 | ||
1904 | /* | 2085 | /* |
1905 | * Return the average load per task on the cpu's run queue | ||
1906 | */ | ||
1907 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1908 | { | ||
1909 | struct rq *rq = cpu_rq(cpu); | ||
1910 | unsigned long total = weighted_cpuload(cpu); | ||
1911 | unsigned long n = rq->nr_running; | ||
1912 | |||
1913 | return n ? total / n : SCHED_LOAD_SCALE; | ||
1914 | } | ||
1915 | |||
1916 | /* | ||
1917 | * find_idlest_group finds and returns the least busy CPU group within the | 2086 | * find_idlest_group finds and returns the least busy CPU group within the |
1918 | * domain. | 2087 | * domain. |
1919 | */ | 2088 | */ |
@@ -2019,6 +2188,9 @@ static int sched_balance_self(int cpu, int flag) | |||
2019 | sd = tmp; | 2188 | sd = tmp; |
2020 | } | 2189 | } |
2021 | 2190 | ||
2191 | if (sd) | ||
2192 | update_shares(sd); | ||
2193 | |||
2022 | while (sd) { | 2194 | while (sd) { |
2023 | cpumask_t span, tmpmask; | 2195 | cpumask_t span, tmpmask; |
2024 | struct sched_group *group; | 2196 | struct sched_group *group; |
@@ -2085,6 +2257,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2085 | if (!sched_feat(SYNC_WAKEUPS)) | 2257 | if (!sched_feat(SYNC_WAKEUPS)) |
2086 | sync = 0; | 2258 | sync = 0; |
2087 | 2259 | ||
2260 | #ifdef CONFIG_SMP | ||
2261 | if (sched_feat(LB_WAKEUP_UPDATE)) { | ||
2262 | struct sched_domain *sd; | ||
2263 | |||
2264 | this_cpu = raw_smp_processor_id(); | ||
2265 | cpu = task_cpu(p); | ||
2266 | |||
2267 | for_each_domain(this_cpu, sd) { | ||
2268 | if (cpu_isset(cpu, sd->span)) { | ||
2269 | update_shares(sd); | ||
2270 | break; | ||
2271 | } | ||
2272 | } | ||
2273 | } | ||
2274 | #endif | ||
2275 | |||
2088 | smp_wmb(); | 2276 | smp_wmb(); |
2089 | rq = task_rq_lock(p, &flags); | 2277 | rq = task_rq_lock(p, &flags); |
2090 | old_state = p->state; | 2278 | old_state = p->state; |
@@ -2131,7 +2319,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2131 | } | 2319 | } |
2132 | } | 2320 | } |
2133 | } | 2321 | } |
2134 | #endif | 2322 | #endif /* CONFIG_SCHEDSTATS */ |
2135 | 2323 | ||
2136 | out_activate: | 2324 | out_activate: |
2137 | #endif /* CONFIG_SMP */ | 2325 | #endif /* CONFIG_SMP */ |
@@ -2149,6 +2337,9 @@ out_activate: | |||
2149 | success = 1; | 2337 | success = 1; |
2150 | 2338 | ||
2151 | out_running: | 2339 | out_running: |
2340 | trace_mark(kernel_sched_wakeup, | ||
2341 | "pid %d state %ld ## rq %p task %p rq->curr %p", | ||
2342 | p->pid, p->state, rq, p, rq->curr); | ||
2152 | check_preempt_curr(rq, p); | 2343 | check_preempt_curr(rq, p); |
2153 | 2344 | ||
2154 | p->state = TASK_RUNNING; | 2345 | p->state = TASK_RUNNING; |
@@ -2157,6 +2348,8 @@ out_running: | |||
2157 | p->sched_class->task_wake_up(rq, p); | 2348 | p->sched_class->task_wake_up(rq, p); |
2158 | #endif | 2349 | #endif |
2159 | out: | 2350 | out: |
2351 | current->se.last_wakeup = current->se.sum_exec_runtime; | ||
2352 | |||
2160 | task_rq_unlock(rq, &flags); | 2353 | task_rq_unlock(rq, &flags); |
2161 | 2354 | ||
2162 | return success; | 2355 | return success; |
@@ -2277,8 +2470,11 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2277 | * management (if any): | 2470 | * management (if any): |
2278 | */ | 2471 | */ |
2279 | p->sched_class->task_new(rq, p); | 2472 | p->sched_class->task_new(rq, p); |
2280 | inc_nr_running(p, rq); | 2473 | inc_nr_running(rq); |
2281 | } | 2474 | } |
2475 | trace_mark(kernel_sched_wakeup_new, | ||
2476 | "pid %d state %ld ## rq %p task %p rq->curr %p", | ||
2477 | p->pid, p->state, rq, p, rq->curr); | ||
2282 | check_preempt_curr(rq, p); | 2478 | check_preempt_curr(rq, p); |
2283 | #ifdef CONFIG_SMP | 2479 | #ifdef CONFIG_SMP |
2284 | if (p->sched_class->task_wake_up) | 2480 | if (p->sched_class->task_wake_up) |
@@ -2331,7 +2527,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, | |||
2331 | notifier->ops->sched_out(notifier, next); | 2527 | notifier->ops->sched_out(notifier, next); |
2332 | } | 2528 | } |
2333 | 2529 | ||
2334 | #else | 2530 | #else /* !CONFIG_PREEMPT_NOTIFIERS */ |
2335 | 2531 | ||
2336 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 2532 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
2337 | { | 2533 | { |
@@ -2343,7 +2539,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, | |||
2343 | { | 2539 | { |
2344 | } | 2540 | } |
2345 | 2541 | ||
2346 | #endif | 2542 | #endif /* CONFIG_PREEMPT_NOTIFIERS */ |
2347 | 2543 | ||
2348 | /** | 2544 | /** |
2349 | * prepare_task_switch - prepare to switch tasks | 2545 | * prepare_task_switch - prepare to switch tasks |
@@ -2451,6 +2647,11 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2451 | struct mm_struct *mm, *oldmm; | 2647 | struct mm_struct *mm, *oldmm; |
2452 | 2648 | ||
2453 | prepare_task_switch(rq, prev, next); | 2649 | prepare_task_switch(rq, prev, next); |
2650 | trace_mark(kernel_sched_schedule, | ||
2651 | "prev_pid %d next_pid %d prev_state %ld " | ||
2652 | "## rq %p prev %p next %p", | ||
2653 | prev->pid, next->pid, prev->state, | ||
2654 | rq, prev, next); | ||
2454 | mm = next->mm; | 2655 | mm = next->mm; |
2455 | oldmm = prev->active_mm; | 2656 | oldmm = prev->active_mm; |
2456 | /* | 2657 | /* |
@@ -2785,7 +2986,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2785 | enum cpu_idle_type idle, int *all_pinned, | 2986 | enum cpu_idle_type idle, int *all_pinned, |
2786 | int *this_best_prio, struct rq_iterator *iterator) | 2987 | int *this_best_prio, struct rq_iterator *iterator) |
2787 | { | 2988 | { |
2788 | int loops = 0, pulled = 0, pinned = 0, skip_for_load; | 2989 | int loops = 0, pulled = 0, pinned = 0; |
2789 | struct task_struct *p; | 2990 | struct task_struct *p; |
2790 | long rem_load_move = max_load_move; | 2991 | long rem_load_move = max_load_move; |
2791 | 2992 | ||
@@ -2801,14 +3002,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2801 | next: | 3002 | next: |
2802 | if (!p || loops++ > sysctl_sched_nr_migrate) | 3003 | if (!p || loops++ > sysctl_sched_nr_migrate) |
2803 | goto out; | 3004 | goto out; |
2804 | /* | 3005 | |
2805 | * To help distribute high priority tasks across CPUs we don't | 3006 | if ((p->se.load.weight >> 1) > rem_load_move || |
2806 | * skip a task if it will be the highest priority task (i.e. smallest | ||
2807 | * prio value) on its new queue regardless of its load weight | ||
2808 | */ | ||
2809 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + | ||
2810 | SCHED_LOAD_SCALE_FUZZ; | ||
2811 | if ((skip_for_load && p->prio >= *this_best_prio) || | ||
2812 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | 3007 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { |
2813 | p = iterator->next(iterator->arg); | 3008 | p = iterator->next(iterator->arg); |
2814 | goto next; | 3009 | goto next; |
@@ -2863,6 +3058,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2863 | max_load_move - total_load_moved, | 3058 | max_load_move - total_load_moved, |
2864 | sd, idle, all_pinned, &this_best_prio); | 3059 | sd, idle, all_pinned, &this_best_prio); |
2865 | class = class->next; | 3060 | class = class->next; |
3061 | |||
3062 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | ||
3063 | break; | ||
3064 | |||
2866 | } while (class && max_load_move > total_load_moved); | 3065 | } while (class && max_load_move > total_load_moved); |
2867 | 3066 | ||
2868 | return total_load_moved > 0; | 3067 | return total_load_moved > 0; |
@@ -2939,6 +3138,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2939 | max_load = this_load = total_load = total_pwr = 0; | 3138 | max_load = this_load = total_load = total_pwr = 0; |
2940 | busiest_load_per_task = busiest_nr_running = 0; | 3139 | busiest_load_per_task = busiest_nr_running = 0; |
2941 | this_load_per_task = this_nr_running = 0; | 3140 | this_load_per_task = this_nr_running = 0; |
3141 | |||
2942 | if (idle == CPU_NOT_IDLE) | 3142 | if (idle == CPU_NOT_IDLE) |
2943 | load_idx = sd->busy_idx; | 3143 | load_idx = sd->busy_idx; |
2944 | else if (idle == CPU_NEWLY_IDLE) | 3144 | else if (idle == CPU_NEWLY_IDLE) |
@@ -2953,6 +3153,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2953 | int __group_imb = 0; | 3153 | int __group_imb = 0; |
2954 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 3154 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
2955 | unsigned long sum_nr_running, sum_weighted_load; | 3155 | unsigned long sum_nr_running, sum_weighted_load; |
3156 | unsigned long sum_avg_load_per_task; | ||
3157 | unsigned long avg_load_per_task; | ||
2956 | 3158 | ||
2957 | local_group = cpu_isset(this_cpu, group->cpumask); | 3159 | local_group = cpu_isset(this_cpu, group->cpumask); |
2958 | 3160 | ||
@@ -2961,6 +3163,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2961 | 3163 | ||
2962 | /* Tally up the load of all CPUs in the group */ | 3164 | /* Tally up the load of all CPUs in the group */ |
2963 | sum_weighted_load = sum_nr_running = avg_load = 0; | 3165 | sum_weighted_load = sum_nr_running = avg_load = 0; |
3166 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
3167 | |||
2964 | max_cpu_load = 0; | 3168 | max_cpu_load = 0; |
2965 | min_cpu_load = ~0UL; | 3169 | min_cpu_load = ~0UL; |
2966 | 3170 | ||
@@ -2994,6 +3198,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2994 | avg_load += load; | 3198 | avg_load += load; |
2995 | sum_nr_running += rq->nr_running; | 3199 | sum_nr_running += rq->nr_running; |
2996 | sum_weighted_load += weighted_cpuload(i); | 3200 | sum_weighted_load += weighted_cpuload(i); |
3201 | |||
3202 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | ||
2997 | } | 3203 | } |
2998 | 3204 | ||
2999 | /* | 3205 | /* |
@@ -3015,7 +3221,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3015 | avg_load = sg_div_cpu_power(group, | 3221 | avg_load = sg_div_cpu_power(group, |
3016 | avg_load * SCHED_LOAD_SCALE); | 3222 | avg_load * SCHED_LOAD_SCALE); |
3017 | 3223 | ||
3018 | if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) | 3224 | |
3225 | /* | ||
3226 | * Consider the group unbalanced when the imbalance is larger | ||
3227 | * than the average weight of two tasks. | ||
3228 | * | ||
3229 | * APZ: with cgroup the avg task weight can vary wildly and | ||
3230 | * might not be a suitable number - should we keep a | ||
3231 | * normalized nr_running number somewhere that negates | ||
3232 | * the hierarchy? | ||
3233 | */ | ||
3234 | avg_load_per_task = sg_div_cpu_power(group, | ||
3235 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | ||
3236 | |||
3237 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
3019 | __group_imb = 1; | 3238 | __group_imb = 1; |
3020 | 3239 | ||
3021 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3240 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; |
@@ -3156,9 +3375,9 @@ small_imbalance: | |||
3156 | if (busiest_load_per_task > this_load_per_task) | 3375 | if (busiest_load_per_task > this_load_per_task) |
3157 | imbn = 1; | 3376 | imbn = 1; |
3158 | } else | 3377 | } else |
3159 | this_load_per_task = SCHED_LOAD_SCALE; | 3378 | this_load_per_task = cpu_avg_load_per_task(this_cpu); |
3160 | 3379 | ||
3161 | if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= | 3380 | if (max_load - this_load + 2*busiest_load_per_task >= |
3162 | busiest_load_per_task * imbn) { | 3381 | busiest_load_per_task * imbn) { |
3163 | *imbalance = busiest_load_per_task; | 3382 | *imbalance = busiest_load_per_task; |
3164 | return busiest; | 3383 | return busiest; |
@@ -3284,6 +3503,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3284 | schedstat_inc(sd, lb_count[idle]); | 3503 | schedstat_inc(sd, lb_count[idle]); |
3285 | 3504 | ||
3286 | redo: | 3505 | redo: |
3506 | update_shares(sd); | ||
3287 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3507 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
3288 | cpus, balance); | 3508 | cpus, balance); |
3289 | 3509 | ||
@@ -3386,8 +3606,9 @@ redo: | |||
3386 | 3606 | ||
3387 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3607 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3388 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3608 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3389 | return -1; | 3609 | ld_moved = -1; |
3390 | return ld_moved; | 3610 | |
3611 | goto out; | ||
3391 | 3612 | ||
3392 | out_balanced: | 3613 | out_balanced: |
3393 | schedstat_inc(sd, lb_balanced[idle]); | 3614 | schedstat_inc(sd, lb_balanced[idle]); |
@@ -3402,8 +3623,13 @@ out_one_pinned: | |||
3402 | 3623 | ||
3403 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3624 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3404 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3625 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3405 | return -1; | 3626 | ld_moved = -1; |
3406 | return 0; | 3627 | else |
3628 | ld_moved = 0; | ||
3629 | out: | ||
3630 | if (ld_moved) | ||
3631 | update_shares(sd); | ||
3632 | return ld_moved; | ||
3407 | } | 3633 | } |
3408 | 3634 | ||
3409 | /* | 3635 | /* |
@@ -3438,6 +3664,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, | |||
3438 | 3664 | ||
3439 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | 3665 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
3440 | redo: | 3666 | redo: |
3667 | update_shares_locked(this_rq, sd); | ||
3441 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 3668 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
3442 | &sd_idle, cpus, NULL); | 3669 | &sd_idle, cpus, NULL); |
3443 | if (!group) { | 3670 | if (!group) { |
@@ -3481,6 +3708,7 @@ redo: | |||
3481 | } else | 3708 | } else |
3482 | sd->nr_balance_failed = 0; | 3709 | sd->nr_balance_failed = 0; |
3483 | 3710 | ||
3711 | update_shares_locked(this_rq, sd); | ||
3484 | return ld_moved; | 3712 | return ld_moved; |
3485 | 3713 | ||
3486 | out_balanced: | 3714 | out_balanced: |
@@ -3672,6 +3900,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3672 | /* Earliest time when we have to do rebalance again */ | 3900 | /* Earliest time when we have to do rebalance again */ |
3673 | unsigned long next_balance = jiffies + 60*HZ; | 3901 | unsigned long next_balance = jiffies + 60*HZ; |
3674 | int update_next_balance = 0; | 3902 | int update_next_balance = 0; |
3903 | int need_serialize; | ||
3675 | cpumask_t tmp; | 3904 | cpumask_t tmp; |
3676 | 3905 | ||
3677 | for_each_domain(cpu, sd) { | 3906 | for_each_domain(cpu, sd) { |
@@ -3689,8 +3918,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3689 | if (interval > HZ*NR_CPUS/10) | 3918 | if (interval > HZ*NR_CPUS/10) |
3690 | interval = HZ*NR_CPUS/10; | 3919 | interval = HZ*NR_CPUS/10; |
3691 | 3920 | ||
3921 | need_serialize = sd->flags & SD_SERIALIZE; | ||
3692 | 3922 | ||
3693 | if (sd->flags & SD_SERIALIZE) { | 3923 | if (need_serialize) { |
3694 | if (!spin_trylock(&balancing)) | 3924 | if (!spin_trylock(&balancing)) |
3695 | goto out; | 3925 | goto out; |
3696 | } | 3926 | } |
@@ -3706,7 +3936,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3706 | } | 3936 | } |
3707 | sd->last_balance = jiffies; | 3937 | sd->last_balance = jiffies; |
3708 | } | 3938 | } |
3709 | if (sd->flags & SD_SERIALIZE) | 3939 | if (need_serialize) |
3710 | spin_unlock(&balancing); | 3940 | spin_unlock(&balancing); |
3711 | out: | 3941 | out: |
3712 | if (time_after(next_balance, sd->last_balance + interval)) { | 3942 | if (time_after(next_balance, sd->last_balance + interval)) { |
@@ -4021,26 +4251,44 @@ void scheduler_tick(void) | |||
4021 | #endif | 4251 | #endif |
4022 | } | 4252 | } |
4023 | 4253 | ||
4024 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) | 4254 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
4255 | defined(CONFIG_PREEMPT_TRACER)) | ||
4256 | |||
4257 | static inline unsigned long get_parent_ip(unsigned long addr) | ||
4258 | { | ||
4259 | if (in_lock_functions(addr)) { | ||
4260 | addr = CALLER_ADDR2; | ||
4261 | if (in_lock_functions(addr)) | ||
4262 | addr = CALLER_ADDR3; | ||
4263 | } | ||
4264 | return addr; | ||
4265 | } | ||
4025 | 4266 | ||
4026 | void __kprobes add_preempt_count(int val) | 4267 | void __kprobes add_preempt_count(int val) |
4027 | { | 4268 | { |
4269 | #ifdef CONFIG_DEBUG_PREEMPT | ||
4028 | /* | 4270 | /* |
4029 | * Underflow? | 4271 | * Underflow? |
4030 | */ | 4272 | */ |
4031 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) | 4273 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
4032 | return; | 4274 | return; |
4275 | #endif | ||
4033 | preempt_count() += val; | 4276 | preempt_count() += val; |
4277 | #ifdef CONFIG_DEBUG_PREEMPT | ||
4034 | /* | 4278 | /* |
4035 | * Spinlock count overflowing soon? | 4279 | * Spinlock count overflowing soon? |
4036 | */ | 4280 | */ |
4037 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= | 4281 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= |
4038 | PREEMPT_MASK - 10); | 4282 | PREEMPT_MASK - 10); |
4283 | #endif | ||
4284 | if (preempt_count() == val) | ||
4285 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | ||
4039 | } | 4286 | } |
4040 | EXPORT_SYMBOL(add_preempt_count); | 4287 | EXPORT_SYMBOL(add_preempt_count); |
4041 | 4288 | ||
4042 | void __kprobes sub_preempt_count(int val) | 4289 | void __kprobes sub_preempt_count(int val) |
4043 | { | 4290 | { |
4291 | #ifdef CONFIG_DEBUG_PREEMPT | ||
4044 | /* | 4292 | /* |
4045 | * Underflow? | 4293 | * Underflow? |
4046 | */ | 4294 | */ |
@@ -4052,7 +4300,10 @@ void __kprobes sub_preempt_count(int val) | |||
4052 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && | 4300 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && |
4053 | !(preempt_count() & PREEMPT_MASK))) | 4301 | !(preempt_count() & PREEMPT_MASK))) |
4054 | return; | 4302 | return; |
4303 | #endif | ||
4055 | 4304 | ||
4305 | if (preempt_count() == val) | ||
4306 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | ||
4056 | preempt_count() -= val; | 4307 | preempt_count() -= val; |
4057 | } | 4308 | } |
4058 | EXPORT_SYMBOL(sub_preempt_count); | 4309 | EXPORT_SYMBOL(sub_preempt_count); |
@@ -4070,6 +4321,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
4070 | prev->comm, prev->pid, preempt_count()); | 4321 | prev->comm, prev->pid, preempt_count()); |
4071 | 4322 | ||
4072 | debug_show_held_locks(prev); | 4323 | debug_show_held_locks(prev); |
4324 | print_modules(); | ||
4073 | if (irqs_disabled()) | 4325 | if (irqs_disabled()) |
4074 | print_irqtrace_events(prev); | 4326 | print_irqtrace_events(prev); |
4075 | 4327 | ||
@@ -4143,7 +4395,7 @@ asmlinkage void __sched schedule(void) | |||
4143 | struct task_struct *prev, *next; | 4395 | struct task_struct *prev, *next; |
4144 | unsigned long *switch_count; | 4396 | unsigned long *switch_count; |
4145 | struct rq *rq; | 4397 | struct rq *rq; |
4146 | int cpu; | 4398 | int cpu, hrtick = sched_feat(HRTICK); |
4147 | 4399 | ||
4148 | need_resched: | 4400 | need_resched: |
4149 | preempt_disable(); | 4401 | preempt_disable(); |
@@ -4158,7 +4410,8 @@ need_resched_nonpreemptible: | |||
4158 | 4410 | ||
4159 | schedule_debug(prev); | 4411 | schedule_debug(prev); |
4160 | 4412 | ||
4161 | hrtick_clear(rq); | 4413 | if (hrtick) |
4414 | hrtick_clear(rq); | ||
4162 | 4415 | ||
4163 | /* | 4416 | /* |
4164 | * Do the rq-clock update outside the rq lock: | 4417 | * Do the rq-clock update outside the rq lock: |
@@ -4204,7 +4457,8 @@ need_resched_nonpreemptible: | |||
4204 | } else | 4457 | } else |
4205 | spin_unlock_irq(&rq->lock); | 4458 | spin_unlock_irq(&rq->lock); |
4206 | 4459 | ||
4207 | hrtick_set(rq); | 4460 | if (hrtick) |
4461 | hrtick_set(rq); | ||
4208 | 4462 | ||
4209 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 4463 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
4210 | goto need_resched_nonpreemptible; | 4464 | goto need_resched_nonpreemptible; |
@@ -4586,10 +4840,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4586 | goto out_unlock; | 4840 | goto out_unlock; |
4587 | } | 4841 | } |
4588 | on_rq = p->se.on_rq; | 4842 | on_rq = p->se.on_rq; |
4589 | if (on_rq) { | 4843 | if (on_rq) |
4590 | dequeue_task(rq, p, 0); | 4844 | dequeue_task(rq, p, 0); |
4591 | dec_load(rq, p); | ||
4592 | } | ||
4593 | 4845 | ||
4594 | p->static_prio = NICE_TO_PRIO(nice); | 4846 | p->static_prio = NICE_TO_PRIO(nice); |
4595 | set_load_weight(p); | 4847 | set_load_weight(p); |
@@ -4599,7 +4851,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4599 | 4851 | ||
4600 | if (on_rq) { | 4852 | if (on_rq) { |
4601 | enqueue_task(rq, p, 0); | 4853 | enqueue_task(rq, p, 0); |
4602 | inc_load(rq, p); | ||
4603 | /* | 4854 | /* |
4604 | * If the task increased its priority or is running and | 4855 | * If the task increased its priority or is running and |
4605 | * lowered its priority, then reschedule its CPU: | 4856 | * lowered its priority, then reschedule its CPU: |
@@ -4744,16 +4995,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | |||
4744 | set_load_weight(p); | 4995 | set_load_weight(p); |
4745 | } | 4996 | } |
4746 | 4997 | ||
4747 | /** | 4998 | static int __sched_setscheduler(struct task_struct *p, int policy, |
4748 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | 4999 | struct sched_param *param, bool user) |
4749 | * @p: the task in question. | ||
4750 | * @policy: new policy. | ||
4751 | * @param: structure containing the new RT priority. | ||
4752 | * | ||
4753 | * NOTE that the task may be already dead. | ||
4754 | */ | ||
4755 | int sched_setscheduler(struct task_struct *p, int policy, | ||
4756 | struct sched_param *param) | ||
4757 | { | 5000 | { |
4758 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 5001 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4759 | unsigned long flags; | 5002 | unsigned long flags; |
@@ -4785,7 +5028,7 @@ recheck: | |||
4785 | /* | 5028 | /* |
4786 | * Allow unprivileged RT tasks to decrease priority: | 5029 | * Allow unprivileged RT tasks to decrease priority: |
4787 | */ | 5030 | */ |
4788 | if (!capable(CAP_SYS_NICE)) { | 5031 | if (user && !capable(CAP_SYS_NICE)) { |
4789 | if (rt_policy(policy)) { | 5032 | if (rt_policy(policy)) { |
4790 | unsigned long rlim_rtprio; | 5033 | unsigned long rlim_rtprio; |
4791 | 5034 | ||
@@ -4821,7 +5064,8 @@ recheck: | |||
4821 | * Do not allow realtime tasks into groups that have no runtime | 5064 | * Do not allow realtime tasks into groups that have no runtime |
4822 | * assigned. | 5065 | * assigned. |
4823 | */ | 5066 | */ |
4824 | if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) | 5067 | if (user |
5068 | && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) | ||
4825 | return -EPERM; | 5069 | return -EPERM; |
4826 | #endif | 5070 | #endif |
4827 | 5071 | ||
@@ -4870,8 +5114,39 @@ recheck: | |||
4870 | 5114 | ||
4871 | return 0; | 5115 | return 0; |
4872 | } | 5116 | } |
5117 | |||
5118 | /** | ||
5119 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | ||
5120 | * @p: the task in question. | ||
5121 | * @policy: new policy. | ||
5122 | * @param: structure containing the new RT priority. | ||
5123 | * | ||
5124 | * NOTE that the task may be already dead. | ||
5125 | */ | ||
5126 | int sched_setscheduler(struct task_struct *p, int policy, | ||
5127 | struct sched_param *param) | ||
5128 | { | ||
5129 | return __sched_setscheduler(p, policy, param, true); | ||
5130 | } | ||
4873 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 5131 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
4874 | 5132 | ||
5133 | /** | ||
5134 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. | ||
5135 | * @p: the task in question. | ||
5136 | * @policy: new policy. | ||
5137 | * @param: structure containing the new RT priority. | ||
5138 | * | ||
5139 | * Just like sched_setscheduler, only don't bother checking if the | ||
5140 | * current context has permission. For example, this is needed in | ||
5141 | * stop_machine(): we create temporary high priority worker threads, | ||
5142 | * but our caller might not have that capability. | ||
5143 | */ | ||
5144 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | ||
5145 | struct sched_param *param) | ||
5146 | { | ||
5147 | return __sched_setscheduler(p, policy, param, false); | ||
5148 | } | ||
5149 | |||
4875 | static int | 5150 | static int |
4876 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 5151 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
4877 | { | 5152 | { |
@@ -5070,24 +5345,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
5070 | return sched_setaffinity(pid, &new_mask); | 5345 | return sched_setaffinity(pid, &new_mask); |
5071 | } | 5346 | } |
5072 | 5347 | ||
5073 | /* | ||
5074 | * Represents all cpu's present in the system | ||
5075 | * In systems capable of hotplug, this map could dynamically grow | ||
5076 | * as new cpu's are detected in the system via any platform specific | ||
5077 | * method, such as ACPI for e.g. | ||
5078 | */ | ||
5079 | |||
5080 | cpumask_t cpu_present_map __read_mostly; | ||
5081 | EXPORT_SYMBOL(cpu_present_map); | ||
5082 | |||
5083 | #ifndef CONFIG_SMP | ||
5084 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; | ||
5085 | EXPORT_SYMBOL(cpu_online_map); | ||
5086 | |||
5087 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | ||
5088 | EXPORT_SYMBOL(cpu_possible_map); | ||
5089 | #endif | ||
5090 | |||
5091 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 5348 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
5092 | { | 5349 | { |
5093 | struct task_struct *p; | 5350 | struct task_struct *p; |
@@ -5384,7 +5641,7 @@ out_unlock: | |||
5384 | return retval; | 5641 | return retval; |
5385 | } | 5642 | } |
5386 | 5643 | ||
5387 | static const char stat_nam[] = "RSDTtZX"; | 5644 | static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; |
5388 | 5645 | ||
5389 | void sched_show_task(struct task_struct *p) | 5646 | void sched_show_task(struct task_struct *p) |
5390 | { | 5647 | { |
@@ -5571,6 +5828,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) | |||
5571 | goto out; | 5828 | goto out; |
5572 | } | 5829 | } |
5573 | 5830 | ||
5831 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && | ||
5832 | !cpus_equal(p->cpus_allowed, *new_mask))) { | ||
5833 | ret = -EINVAL; | ||
5834 | goto out; | ||
5835 | } | ||
5836 | |||
5574 | if (p->sched_class->set_cpus_allowed) | 5837 | if (p->sched_class->set_cpus_allowed) |
5575 | p->sched_class->set_cpus_allowed(p, new_mask); | 5838 | p->sched_class->set_cpus_allowed(p, new_mask); |
5576 | else { | 5839 | else { |
@@ -5622,10 +5885,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5622 | double_rq_lock(rq_src, rq_dest); | 5885 | double_rq_lock(rq_src, rq_dest); |
5623 | /* Already moved. */ | 5886 | /* Already moved. */ |
5624 | if (task_cpu(p) != src_cpu) | 5887 | if (task_cpu(p) != src_cpu) |
5625 | goto out; | 5888 | goto done; |
5626 | /* Affinity changed (again). */ | 5889 | /* Affinity changed (again). */ |
5627 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) | 5890 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) |
5628 | goto out; | 5891 | goto fail; |
5629 | 5892 | ||
5630 | on_rq = p->se.on_rq; | 5893 | on_rq = p->se.on_rq; |
5631 | if (on_rq) | 5894 | if (on_rq) |
@@ -5636,8 +5899,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5636 | activate_task(rq_dest, p, 0); | 5899 | activate_task(rq_dest, p, 0); |
5637 | check_preempt_curr(rq_dest, p); | 5900 | check_preempt_curr(rq_dest, p); |
5638 | } | 5901 | } |
5902 | done: | ||
5639 | ret = 1; | 5903 | ret = 1; |
5640 | out: | 5904 | fail: |
5641 | double_rq_unlock(rq_src, rq_dest); | 5905 | double_rq_unlock(rq_src, rq_dest); |
5642 | return ret; | 5906 | return ret; |
5643 | } | 5907 | } |
@@ -5887,6 +6151,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
5887 | next = pick_next_task(rq, rq->curr); | 6151 | next = pick_next_task(rq, rq->curr); |
5888 | if (!next) | 6152 | if (!next) |
5889 | break; | 6153 | break; |
6154 | next->sched_class->put_prev_task(rq, next); | ||
5890 | migrate_dead(dead_cpu, next); | 6155 | migrate_dead(dead_cpu, next); |
5891 | 6156 | ||
5892 | } | 6157 | } |
@@ -6058,6 +6323,36 @@ static void unregister_sched_domain_sysctl(void) | |||
6058 | } | 6323 | } |
6059 | #endif | 6324 | #endif |
6060 | 6325 | ||
6326 | static void set_rq_online(struct rq *rq) | ||
6327 | { | ||
6328 | if (!rq->online) { | ||
6329 | const struct sched_class *class; | ||
6330 | |||
6331 | cpu_set(rq->cpu, rq->rd->online); | ||
6332 | rq->online = 1; | ||
6333 | |||
6334 | for_each_class(class) { | ||
6335 | if (class->rq_online) | ||
6336 | class->rq_online(rq); | ||
6337 | } | ||
6338 | } | ||
6339 | } | ||
6340 | |||
6341 | static void set_rq_offline(struct rq *rq) | ||
6342 | { | ||
6343 | if (rq->online) { | ||
6344 | const struct sched_class *class; | ||
6345 | |||
6346 | for_each_class(class) { | ||
6347 | if (class->rq_offline) | ||
6348 | class->rq_offline(rq); | ||
6349 | } | ||
6350 | |||
6351 | cpu_clear(rq->cpu, rq->rd->online); | ||
6352 | rq->online = 0; | ||
6353 | } | ||
6354 | } | ||
6355 | |||
6061 | /* | 6356 | /* |
6062 | * migration_call - callback that gets triggered when a CPU is added. | 6357 | * migration_call - callback that gets triggered when a CPU is added. |
6063 | * Here we can start up the necessary migration thread for the new CPU. | 6358 | * Here we can start up the necessary migration thread for the new CPU. |
@@ -6095,7 +6390,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6095 | spin_lock_irqsave(&rq->lock, flags); | 6390 | spin_lock_irqsave(&rq->lock, flags); |
6096 | if (rq->rd) { | 6391 | if (rq->rd) { |
6097 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | 6392 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); |
6098 | cpu_set(cpu, rq->rd->online); | 6393 | |
6394 | set_rq_online(rq); | ||
6099 | } | 6395 | } |
6100 | spin_unlock_irqrestore(&rq->lock, flags); | 6396 | spin_unlock_irqrestore(&rq->lock, flags); |
6101 | break; | 6397 | break; |
@@ -6156,7 +6452,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6156 | spin_lock_irqsave(&rq->lock, flags); | 6452 | spin_lock_irqsave(&rq->lock, flags); |
6157 | if (rq->rd) { | 6453 | if (rq->rd) { |
6158 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | 6454 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); |
6159 | cpu_clear(cpu, rq->rd->online); | 6455 | set_rq_offline(rq); |
6160 | } | 6456 | } |
6161 | spin_unlock_irqrestore(&rq->lock, flags); | 6457 | spin_unlock_irqrestore(&rq->lock, flags); |
6162 | break; | 6458 | break; |
@@ -6190,6 +6486,28 @@ void __init migration_init(void) | |||
6190 | 6486 | ||
6191 | #ifdef CONFIG_SCHED_DEBUG | 6487 | #ifdef CONFIG_SCHED_DEBUG |
6192 | 6488 | ||
6489 | static inline const char *sd_level_to_string(enum sched_domain_level lvl) | ||
6490 | { | ||
6491 | switch (lvl) { | ||
6492 | case SD_LV_NONE: | ||
6493 | return "NONE"; | ||
6494 | case SD_LV_SIBLING: | ||
6495 | return "SIBLING"; | ||
6496 | case SD_LV_MC: | ||
6497 | return "MC"; | ||
6498 | case SD_LV_CPU: | ||
6499 | return "CPU"; | ||
6500 | case SD_LV_NODE: | ||
6501 | return "NODE"; | ||
6502 | case SD_LV_ALLNODES: | ||
6503 | return "ALLNODES"; | ||
6504 | case SD_LV_MAX: | ||
6505 | return "MAX"; | ||
6506 | |||
6507 | } | ||
6508 | return "MAX"; | ||
6509 | } | ||
6510 | |||
6193 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 6511 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
6194 | cpumask_t *groupmask) | 6512 | cpumask_t *groupmask) |
6195 | { | 6513 | { |
@@ -6209,7 +6527,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6209 | return -1; | 6527 | return -1; |
6210 | } | 6528 | } |
6211 | 6529 | ||
6212 | printk(KERN_CONT "span %s\n", str); | 6530 | printk(KERN_CONT "span %s level %s\n", |
6531 | str, sd_level_to_string(sd->level)); | ||
6213 | 6532 | ||
6214 | if (!cpu_isset(cpu, sd->span)) { | 6533 | if (!cpu_isset(cpu, sd->span)) { |
6215 | printk(KERN_ERR "ERROR: domain->span does not contain " | 6534 | printk(KERN_ERR "ERROR: domain->span does not contain " |
@@ -6293,9 +6612,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
6293 | } | 6612 | } |
6294 | kfree(groupmask); | 6613 | kfree(groupmask); |
6295 | } | 6614 | } |
6296 | #else | 6615 | #else /* !CONFIG_SCHED_DEBUG */ |
6297 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6616 | # define sched_domain_debug(sd, cpu) do { } while (0) |
6298 | #endif | 6617 | #endif /* CONFIG_SCHED_DEBUG */ |
6299 | 6618 | ||
6300 | static int sd_degenerate(struct sched_domain *sd) | 6619 | static int sd_degenerate(struct sched_domain *sd) |
6301 | { | 6620 | { |
@@ -6355,20 +6674,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
6355 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | 6674 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) |
6356 | { | 6675 | { |
6357 | unsigned long flags; | 6676 | unsigned long flags; |
6358 | const struct sched_class *class; | ||
6359 | 6677 | ||
6360 | spin_lock_irqsave(&rq->lock, flags); | 6678 | spin_lock_irqsave(&rq->lock, flags); |
6361 | 6679 | ||
6362 | if (rq->rd) { | 6680 | if (rq->rd) { |
6363 | struct root_domain *old_rd = rq->rd; | 6681 | struct root_domain *old_rd = rq->rd; |
6364 | 6682 | ||
6365 | for (class = sched_class_highest; class; class = class->next) { | 6683 | if (cpu_isset(rq->cpu, old_rd->online)) |
6366 | if (class->leave_domain) | 6684 | set_rq_offline(rq); |
6367 | class->leave_domain(rq); | ||
6368 | } | ||
6369 | 6685 | ||
6370 | cpu_clear(rq->cpu, old_rd->span); | 6686 | cpu_clear(rq->cpu, old_rd->span); |
6371 | cpu_clear(rq->cpu, old_rd->online); | ||
6372 | 6687 | ||
6373 | if (atomic_dec_and_test(&old_rd->refcount)) | 6688 | if (atomic_dec_and_test(&old_rd->refcount)) |
6374 | kfree(old_rd); | 6689 | kfree(old_rd); |
@@ -6379,12 +6694,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6379 | 6694 | ||
6380 | cpu_set(rq->cpu, rd->span); | 6695 | cpu_set(rq->cpu, rd->span); |
6381 | if (cpu_isset(rq->cpu, cpu_online_map)) | 6696 | if (cpu_isset(rq->cpu, cpu_online_map)) |
6382 | cpu_set(rq->cpu, rd->online); | 6697 | set_rq_online(rq); |
6383 | |||
6384 | for (class = sched_class_highest; class; class = class->next) { | ||
6385 | if (class->join_domain) | ||
6386 | class->join_domain(rq); | ||
6387 | } | ||
6388 | 6698 | ||
6389 | spin_unlock_irqrestore(&rq->lock, flags); | 6699 | spin_unlock_irqrestore(&rq->lock, flags); |
6390 | } | 6700 | } |
@@ -6395,6 +6705,8 @@ static void init_rootdomain(struct root_domain *rd) | |||
6395 | 6705 | ||
6396 | cpus_clear(rd->span); | 6706 | cpus_clear(rd->span); |
6397 | cpus_clear(rd->online); | 6707 | cpus_clear(rd->online); |
6708 | |||
6709 | cpupri_init(&rd->cpupri); | ||
6398 | } | 6710 | } |
6399 | 6711 | ||
6400 | static void init_defrootdomain(void) | 6712 | static void init_defrootdomain(void) |
@@ -6537,9 +6849,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) | |||
6537 | 6849 | ||
6538 | min_val = INT_MAX; | 6850 | min_val = INT_MAX; |
6539 | 6851 | ||
6540 | for (i = 0; i < MAX_NUMNODES; i++) { | 6852 | for (i = 0; i < nr_node_ids; i++) { |
6541 | /* Start at @node */ | 6853 | /* Start at @node */ |
6542 | n = (node + i) % MAX_NUMNODES; | 6854 | n = (node + i) % nr_node_ids; |
6543 | 6855 | ||
6544 | if (!nr_cpus_node(n)) | 6856 | if (!nr_cpus_node(n)) |
6545 | continue; | 6857 | continue; |
@@ -6589,7 +6901,7 @@ static void sched_domain_node_span(int node, cpumask_t *span) | |||
6589 | cpus_or(*span, *span, *nodemask); | 6901 | cpus_or(*span, *span, *nodemask); |
6590 | } | 6902 | } |
6591 | } | 6903 | } |
6592 | #endif | 6904 | #endif /* CONFIG_NUMA */ |
6593 | 6905 | ||
6594 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 6906 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6595 | 6907 | ||
@@ -6608,7 +6920,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, | |||
6608 | *sg = &per_cpu(sched_group_cpus, cpu); | 6920 | *sg = &per_cpu(sched_group_cpus, cpu); |
6609 | return cpu; | 6921 | return cpu; |
6610 | } | 6922 | } |
6611 | #endif | 6923 | #endif /* CONFIG_SCHED_SMT */ |
6612 | 6924 | ||
6613 | /* | 6925 | /* |
6614 | * multi-core sched-domains: | 6926 | * multi-core sched-domains: |
@@ -6616,7 +6928,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, | |||
6616 | #ifdef CONFIG_SCHED_MC | 6928 | #ifdef CONFIG_SCHED_MC |
6617 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 6929 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
6618 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); | 6930 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); |
6619 | #endif | 6931 | #endif /* CONFIG_SCHED_MC */ |
6620 | 6932 | ||
6621 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6933 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
6622 | static int | 6934 | static int |
@@ -6718,7 +7030,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
6718 | sg = sg->next; | 7030 | sg = sg->next; |
6719 | } while (sg != group_head); | 7031 | } while (sg != group_head); |
6720 | } | 7032 | } |
6721 | #endif | 7033 | #endif /* CONFIG_NUMA */ |
6722 | 7034 | ||
6723 | #ifdef CONFIG_NUMA | 7035 | #ifdef CONFIG_NUMA |
6724 | /* Free memory allocated for various sched_group structures */ | 7036 | /* Free memory allocated for various sched_group structures */ |
@@ -6733,7 +7045,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) | |||
6733 | if (!sched_group_nodes) | 7045 | if (!sched_group_nodes) |
6734 | continue; | 7046 | continue; |
6735 | 7047 | ||
6736 | for (i = 0; i < MAX_NUMNODES; i++) { | 7048 | for (i = 0; i < nr_node_ids; i++) { |
6737 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 7049 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
6738 | 7050 | ||
6739 | *nodemask = node_to_cpumask(i); | 7051 | *nodemask = node_to_cpumask(i); |
@@ -6755,11 +7067,11 @@ next_sg: | |||
6755 | sched_group_nodes_bycpu[cpu] = NULL; | 7067 | sched_group_nodes_bycpu[cpu] = NULL; |
6756 | } | 7068 | } |
6757 | } | 7069 | } |
6758 | #else | 7070 | #else /* !CONFIG_NUMA */ |
6759 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) | 7071 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) |
6760 | { | 7072 | { |
6761 | } | 7073 | } |
6762 | #endif | 7074 | #endif /* CONFIG_NUMA */ |
6763 | 7075 | ||
6764 | /* | 7076 | /* |
6765 | * Initialize sched groups cpu_power. | 7077 | * Initialize sched groups cpu_power. |
@@ -6926,7 +7238,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
6926 | /* | 7238 | /* |
6927 | * Allocate the per-node list of sched groups | 7239 | * Allocate the per-node list of sched groups |
6928 | */ | 7240 | */ |
6929 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), | 7241 | sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), |
6930 | GFP_KERNEL); | 7242 | GFP_KERNEL); |
6931 | if (!sched_group_nodes) { | 7243 | if (!sched_group_nodes) { |
6932 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 7244 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
@@ -7065,7 +7377,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7065 | #endif | 7377 | #endif |
7066 | 7378 | ||
7067 | /* Set up physical groups */ | 7379 | /* Set up physical groups */ |
7068 | for (i = 0; i < MAX_NUMNODES; i++) { | 7380 | for (i = 0; i < nr_node_ids; i++) { |
7069 | SCHED_CPUMASK_VAR(nodemask, allmasks); | 7381 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
7070 | SCHED_CPUMASK_VAR(send_covered, allmasks); | 7382 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
7071 | 7383 | ||
@@ -7089,7 +7401,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7089 | send_covered, tmpmask); | 7401 | send_covered, tmpmask); |
7090 | } | 7402 | } |
7091 | 7403 | ||
7092 | for (i = 0; i < MAX_NUMNODES; i++) { | 7404 | for (i = 0; i < nr_node_ids; i++) { |
7093 | /* Set up node groups */ | 7405 | /* Set up node groups */ |
7094 | struct sched_group *sg, *prev; | 7406 | struct sched_group *sg, *prev; |
7095 | SCHED_CPUMASK_VAR(nodemask, allmasks); | 7407 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
@@ -7128,9 +7440,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7128 | cpus_or(*covered, *covered, *nodemask); | 7440 | cpus_or(*covered, *covered, *nodemask); |
7129 | prev = sg; | 7441 | prev = sg; |
7130 | 7442 | ||
7131 | for (j = 0; j < MAX_NUMNODES; j++) { | 7443 | for (j = 0; j < nr_node_ids; j++) { |
7132 | SCHED_CPUMASK_VAR(notcovered, allmasks); | 7444 | SCHED_CPUMASK_VAR(notcovered, allmasks); |
7133 | int n = (i + j) % MAX_NUMNODES; | 7445 | int n = (i + j) % nr_node_ids; |
7134 | node_to_cpumask_ptr(pnodemask, n); | 7446 | node_to_cpumask_ptr(pnodemask, n); |
7135 | 7447 | ||
7136 | cpus_complement(*notcovered, *covered); | 7448 | cpus_complement(*notcovered, *covered); |
@@ -7183,7 +7495,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7183 | } | 7495 | } |
7184 | 7496 | ||
7185 | #ifdef CONFIG_NUMA | 7497 | #ifdef CONFIG_NUMA |
7186 | for (i = 0; i < MAX_NUMNODES; i++) | 7498 | for (i = 0; i < nr_node_ids; i++) |
7187 | init_numa_sched_groups_power(sched_group_nodes[i]); | 7499 | init_numa_sched_groups_power(sched_group_nodes[i]); |
7188 | 7500 | ||
7189 | if (sd_allnodes) { | 7501 | if (sd_allnodes) { |
@@ -7468,7 +7780,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
7468 | #endif | 7780 | #endif |
7469 | return err; | 7781 | return err; |
7470 | } | 7782 | } |
7471 | #endif | 7783 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
7472 | 7784 | ||
7473 | /* | 7785 | /* |
7474 | * Force a reinitialization of the sched domains hierarchy. The domains | 7786 | * Force a reinitialization of the sched domains hierarchy. The domains |
@@ -7479,21 +7791,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
7479 | static int update_sched_domains(struct notifier_block *nfb, | 7791 | static int update_sched_domains(struct notifier_block *nfb, |
7480 | unsigned long action, void *hcpu) | 7792 | unsigned long action, void *hcpu) |
7481 | { | 7793 | { |
7794 | int cpu = (int)(long)hcpu; | ||
7795 | |||
7482 | switch (action) { | 7796 | switch (action) { |
7483 | case CPU_UP_PREPARE: | ||
7484 | case CPU_UP_PREPARE_FROZEN: | ||
7485 | case CPU_DOWN_PREPARE: | 7797 | case CPU_DOWN_PREPARE: |
7486 | case CPU_DOWN_PREPARE_FROZEN: | 7798 | case CPU_DOWN_PREPARE_FROZEN: |
7799 | disable_runtime(cpu_rq(cpu)); | ||
7800 | /* fall-through */ | ||
7801 | case CPU_UP_PREPARE: | ||
7802 | case CPU_UP_PREPARE_FROZEN: | ||
7487 | detach_destroy_domains(&cpu_online_map); | 7803 | detach_destroy_domains(&cpu_online_map); |
7488 | free_sched_domains(); | 7804 | free_sched_domains(); |
7489 | return NOTIFY_OK; | 7805 | return NOTIFY_OK; |
7490 | 7806 | ||
7491 | case CPU_UP_CANCELED: | 7807 | |
7492 | case CPU_UP_CANCELED_FROZEN: | ||
7493 | case CPU_DOWN_FAILED: | 7808 | case CPU_DOWN_FAILED: |
7494 | case CPU_DOWN_FAILED_FROZEN: | 7809 | case CPU_DOWN_FAILED_FROZEN: |
7495 | case CPU_ONLINE: | 7810 | case CPU_ONLINE: |
7496 | case CPU_ONLINE_FROZEN: | 7811 | case CPU_ONLINE_FROZEN: |
7812 | enable_runtime(cpu_rq(cpu)); | ||
7813 | /* fall-through */ | ||
7814 | case CPU_UP_CANCELED: | ||
7815 | case CPU_UP_CANCELED_FROZEN: | ||
7497 | case CPU_DEAD: | 7816 | case CPU_DEAD: |
7498 | case CPU_DEAD_FROZEN: | 7817 | case CPU_DEAD_FROZEN: |
7499 | /* | 7818 | /* |
@@ -7693,8 +8012,8 @@ void __init sched_init(void) | |||
7693 | 8012 | ||
7694 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | 8013 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
7695 | ptr += nr_cpu_ids * sizeof(void **); | 8014 | ptr += nr_cpu_ids * sizeof(void **); |
7696 | #endif | 8015 | #endif /* CONFIG_USER_SCHED */ |
7697 | #endif | 8016 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7698 | #ifdef CONFIG_RT_GROUP_SCHED | 8017 | #ifdef CONFIG_RT_GROUP_SCHED |
7699 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 8018 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; |
7700 | ptr += nr_cpu_ids * sizeof(void **); | 8019 | ptr += nr_cpu_ids * sizeof(void **); |
@@ -7708,8 +8027,8 @@ void __init sched_init(void) | |||
7708 | 8027 | ||
7709 | root_task_group.rt_rq = (struct rt_rq **)ptr; | 8028 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
7710 | ptr += nr_cpu_ids * sizeof(void **); | 8029 | ptr += nr_cpu_ids * sizeof(void **); |
7711 | #endif | 8030 | #endif /* CONFIG_USER_SCHED */ |
7712 | #endif | 8031 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7713 | } | 8032 | } |
7714 | 8033 | ||
7715 | #ifdef CONFIG_SMP | 8034 | #ifdef CONFIG_SMP |
@@ -7725,8 +8044,8 @@ void __init sched_init(void) | |||
7725 | #ifdef CONFIG_USER_SCHED | 8044 | #ifdef CONFIG_USER_SCHED |
7726 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | 8045 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
7727 | global_rt_period(), RUNTIME_INF); | 8046 | global_rt_period(), RUNTIME_INF); |
7728 | #endif | 8047 | #endif /* CONFIG_USER_SCHED */ |
7729 | #endif | 8048 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7730 | 8049 | ||
7731 | #ifdef CONFIG_GROUP_SCHED | 8050 | #ifdef CONFIG_GROUP_SCHED |
7732 | list_add(&init_task_group.list, &task_groups); | 8051 | list_add(&init_task_group.list, &task_groups); |
@@ -7736,8 +8055,8 @@ void __init sched_init(void) | |||
7736 | INIT_LIST_HEAD(&root_task_group.children); | 8055 | INIT_LIST_HEAD(&root_task_group.children); |
7737 | init_task_group.parent = &root_task_group; | 8056 | init_task_group.parent = &root_task_group; |
7738 | list_add(&init_task_group.siblings, &root_task_group.children); | 8057 | list_add(&init_task_group.siblings, &root_task_group.children); |
7739 | #endif | 8058 | #endif /* CONFIG_USER_SCHED */ |
7740 | #endif | 8059 | #endif /* CONFIG_GROUP_SCHED */ |
7741 | 8060 | ||
7742 | for_each_possible_cpu(i) { | 8061 | for_each_possible_cpu(i) { |
7743 | struct rq *rq; | 8062 | struct rq *rq; |
@@ -7817,6 +8136,7 @@ void __init sched_init(void) | |||
7817 | rq->next_balance = jiffies; | 8136 | rq->next_balance = jiffies; |
7818 | rq->push_cpu = 0; | 8137 | rq->push_cpu = 0; |
7819 | rq->cpu = i; | 8138 | rq->cpu = i; |
8139 | rq->online = 0; | ||
7820 | rq->migration_thread = NULL; | 8140 | rq->migration_thread = NULL; |
7821 | INIT_LIST_HEAD(&rq->migration_queue); | 8141 | INIT_LIST_HEAD(&rq->migration_queue); |
7822 | rq_attach_root(rq, &def_root_domain); | 8142 | rq_attach_root(rq, &def_root_domain); |
@@ -8056,7 +8376,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | |||
8056 | { | 8376 | { |
8057 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8377 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); |
8058 | } | 8378 | } |
8059 | #else | 8379 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
8060 | static inline void free_fair_sched_group(struct task_group *tg) | 8380 | static inline void free_fair_sched_group(struct task_group *tg) |
8061 | { | 8381 | { |
8062 | } | 8382 | } |
@@ -8074,7 +8394,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu) | |||
8074 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8394 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8075 | { | 8395 | { |
8076 | } | 8396 | } |
8077 | #endif | 8397 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8078 | 8398 | ||
8079 | #ifdef CONFIG_RT_GROUP_SCHED | 8399 | #ifdef CONFIG_RT_GROUP_SCHED |
8080 | static void free_rt_sched_group(struct task_group *tg) | 8400 | static void free_rt_sched_group(struct task_group *tg) |
@@ -8145,7 +8465,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | |||
8145 | { | 8465 | { |
8146 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | 8466 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); |
8147 | } | 8467 | } |
8148 | #else | 8468 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8149 | static inline void free_rt_sched_group(struct task_group *tg) | 8469 | static inline void free_rt_sched_group(struct task_group *tg) |
8150 | { | 8470 | { |
8151 | } | 8471 | } |
@@ -8163,7 +8483,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu) | |||
8163 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | 8483 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) |
8164 | { | 8484 | { |
8165 | } | 8485 | } |
8166 | #endif | 8486 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8167 | 8487 | ||
8168 | #ifdef CONFIG_GROUP_SCHED | 8488 | #ifdef CONFIG_GROUP_SCHED |
8169 | static void free_sched_group(struct task_group *tg) | 8489 | static void free_sched_group(struct task_group *tg) |
@@ -8274,17 +8594,14 @@ void sched_move_task(struct task_struct *tsk) | |||
8274 | 8594 | ||
8275 | task_rq_unlock(rq, &flags); | 8595 | task_rq_unlock(rq, &flags); |
8276 | } | 8596 | } |
8277 | #endif | 8597 | #endif /* CONFIG_GROUP_SCHED */ |
8278 | 8598 | ||
8279 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8599 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8280 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 8600 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
8281 | { | 8601 | { |
8282 | struct cfs_rq *cfs_rq = se->cfs_rq; | 8602 | struct cfs_rq *cfs_rq = se->cfs_rq; |
8283 | struct rq *rq = cfs_rq->rq; | ||
8284 | int on_rq; | 8603 | int on_rq; |
8285 | 8604 | ||
8286 | spin_lock_irq(&rq->lock); | ||
8287 | |||
8288 | on_rq = se->on_rq; | 8605 | on_rq = se->on_rq; |
8289 | if (on_rq) | 8606 | if (on_rq) |
8290 | dequeue_entity(cfs_rq, se, 0); | 8607 | dequeue_entity(cfs_rq, se, 0); |
@@ -8294,8 +8611,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) | |||
8294 | 8611 | ||
8295 | if (on_rq) | 8612 | if (on_rq) |
8296 | enqueue_entity(cfs_rq, se, 0); | 8613 | enqueue_entity(cfs_rq, se, 0); |
8614 | } | ||
8297 | 8615 | ||
8298 | spin_unlock_irq(&rq->lock); | 8616 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
8617 | { | ||
8618 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8619 | struct rq *rq = cfs_rq->rq; | ||
8620 | unsigned long flags; | ||
8621 | |||
8622 | spin_lock_irqsave(&rq->lock, flags); | ||
8623 | __set_se_shares(se, shares); | ||
8624 | spin_unlock_irqrestore(&rq->lock, flags); | ||
8299 | } | 8625 | } |
8300 | 8626 | ||
8301 | static DEFINE_MUTEX(shares_mutex); | 8627 | static DEFINE_MUTEX(shares_mutex); |
@@ -8334,8 +8660,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8334 | * w/o tripping rebalance_share or load_balance_fair. | 8660 | * w/o tripping rebalance_share or load_balance_fair. |
8335 | */ | 8661 | */ |
8336 | tg->shares = shares; | 8662 | tg->shares = shares; |
8337 | for_each_possible_cpu(i) | 8663 | for_each_possible_cpu(i) { |
8664 | /* | ||
8665 | * force a rebalance | ||
8666 | */ | ||
8667 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
8338 | set_se_shares(tg->se[i], shares); | 8668 | set_se_shares(tg->se[i], shares); |
8669 | } | ||
8339 | 8670 | ||
8340 | /* | 8671 | /* |
8341 | * Enable load balance activity on this group, by inserting it back on | 8672 | * Enable load balance activity on this group, by inserting it back on |
@@ -8374,7 +8705,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
8374 | #ifdef CONFIG_CGROUP_SCHED | 8705 | #ifdef CONFIG_CGROUP_SCHED |
8375 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8706 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8376 | { | 8707 | { |
8377 | struct task_group *tgi, *parent = tg ? tg->parent : NULL; | 8708 | struct task_group *tgi, *parent = tg->parent; |
8378 | unsigned long total = 0; | 8709 | unsigned long total = 0; |
8379 | 8710 | ||
8380 | if (!parent) { | 8711 | if (!parent) { |
@@ -8398,7 +8729,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | |||
8398 | } | 8729 | } |
8399 | rcu_read_unlock(); | 8730 | rcu_read_unlock(); |
8400 | 8731 | ||
8401 | return total + to_ratio(period, runtime) < | 8732 | return total + to_ratio(period, runtime) <= |
8402 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | 8733 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), |
8403 | parent->rt_bandwidth.rt_runtime); | 8734 | parent->rt_bandwidth.rt_runtime); |
8404 | } | 8735 | } |
@@ -8501,6 +8832,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
8501 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; | 8832 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; |
8502 | rt_runtime = tg->rt_bandwidth.rt_runtime; | 8833 | rt_runtime = tg->rt_bandwidth.rt_runtime; |
8503 | 8834 | ||
8835 | if (rt_period == 0) | ||
8836 | return -EINVAL; | ||
8837 | |||
8504 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8838 | return tg_set_bandwidth(tg, rt_period, rt_runtime); |
8505 | } | 8839 | } |
8506 | 8840 | ||
@@ -8515,16 +8849,21 @@ long sched_group_rt_period(struct task_group *tg) | |||
8515 | 8849 | ||
8516 | static int sched_rt_global_constraints(void) | 8850 | static int sched_rt_global_constraints(void) |
8517 | { | 8851 | { |
8852 | struct task_group *tg = &root_task_group; | ||
8853 | u64 rt_runtime, rt_period; | ||
8518 | int ret = 0; | 8854 | int ret = 0; |
8519 | 8855 | ||
8856 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
8857 | rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
8858 | |||
8520 | mutex_lock(&rt_constraints_mutex); | 8859 | mutex_lock(&rt_constraints_mutex); |
8521 | if (!__rt_schedulable(NULL, 1, 0)) | 8860 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) |
8522 | ret = -EINVAL; | 8861 | ret = -EINVAL; |
8523 | mutex_unlock(&rt_constraints_mutex); | 8862 | mutex_unlock(&rt_constraints_mutex); |
8524 | 8863 | ||
8525 | return ret; | 8864 | return ret; |
8526 | } | 8865 | } |
8527 | #else | 8866 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8528 | static int sched_rt_global_constraints(void) | 8867 | static int sched_rt_global_constraints(void) |
8529 | { | 8868 | { |
8530 | unsigned long flags; | 8869 | unsigned long flags; |
@@ -8542,7 +8881,7 @@ static int sched_rt_global_constraints(void) | |||
8542 | 8881 | ||
8543 | return 0; | 8882 | return 0; |
8544 | } | 8883 | } |
8545 | #endif | 8884 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8546 | 8885 | ||
8547 | int sched_rt_handler(struct ctl_table *table, int write, | 8886 | int sched_rt_handler(struct ctl_table *table, int write, |
8548 | struct file *filp, void __user *buffer, size_t *lenp, | 8887 | struct file *filp, void __user *buffer, size_t *lenp, |
@@ -8650,7 +8989,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
8650 | 8989 | ||
8651 | return (u64) tg->shares; | 8990 | return (u64) tg->shares; |
8652 | } | 8991 | } |
8653 | #endif | 8992 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8654 | 8993 | ||
8655 | #ifdef CONFIG_RT_GROUP_SCHED | 8994 | #ifdef CONFIG_RT_GROUP_SCHED |
8656 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 8995 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, |
@@ -8674,7 +9013,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
8674 | { | 9013 | { |
8675 | return sched_group_rt_period(cgroup_tg(cgrp)); | 9014 | return sched_group_rt_period(cgroup_tg(cgrp)); |
8676 | } | 9015 | } |
8677 | #endif | 9016 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8678 | 9017 | ||
8679 | static struct cftype cpu_files[] = { | 9018 | static struct cftype cpu_files[] = { |
8680 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9019 | #ifdef CONFIG_FAIR_GROUP_SCHED |