diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2008-07-21 00:55:14 -0400 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2008-07-21 00:55:14 -0400 |
commit | 908cf4b925e419bc74f3297b2f0e51d6f8a81da2 (patch) | |
tree | 6c2da79366d4695a9c2560ab18259eca8a2a25b4 /kernel/sched.c | |
parent | 92c49890922d54cba4b1eadeb0b185773c2c9570 (diff) | |
parent | 14b395e35d1afdd8019d11b92e28041fad591b71 (diff) |
Merge master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6 into next
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 972 |
1 files changed, 489 insertions, 483 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a91539..99e6d850ecab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -70,10 +70,13 @@ | |||
70 | #include <linux/bootmem.h> | 70 | #include <linux/bootmem.h> |
71 | #include <linux/debugfs.h> | 71 | #include <linux/debugfs.h> |
72 | #include <linux/ctype.h> | 72 | #include <linux/ctype.h> |
73 | #include <linux/ftrace.h> | ||
73 | 74 | ||
74 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
75 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
76 | 77 | ||
78 | #include "sched_cpupri.h" | ||
79 | |||
77 | /* | 80 | /* |
78 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 81 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
79 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 82 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
@@ -136,7 +139,7 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | |||
136 | 139 | ||
137 | static inline int rt_policy(int policy) | 140 | static inline int rt_policy(int policy) |
138 | { | 141 | { |
139 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) | 142 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) |
140 | return 1; | 143 | return 1; |
141 | return 0; | 144 | return 0; |
142 | } | 145 | } |
@@ -289,15 +292,15 @@ struct task_group root_task_group; | |||
289 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 292 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
290 | /* Default task group's cfs_rq on each cpu */ | 293 | /* Default task group's cfs_rq on each cpu */ |
291 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 294 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
292 | #endif | 295 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
293 | 296 | ||
294 | #ifdef CONFIG_RT_GROUP_SCHED | 297 | #ifdef CONFIG_RT_GROUP_SCHED |
295 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 298 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
296 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 299 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
297 | #endif | 300 | #endif /* CONFIG_RT_GROUP_SCHED */ |
298 | #else | 301 | #else /* !CONFIG_FAIR_GROUP_SCHED */ |
299 | #define root_task_group init_task_group | 302 | #define root_task_group init_task_group |
300 | #endif | 303 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
301 | 304 | ||
302 | /* task_group_lock serializes add/remove of task groups and also changes to | 305 | /* task_group_lock serializes add/remove of task groups and also changes to |
303 | * a task group's cpu shares. | 306 | * a task group's cpu shares. |
@@ -307,17 +310,20 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
307 | #ifdef CONFIG_FAIR_GROUP_SCHED | 310 | #ifdef CONFIG_FAIR_GROUP_SCHED |
308 | #ifdef CONFIG_USER_SCHED | 311 | #ifdef CONFIG_USER_SCHED |
309 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | 312 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
310 | #else | 313 | #else /* !CONFIG_USER_SCHED */ |
311 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 314 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
312 | #endif | 315 | #endif /* CONFIG_USER_SCHED */ |
313 | 316 | ||
314 | /* | 317 | /* |
315 | * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. | 318 | * A weight of 0 or 1 can cause arithmetics problems. |
319 | * A weight of a cfs_rq is the sum of weights of which entities | ||
320 | * are queued on this cfs_rq, so a weight of a entity should not be | ||
321 | * too large, so as the shares value of a task group. | ||
316 | * (The default weight is 1024 - so there's no practical | 322 | * (The default weight is 1024 - so there's no practical |
317 | * limitation from this.) | 323 | * limitation from this.) |
318 | */ | 324 | */ |
319 | #define MIN_SHARES 2 | 325 | #define MIN_SHARES 2 |
320 | #define MAX_SHARES (ULONG_MAX - 1) | 326 | #define MAX_SHARES (1UL << 18) |
321 | 327 | ||
322 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 328 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
323 | #endif | 329 | #endif |
@@ -360,6 +366,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | |||
360 | #else | 366 | #else |
361 | 367 | ||
362 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | 368 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
369 | static inline struct task_group *task_group(struct task_struct *p) | ||
370 | { | ||
371 | return NULL; | ||
372 | } | ||
363 | 373 | ||
364 | #endif /* CONFIG_GROUP_SCHED */ | 374 | #endif /* CONFIG_GROUP_SCHED */ |
365 | 375 | ||
@@ -370,6 +380,7 @@ struct cfs_rq { | |||
370 | 380 | ||
371 | u64 exec_clock; | 381 | u64 exec_clock; |
372 | u64 min_vruntime; | 382 | u64 min_vruntime; |
383 | u64 pair_start; | ||
373 | 384 | ||
374 | struct rb_root tasks_timeline; | 385 | struct rb_root tasks_timeline; |
375 | struct rb_node *rb_leftmost; | 386 | struct rb_node *rb_leftmost; |
@@ -400,40 +411,28 @@ struct cfs_rq { | |||
400 | struct task_group *tg; /* group that "owns" this runqueue */ | 411 | struct task_group *tg; /* group that "owns" this runqueue */ |
401 | 412 | ||
402 | #ifdef CONFIG_SMP | 413 | #ifdef CONFIG_SMP |
403 | unsigned long task_weight; | ||
404 | unsigned long shares; | ||
405 | /* | 414 | /* |
406 | * We need space to build a sched_domain wide view of the full task | 415 | * the part of load.weight contributed by tasks |
407 | * group tree, in order to avoid depending on dynamic memory allocation | ||
408 | * during the load balancing we place this in the per cpu task group | ||
409 | * hierarchy. This limits the load balancing to one instance per cpu, | ||
410 | * but more should not be needed anyway. | ||
411 | */ | 416 | */ |
412 | struct aggregate_struct { | 417 | unsigned long task_weight; |
413 | /* | ||
414 | * load = weight(cpus) * f(tg) | ||
415 | * | ||
416 | * Where f(tg) is the recursive weight fraction assigned to | ||
417 | * this group. | ||
418 | */ | ||
419 | unsigned long load; | ||
420 | 418 | ||
421 | /* | 419 | /* |
422 | * part of the group weight distributed to this span. | 420 | * h_load = weight * f(tg) |
423 | */ | 421 | * |
424 | unsigned long shares; | 422 | * Where f(tg) is the recursive weight fraction assigned to |
423 | * this group. | ||
424 | */ | ||
425 | unsigned long h_load; | ||
425 | 426 | ||
426 | /* | 427 | /* |
427 | * The sum of all runqueue weights within this span. | 428 | * this cpu's part of tg->shares |
428 | */ | 429 | */ |
429 | unsigned long rq_weight; | 430 | unsigned long shares; |
430 | 431 | ||
431 | /* | 432 | /* |
432 | * Weight contributed by tasks; this is the part we can | 433 | * load.weight at the time we set shares |
433 | * influence by moving tasks around. | 434 | */ |
434 | */ | 435 | unsigned long rq_weight; |
435 | unsigned long task_weight; | ||
436 | } aggregate; | ||
437 | #endif | 436 | #endif |
438 | #endif | 437 | #endif |
439 | }; | 438 | }; |
@@ -486,6 +485,9 @@ struct root_domain { | |||
486 | */ | 485 | */ |
487 | cpumask_t rto_mask; | 486 | cpumask_t rto_mask; |
488 | atomic_t rto_count; | 487 | atomic_t rto_count; |
488 | #ifdef CONFIG_SMP | ||
489 | struct cpupri cpupri; | ||
490 | #endif | ||
489 | }; | 491 | }; |
490 | 492 | ||
491 | /* | 493 | /* |
@@ -560,6 +562,9 @@ struct rq { | |||
560 | int push_cpu; | 562 | int push_cpu; |
561 | /* cpu of this runqueue: */ | 563 | /* cpu of this runqueue: */ |
562 | int cpu; | 564 | int cpu; |
565 | int online; | ||
566 | |||
567 | unsigned long avg_load_per_task; | ||
563 | 568 | ||
564 | struct task_struct *migration_thread; | 569 | struct task_struct *migration_thread; |
565 | struct list_head migration_queue; | 570 | struct list_head migration_queue; |
@@ -641,6 +646,24 @@ static inline void update_rq_clock(struct rq *rq) | |||
641 | # define const_debug static const | 646 | # define const_debug static const |
642 | #endif | 647 | #endif |
643 | 648 | ||
649 | /** | ||
650 | * runqueue_is_locked | ||
651 | * | ||
652 | * Returns true if the current cpu runqueue is locked. | ||
653 | * This interface allows printk to be called with the runqueue lock | ||
654 | * held and know whether or not it is OK to wake up the klogd. | ||
655 | */ | ||
656 | int runqueue_is_locked(void) | ||
657 | { | ||
658 | int cpu = get_cpu(); | ||
659 | struct rq *rq = cpu_rq(cpu); | ||
660 | int ret; | ||
661 | |||
662 | ret = spin_is_locked(&rq->lock); | ||
663 | put_cpu(); | ||
664 | return ret; | ||
665 | } | ||
666 | |||
644 | /* | 667 | /* |
645 | * Debugging: various feature bits | 668 | * Debugging: various feature bits |
646 | */ | 669 | */ |
@@ -783,6 +806,12 @@ late_initcall(sched_init_debug); | |||
783 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 806 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
784 | 807 | ||
785 | /* | 808 | /* |
809 | * ratelimit for updating the group shares. | ||
810 | * default: 0.5ms | ||
811 | */ | ||
812 | const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; | ||
813 | |||
814 | /* | ||
786 | * period over which we measure -rt task cpu usage in us. | 815 | * period over which we measure -rt task cpu usage in us. |
787 | * default: 1s | 816 | * default: 1s |
788 | */ | 817 | */ |
@@ -809,82 +838,6 @@ static inline u64 global_rt_runtime(void) | |||
809 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | 838 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; |
810 | } | 839 | } |
811 | 840 | ||
812 | unsigned long long time_sync_thresh = 100000; | ||
813 | |||
814 | static DEFINE_PER_CPU(unsigned long long, time_offset); | ||
815 | static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); | ||
816 | |||
817 | /* | ||
818 | * Global lock which we take every now and then to synchronize | ||
819 | * the CPUs time. This method is not warp-safe, but it's good | ||
820 | * enough to synchronize slowly diverging time sources and thus | ||
821 | * it's good enough for tracing: | ||
822 | */ | ||
823 | static DEFINE_SPINLOCK(time_sync_lock); | ||
824 | static unsigned long long prev_global_time; | ||
825 | |||
826 | static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu) | ||
827 | { | ||
828 | /* | ||
829 | * We want this inlined, to not get tracer function calls | ||
830 | * in this critical section: | ||
831 | */ | ||
832 | spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_); | ||
833 | __raw_spin_lock(&time_sync_lock.raw_lock); | ||
834 | |||
835 | if (time < prev_global_time) { | ||
836 | per_cpu(time_offset, cpu) += prev_global_time - time; | ||
837 | time = prev_global_time; | ||
838 | } else { | ||
839 | prev_global_time = time; | ||
840 | } | ||
841 | |||
842 | __raw_spin_unlock(&time_sync_lock.raw_lock); | ||
843 | spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_); | ||
844 | |||
845 | return time; | ||
846 | } | ||
847 | |||
848 | static unsigned long long __cpu_clock(int cpu) | ||
849 | { | ||
850 | unsigned long long now; | ||
851 | |||
852 | /* | ||
853 | * Only call sched_clock() if the scheduler has already been | ||
854 | * initialized (some code might call cpu_clock() very early): | ||
855 | */ | ||
856 | if (unlikely(!scheduler_running)) | ||
857 | return 0; | ||
858 | |||
859 | now = sched_clock_cpu(cpu); | ||
860 | |||
861 | return now; | ||
862 | } | ||
863 | |||
864 | /* | ||
865 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | ||
866 | * clock constructed from sched_clock(): | ||
867 | */ | ||
868 | unsigned long long cpu_clock(int cpu) | ||
869 | { | ||
870 | unsigned long long prev_cpu_time, time, delta_time; | ||
871 | unsigned long flags; | ||
872 | |||
873 | local_irq_save(flags); | ||
874 | prev_cpu_time = per_cpu(prev_cpu_time, cpu); | ||
875 | time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); | ||
876 | delta_time = time-prev_cpu_time; | ||
877 | |||
878 | if (unlikely(delta_time > time_sync_thresh)) { | ||
879 | time = __sync_cpu_clock(time, cpu); | ||
880 | per_cpu(prev_cpu_time, cpu) = time; | ||
881 | } | ||
882 | local_irq_restore(flags); | ||
883 | |||
884 | return time; | ||
885 | } | ||
886 | EXPORT_SYMBOL_GPL(cpu_clock); | ||
887 | |||
888 | #ifndef prepare_arch_switch | 841 | #ifndef prepare_arch_switch |
889 | # define prepare_arch_switch(next) do { } while (0) | 842 | # define prepare_arch_switch(next) do { } while (0) |
890 | #endif | 843 | #endif |
@@ -1161,6 +1114,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) | |||
1161 | return HRTIMER_NORESTART; | 1114 | return HRTIMER_NORESTART; |
1162 | } | 1115 | } |
1163 | 1116 | ||
1117 | #ifdef CONFIG_SMP | ||
1164 | static void hotplug_hrtick_disable(int cpu) | 1118 | static void hotplug_hrtick_disable(int cpu) |
1165 | { | 1119 | { |
1166 | struct rq *rq = cpu_rq(cpu); | 1120 | struct rq *rq = cpu_rq(cpu); |
@@ -1216,6 +1170,7 @@ static void init_hrtick(void) | |||
1216 | { | 1170 | { |
1217 | hotcpu_notifier(hotplug_hrtick, 0); | 1171 | hotcpu_notifier(hotplug_hrtick, 0); |
1218 | } | 1172 | } |
1173 | #endif /* CONFIG_SMP */ | ||
1219 | 1174 | ||
1220 | static void init_rq_hrtick(struct rq *rq) | 1175 | static void init_rq_hrtick(struct rq *rq) |
1221 | { | 1176 | { |
@@ -1345,15 +1300,15 @@ void wake_up_idle_cpu(int cpu) | |||
1345 | if (!tsk_is_polling(rq->idle)) | 1300 | if (!tsk_is_polling(rq->idle)) |
1346 | smp_send_reschedule(cpu); | 1301 | smp_send_reschedule(cpu); |
1347 | } | 1302 | } |
1348 | #endif | 1303 | #endif /* CONFIG_NO_HZ */ |
1349 | 1304 | ||
1350 | #else | 1305 | #else /* !CONFIG_SMP */ |
1351 | static void __resched_task(struct task_struct *p, int tif_bit) | 1306 | static void __resched_task(struct task_struct *p, int tif_bit) |
1352 | { | 1307 | { |
1353 | assert_spin_locked(&task_rq(p)->lock); | 1308 | assert_spin_locked(&task_rq(p)->lock); |
1354 | set_tsk_thread_flag(p, tif_bit); | 1309 | set_tsk_thread_flag(p, tif_bit); |
1355 | } | 1310 | } |
1356 | #endif | 1311 | #endif /* CONFIG_SMP */ |
1357 | 1312 | ||
1358 | #if BITS_PER_LONG == 32 | 1313 | #if BITS_PER_LONG == 32 |
1359 | # define WMULT_CONST (~0UL) | 1314 | # define WMULT_CONST (~0UL) |
@@ -1377,8 +1332,13 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1377 | { | 1332 | { |
1378 | u64 tmp; | 1333 | u64 tmp; |
1379 | 1334 | ||
1380 | if (!lw->inv_weight) | 1335 | if (!lw->inv_weight) { |
1381 | lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1); | 1336 | if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) |
1337 | lw->inv_weight = 1; | ||
1338 | else | ||
1339 | lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) | ||
1340 | / (lw->weight+1); | ||
1341 | } | ||
1382 | 1342 | ||
1383 | tmp = (u64)delta_exec * weight; | 1343 | tmp = (u64)delta_exec * weight; |
1384 | /* | 1344 | /* |
@@ -1503,63 +1463,35 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
1503 | #ifdef CONFIG_SMP | 1463 | #ifdef CONFIG_SMP |
1504 | static unsigned long source_load(int cpu, int type); | 1464 | static unsigned long source_load(int cpu, int type); |
1505 | static unsigned long target_load(int cpu, int type); | 1465 | static unsigned long target_load(int cpu, int type); |
1506 | static unsigned long cpu_avg_load_per_task(int cpu); | ||
1507 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1466 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1508 | 1467 | ||
1509 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1468 | static unsigned long cpu_avg_load_per_task(int cpu) |
1469 | { | ||
1470 | struct rq *rq = cpu_rq(cpu); | ||
1510 | 1471 | ||
1511 | /* | 1472 | if (rq->nr_running) |
1512 | * Group load balancing. | 1473 | rq->avg_load_per_task = rq->load.weight / rq->nr_running; |
1513 | * | ||
1514 | * We calculate a few balance domain wide aggregate numbers; load and weight. | ||
1515 | * Given the pictures below, and assuming each item has equal weight: | ||
1516 | * | ||
1517 | * root 1 - thread | ||
1518 | * / | \ A - group | ||
1519 | * A 1 B | ||
1520 | * /|\ / \ | ||
1521 | * C 2 D 3 4 | ||
1522 | * | | | ||
1523 | * 5 6 | ||
1524 | * | ||
1525 | * load: | ||
1526 | * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, | ||
1527 | * which equals 1/9-th of the total load. | ||
1528 | * | ||
1529 | * shares: | ||
1530 | * The weight of this group on the selected cpus. | ||
1531 | * | ||
1532 | * rq_weight: | ||
1533 | * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while | ||
1534 | * B would get 2. | ||
1535 | * | ||
1536 | * task_weight: | ||
1537 | * Part of the rq_weight contributed by tasks; all groups except B would | ||
1538 | * get 1, B gets 2. | ||
1539 | */ | ||
1540 | 1474 | ||
1541 | static inline struct aggregate_struct * | 1475 | return rq->avg_load_per_task; |
1542 | aggregate(struct task_group *tg, struct sched_domain *sd) | ||
1543 | { | ||
1544 | return &tg->cfs_rq[sd->first_cpu]->aggregate; | ||
1545 | } | 1476 | } |
1546 | 1477 | ||
1547 | typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); | 1478 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1479 | |||
1480 | typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); | ||
1548 | 1481 | ||
1549 | /* | 1482 | /* |
1550 | * Iterate the full tree, calling @down when first entering a node and @up when | 1483 | * Iterate the full tree, calling @down when first entering a node and @up when |
1551 | * leaving it for the final time. | 1484 | * leaving it for the final time. |
1552 | */ | 1485 | */ |
1553 | static | 1486 | static void |
1554 | void aggregate_walk_tree(aggregate_func down, aggregate_func up, | 1487 | walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) |
1555 | struct sched_domain *sd) | ||
1556 | { | 1488 | { |
1557 | struct task_group *parent, *child; | 1489 | struct task_group *parent, *child; |
1558 | 1490 | ||
1559 | rcu_read_lock(); | 1491 | rcu_read_lock(); |
1560 | parent = &root_task_group; | 1492 | parent = &root_task_group; |
1561 | down: | 1493 | down: |
1562 | (*down)(parent, sd); | 1494 | (*down)(parent, cpu, sd); |
1563 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1495 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
1564 | parent = child; | 1496 | parent = child; |
1565 | goto down; | 1497 | goto down; |
@@ -1567,7 +1499,7 @@ down: | |||
1567 | up: | 1499 | up: |
1568 | continue; | 1500 | continue; |
1569 | } | 1501 | } |
1570 | (*up)(parent, sd); | 1502 | (*up)(parent, cpu, sd); |
1571 | 1503 | ||
1572 | child = parent; | 1504 | child = parent; |
1573 | parent = parent->parent; | 1505 | parent = parent->parent; |
@@ -1576,90 +1508,23 @@ up: | |||
1576 | rcu_read_unlock(); | 1508 | rcu_read_unlock(); |
1577 | } | 1509 | } |
1578 | 1510 | ||
1579 | /* | ||
1580 | * Calculate the aggregate runqueue weight. | ||
1581 | */ | ||
1582 | static | ||
1583 | void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) | ||
1584 | { | ||
1585 | unsigned long rq_weight = 0; | ||
1586 | unsigned long task_weight = 0; | ||
1587 | int i; | ||
1588 | |||
1589 | for_each_cpu_mask(i, sd->span) { | ||
1590 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
1591 | task_weight += tg->cfs_rq[i]->task_weight; | ||
1592 | } | ||
1593 | |||
1594 | aggregate(tg, sd)->rq_weight = rq_weight; | ||
1595 | aggregate(tg, sd)->task_weight = task_weight; | ||
1596 | } | ||
1597 | |||
1598 | /* | ||
1599 | * Compute the weight of this group on the given cpus. | ||
1600 | */ | ||
1601 | static | ||
1602 | void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) | ||
1603 | { | ||
1604 | unsigned long shares = 0; | ||
1605 | int i; | ||
1606 | |||
1607 | for_each_cpu_mask(i, sd->span) | ||
1608 | shares += tg->cfs_rq[i]->shares; | ||
1609 | |||
1610 | if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) | ||
1611 | shares = tg->shares; | ||
1612 | |||
1613 | aggregate(tg, sd)->shares = shares; | ||
1614 | } | ||
1615 | |||
1616 | /* | ||
1617 | * Compute the load fraction assigned to this group, relies on the aggregate | ||
1618 | * weight and this group's parent's load, i.e. top-down. | ||
1619 | */ | ||
1620 | static | ||
1621 | void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) | ||
1622 | { | ||
1623 | unsigned long load; | ||
1624 | |||
1625 | if (!tg->parent) { | ||
1626 | int i; | ||
1627 | |||
1628 | load = 0; | ||
1629 | for_each_cpu_mask(i, sd->span) | ||
1630 | load += cpu_rq(i)->load.weight; | ||
1631 | |||
1632 | } else { | ||
1633 | load = aggregate(tg->parent, sd)->load; | ||
1634 | |||
1635 | /* | ||
1636 | * shares is our weight in the parent's rq so | ||
1637 | * shares/parent->rq_weight gives our fraction of the load | ||
1638 | */ | ||
1639 | load *= aggregate(tg, sd)->shares; | ||
1640 | load /= aggregate(tg->parent, sd)->rq_weight + 1; | ||
1641 | } | ||
1642 | |||
1643 | aggregate(tg, sd)->load = load; | ||
1644 | } | ||
1645 | |||
1646 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1511 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1647 | 1512 | ||
1648 | /* | 1513 | /* |
1649 | * Calculate and set the cpu's group shares. | 1514 | * Calculate and set the cpu's group shares. |
1650 | */ | 1515 | */ |
1651 | static void | 1516 | static void |
1652 | __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, | 1517 | __update_group_shares_cpu(struct task_group *tg, int cpu, |
1653 | int tcpu) | 1518 | unsigned long sd_shares, unsigned long sd_rq_weight) |
1654 | { | 1519 | { |
1655 | int boost = 0; | 1520 | int boost = 0; |
1656 | unsigned long shares; | 1521 | unsigned long shares; |
1657 | unsigned long rq_weight; | 1522 | unsigned long rq_weight; |
1658 | 1523 | ||
1659 | if (!tg->se[tcpu]) | 1524 | if (!tg->se[cpu]) |
1660 | return; | 1525 | return; |
1661 | 1526 | ||
1662 | rq_weight = tg->cfs_rq[tcpu]->load.weight; | 1527 | rq_weight = tg->cfs_rq[cpu]->load.weight; |
1663 | 1528 | ||
1664 | /* | 1529 | /* |
1665 | * If there are currently no tasks on the cpu pretend there is one of | 1530 | * If there are currently no tasks on the cpu pretend there is one of |
@@ -1671,170 +1536,139 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, | |||
1671 | rq_weight = NICE_0_LOAD; | 1536 | rq_weight = NICE_0_LOAD; |
1672 | } | 1537 | } |
1673 | 1538 | ||
1539 | if (unlikely(rq_weight > sd_rq_weight)) | ||
1540 | rq_weight = sd_rq_weight; | ||
1541 | |||
1674 | /* | 1542 | /* |
1675 | * \Sum shares * rq_weight | 1543 | * \Sum shares * rq_weight |
1676 | * shares = ----------------------- | 1544 | * shares = ----------------------- |
1677 | * \Sum rq_weight | 1545 | * \Sum rq_weight |
1678 | * | 1546 | * |
1679 | */ | 1547 | */ |
1680 | shares = aggregate(tg, sd)->shares * rq_weight; | 1548 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); |
1681 | shares /= aggregate(tg, sd)->rq_weight + 1; | ||
1682 | 1549 | ||
1683 | /* | 1550 | /* |
1684 | * record the actual number of shares, not the boosted amount. | 1551 | * record the actual number of shares, not the boosted amount. |
1685 | */ | 1552 | */ |
1686 | tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; | 1553 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; |
1554 | tg->cfs_rq[cpu]->rq_weight = rq_weight; | ||
1687 | 1555 | ||
1688 | if (shares < MIN_SHARES) | 1556 | if (shares < MIN_SHARES) |
1689 | shares = MIN_SHARES; | 1557 | shares = MIN_SHARES; |
1690 | else if (shares > MAX_SHARES) | 1558 | else if (shares > MAX_SHARES) |
1691 | shares = MAX_SHARES; | 1559 | shares = MAX_SHARES; |
1692 | 1560 | ||
1693 | __set_se_shares(tg->se[tcpu], shares); | 1561 | __set_se_shares(tg->se[cpu], shares); |
1694 | } | 1562 | } |
1695 | 1563 | ||
1696 | /* | 1564 | /* |
1697 | * Re-adjust the weights on the cpu the task came from and on the cpu the | 1565 | * Re-compute the task group their per cpu shares over the given domain. |
1698 | * task went to. | 1566 | * This needs to be done in a bottom-up fashion because the rq weight of a |
1567 | * parent group depends on the shares of its child groups. | ||
1699 | */ | 1568 | */ |
1700 | static void | 1569 | static void |
1701 | __move_group_shares(struct task_group *tg, struct sched_domain *sd, | 1570 | tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) |
1702 | int scpu, int dcpu) | ||
1703 | { | 1571 | { |
1704 | unsigned long shares; | 1572 | unsigned long rq_weight = 0; |
1705 | 1573 | unsigned long shares = 0; | |
1706 | shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | 1574 | int i; |
1707 | 1575 | ||
1708 | __update_group_shares_cpu(tg, sd, scpu); | 1576 | for_each_cpu_mask(i, sd->span) { |
1709 | __update_group_shares_cpu(tg, sd, dcpu); | 1577 | rq_weight += tg->cfs_rq[i]->load.weight; |
1578 | shares += tg->cfs_rq[i]->shares; | ||
1579 | } | ||
1710 | 1580 | ||
1711 | /* | 1581 | if ((!shares && rq_weight) || shares > tg->shares) |
1712 | * ensure we never loose shares due to rounding errors in the | 1582 | shares = tg->shares; |
1713 | * above redistribution. | ||
1714 | */ | ||
1715 | shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
1716 | if (shares) | ||
1717 | tg->cfs_rq[dcpu]->shares += shares; | ||
1718 | } | ||
1719 | 1583 | ||
1720 | /* | 1584 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) |
1721 | * Because changing a group's shares changes the weight of the super-group | 1585 | shares = tg->shares; |
1722 | * we need to walk up the tree and change all shares until we hit the root. | ||
1723 | */ | ||
1724 | static void | ||
1725 | move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
1726 | int scpu, int dcpu) | ||
1727 | { | ||
1728 | while (tg) { | ||
1729 | __move_group_shares(tg, sd, scpu, dcpu); | ||
1730 | tg = tg->parent; | ||
1731 | } | ||
1732 | } | ||
1733 | 1586 | ||
1734 | static | 1587 | if (!rq_weight) |
1735 | void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) | 1588 | rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; |
1736 | { | ||
1737 | unsigned long shares = aggregate(tg, sd)->shares; | ||
1738 | int i; | ||
1739 | 1589 | ||
1740 | for_each_cpu_mask(i, sd->span) { | 1590 | for_each_cpu_mask(i, sd->span) { |
1741 | struct rq *rq = cpu_rq(i); | 1591 | struct rq *rq = cpu_rq(i); |
1742 | unsigned long flags; | 1592 | unsigned long flags; |
1743 | 1593 | ||
1744 | spin_lock_irqsave(&rq->lock, flags); | 1594 | spin_lock_irqsave(&rq->lock, flags); |
1745 | __update_group_shares_cpu(tg, sd, i); | 1595 | __update_group_shares_cpu(tg, i, shares, rq_weight); |
1746 | spin_unlock_irqrestore(&rq->lock, flags); | 1596 | spin_unlock_irqrestore(&rq->lock, flags); |
1747 | } | 1597 | } |
1748 | |||
1749 | aggregate_group_shares(tg, sd); | ||
1750 | |||
1751 | /* | ||
1752 | * ensure we never loose shares due to rounding errors in the | ||
1753 | * above redistribution. | ||
1754 | */ | ||
1755 | shares -= aggregate(tg, sd)->shares; | ||
1756 | if (shares) { | ||
1757 | tg->cfs_rq[sd->first_cpu]->shares += shares; | ||
1758 | aggregate(tg, sd)->shares += shares; | ||
1759 | } | ||
1760 | } | 1598 | } |
1761 | 1599 | ||
1762 | /* | 1600 | /* |
1763 | * Calculate the accumulative weight and recursive load of each task group | 1601 | * Compute the cpu's hierarchical load factor for each task group. |
1764 | * while walking down the tree. | 1602 | * This needs to be done in a top-down fashion because the load of a child |
1603 | * group is a fraction of its parents load. | ||
1765 | */ | 1604 | */ |
1766 | static | 1605 | static void |
1767 | void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) | 1606 | tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) |
1768 | { | 1607 | { |
1769 | aggregate_group_weight(tg, sd); | 1608 | unsigned long load; |
1770 | aggregate_group_shares(tg, sd); | ||
1771 | aggregate_group_load(tg, sd); | ||
1772 | } | ||
1773 | 1609 | ||
1774 | /* | 1610 | if (!tg->parent) { |
1775 | * Rebalance the cpu shares while walking back up the tree. | 1611 | load = cpu_rq(cpu)->load.weight; |
1776 | */ | 1612 | } else { |
1777 | static | 1613 | load = tg->parent->cfs_rq[cpu]->h_load; |
1778 | void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) | 1614 | load *= tg->cfs_rq[cpu]->shares; |
1779 | { | 1615 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; |
1780 | aggregate_group_set_shares(tg, sd); | 1616 | } |
1781 | } | ||
1782 | 1617 | ||
1783 | static DEFINE_PER_CPU(spinlock_t, aggregate_lock); | 1618 | tg->cfs_rq[cpu]->h_load = load; |
1619 | } | ||
1784 | 1620 | ||
1785 | static void __init init_aggregate(void) | 1621 | static void |
1622 | tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1786 | { | 1623 | { |
1787 | int i; | ||
1788 | |||
1789 | for_each_possible_cpu(i) | ||
1790 | spin_lock_init(&per_cpu(aggregate_lock, i)); | ||
1791 | } | 1624 | } |
1792 | 1625 | ||
1793 | static int get_aggregate(struct sched_domain *sd) | 1626 | static void update_shares(struct sched_domain *sd) |
1794 | { | 1627 | { |
1795 | if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) | 1628 | u64 now = cpu_clock(raw_smp_processor_id()); |
1796 | return 0; | 1629 | s64 elapsed = now - sd->last_update; |
1797 | 1630 | ||
1798 | aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); | 1631 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
1799 | return 1; | 1632 | sd->last_update = now; |
1633 | walk_tg_tree(tg_nop, tg_shares_up, 0, sd); | ||
1634 | } | ||
1800 | } | 1635 | } |
1801 | 1636 | ||
1802 | static void put_aggregate(struct sched_domain *sd) | 1637 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) |
1803 | { | 1638 | { |
1804 | spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); | 1639 | spin_unlock(&rq->lock); |
1640 | update_shares(sd); | ||
1641 | spin_lock(&rq->lock); | ||
1805 | } | 1642 | } |
1806 | 1643 | ||
1807 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | 1644 | static void update_h_load(int cpu) |
1808 | { | 1645 | { |
1809 | cfs_rq->shares = shares; | 1646 | walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); |
1810 | } | 1647 | } |
1811 | 1648 | ||
1812 | #else | 1649 | #else |
1813 | 1650 | ||
1814 | static inline void init_aggregate(void) | 1651 | static inline void update_shares(struct sched_domain *sd) |
1815 | { | 1652 | { |
1816 | } | 1653 | } |
1817 | 1654 | ||
1818 | static inline int get_aggregate(struct sched_domain *sd) | 1655 | static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) |
1819 | { | 1656 | { |
1820 | return 0; | ||
1821 | } | 1657 | } |
1822 | 1658 | ||
1823 | static inline void put_aggregate(struct sched_domain *sd) | ||
1824 | { | ||
1825 | } | ||
1826 | #endif | 1659 | #endif |
1827 | 1660 | ||
1828 | #else /* CONFIG_SMP */ | 1661 | #endif |
1829 | 1662 | ||
1830 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1663 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1831 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | 1664 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) |
1832 | { | 1665 | { |
1666 | #ifdef CONFIG_SMP | ||
1667 | cfs_rq->shares = shares; | ||
1668 | #endif | ||
1833 | } | 1669 | } |
1834 | #endif | 1670 | #endif |
1835 | 1671 | ||
1836 | #endif /* CONFIG_SMP */ | ||
1837 | |||
1838 | #include "sched_stats.h" | 1672 | #include "sched_stats.h" |
1839 | #include "sched_idletask.c" | 1673 | #include "sched_idletask.c" |
1840 | #include "sched_fair.c" | 1674 | #include "sched_fair.c" |
@@ -1844,6 +1678,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1844 | #endif | 1678 | #endif |
1845 | 1679 | ||
1846 | #define sched_class_highest (&rt_sched_class) | 1680 | #define sched_class_highest (&rt_sched_class) |
1681 | #define for_each_class(class) \ | ||
1682 | for (class = sched_class_highest; class; class = class->next) | ||
1847 | 1683 | ||
1848 | static void inc_nr_running(struct rq *rq) | 1684 | static void inc_nr_running(struct rq *rq) |
1849 | { | 1685 | { |
@@ -1876,6 +1712,12 @@ static void set_load_weight(struct task_struct *p) | |||
1876 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 1712 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; |
1877 | } | 1713 | } |
1878 | 1714 | ||
1715 | static void update_avg(u64 *avg, u64 sample) | ||
1716 | { | ||
1717 | s64 diff = sample - *avg; | ||
1718 | *avg += diff >> 3; | ||
1719 | } | ||
1720 | |||
1879 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | 1721 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) |
1880 | { | 1722 | { |
1881 | sched_info_queued(p); | 1723 | sched_info_queued(p); |
@@ -1885,6 +1727,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1885 | 1727 | ||
1886 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) | 1728 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) |
1887 | { | 1729 | { |
1730 | if (sleep && p->se.last_wakeup) { | ||
1731 | update_avg(&p->se.avg_overlap, | ||
1732 | p->se.sum_exec_runtime - p->se.last_wakeup); | ||
1733 | p->se.last_wakeup = 0; | ||
1734 | } | ||
1735 | |||
1736 | sched_info_dequeued(p); | ||
1888 | p->sched_class->dequeue_task(rq, p, sleep); | 1737 | p->sched_class->dequeue_task(rq, p, sleep); |
1889 | p->se.on_rq = 0; | 1738 | p->se.on_rq = 0; |
1890 | } | 1739 | } |
@@ -1968,12 +1817,6 @@ inline int task_curr(const struct task_struct *p) | |||
1968 | return cpu_curr(task_cpu(p)) == p; | 1817 | return cpu_curr(task_cpu(p)) == p; |
1969 | } | 1818 | } |
1970 | 1819 | ||
1971 | /* Used instead of source_load when we know the type == 0 */ | ||
1972 | unsigned long weighted_cpuload(const int cpu) | ||
1973 | { | ||
1974 | return cpu_rq(cpu)->load.weight; | ||
1975 | } | ||
1976 | |||
1977 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1820 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1978 | { | 1821 | { |
1979 | set_task_rq(p, cpu); | 1822 | set_task_rq(p, cpu); |
@@ -2002,6 +1845,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
2002 | 1845 | ||
2003 | #ifdef CONFIG_SMP | 1846 | #ifdef CONFIG_SMP |
2004 | 1847 | ||
1848 | /* Used instead of source_load when we know the type == 0 */ | ||
1849 | static unsigned long weighted_cpuload(const int cpu) | ||
1850 | { | ||
1851 | return cpu_rq(cpu)->load.weight; | ||
1852 | } | ||
1853 | |||
2005 | /* | 1854 | /* |
2006 | * Is this task likely cache-hot: | 1855 | * Is this task likely cache-hot: |
2007 | */ | 1856 | */ |
@@ -2212,7 +2061,7 @@ static unsigned long source_load(int cpu, int type) | |||
2212 | struct rq *rq = cpu_rq(cpu); | 2061 | struct rq *rq = cpu_rq(cpu); |
2213 | unsigned long total = weighted_cpuload(cpu); | 2062 | unsigned long total = weighted_cpuload(cpu); |
2214 | 2063 | ||
2215 | if (type == 0) | 2064 | if (type == 0 || !sched_feat(LB_BIAS)) |
2216 | return total; | 2065 | return total; |
2217 | 2066 | ||
2218 | return min(rq->cpu_load[type-1], total); | 2067 | return min(rq->cpu_load[type-1], total); |
@@ -2227,25 +2076,13 @@ static unsigned long target_load(int cpu, int type) | |||
2227 | struct rq *rq = cpu_rq(cpu); | 2076 | struct rq *rq = cpu_rq(cpu); |
2228 | unsigned long total = weighted_cpuload(cpu); | 2077 | unsigned long total = weighted_cpuload(cpu); |
2229 | 2078 | ||
2230 | if (type == 0) | 2079 | if (type == 0 || !sched_feat(LB_BIAS)) |
2231 | return total; | 2080 | return total; |
2232 | 2081 | ||
2233 | return max(rq->cpu_load[type-1], total); | 2082 | return max(rq->cpu_load[type-1], total); |
2234 | } | 2083 | } |
2235 | 2084 | ||
2236 | /* | 2085 | /* |
2237 | * Return the average load per task on the cpu's run queue | ||
2238 | */ | ||
2239 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
2240 | { | ||
2241 | struct rq *rq = cpu_rq(cpu); | ||
2242 | unsigned long total = weighted_cpuload(cpu); | ||
2243 | unsigned long n = rq->nr_running; | ||
2244 | |||
2245 | return n ? total / n : SCHED_LOAD_SCALE; | ||
2246 | } | ||
2247 | |||
2248 | /* | ||
2249 | * find_idlest_group finds and returns the least busy CPU group within the | 2086 | * find_idlest_group finds and returns the least busy CPU group within the |
2250 | * domain. | 2087 | * domain. |
2251 | */ | 2088 | */ |
@@ -2351,6 +2188,9 @@ static int sched_balance_self(int cpu, int flag) | |||
2351 | sd = tmp; | 2188 | sd = tmp; |
2352 | } | 2189 | } |
2353 | 2190 | ||
2191 | if (sd) | ||
2192 | update_shares(sd); | ||
2193 | |||
2354 | while (sd) { | 2194 | while (sd) { |
2355 | cpumask_t span, tmpmask; | 2195 | cpumask_t span, tmpmask; |
2356 | struct sched_group *group; | 2196 | struct sched_group *group; |
@@ -2417,6 +2257,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2417 | if (!sched_feat(SYNC_WAKEUPS)) | 2257 | if (!sched_feat(SYNC_WAKEUPS)) |
2418 | sync = 0; | 2258 | sync = 0; |
2419 | 2259 | ||
2260 | #ifdef CONFIG_SMP | ||
2261 | if (sched_feat(LB_WAKEUP_UPDATE)) { | ||
2262 | struct sched_domain *sd; | ||
2263 | |||
2264 | this_cpu = raw_smp_processor_id(); | ||
2265 | cpu = task_cpu(p); | ||
2266 | |||
2267 | for_each_domain(this_cpu, sd) { | ||
2268 | if (cpu_isset(cpu, sd->span)) { | ||
2269 | update_shares(sd); | ||
2270 | break; | ||
2271 | } | ||
2272 | } | ||
2273 | } | ||
2274 | #endif | ||
2275 | |||
2420 | smp_wmb(); | 2276 | smp_wmb(); |
2421 | rq = task_rq_lock(p, &flags); | 2277 | rq = task_rq_lock(p, &flags); |
2422 | old_state = p->state; | 2278 | old_state = p->state; |
@@ -2463,7 +2319,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2463 | } | 2319 | } |
2464 | } | 2320 | } |
2465 | } | 2321 | } |
2466 | #endif | 2322 | #endif /* CONFIG_SCHEDSTATS */ |
2467 | 2323 | ||
2468 | out_activate: | 2324 | out_activate: |
2469 | #endif /* CONFIG_SMP */ | 2325 | #endif /* CONFIG_SMP */ |
@@ -2481,6 +2337,9 @@ out_activate: | |||
2481 | success = 1; | 2337 | success = 1; |
2482 | 2338 | ||
2483 | out_running: | 2339 | out_running: |
2340 | trace_mark(kernel_sched_wakeup, | ||
2341 | "pid %d state %ld ## rq %p task %p rq->curr %p", | ||
2342 | p->pid, p->state, rq, p, rq->curr); | ||
2484 | check_preempt_curr(rq, p); | 2343 | check_preempt_curr(rq, p); |
2485 | 2344 | ||
2486 | p->state = TASK_RUNNING; | 2345 | p->state = TASK_RUNNING; |
@@ -2489,6 +2348,8 @@ out_running: | |||
2489 | p->sched_class->task_wake_up(rq, p); | 2348 | p->sched_class->task_wake_up(rq, p); |
2490 | #endif | 2349 | #endif |
2491 | out: | 2350 | out: |
2351 | current->se.last_wakeup = current->se.sum_exec_runtime; | ||
2352 | |||
2492 | task_rq_unlock(rq, &flags); | 2353 | task_rq_unlock(rq, &flags); |
2493 | 2354 | ||
2494 | return success; | 2355 | return success; |
@@ -2611,6 +2472,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2611 | p->sched_class->task_new(rq, p); | 2472 | p->sched_class->task_new(rq, p); |
2612 | inc_nr_running(rq); | 2473 | inc_nr_running(rq); |
2613 | } | 2474 | } |
2475 | trace_mark(kernel_sched_wakeup_new, | ||
2476 | "pid %d state %ld ## rq %p task %p rq->curr %p", | ||
2477 | p->pid, p->state, rq, p, rq->curr); | ||
2614 | check_preempt_curr(rq, p); | 2478 | check_preempt_curr(rq, p); |
2615 | #ifdef CONFIG_SMP | 2479 | #ifdef CONFIG_SMP |
2616 | if (p->sched_class->task_wake_up) | 2480 | if (p->sched_class->task_wake_up) |
@@ -2663,7 +2527,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, | |||
2663 | notifier->ops->sched_out(notifier, next); | 2527 | notifier->ops->sched_out(notifier, next); |
2664 | } | 2528 | } |
2665 | 2529 | ||
2666 | #else | 2530 | #else /* !CONFIG_PREEMPT_NOTIFIERS */ |
2667 | 2531 | ||
2668 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 2532 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
2669 | { | 2533 | { |
@@ -2675,7 +2539,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, | |||
2675 | { | 2539 | { |
2676 | } | 2540 | } |
2677 | 2541 | ||
2678 | #endif | 2542 | #endif /* CONFIG_PREEMPT_NOTIFIERS */ |
2679 | 2543 | ||
2680 | /** | 2544 | /** |
2681 | * prepare_task_switch - prepare to switch tasks | 2545 | * prepare_task_switch - prepare to switch tasks |
@@ -2783,6 +2647,11 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2783 | struct mm_struct *mm, *oldmm; | 2647 | struct mm_struct *mm, *oldmm; |
2784 | 2648 | ||
2785 | prepare_task_switch(rq, prev, next); | 2649 | prepare_task_switch(rq, prev, next); |
2650 | trace_mark(kernel_sched_schedule, | ||
2651 | "prev_pid %d next_pid %d prev_state %ld " | ||
2652 | "## rq %p prev %p next %p", | ||
2653 | prev->pid, next->pid, prev->state, | ||
2654 | rq, prev, next); | ||
2786 | mm = next->mm; | 2655 | mm = next->mm; |
2787 | oldmm = prev->active_mm; | 2656 | oldmm = prev->active_mm; |
2788 | /* | 2657 | /* |
@@ -3117,7 +2986,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3117 | enum cpu_idle_type idle, int *all_pinned, | 2986 | enum cpu_idle_type idle, int *all_pinned, |
3118 | int *this_best_prio, struct rq_iterator *iterator) | 2987 | int *this_best_prio, struct rq_iterator *iterator) |
3119 | { | 2988 | { |
3120 | int loops = 0, pulled = 0, pinned = 0, skip_for_load; | 2989 | int loops = 0, pulled = 0, pinned = 0; |
3121 | struct task_struct *p; | 2990 | struct task_struct *p; |
3122 | long rem_load_move = max_load_move; | 2991 | long rem_load_move = max_load_move; |
3123 | 2992 | ||
@@ -3133,14 +3002,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3133 | next: | 3002 | next: |
3134 | if (!p || loops++ > sysctl_sched_nr_migrate) | 3003 | if (!p || loops++ > sysctl_sched_nr_migrate) |
3135 | goto out; | 3004 | goto out; |
3136 | /* | 3005 | |
3137 | * To help distribute high priority tasks across CPUs we don't | 3006 | if ((p->se.load.weight >> 1) > rem_load_move || |
3138 | * skip a task if it will be the highest priority task (i.e. smallest | ||
3139 | * prio value) on its new queue regardless of its load weight | ||
3140 | */ | ||
3141 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + | ||
3142 | SCHED_LOAD_SCALE_FUZZ; | ||
3143 | if ((skip_for_load && p->prio >= *this_best_prio) || | ||
3144 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | 3007 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { |
3145 | p = iterator->next(iterator->arg); | 3008 | p = iterator->next(iterator->arg); |
3146 | goto next; | 3009 | goto next; |
@@ -3195,6 +3058,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3195 | max_load_move - total_load_moved, | 3058 | max_load_move - total_load_moved, |
3196 | sd, idle, all_pinned, &this_best_prio); | 3059 | sd, idle, all_pinned, &this_best_prio); |
3197 | class = class->next; | 3060 | class = class->next; |
3061 | |||
3062 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | ||
3063 | break; | ||
3064 | |||
3198 | } while (class && max_load_move > total_load_moved); | 3065 | } while (class && max_load_move > total_load_moved); |
3199 | 3066 | ||
3200 | return total_load_moved > 0; | 3067 | return total_load_moved > 0; |
@@ -3271,6 +3138,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3271 | max_load = this_load = total_load = total_pwr = 0; | 3138 | max_load = this_load = total_load = total_pwr = 0; |
3272 | busiest_load_per_task = busiest_nr_running = 0; | 3139 | busiest_load_per_task = busiest_nr_running = 0; |
3273 | this_load_per_task = this_nr_running = 0; | 3140 | this_load_per_task = this_nr_running = 0; |
3141 | |||
3274 | if (idle == CPU_NOT_IDLE) | 3142 | if (idle == CPU_NOT_IDLE) |
3275 | load_idx = sd->busy_idx; | 3143 | load_idx = sd->busy_idx; |
3276 | else if (idle == CPU_NEWLY_IDLE) | 3144 | else if (idle == CPU_NEWLY_IDLE) |
@@ -3285,6 +3153,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3285 | int __group_imb = 0; | 3153 | int __group_imb = 0; |
3286 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 3154 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
3287 | unsigned long sum_nr_running, sum_weighted_load; | 3155 | unsigned long sum_nr_running, sum_weighted_load; |
3156 | unsigned long sum_avg_load_per_task; | ||
3157 | unsigned long avg_load_per_task; | ||
3288 | 3158 | ||
3289 | local_group = cpu_isset(this_cpu, group->cpumask); | 3159 | local_group = cpu_isset(this_cpu, group->cpumask); |
3290 | 3160 | ||
@@ -3293,6 +3163,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3293 | 3163 | ||
3294 | /* Tally up the load of all CPUs in the group */ | 3164 | /* Tally up the load of all CPUs in the group */ |
3295 | sum_weighted_load = sum_nr_running = avg_load = 0; | 3165 | sum_weighted_load = sum_nr_running = avg_load = 0; |
3166 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
3167 | |||
3296 | max_cpu_load = 0; | 3168 | max_cpu_load = 0; |
3297 | min_cpu_load = ~0UL; | 3169 | min_cpu_load = ~0UL; |
3298 | 3170 | ||
@@ -3326,6 +3198,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3326 | avg_load += load; | 3198 | avg_load += load; |
3327 | sum_nr_running += rq->nr_running; | 3199 | sum_nr_running += rq->nr_running; |
3328 | sum_weighted_load += weighted_cpuload(i); | 3200 | sum_weighted_load += weighted_cpuload(i); |
3201 | |||
3202 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | ||
3329 | } | 3203 | } |
3330 | 3204 | ||
3331 | /* | 3205 | /* |
@@ -3347,7 +3221,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3347 | avg_load = sg_div_cpu_power(group, | 3221 | avg_load = sg_div_cpu_power(group, |
3348 | avg_load * SCHED_LOAD_SCALE); | 3222 | avg_load * SCHED_LOAD_SCALE); |
3349 | 3223 | ||
3350 | if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) | 3224 | |
3225 | /* | ||
3226 | * Consider the group unbalanced when the imbalance is larger | ||
3227 | * than the average weight of two tasks. | ||
3228 | * | ||
3229 | * APZ: with cgroup the avg task weight can vary wildly and | ||
3230 | * might not be a suitable number - should we keep a | ||
3231 | * normalized nr_running number somewhere that negates | ||
3232 | * the hierarchy? | ||
3233 | */ | ||
3234 | avg_load_per_task = sg_div_cpu_power(group, | ||
3235 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | ||
3236 | |||
3237 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | ||
3351 | __group_imb = 1; | 3238 | __group_imb = 1; |
3352 | 3239 | ||
3353 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3240 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; |
@@ -3488,9 +3375,9 @@ small_imbalance: | |||
3488 | if (busiest_load_per_task > this_load_per_task) | 3375 | if (busiest_load_per_task > this_load_per_task) |
3489 | imbn = 1; | 3376 | imbn = 1; |
3490 | } else | 3377 | } else |
3491 | this_load_per_task = SCHED_LOAD_SCALE; | 3378 | this_load_per_task = cpu_avg_load_per_task(this_cpu); |
3492 | 3379 | ||
3493 | if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= | 3380 | if (max_load - this_load + 2*busiest_load_per_task >= |
3494 | busiest_load_per_task * imbn) { | 3381 | busiest_load_per_task * imbn) { |
3495 | *imbalance = busiest_load_per_task; | 3382 | *imbalance = busiest_load_per_task; |
3496 | return busiest; | 3383 | return busiest; |
@@ -3600,12 +3487,9 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3600 | unsigned long imbalance; | 3487 | unsigned long imbalance; |
3601 | struct rq *busiest; | 3488 | struct rq *busiest; |
3602 | unsigned long flags; | 3489 | unsigned long flags; |
3603 | int unlock_aggregate; | ||
3604 | 3490 | ||
3605 | cpus_setall(*cpus); | 3491 | cpus_setall(*cpus); |
3606 | 3492 | ||
3607 | unlock_aggregate = get_aggregate(sd); | ||
3608 | |||
3609 | /* | 3493 | /* |
3610 | * When power savings policy is enabled for the parent domain, idle | 3494 | * When power savings policy is enabled for the parent domain, idle |
3611 | * sibling can pick up load irrespective of busy siblings. In this case, | 3495 | * sibling can pick up load irrespective of busy siblings. In this case, |
@@ -3619,6 +3503,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3619 | schedstat_inc(sd, lb_count[idle]); | 3503 | schedstat_inc(sd, lb_count[idle]); |
3620 | 3504 | ||
3621 | redo: | 3505 | redo: |
3506 | update_shares(sd); | ||
3622 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3507 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
3623 | cpus, balance); | 3508 | cpus, balance); |
3624 | 3509 | ||
@@ -3742,8 +3627,8 @@ out_one_pinned: | |||
3742 | else | 3627 | else |
3743 | ld_moved = 0; | 3628 | ld_moved = 0; |
3744 | out: | 3629 | out: |
3745 | if (unlock_aggregate) | 3630 | if (ld_moved) |
3746 | put_aggregate(sd); | 3631 | update_shares(sd); |
3747 | return ld_moved; | 3632 | return ld_moved; |
3748 | } | 3633 | } |
3749 | 3634 | ||
@@ -3779,6 +3664,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, | |||
3779 | 3664 | ||
3780 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | 3665 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
3781 | redo: | 3666 | redo: |
3667 | update_shares_locked(this_rq, sd); | ||
3782 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 3668 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
3783 | &sd_idle, cpus, NULL); | 3669 | &sd_idle, cpus, NULL); |
3784 | if (!group) { | 3670 | if (!group) { |
@@ -3822,6 +3708,7 @@ redo: | |||
3822 | } else | 3708 | } else |
3823 | sd->nr_balance_failed = 0; | 3709 | sd->nr_balance_failed = 0; |
3824 | 3710 | ||
3711 | update_shares_locked(this_rq, sd); | ||
3825 | return ld_moved; | 3712 | return ld_moved; |
3826 | 3713 | ||
3827 | out_balanced: | 3714 | out_balanced: |
@@ -4013,6 +3900,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
4013 | /* Earliest time when we have to do rebalance again */ | 3900 | /* Earliest time when we have to do rebalance again */ |
4014 | unsigned long next_balance = jiffies + 60*HZ; | 3901 | unsigned long next_balance = jiffies + 60*HZ; |
4015 | int update_next_balance = 0; | 3902 | int update_next_balance = 0; |
3903 | int need_serialize; | ||
4016 | cpumask_t tmp; | 3904 | cpumask_t tmp; |
4017 | 3905 | ||
4018 | for_each_domain(cpu, sd) { | 3906 | for_each_domain(cpu, sd) { |
@@ -4030,8 +3918,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
4030 | if (interval > HZ*NR_CPUS/10) | 3918 | if (interval > HZ*NR_CPUS/10) |
4031 | interval = HZ*NR_CPUS/10; | 3919 | interval = HZ*NR_CPUS/10; |
4032 | 3920 | ||
3921 | need_serialize = sd->flags & SD_SERIALIZE; | ||
4033 | 3922 | ||
4034 | if (sd->flags & SD_SERIALIZE) { | 3923 | if (need_serialize) { |
4035 | if (!spin_trylock(&balancing)) | 3924 | if (!spin_trylock(&balancing)) |
4036 | goto out; | 3925 | goto out; |
4037 | } | 3926 | } |
@@ -4047,7 +3936,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
4047 | } | 3936 | } |
4048 | sd->last_balance = jiffies; | 3937 | sd->last_balance = jiffies; |
4049 | } | 3938 | } |
4050 | if (sd->flags & SD_SERIALIZE) | 3939 | if (need_serialize) |
4051 | spin_unlock(&balancing); | 3940 | spin_unlock(&balancing); |
4052 | out: | 3941 | out: |
4053 | if (time_after(next_balance, sd->last_balance + interval)) { | 3942 | if (time_after(next_balance, sd->last_balance + interval)) { |
@@ -4362,26 +4251,44 @@ void scheduler_tick(void) | |||
4362 | #endif | 4251 | #endif |
4363 | } | 4252 | } |
4364 | 4253 | ||
4365 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) | 4254 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
4255 | defined(CONFIG_PREEMPT_TRACER)) | ||
4256 | |||
4257 | static inline unsigned long get_parent_ip(unsigned long addr) | ||
4258 | { | ||
4259 | if (in_lock_functions(addr)) { | ||
4260 | addr = CALLER_ADDR2; | ||
4261 | if (in_lock_functions(addr)) | ||
4262 | addr = CALLER_ADDR3; | ||
4263 | } | ||
4264 | return addr; | ||
4265 | } | ||
4366 | 4266 | ||
4367 | void __kprobes add_preempt_count(int val) | 4267 | void __kprobes add_preempt_count(int val) |
4368 | { | 4268 | { |
4269 | #ifdef CONFIG_DEBUG_PREEMPT | ||
4369 | /* | 4270 | /* |
4370 | * Underflow? | 4271 | * Underflow? |
4371 | */ | 4272 | */ |
4372 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) | 4273 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
4373 | return; | 4274 | return; |
4275 | #endif | ||
4374 | preempt_count() += val; | 4276 | preempt_count() += val; |
4277 | #ifdef CONFIG_DEBUG_PREEMPT | ||
4375 | /* | 4278 | /* |
4376 | * Spinlock count overflowing soon? | 4279 | * Spinlock count overflowing soon? |
4377 | */ | 4280 | */ |
4378 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= | 4281 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= |
4379 | PREEMPT_MASK - 10); | 4282 | PREEMPT_MASK - 10); |
4283 | #endif | ||
4284 | if (preempt_count() == val) | ||
4285 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | ||
4380 | } | 4286 | } |
4381 | EXPORT_SYMBOL(add_preempt_count); | 4287 | EXPORT_SYMBOL(add_preempt_count); |
4382 | 4288 | ||
4383 | void __kprobes sub_preempt_count(int val) | 4289 | void __kprobes sub_preempt_count(int val) |
4384 | { | 4290 | { |
4291 | #ifdef CONFIG_DEBUG_PREEMPT | ||
4385 | /* | 4292 | /* |
4386 | * Underflow? | 4293 | * Underflow? |
4387 | */ | 4294 | */ |
@@ -4393,7 +4300,10 @@ void __kprobes sub_preempt_count(int val) | |||
4393 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && | 4300 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && |
4394 | !(preempt_count() & PREEMPT_MASK))) | 4301 | !(preempt_count() & PREEMPT_MASK))) |
4395 | return; | 4302 | return; |
4303 | #endif | ||
4396 | 4304 | ||
4305 | if (preempt_count() == val) | ||
4306 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | ||
4397 | preempt_count() -= val; | 4307 | preempt_count() -= val; |
4398 | } | 4308 | } |
4399 | EXPORT_SYMBOL(sub_preempt_count); | 4309 | EXPORT_SYMBOL(sub_preempt_count); |
@@ -4411,6 +4321,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
4411 | prev->comm, prev->pid, preempt_count()); | 4321 | prev->comm, prev->pid, preempt_count()); |
4412 | 4322 | ||
4413 | debug_show_held_locks(prev); | 4323 | debug_show_held_locks(prev); |
4324 | print_modules(); | ||
4414 | if (irqs_disabled()) | 4325 | if (irqs_disabled()) |
4415 | print_irqtrace_events(prev); | 4326 | print_irqtrace_events(prev); |
4416 | 4327 | ||
@@ -4430,7 +4341,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
4430 | * schedule() atomically, we ignore that path for now. | 4341 | * schedule() atomically, we ignore that path for now. |
4431 | * Otherwise, whine if we are scheduling when we should not be. | 4342 | * Otherwise, whine if we are scheduling when we should not be. |
4432 | */ | 4343 | */ |
4433 | if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) | 4344 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) |
4434 | __schedule_bug(prev); | 4345 | __schedule_bug(prev); |
4435 | 4346 | ||
4436 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 4347 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
@@ -4484,7 +4395,7 @@ asmlinkage void __sched schedule(void) | |||
4484 | struct task_struct *prev, *next; | 4395 | struct task_struct *prev, *next; |
4485 | unsigned long *switch_count; | 4396 | unsigned long *switch_count; |
4486 | struct rq *rq; | 4397 | struct rq *rq; |
4487 | int cpu; | 4398 | int cpu, hrtick = sched_feat(HRTICK); |
4488 | 4399 | ||
4489 | need_resched: | 4400 | need_resched: |
4490 | preempt_disable(); | 4401 | preempt_disable(); |
@@ -4499,7 +4410,8 @@ need_resched_nonpreemptible: | |||
4499 | 4410 | ||
4500 | schedule_debug(prev); | 4411 | schedule_debug(prev); |
4501 | 4412 | ||
4502 | hrtick_clear(rq); | 4413 | if (hrtick) |
4414 | hrtick_clear(rq); | ||
4503 | 4415 | ||
4504 | /* | 4416 | /* |
4505 | * Do the rq-clock update outside the rq lock: | 4417 | * Do the rq-clock update outside the rq lock: |
@@ -4510,12 +4422,10 @@ need_resched_nonpreemptible: | |||
4510 | clear_tsk_need_resched(prev); | 4422 | clear_tsk_need_resched(prev); |
4511 | 4423 | ||
4512 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 4424 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
4513 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 4425 | if (unlikely(signal_pending_state(prev->state, prev))) |
4514 | signal_pending(prev))) { | ||
4515 | prev->state = TASK_RUNNING; | 4426 | prev->state = TASK_RUNNING; |
4516 | } else { | 4427 | else |
4517 | deactivate_task(rq, prev, 1); | 4428 | deactivate_task(rq, prev, 1); |
4518 | } | ||
4519 | switch_count = &prev->nvcsw; | 4429 | switch_count = &prev->nvcsw; |
4520 | } | 4430 | } |
4521 | 4431 | ||
@@ -4547,7 +4457,8 @@ need_resched_nonpreemptible: | |||
4547 | } else | 4457 | } else |
4548 | spin_unlock_irq(&rq->lock); | 4458 | spin_unlock_irq(&rq->lock); |
4549 | 4459 | ||
4550 | hrtick_set(rq); | 4460 | if (hrtick) |
4461 | hrtick_set(rq); | ||
4551 | 4462 | ||
4552 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 4463 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
4553 | goto need_resched_nonpreemptible; | 4464 | goto need_resched_nonpreemptible; |
@@ -4741,22 +4652,20 @@ do_wait_for_common(struct completion *x, long timeout, int state) | |||
4741 | signal_pending(current)) || | 4652 | signal_pending(current)) || |
4742 | (state == TASK_KILLABLE && | 4653 | (state == TASK_KILLABLE && |
4743 | fatal_signal_pending(current))) { | 4654 | fatal_signal_pending(current))) { |
4744 | __remove_wait_queue(&x->wait, &wait); | 4655 | timeout = -ERESTARTSYS; |
4745 | return -ERESTARTSYS; | 4656 | break; |
4746 | } | 4657 | } |
4747 | __set_current_state(state); | 4658 | __set_current_state(state); |
4748 | spin_unlock_irq(&x->wait.lock); | 4659 | spin_unlock_irq(&x->wait.lock); |
4749 | timeout = schedule_timeout(timeout); | 4660 | timeout = schedule_timeout(timeout); |
4750 | spin_lock_irq(&x->wait.lock); | 4661 | spin_lock_irq(&x->wait.lock); |
4751 | if (!timeout) { | 4662 | } while (!x->done && timeout); |
4752 | __remove_wait_queue(&x->wait, &wait); | ||
4753 | return timeout; | ||
4754 | } | ||
4755 | } while (!x->done); | ||
4756 | __remove_wait_queue(&x->wait, &wait); | 4663 | __remove_wait_queue(&x->wait, &wait); |
4664 | if (!x->done) | ||
4665 | return timeout; | ||
4757 | } | 4666 | } |
4758 | x->done--; | 4667 | x->done--; |
4759 | return timeout; | 4668 | return timeout ?: 1; |
4760 | } | 4669 | } |
4761 | 4670 | ||
4762 | static long __sched | 4671 | static long __sched |
@@ -5086,16 +4995,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | |||
5086 | set_load_weight(p); | 4995 | set_load_weight(p); |
5087 | } | 4996 | } |
5088 | 4997 | ||
5089 | /** | 4998 | static int __sched_setscheduler(struct task_struct *p, int policy, |
5090 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | 4999 | struct sched_param *param, bool user) |
5091 | * @p: the task in question. | ||
5092 | * @policy: new policy. | ||
5093 | * @param: structure containing the new RT priority. | ||
5094 | * | ||
5095 | * NOTE that the task may be already dead. | ||
5096 | */ | ||
5097 | int sched_setscheduler(struct task_struct *p, int policy, | ||
5098 | struct sched_param *param) | ||
5099 | { | 5000 | { |
5100 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 5001 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
5101 | unsigned long flags; | 5002 | unsigned long flags; |
@@ -5127,7 +5028,7 @@ recheck: | |||
5127 | /* | 5028 | /* |
5128 | * Allow unprivileged RT tasks to decrease priority: | 5029 | * Allow unprivileged RT tasks to decrease priority: |
5129 | */ | 5030 | */ |
5130 | if (!capable(CAP_SYS_NICE)) { | 5031 | if (user && !capable(CAP_SYS_NICE)) { |
5131 | if (rt_policy(policy)) { | 5032 | if (rt_policy(policy)) { |
5132 | unsigned long rlim_rtprio; | 5033 | unsigned long rlim_rtprio; |
5133 | 5034 | ||
@@ -5163,7 +5064,8 @@ recheck: | |||
5163 | * Do not allow realtime tasks into groups that have no runtime | 5064 | * Do not allow realtime tasks into groups that have no runtime |
5164 | * assigned. | 5065 | * assigned. |
5165 | */ | 5066 | */ |
5166 | if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) | 5067 | if (user |
5068 | && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) | ||
5167 | return -EPERM; | 5069 | return -EPERM; |
5168 | #endif | 5070 | #endif |
5169 | 5071 | ||
@@ -5212,8 +5114,39 @@ recheck: | |||
5212 | 5114 | ||
5213 | return 0; | 5115 | return 0; |
5214 | } | 5116 | } |
5117 | |||
5118 | /** | ||
5119 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | ||
5120 | * @p: the task in question. | ||
5121 | * @policy: new policy. | ||
5122 | * @param: structure containing the new RT priority. | ||
5123 | * | ||
5124 | * NOTE that the task may be already dead. | ||
5125 | */ | ||
5126 | int sched_setscheduler(struct task_struct *p, int policy, | ||
5127 | struct sched_param *param) | ||
5128 | { | ||
5129 | return __sched_setscheduler(p, policy, param, true); | ||
5130 | } | ||
5215 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 5131 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
5216 | 5132 | ||
5133 | /** | ||
5134 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. | ||
5135 | * @p: the task in question. | ||
5136 | * @policy: new policy. | ||
5137 | * @param: structure containing the new RT priority. | ||
5138 | * | ||
5139 | * Just like sched_setscheduler, only don't bother checking if the | ||
5140 | * current context has permission. For example, this is needed in | ||
5141 | * stop_machine(): we create temporary high priority worker threads, | ||
5142 | * but our caller might not have that capability. | ||
5143 | */ | ||
5144 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | ||
5145 | struct sched_param *param) | ||
5146 | { | ||
5147 | return __sched_setscheduler(p, policy, param, false); | ||
5148 | } | ||
5149 | |||
5217 | static int | 5150 | static int |
5218 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 5151 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
5219 | { | 5152 | { |
@@ -5412,24 +5345,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
5412 | return sched_setaffinity(pid, &new_mask); | 5345 | return sched_setaffinity(pid, &new_mask); |
5413 | } | 5346 | } |
5414 | 5347 | ||
5415 | /* | ||
5416 | * Represents all cpu's present in the system | ||
5417 | * In systems capable of hotplug, this map could dynamically grow | ||
5418 | * as new cpu's are detected in the system via any platform specific | ||
5419 | * method, such as ACPI for e.g. | ||
5420 | */ | ||
5421 | |||
5422 | cpumask_t cpu_present_map __read_mostly; | ||
5423 | EXPORT_SYMBOL(cpu_present_map); | ||
5424 | |||
5425 | #ifndef CONFIG_SMP | ||
5426 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; | ||
5427 | EXPORT_SYMBOL(cpu_online_map); | ||
5428 | |||
5429 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | ||
5430 | EXPORT_SYMBOL(cpu_possible_map); | ||
5431 | #endif | ||
5432 | |||
5433 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 5348 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
5434 | { | 5349 | { |
5435 | struct task_struct *p; | 5350 | struct task_struct *p; |
@@ -5726,7 +5641,7 @@ out_unlock: | |||
5726 | return retval; | 5641 | return retval; |
5727 | } | 5642 | } |
5728 | 5643 | ||
5729 | static const char stat_nam[] = "RSDTtZX"; | 5644 | static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; |
5730 | 5645 | ||
5731 | void sched_show_task(struct task_struct *p) | 5646 | void sched_show_task(struct task_struct *p) |
5732 | { | 5647 | { |
@@ -5913,6 +5828,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) | |||
5913 | goto out; | 5828 | goto out; |
5914 | } | 5829 | } |
5915 | 5830 | ||
5831 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && | ||
5832 | !cpus_equal(p->cpus_allowed, *new_mask))) { | ||
5833 | ret = -EINVAL; | ||
5834 | goto out; | ||
5835 | } | ||
5836 | |||
5916 | if (p->sched_class->set_cpus_allowed) | 5837 | if (p->sched_class->set_cpus_allowed) |
5917 | p->sched_class->set_cpus_allowed(p, new_mask); | 5838 | p->sched_class->set_cpus_allowed(p, new_mask); |
5918 | else { | 5839 | else { |
@@ -5964,10 +5885,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5964 | double_rq_lock(rq_src, rq_dest); | 5885 | double_rq_lock(rq_src, rq_dest); |
5965 | /* Already moved. */ | 5886 | /* Already moved. */ |
5966 | if (task_cpu(p) != src_cpu) | 5887 | if (task_cpu(p) != src_cpu) |
5967 | goto out; | 5888 | goto done; |
5968 | /* Affinity changed (again). */ | 5889 | /* Affinity changed (again). */ |
5969 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) | 5890 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) |
5970 | goto out; | 5891 | goto fail; |
5971 | 5892 | ||
5972 | on_rq = p->se.on_rq; | 5893 | on_rq = p->se.on_rq; |
5973 | if (on_rq) | 5894 | if (on_rq) |
@@ -5978,8 +5899,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5978 | activate_task(rq_dest, p, 0); | 5899 | activate_task(rq_dest, p, 0); |
5979 | check_preempt_curr(rq_dest, p); | 5900 | check_preempt_curr(rq_dest, p); |
5980 | } | 5901 | } |
5902 | done: | ||
5981 | ret = 1; | 5903 | ret = 1; |
5982 | out: | 5904 | fail: |
5983 | double_rq_unlock(rq_src, rq_dest); | 5905 | double_rq_unlock(rq_src, rq_dest); |
5984 | return ret; | 5906 | return ret; |
5985 | } | 5907 | } |
@@ -6229,6 +6151,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
6229 | next = pick_next_task(rq, rq->curr); | 6151 | next = pick_next_task(rq, rq->curr); |
6230 | if (!next) | 6152 | if (!next) |
6231 | break; | 6153 | break; |
6154 | next->sched_class->put_prev_task(rq, next); | ||
6232 | migrate_dead(dead_cpu, next); | 6155 | migrate_dead(dead_cpu, next); |
6233 | 6156 | ||
6234 | } | 6157 | } |
@@ -6400,6 +6323,36 @@ static void unregister_sched_domain_sysctl(void) | |||
6400 | } | 6323 | } |
6401 | #endif | 6324 | #endif |
6402 | 6325 | ||
6326 | static void set_rq_online(struct rq *rq) | ||
6327 | { | ||
6328 | if (!rq->online) { | ||
6329 | const struct sched_class *class; | ||
6330 | |||
6331 | cpu_set(rq->cpu, rq->rd->online); | ||
6332 | rq->online = 1; | ||
6333 | |||
6334 | for_each_class(class) { | ||
6335 | if (class->rq_online) | ||
6336 | class->rq_online(rq); | ||
6337 | } | ||
6338 | } | ||
6339 | } | ||
6340 | |||
6341 | static void set_rq_offline(struct rq *rq) | ||
6342 | { | ||
6343 | if (rq->online) { | ||
6344 | const struct sched_class *class; | ||
6345 | |||
6346 | for_each_class(class) { | ||
6347 | if (class->rq_offline) | ||
6348 | class->rq_offline(rq); | ||
6349 | } | ||
6350 | |||
6351 | cpu_clear(rq->cpu, rq->rd->online); | ||
6352 | rq->online = 0; | ||
6353 | } | ||
6354 | } | ||
6355 | |||
6403 | /* | 6356 | /* |
6404 | * migration_call - callback that gets triggered when a CPU is added. | 6357 | * migration_call - callback that gets triggered when a CPU is added. |
6405 | * Here we can start up the necessary migration thread for the new CPU. | 6358 | * Here we can start up the necessary migration thread for the new CPU. |
@@ -6437,7 +6390,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6437 | spin_lock_irqsave(&rq->lock, flags); | 6390 | spin_lock_irqsave(&rq->lock, flags); |
6438 | if (rq->rd) { | 6391 | if (rq->rd) { |
6439 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | 6392 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); |
6440 | cpu_set(cpu, rq->rd->online); | 6393 | |
6394 | set_rq_online(rq); | ||
6441 | } | 6395 | } |
6442 | spin_unlock_irqrestore(&rq->lock, flags); | 6396 | spin_unlock_irqrestore(&rq->lock, flags); |
6443 | break; | 6397 | break; |
@@ -6498,7 +6452,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6498 | spin_lock_irqsave(&rq->lock, flags); | 6452 | spin_lock_irqsave(&rq->lock, flags); |
6499 | if (rq->rd) { | 6453 | if (rq->rd) { |
6500 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | 6454 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); |
6501 | cpu_clear(cpu, rq->rd->online); | 6455 | set_rq_offline(rq); |
6502 | } | 6456 | } |
6503 | spin_unlock_irqrestore(&rq->lock, flags); | 6457 | spin_unlock_irqrestore(&rq->lock, flags); |
6504 | break; | 6458 | break; |
@@ -6532,6 +6486,28 @@ void __init migration_init(void) | |||
6532 | 6486 | ||
6533 | #ifdef CONFIG_SCHED_DEBUG | 6487 | #ifdef CONFIG_SCHED_DEBUG |
6534 | 6488 | ||
6489 | static inline const char *sd_level_to_string(enum sched_domain_level lvl) | ||
6490 | { | ||
6491 | switch (lvl) { | ||
6492 | case SD_LV_NONE: | ||
6493 | return "NONE"; | ||
6494 | case SD_LV_SIBLING: | ||
6495 | return "SIBLING"; | ||
6496 | case SD_LV_MC: | ||
6497 | return "MC"; | ||
6498 | case SD_LV_CPU: | ||
6499 | return "CPU"; | ||
6500 | case SD_LV_NODE: | ||
6501 | return "NODE"; | ||
6502 | case SD_LV_ALLNODES: | ||
6503 | return "ALLNODES"; | ||
6504 | case SD_LV_MAX: | ||
6505 | return "MAX"; | ||
6506 | |||
6507 | } | ||
6508 | return "MAX"; | ||
6509 | } | ||
6510 | |||
6535 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 6511 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
6536 | cpumask_t *groupmask) | 6512 | cpumask_t *groupmask) |
6537 | { | 6513 | { |
@@ -6551,7 +6527,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6551 | return -1; | 6527 | return -1; |
6552 | } | 6528 | } |
6553 | 6529 | ||
6554 | printk(KERN_CONT "span %s\n", str); | 6530 | printk(KERN_CONT "span %s level %s\n", |
6531 | str, sd_level_to_string(sd->level)); | ||
6555 | 6532 | ||
6556 | if (!cpu_isset(cpu, sd->span)) { | 6533 | if (!cpu_isset(cpu, sd->span)) { |
6557 | printk(KERN_ERR "ERROR: domain->span does not contain " | 6534 | printk(KERN_ERR "ERROR: domain->span does not contain " |
@@ -6635,9 +6612,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
6635 | } | 6612 | } |
6636 | kfree(groupmask); | 6613 | kfree(groupmask); |
6637 | } | 6614 | } |
6638 | #else | 6615 | #else /* !CONFIG_SCHED_DEBUG */ |
6639 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6616 | # define sched_domain_debug(sd, cpu) do { } while (0) |
6640 | #endif | 6617 | #endif /* CONFIG_SCHED_DEBUG */ |
6641 | 6618 | ||
6642 | static int sd_degenerate(struct sched_domain *sd) | 6619 | static int sd_degenerate(struct sched_domain *sd) |
6643 | { | 6620 | { |
@@ -6697,20 +6674,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
6697 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | 6674 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) |
6698 | { | 6675 | { |
6699 | unsigned long flags; | 6676 | unsigned long flags; |
6700 | const struct sched_class *class; | ||
6701 | 6677 | ||
6702 | spin_lock_irqsave(&rq->lock, flags); | 6678 | spin_lock_irqsave(&rq->lock, flags); |
6703 | 6679 | ||
6704 | if (rq->rd) { | 6680 | if (rq->rd) { |
6705 | struct root_domain *old_rd = rq->rd; | 6681 | struct root_domain *old_rd = rq->rd; |
6706 | 6682 | ||
6707 | for (class = sched_class_highest; class; class = class->next) { | 6683 | if (cpu_isset(rq->cpu, old_rd->online)) |
6708 | if (class->leave_domain) | 6684 | set_rq_offline(rq); |
6709 | class->leave_domain(rq); | ||
6710 | } | ||
6711 | 6685 | ||
6712 | cpu_clear(rq->cpu, old_rd->span); | 6686 | cpu_clear(rq->cpu, old_rd->span); |
6713 | cpu_clear(rq->cpu, old_rd->online); | ||
6714 | 6687 | ||
6715 | if (atomic_dec_and_test(&old_rd->refcount)) | 6688 | if (atomic_dec_and_test(&old_rd->refcount)) |
6716 | kfree(old_rd); | 6689 | kfree(old_rd); |
@@ -6721,12 +6694,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6721 | 6694 | ||
6722 | cpu_set(rq->cpu, rd->span); | 6695 | cpu_set(rq->cpu, rd->span); |
6723 | if (cpu_isset(rq->cpu, cpu_online_map)) | 6696 | if (cpu_isset(rq->cpu, cpu_online_map)) |
6724 | cpu_set(rq->cpu, rd->online); | 6697 | set_rq_online(rq); |
6725 | |||
6726 | for (class = sched_class_highest; class; class = class->next) { | ||
6727 | if (class->join_domain) | ||
6728 | class->join_domain(rq); | ||
6729 | } | ||
6730 | 6698 | ||
6731 | spin_unlock_irqrestore(&rq->lock, flags); | 6699 | spin_unlock_irqrestore(&rq->lock, flags); |
6732 | } | 6700 | } |
@@ -6737,6 +6705,8 @@ static void init_rootdomain(struct root_domain *rd) | |||
6737 | 6705 | ||
6738 | cpus_clear(rd->span); | 6706 | cpus_clear(rd->span); |
6739 | cpus_clear(rd->online); | 6707 | cpus_clear(rd->online); |
6708 | |||
6709 | cpupri_init(&rd->cpupri); | ||
6740 | } | 6710 | } |
6741 | 6711 | ||
6742 | static void init_defrootdomain(void) | 6712 | static void init_defrootdomain(void) |
@@ -6879,9 +6849,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) | |||
6879 | 6849 | ||
6880 | min_val = INT_MAX; | 6850 | min_val = INT_MAX; |
6881 | 6851 | ||
6882 | for (i = 0; i < MAX_NUMNODES; i++) { | 6852 | for (i = 0; i < nr_node_ids; i++) { |
6883 | /* Start at @node */ | 6853 | /* Start at @node */ |
6884 | n = (node + i) % MAX_NUMNODES; | 6854 | n = (node + i) % nr_node_ids; |
6885 | 6855 | ||
6886 | if (!nr_cpus_node(n)) | 6856 | if (!nr_cpus_node(n)) |
6887 | continue; | 6857 | continue; |
@@ -6931,7 +6901,7 @@ static void sched_domain_node_span(int node, cpumask_t *span) | |||
6931 | cpus_or(*span, *span, *nodemask); | 6901 | cpus_or(*span, *span, *nodemask); |
6932 | } | 6902 | } |
6933 | } | 6903 | } |
6934 | #endif | 6904 | #endif /* CONFIG_NUMA */ |
6935 | 6905 | ||
6936 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 6906 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6937 | 6907 | ||
@@ -6950,7 +6920,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, | |||
6950 | *sg = &per_cpu(sched_group_cpus, cpu); | 6920 | *sg = &per_cpu(sched_group_cpus, cpu); |
6951 | return cpu; | 6921 | return cpu; |
6952 | } | 6922 | } |
6953 | #endif | 6923 | #endif /* CONFIG_SCHED_SMT */ |
6954 | 6924 | ||
6955 | /* | 6925 | /* |
6956 | * multi-core sched-domains: | 6926 | * multi-core sched-domains: |
@@ -6958,7 +6928,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, | |||
6958 | #ifdef CONFIG_SCHED_MC | 6928 | #ifdef CONFIG_SCHED_MC |
6959 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 6929 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
6960 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); | 6930 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); |
6961 | #endif | 6931 | #endif /* CONFIG_SCHED_MC */ |
6962 | 6932 | ||
6963 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6933 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
6964 | static int | 6934 | static int |
@@ -7060,7 +7030,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
7060 | sg = sg->next; | 7030 | sg = sg->next; |
7061 | } while (sg != group_head); | 7031 | } while (sg != group_head); |
7062 | } | 7032 | } |
7063 | #endif | 7033 | #endif /* CONFIG_NUMA */ |
7064 | 7034 | ||
7065 | #ifdef CONFIG_NUMA | 7035 | #ifdef CONFIG_NUMA |
7066 | /* Free memory allocated for various sched_group structures */ | 7036 | /* Free memory allocated for various sched_group structures */ |
@@ -7075,7 +7045,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) | |||
7075 | if (!sched_group_nodes) | 7045 | if (!sched_group_nodes) |
7076 | continue; | 7046 | continue; |
7077 | 7047 | ||
7078 | for (i = 0; i < MAX_NUMNODES; i++) { | 7048 | for (i = 0; i < nr_node_ids; i++) { |
7079 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 7049 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
7080 | 7050 | ||
7081 | *nodemask = node_to_cpumask(i); | 7051 | *nodemask = node_to_cpumask(i); |
@@ -7097,11 +7067,11 @@ next_sg: | |||
7097 | sched_group_nodes_bycpu[cpu] = NULL; | 7067 | sched_group_nodes_bycpu[cpu] = NULL; |
7098 | } | 7068 | } |
7099 | } | 7069 | } |
7100 | #else | 7070 | #else /* !CONFIG_NUMA */ |
7101 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) | 7071 | static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) |
7102 | { | 7072 | { |
7103 | } | 7073 | } |
7104 | #endif | 7074 | #endif /* CONFIG_NUMA */ |
7105 | 7075 | ||
7106 | /* | 7076 | /* |
7107 | * Initialize sched groups cpu_power. | 7077 | * Initialize sched groups cpu_power. |
@@ -7219,7 +7189,12 @@ static int default_relax_domain_level = -1; | |||
7219 | 7189 | ||
7220 | static int __init setup_relax_domain_level(char *str) | 7190 | static int __init setup_relax_domain_level(char *str) |
7221 | { | 7191 | { |
7222 | default_relax_domain_level = simple_strtoul(str, NULL, 0); | 7192 | unsigned long val; |
7193 | |||
7194 | val = simple_strtoul(str, NULL, 0); | ||
7195 | if (val < SD_LV_MAX) | ||
7196 | default_relax_domain_level = val; | ||
7197 | |||
7223 | return 1; | 7198 | return 1; |
7224 | } | 7199 | } |
7225 | __setup("relax_domain_level=", setup_relax_domain_level); | 7200 | __setup("relax_domain_level=", setup_relax_domain_level); |
@@ -7263,7 +7238,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7263 | /* | 7238 | /* |
7264 | * Allocate the per-node list of sched groups | 7239 | * Allocate the per-node list of sched groups |
7265 | */ | 7240 | */ |
7266 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), | 7241 | sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), |
7267 | GFP_KERNEL); | 7242 | GFP_KERNEL); |
7268 | if (!sched_group_nodes) { | 7243 | if (!sched_group_nodes) { |
7269 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 7244 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
@@ -7316,7 +7291,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7316 | SD_INIT(sd, ALLNODES); | 7291 | SD_INIT(sd, ALLNODES); |
7317 | set_domain_attribute(sd, attr); | 7292 | set_domain_attribute(sd, attr); |
7318 | sd->span = *cpu_map; | 7293 | sd->span = *cpu_map; |
7319 | sd->first_cpu = first_cpu(sd->span); | ||
7320 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | 7294 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); |
7321 | p = sd; | 7295 | p = sd; |
7322 | sd_allnodes = 1; | 7296 | sd_allnodes = 1; |
@@ -7327,7 +7301,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7327 | SD_INIT(sd, NODE); | 7301 | SD_INIT(sd, NODE); |
7328 | set_domain_attribute(sd, attr); | 7302 | set_domain_attribute(sd, attr); |
7329 | sched_domain_node_span(cpu_to_node(i), &sd->span); | 7303 | sched_domain_node_span(cpu_to_node(i), &sd->span); |
7330 | sd->first_cpu = first_cpu(sd->span); | ||
7331 | sd->parent = p; | 7304 | sd->parent = p; |
7332 | if (p) | 7305 | if (p) |
7333 | p->child = sd; | 7306 | p->child = sd; |
@@ -7339,7 +7312,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7339 | SD_INIT(sd, CPU); | 7312 | SD_INIT(sd, CPU); |
7340 | set_domain_attribute(sd, attr); | 7313 | set_domain_attribute(sd, attr); |
7341 | sd->span = *nodemask; | 7314 | sd->span = *nodemask; |
7342 | sd->first_cpu = first_cpu(sd->span); | ||
7343 | sd->parent = p; | 7315 | sd->parent = p; |
7344 | if (p) | 7316 | if (p) |
7345 | p->child = sd; | 7317 | p->child = sd; |
@@ -7351,7 +7323,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7351 | SD_INIT(sd, MC); | 7323 | SD_INIT(sd, MC); |
7352 | set_domain_attribute(sd, attr); | 7324 | set_domain_attribute(sd, attr); |
7353 | sd->span = cpu_coregroup_map(i); | 7325 | sd->span = cpu_coregroup_map(i); |
7354 | sd->first_cpu = first_cpu(sd->span); | ||
7355 | cpus_and(sd->span, sd->span, *cpu_map); | 7326 | cpus_and(sd->span, sd->span, *cpu_map); |
7356 | sd->parent = p; | 7327 | sd->parent = p; |
7357 | p->child = sd; | 7328 | p->child = sd; |
@@ -7364,7 +7335,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7364 | SD_INIT(sd, SIBLING); | 7335 | SD_INIT(sd, SIBLING); |
7365 | set_domain_attribute(sd, attr); | 7336 | set_domain_attribute(sd, attr); |
7366 | sd->span = per_cpu(cpu_sibling_map, i); | 7337 | sd->span = per_cpu(cpu_sibling_map, i); |
7367 | sd->first_cpu = first_cpu(sd->span); | ||
7368 | cpus_and(sd->span, sd->span, *cpu_map); | 7338 | cpus_and(sd->span, sd->span, *cpu_map); |
7369 | sd->parent = p; | 7339 | sd->parent = p; |
7370 | p->child = sd; | 7340 | p->child = sd; |
@@ -7407,7 +7377,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7407 | #endif | 7377 | #endif |
7408 | 7378 | ||
7409 | /* Set up physical groups */ | 7379 | /* Set up physical groups */ |
7410 | for (i = 0; i < MAX_NUMNODES; i++) { | 7380 | for (i = 0; i < nr_node_ids; i++) { |
7411 | SCHED_CPUMASK_VAR(nodemask, allmasks); | 7381 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
7412 | SCHED_CPUMASK_VAR(send_covered, allmasks); | 7382 | SCHED_CPUMASK_VAR(send_covered, allmasks); |
7413 | 7383 | ||
@@ -7431,7 +7401,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7431 | send_covered, tmpmask); | 7401 | send_covered, tmpmask); |
7432 | } | 7402 | } |
7433 | 7403 | ||
7434 | for (i = 0; i < MAX_NUMNODES; i++) { | 7404 | for (i = 0; i < nr_node_ids; i++) { |
7435 | /* Set up node groups */ | 7405 | /* Set up node groups */ |
7436 | struct sched_group *sg, *prev; | 7406 | struct sched_group *sg, *prev; |
7437 | SCHED_CPUMASK_VAR(nodemask, allmasks); | 7407 | SCHED_CPUMASK_VAR(nodemask, allmasks); |
@@ -7470,9 +7440,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7470 | cpus_or(*covered, *covered, *nodemask); | 7440 | cpus_or(*covered, *covered, *nodemask); |
7471 | prev = sg; | 7441 | prev = sg; |
7472 | 7442 | ||
7473 | for (j = 0; j < MAX_NUMNODES; j++) { | 7443 | for (j = 0; j < nr_node_ids; j++) { |
7474 | SCHED_CPUMASK_VAR(notcovered, allmasks); | 7444 | SCHED_CPUMASK_VAR(notcovered, allmasks); |
7475 | int n = (i + j) % MAX_NUMNODES; | 7445 | int n = (i + j) % nr_node_ids; |
7476 | node_to_cpumask_ptr(pnodemask, n); | 7446 | node_to_cpumask_ptr(pnodemask, n); |
7477 | 7447 | ||
7478 | cpus_complement(*notcovered, *covered); | 7448 | cpus_complement(*notcovered, *covered); |
@@ -7525,7 +7495,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7525 | } | 7495 | } |
7526 | 7496 | ||
7527 | #ifdef CONFIG_NUMA | 7497 | #ifdef CONFIG_NUMA |
7528 | for (i = 0; i < MAX_NUMNODES; i++) | 7498 | for (i = 0; i < nr_node_ids; i++) |
7529 | init_numa_sched_groups_power(sched_group_nodes[i]); | 7499 | init_numa_sched_groups_power(sched_group_nodes[i]); |
7530 | 7500 | ||
7531 | if (sd_allnodes) { | 7501 | if (sd_allnodes) { |
@@ -7568,8 +7538,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
7568 | 7538 | ||
7569 | static cpumask_t *doms_cur; /* current sched domains */ | 7539 | static cpumask_t *doms_cur; /* current sched domains */ |
7570 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | 7540 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ |
7571 | static struct sched_domain_attr *dattr_cur; /* attribues of custom domains | 7541 | static struct sched_domain_attr *dattr_cur; |
7572 | in 'doms_cur' */ | 7542 | /* attribues of custom domains in 'doms_cur' */ |
7573 | 7543 | ||
7574 | /* | 7544 | /* |
7575 | * Special case: If a kmalloc of a doms_cur partition (array of | 7545 | * Special case: If a kmalloc of a doms_cur partition (array of |
@@ -7583,6 +7553,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void) | |||
7583 | } | 7553 | } |
7584 | 7554 | ||
7585 | /* | 7555 | /* |
7556 | * Free current domain masks. | ||
7557 | * Called after all cpus are attached to NULL domain. | ||
7558 | */ | ||
7559 | static void free_sched_domains(void) | ||
7560 | { | ||
7561 | ndoms_cur = 0; | ||
7562 | if (doms_cur != &fallback_doms) | ||
7563 | kfree(doms_cur); | ||
7564 | doms_cur = &fallback_doms; | ||
7565 | } | ||
7566 | |||
7567 | /* | ||
7586 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 7568 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
7587 | * For now this just excludes isolated cpus, but could be used to | 7569 | * For now this just excludes isolated cpus, but could be used to |
7588 | * exclude other special cases in the future. | 7570 | * exclude other special cases in the future. |
@@ -7729,6 +7711,7 @@ int arch_reinit_sched_domains(void) | |||
7729 | get_online_cpus(); | 7711 | get_online_cpus(); |
7730 | mutex_lock(&sched_domains_mutex); | 7712 | mutex_lock(&sched_domains_mutex); |
7731 | detach_destroy_domains(&cpu_online_map); | 7713 | detach_destroy_domains(&cpu_online_map); |
7714 | free_sched_domains(); | ||
7732 | err = arch_init_sched_domains(&cpu_online_map); | 7715 | err = arch_init_sched_domains(&cpu_online_map); |
7733 | mutex_unlock(&sched_domains_mutex); | 7716 | mutex_unlock(&sched_domains_mutex); |
7734 | put_online_cpus(); | 7717 | put_online_cpus(); |
@@ -7797,7 +7780,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
7797 | #endif | 7780 | #endif |
7798 | return err; | 7781 | return err; |
7799 | } | 7782 | } |
7800 | #endif | 7783 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
7801 | 7784 | ||
7802 | /* | 7785 | /* |
7803 | * Force a reinitialization of the sched domains hierarchy. The domains | 7786 | * Force a reinitialization of the sched domains hierarchy. The domains |
@@ -7808,20 +7791,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
7808 | static int update_sched_domains(struct notifier_block *nfb, | 7791 | static int update_sched_domains(struct notifier_block *nfb, |
7809 | unsigned long action, void *hcpu) | 7792 | unsigned long action, void *hcpu) |
7810 | { | 7793 | { |
7794 | int cpu = (int)(long)hcpu; | ||
7795 | |||
7811 | switch (action) { | 7796 | switch (action) { |
7812 | case CPU_UP_PREPARE: | ||
7813 | case CPU_UP_PREPARE_FROZEN: | ||
7814 | case CPU_DOWN_PREPARE: | 7797 | case CPU_DOWN_PREPARE: |
7815 | case CPU_DOWN_PREPARE_FROZEN: | 7798 | case CPU_DOWN_PREPARE_FROZEN: |
7799 | disable_runtime(cpu_rq(cpu)); | ||
7800 | /* fall-through */ | ||
7801 | case CPU_UP_PREPARE: | ||
7802 | case CPU_UP_PREPARE_FROZEN: | ||
7816 | detach_destroy_domains(&cpu_online_map); | 7803 | detach_destroy_domains(&cpu_online_map); |
7804 | free_sched_domains(); | ||
7817 | return NOTIFY_OK; | 7805 | return NOTIFY_OK; |
7818 | 7806 | ||
7819 | case CPU_UP_CANCELED: | 7807 | |
7820 | case CPU_UP_CANCELED_FROZEN: | ||
7821 | case CPU_DOWN_FAILED: | 7808 | case CPU_DOWN_FAILED: |
7822 | case CPU_DOWN_FAILED_FROZEN: | 7809 | case CPU_DOWN_FAILED_FROZEN: |
7823 | case CPU_ONLINE: | 7810 | case CPU_ONLINE: |
7824 | case CPU_ONLINE_FROZEN: | 7811 | case CPU_ONLINE_FROZEN: |
7812 | enable_runtime(cpu_rq(cpu)); | ||
7813 | /* fall-through */ | ||
7814 | case CPU_UP_CANCELED: | ||
7815 | case CPU_UP_CANCELED_FROZEN: | ||
7825 | case CPU_DEAD: | 7816 | case CPU_DEAD: |
7826 | case CPU_DEAD_FROZEN: | 7817 | case CPU_DEAD_FROZEN: |
7827 | /* | 7818 | /* |
@@ -7832,8 +7823,16 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
7832 | return NOTIFY_DONE; | 7823 | return NOTIFY_DONE; |
7833 | } | 7824 | } |
7834 | 7825 | ||
7826 | #ifndef CONFIG_CPUSETS | ||
7827 | /* | ||
7828 | * Create default domain partitioning if cpusets are disabled. | ||
7829 | * Otherwise we let cpusets rebuild the domains based on the | ||
7830 | * current setup. | ||
7831 | */ | ||
7832 | |||
7835 | /* The hotplug lock is already held by cpu_up/cpu_down */ | 7833 | /* The hotplug lock is already held by cpu_up/cpu_down */ |
7836 | arch_init_sched_domains(&cpu_online_map); | 7834 | arch_init_sched_domains(&cpu_online_map); |
7835 | #endif | ||
7837 | 7836 | ||
7838 | return NOTIFY_OK; | 7837 | return NOTIFY_OK; |
7839 | } | 7838 | } |
@@ -7973,7 +7972,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
7973 | else | 7972 | else |
7974 | rt_se->rt_rq = parent->my_q; | 7973 | rt_se->rt_rq = parent->my_q; |
7975 | 7974 | ||
7976 | rt_se->rt_rq = &rq->rt; | ||
7977 | rt_se->my_q = rt_rq; | 7975 | rt_se->my_q = rt_rq; |
7978 | rt_se->parent = parent; | 7976 | rt_se->parent = parent; |
7979 | INIT_LIST_HEAD(&rt_se->run_list); | 7977 | INIT_LIST_HEAD(&rt_se->run_list); |
@@ -8014,8 +8012,8 @@ void __init sched_init(void) | |||
8014 | 8012 | ||
8015 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; | 8013 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
8016 | ptr += nr_cpu_ids * sizeof(void **); | 8014 | ptr += nr_cpu_ids * sizeof(void **); |
8017 | #endif | 8015 | #endif /* CONFIG_USER_SCHED */ |
8018 | #endif | 8016 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8019 | #ifdef CONFIG_RT_GROUP_SCHED | 8017 | #ifdef CONFIG_RT_GROUP_SCHED |
8020 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 8018 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; |
8021 | ptr += nr_cpu_ids * sizeof(void **); | 8019 | ptr += nr_cpu_ids * sizeof(void **); |
@@ -8029,12 +8027,11 @@ void __init sched_init(void) | |||
8029 | 8027 | ||
8030 | root_task_group.rt_rq = (struct rt_rq **)ptr; | 8028 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
8031 | ptr += nr_cpu_ids * sizeof(void **); | 8029 | ptr += nr_cpu_ids * sizeof(void **); |
8032 | #endif | 8030 | #endif /* CONFIG_USER_SCHED */ |
8033 | #endif | 8031 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8034 | } | 8032 | } |
8035 | 8033 | ||
8036 | #ifdef CONFIG_SMP | 8034 | #ifdef CONFIG_SMP |
8037 | init_aggregate(); | ||
8038 | init_defrootdomain(); | 8035 | init_defrootdomain(); |
8039 | #endif | 8036 | #endif |
8040 | 8037 | ||
@@ -8047,8 +8044,8 @@ void __init sched_init(void) | |||
8047 | #ifdef CONFIG_USER_SCHED | 8044 | #ifdef CONFIG_USER_SCHED |
8048 | init_rt_bandwidth(&root_task_group.rt_bandwidth, | 8045 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
8049 | global_rt_period(), RUNTIME_INF); | 8046 | global_rt_period(), RUNTIME_INF); |
8050 | #endif | 8047 | #endif /* CONFIG_USER_SCHED */ |
8051 | #endif | 8048 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8052 | 8049 | ||
8053 | #ifdef CONFIG_GROUP_SCHED | 8050 | #ifdef CONFIG_GROUP_SCHED |
8054 | list_add(&init_task_group.list, &task_groups); | 8051 | list_add(&init_task_group.list, &task_groups); |
@@ -8058,8 +8055,8 @@ void __init sched_init(void) | |||
8058 | INIT_LIST_HEAD(&root_task_group.children); | 8055 | INIT_LIST_HEAD(&root_task_group.children); |
8059 | init_task_group.parent = &root_task_group; | 8056 | init_task_group.parent = &root_task_group; |
8060 | list_add(&init_task_group.siblings, &root_task_group.children); | 8057 | list_add(&init_task_group.siblings, &root_task_group.children); |
8061 | #endif | 8058 | #endif /* CONFIG_USER_SCHED */ |
8062 | #endif | 8059 | #endif /* CONFIG_GROUP_SCHED */ |
8063 | 8060 | ||
8064 | for_each_possible_cpu(i) { | 8061 | for_each_possible_cpu(i) { |
8065 | struct rq *rq; | 8062 | struct rq *rq; |
@@ -8139,6 +8136,7 @@ void __init sched_init(void) | |||
8139 | rq->next_balance = jiffies; | 8136 | rq->next_balance = jiffies; |
8140 | rq->push_cpu = 0; | 8137 | rq->push_cpu = 0; |
8141 | rq->cpu = i; | 8138 | rq->cpu = i; |
8139 | rq->online = 0; | ||
8142 | rq->migration_thread = NULL; | 8140 | rq->migration_thread = NULL; |
8143 | INIT_LIST_HEAD(&rq->migration_queue); | 8141 | INIT_LIST_HEAD(&rq->migration_queue); |
8144 | rq_attach_root(rq, &def_root_domain); | 8142 | rq_attach_root(rq, &def_root_domain); |
@@ -8154,7 +8152,7 @@ void __init sched_init(void) | |||
8154 | #endif | 8152 | #endif |
8155 | 8153 | ||
8156 | #ifdef CONFIG_SMP | 8154 | #ifdef CONFIG_SMP |
8157 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); | 8155 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); |
8158 | #endif | 8156 | #endif |
8159 | 8157 | ||
8160 | #ifdef CONFIG_RT_MUTEXES | 8158 | #ifdef CONFIG_RT_MUTEXES |
@@ -8378,7 +8376,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | |||
8378 | { | 8376 | { |
8379 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8377 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); |
8380 | } | 8378 | } |
8381 | #else | 8379 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
8382 | static inline void free_fair_sched_group(struct task_group *tg) | 8380 | static inline void free_fair_sched_group(struct task_group *tg) |
8383 | { | 8381 | { |
8384 | } | 8382 | } |
@@ -8396,7 +8394,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu) | |||
8396 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8394 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8397 | { | 8395 | { |
8398 | } | 8396 | } |
8399 | #endif | 8397 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8400 | 8398 | ||
8401 | #ifdef CONFIG_RT_GROUP_SCHED | 8399 | #ifdef CONFIG_RT_GROUP_SCHED |
8402 | static void free_rt_sched_group(struct task_group *tg) | 8400 | static void free_rt_sched_group(struct task_group *tg) |
@@ -8467,7 +8465,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | |||
8467 | { | 8465 | { |
8468 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | 8466 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); |
8469 | } | 8467 | } |
8470 | #else | 8468 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8471 | static inline void free_rt_sched_group(struct task_group *tg) | 8469 | static inline void free_rt_sched_group(struct task_group *tg) |
8472 | { | 8470 | { |
8473 | } | 8471 | } |
@@ -8485,7 +8483,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu) | |||
8485 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | 8483 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) |
8486 | { | 8484 | { |
8487 | } | 8485 | } |
8488 | #endif | 8486 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8489 | 8487 | ||
8490 | #ifdef CONFIG_GROUP_SCHED | 8488 | #ifdef CONFIG_GROUP_SCHED |
8491 | static void free_sched_group(struct task_group *tg) | 8489 | static void free_sched_group(struct task_group *tg) |
@@ -8596,7 +8594,7 @@ void sched_move_task(struct task_struct *tsk) | |||
8596 | 8594 | ||
8597 | task_rq_unlock(rq, &flags); | 8595 | task_rq_unlock(rq, &flags); |
8598 | } | 8596 | } |
8599 | #endif | 8597 | #endif /* CONFIG_GROUP_SCHED */ |
8600 | 8598 | ||
8601 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8599 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8602 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | 8600 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
@@ -8731,7 +8729,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | |||
8731 | } | 8729 | } |
8732 | rcu_read_unlock(); | 8730 | rcu_read_unlock(); |
8733 | 8731 | ||
8734 | return total + to_ratio(period, runtime) < | 8732 | return total + to_ratio(period, runtime) <= |
8735 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), | 8733 | to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), |
8736 | parent->rt_bandwidth.rt_runtime); | 8734 | parent->rt_bandwidth.rt_runtime); |
8737 | } | 8735 | } |
@@ -8834,6 +8832,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
8834 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; | 8832 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; |
8835 | rt_runtime = tg->rt_bandwidth.rt_runtime; | 8833 | rt_runtime = tg->rt_bandwidth.rt_runtime; |
8836 | 8834 | ||
8835 | if (rt_period == 0) | ||
8836 | return -EINVAL; | ||
8837 | |||
8837 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8838 | return tg_set_bandwidth(tg, rt_period, rt_runtime); |
8838 | } | 8839 | } |
8839 | 8840 | ||
@@ -8848,16 +8849,21 @@ long sched_group_rt_period(struct task_group *tg) | |||
8848 | 8849 | ||
8849 | static int sched_rt_global_constraints(void) | 8850 | static int sched_rt_global_constraints(void) |
8850 | { | 8851 | { |
8852 | struct task_group *tg = &root_task_group; | ||
8853 | u64 rt_runtime, rt_period; | ||
8851 | int ret = 0; | 8854 | int ret = 0; |
8852 | 8855 | ||
8856 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
8857 | rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
8858 | |||
8853 | mutex_lock(&rt_constraints_mutex); | 8859 | mutex_lock(&rt_constraints_mutex); |
8854 | if (!__rt_schedulable(NULL, 1, 0)) | 8860 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) |
8855 | ret = -EINVAL; | 8861 | ret = -EINVAL; |
8856 | mutex_unlock(&rt_constraints_mutex); | 8862 | mutex_unlock(&rt_constraints_mutex); |
8857 | 8863 | ||
8858 | return ret; | 8864 | return ret; |
8859 | } | 8865 | } |
8860 | #else | 8866 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8861 | static int sched_rt_global_constraints(void) | 8867 | static int sched_rt_global_constraints(void) |
8862 | { | 8868 | { |
8863 | unsigned long flags; | 8869 | unsigned long flags; |
@@ -8875,7 +8881,7 @@ static int sched_rt_global_constraints(void) | |||
8875 | 8881 | ||
8876 | return 0; | 8882 | return 0; |
8877 | } | 8883 | } |
8878 | #endif | 8884 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8879 | 8885 | ||
8880 | int sched_rt_handler(struct ctl_table *table, int write, | 8886 | int sched_rt_handler(struct ctl_table *table, int write, |
8881 | struct file *filp, void __user *buffer, size_t *lenp, | 8887 | struct file *filp, void __user *buffer, size_t *lenp, |
@@ -8983,7 +8989,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
8983 | 8989 | ||
8984 | return (u64) tg->shares; | 8990 | return (u64) tg->shares; |
8985 | } | 8991 | } |
8986 | #endif | 8992 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8987 | 8993 | ||
8988 | #ifdef CONFIG_RT_GROUP_SCHED | 8994 | #ifdef CONFIG_RT_GROUP_SCHED |
8989 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 8995 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, |
@@ -9007,7 +9013,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
9007 | { | 9013 | { |
9008 | return sched_group_rt_period(cgroup_tg(cgrp)); | 9014 | return sched_group_rt_period(cgroup_tg(cgrp)); |
9009 | } | 9015 | } |
9010 | #endif | 9016 | #endif /* CONFIG_RT_GROUP_SCHED */ |
9011 | 9017 | ||
9012 | static struct cftype cpu_files[] = { | 9018 | static struct cftype cpu_files[] = { |
9013 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9019 | #ifdef CONFIG_FAIR_GROUP_SCHED |