diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 2054 |
1 files changed, 1230 insertions, 824 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273b..e7f2cfa6a257 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -39,7 +39,7 @@ | |||
| 39 | #include <linux/completion.h> | 39 | #include <linux/completion.h> |
| 40 | #include <linux/kernel_stat.h> | 40 | #include <linux/kernel_stat.h> |
| 41 | #include <linux/debug_locks.h> | 41 | #include <linux/debug_locks.h> |
| 42 | #include <linux/perf_counter.h> | 42 | #include <linux/perf_event.h> |
| 43 | #include <linux/security.h> | 43 | #include <linux/security.h> |
| 44 | #include <linux/notifier.h> | 44 | #include <linux/notifier.h> |
| 45 | #include <linux/profile.h> | 45 | #include <linux/profile.h> |
| @@ -64,7 +64,6 @@ | |||
| 64 | #include <linux/tsacct_kern.h> | 64 | #include <linux/tsacct_kern.h> |
| 65 | #include <linux/kprobes.h> | 65 | #include <linux/kprobes.h> |
| 66 | #include <linux/delayacct.h> | 66 | #include <linux/delayacct.h> |
| 67 | #include <linux/reciprocal_div.h> | ||
| 68 | #include <linux/unistd.h> | 67 | #include <linux/unistd.h> |
| 69 | #include <linux/pagemap.h> | 68 | #include <linux/pagemap.h> |
| 70 | #include <linux/hrtimer.h> | 69 | #include <linux/hrtimer.h> |
| @@ -120,30 +119,6 @@ | |||
| 120 | */ | 119 | */ |
| 121 | #define RUNTIME_INF ((u64)~0ULL) | 120 | #define RUNTIME_INF ((u64)~0ULL) |
| 122 | 121 | ||
| 123 | #ifdef CONFIG_SMP | ||
| 124 | |||
| 125 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
| 126 | |||
| 127 | /* | ||
| 128 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | ||
| 129 | * Since cpu_power is a 'constant', we can use a reciprocal divide. | ||
| 130 | */ | ||
| 131 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) | ||
| 132 | { | ||
| 133 | return reciprocal_divide(load, sg->reciprocal_cpu_power); | ||
| 134 | } | ||
| 135 | |||
| 136 | /* | ||
| 137 | * Each time a sched group cpu_power is changed, | ||
| 138 | * we must compute its reciprocal value | ||
| 139 | */ | ||
| 140 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | ||
| 141 | { | ||
| 142 | sg->__cpu_power += val; | ||
| 143 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); | ||
| 144 | } | ||
| 145 | #endif | ||
| 146 | |||
| 147 | static inline int rt_policy(int policy) | 122 | static inline int rt_policy(int policy) |
| 148 | { | 123 | { |
| 149 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) | 124 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) |
| @@ -309,8 +284,8 @@ void set_tg_uid(struct user_struct *user) | |||
| 309 | 284 | ||
| 310 | /* | 285 | /* |
| 311 | * Root task group. | 286 | * Root task group. |
| 312 | * Every UID task group (including init_task_group aka UID-0) will | 287 | * Every UID task group (including init_task_group aka UID-0) will |
| 313 | * be a child to this group. | 288 | * be a child to this group. |
| 314 | */ | 289 | */ |
| 315 | struct task_group root_task_group; | 290 | struct task_group root_task_group; |
| 316 | 291 | ||
| @@ -318,12 +293,12 @@ struct task_group root_task_group; | |||
| 318 | /* Default task group's sched entity on each cpu */ | 293 | /* Default task group's sched entity on each cpu */ |
| 319 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 294 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
| 320 | /* Default task group's cfs_rq on each cpu */ | 295 | /* Default task group's cfs_rq on each cpu */ |
| 321 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 296 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); |
| 322 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 297 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 323 | 298 | ||
| 324 | #ifdef CONFIG_RT_GROUP_SCHED | 299 | #ifdef CONFIG_RT_GROUP_SCHED |
| 325 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 300 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
| 326 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 301 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); |
| 327 | #endif /* CONFIG_RT_GROUP_SCHED */ | 302 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 328 | #else /* !CONFIG_USER_SCHED */ | 303 | #else /* !CONFIG_USER_SCHED */ |
| 329 | #define root_task_group init_task_group | 304 | #define root_task_group init_task_group |
| @@ -334,6 +309,8 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | |||
| 334 | */ | 309 | */ |
| 335 | static DEFINE_SPINLOCK(task_group_lock); | 310 | static DEFINE_SPINLOCK(task_group_lock); |
| 336 | 311 | ||
| 312 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 313 | |||
| 337 | #ifdef CONFIG_SMP | 314 | #ifdef CONFIG_SMP |
| 338 | static int root_task_group_empty(void) | 315 | static int root_task_group_empty(void) |
| 339 | { | 316 | { |
| @@ -341,7 +318,6 @@ static int root_task_group_empty(void) | |||
| 341 | } | 318 | } |
| 342 | #endif | 319 | #endif |
| 343 | 320 | ||
| 344 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 345 | #ifdef CONFIG_USER_SCHED | 321 | #ifdef CONFIG_USER_SCHED |
| 346 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | 322 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
| 347 | #else /* !CONFIG_USER_SCHED */ | 323 | #else /* !CONFIG_USER_SCHED */ |
| @@ -401,13 +377,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | |||
| 401 | 377 | ||
| 402 | #else | 378 | #else |
| 403 | 379 | ||
| 404 | #ifdef CONFIG_SMP | ||
| 405 | static int root_task_group_empty(void) | ||
| 406 | { | ||
| 407 | return 1; | ||
| 408 | } | ||
| 409 | #endif | ||
| 410 | |||
| 411 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | 380 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
| 412 | static inline struct task_group *task_group(struct task_struct *p) | 381 | static inline struct task_group *task_group(struct task_struct *p) |
| 413 | { | 382 | { |
| @@ -537,14 +506,6 @@ struct root_domain { | |||
| 537 | #ifdef CONFIG_SMP | 506 | #ifdef CONFIG_SMP |
| 538 | struct cpupri cpupri; | 507 | struct cpupri cpupri; |
| 539 | #endif | 508 | #endif |
| 540 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 541 | /* | ||
| 542 | * Preferred wake up cpu nominated by sched_mc balance that will be | ||
| 543 | * used when most cpus are idle in the system indicating overall very | ||
| 544 | * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) | ||
| 545 | */ | ||
| 546 | unsigned int sched_mc_preferred_wakeup_cpu; | ||
| 547 | #endif | ||
| 548 | }; | 509 | }; |
| 549 | 510 | ||
| 550 | /* | 511 | /* |
| @@ -574,14 +535,12 @@ struct rq { | |||
| 574 | #define CPU_LOAD_IDX_MAX 5 | 535 | #define CPU_LOAD_IDX_MAX 5 |
| 575 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 536 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
| 576 | #ifdef CONFIG_NO_HZ | 537 | #ifdef CONFIG_NO_HZ |
| 577 | unsigned long last_tick_seen; | ||
| 578 | unsigned char in_nohz_recently; | 538 | unsigned char in_nohz_recently; |
| 579 | #endif | 539 | #endif |
| 580 | /* capture load from *all* tasks on this cpu: */ | 540 | /* capture load from *all* tasks on this cpu: */ |
| 581 | struct load_weight load; | 541 | struct load_weight load; |
| 582 | unsigned long nr_load_updates; | 542 | unsigned long nr_load_updates; |
| 583 | u64 nr_switches; | 543 | u64 nr_switches; |
| 584 | u64 nr_migrations_in; | ||
| 585 | 544 | ||
| 586 | struct cfs_rq cfs; | 545 | struct cfs_rq cfs; |
| 587 | struct rt_rq rt; | 546 | struct rt_rq rt; |
| @@ -616,6 +575,7 @@ struct rq { | |||
| 616 | 575 | ||
| 617 | unsigned char idle_at_tick; | 576 | unsigned char idle_at_tick; |
| 618 | /* For active balancing */ | 577 | /* For active balancing */ |
| 578 | int post_schedule; | ||
| 619 | int active_balance; | 579 | int active_balance; |
| 620 | int push_cpu; | 580 | int push_cpu; |
| 621 | /* cpu of this runqueue: */ | 581 | /* cpu of this runqueue: */ |
| @@ -626,6 +586,11 @@ struct rq { | |||
| 626 | 586 | ||
| 627 | struct task_struct *migration_thread; | 587 | struct task_struct *migration_thread; |
| 628 | struct list_head migration_queue; | 588 | struct list_head migration_queue; |
| 589 | |||
| 590 | u64 rt_avg; | ||
| 591 | u64 age_stamp; | ||
| 592 | u64 idle_stamp; | ||
| 593 | u64 avg_idle; | ||
| 629 | #endif | 594 | #endif |
| 630 | 595 | ||
| 631 | /* calc_load related fields */ | 596 | /* calc_load related fields */ |
| @@ -665,9 +630,10 @@ struct rq { | |||
| 665 | 630 | ||
| 666 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 631 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
| 667 | 632 | ||
| 668 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) | 633 | static inline |
| 634 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
| 669 | { | 635 | { |
| 670 | rq->curr->sched_class->check_preempt_curr(rq, p, sync); | 636 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); |
| 671 | } | 637 | } |
| 672 | 638 | ||
| 673 | static inline int cpu_of(struct rq *rq) | 639 | static inline int cpu_of(struct rq *rq) |
| @@ -693,6 +659,7 @@ static inline int cpu_of(struct rq *rq) | |||
| 693 | #define this_rq() (&__get_cpu_var(runqueues)) | 659 | #define this_rq() (&__get_cpu_var(runqueues)) |
| 694 | #define task_rq(p) cpu_rq(task_cpu(p)) | 660 | #define task_rq(p) cpu_rq(task_cpu(p)) |
| 695 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 661 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
| 662 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
| 696 | 663 | ||
| 697 | inline void update_rq_clock(struct rq *rq) | 664 | inline void update_rq_clock(struct rq *rq) |
| 698 | { | 665 | { |
| @@ -710,20 +677,15 @@ inline void update_rq_clock(struct rq *rq) | |||
| 710 | 677 | ||
| 711 | /** | 678 | /** |
| 712 | * runqueue_is_locked | 679 | * runqueue_is_locked |
| 680 | * @cpu: the processor in question. | ||
| 713 | * | 681 | * |
| 714 | * Returns true if the current cpu runqueue is locked. | 682 | * Returns true if the current cpu runqueue is locked. |
| 715 | * This interface allows printk to be called with the runqueue lock | 683 | * This interface allows printk to be called with the runqueue lock |
| 716 | * held and know whether or not it is OK to wake up the klogd. | 684 | * held and know whether or not it is OK to wake up the klogd. |
| 717 | */ | 685 | */ |
| 718 | int runqueue_is_locked(void) | 686 | int runqueue_is_locked(int cpu) |
| 719 | { | 687 | { |
| 720 | int cpu = get_cpu(); | 688 | return spin_is_locked(&cpu_rq(cpu)->lock); |
| 721 | struct rq *rq = cpu_rq(cpu); | ||
| 722 | int ret; | ||
| 723 | |||
| 724 | ret = spin_is_locked(&rq->lock); | ||
| 725 | put_cpu(); | ||
| 726 | return ret; | ||
| 727 | } | 689 | } |
| 728 | 690 | ||
| 729 | /* | 691 | /* |
| @@ -810,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
| 810 | if (!sched_feat_names[i]) | 772 | if (!sched_feat_names[i]) |
| 811 | return -EINVAL; | 773 | return -EINVAL; |
| 812 | 774 | ||
| 813 | filp->f_pos += cnt; | 775 | *ppos += cnt; |
| 814 | 776 | ||
| 815 | return cnt; | 777 | return cnt; |
| 816 | } | 778 | } |
| @@ -820,7 +782,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp) | |||
| 820 | return single_open(filp, sched_feat_show, NULL); | 782 | return single_open(filp, sched_feat_show, NULL); |
| 821 | } | 783 | } |
| 822 | 784 | ||
| 823 | static struct file_operations sched_feat_fops = { | 785 | static const struct file_operations sched_feat_fops = { |
| 824 | .open = sched_feat_open, | 786 | .open = sched_feat_open, |
| 825 | .write = sched_feat_write, | 787 | .write = sched_feat_write, |
| 826 | .read = seq_read, | 788 | .read = seq_read, |
| @@ -861,6 +823,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; | |||
| 861 | unsigned int sysctl_sched_shares_thresh = 4; | 823 | unsigned int sysctl_sched_shares_thresh = 4; |
| 862 | 824 | ||
| 863 | /* | 825 | /* |
| 826 | * period over which we average the RT time consumption, measured | ||
| 827 | * in ms. | ||
| 828 | * | ||
| 829 | * default: 1s | ||
| 830 | */ | ||
| 831 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | ||
| 832 | |||
| 833 | /* | ||
| 864 | * period over which we measure -rt task cpu usage in us. | 834 | * period over which we measure -rt task cpu usage in us. |
| 865 | * default: 1s | 835 | * default: 1s |
| 866 | */ | 836 | */ |
| @@ -1278,12 +1248,37 @@ void wake_up_idle_cpu(int cpu) | |||
| 1278 | } | 1248 | } |
| 1279 | #endif /* CONFIG_NO_HZ */ | 1249 | #endif /* CONFIG_NO_HZ */ |
| 1280 | 1250 | ||
| 1251 | static u64 sched_avg_period(void) | ||
| 1252 | { | ||
| 1253 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
| 1254 | } | ||
| 1255 | |||
| 1256 | static void sched_avg_update(struct rq *rq) | ||
| 1257 | { | ||
| 1258 | s64 period = sched_avg_period(); | ||
| 1259 | |||
| 1260 | while ((s64)(rq->clock - rq->age_stamp) > period) { | ||
| 1261 | rq->age_stamp += period; | ||
| 1262 | rq->rt_avg /= 2; | ||
| 1263 | } | ||
| 1264 | } | ||
| 1265 | |||
| 1266 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
| 1267 | { | ||
| 1268 | rq->rt_avg += rt_delta; | ||
| 1269 | sched_avg_update(rq); | ||
| 1270 | } | ||
| 1271 | |||
| 1281 | #else /* !CONFIG_SMP */ | 1272 | #else /* !CONFIG_SMP */ |
| 1282 | static void resched_task(struct task_struct *p) | 1273 | static void resched_task(struct task_struct *p) |
| 1283 | { | 1274 | { |
| 1284 | assert_spin_locked(&task_rq(p)->lock); | 1275 | assert_spin_locked(&task_rq(p)->lock); |
| 1285 | set_tsk_need_resched(p); | 1276 | set_tsk_need_resched(p); |
| 1286 | } | 1277 | } |
| 1278 | |||
| 1279 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
| 1280 | { | ||
| 1281 | } | ||
| 1287 | #endif /* CONFIG_SMP */ | 1282 | #endif /* CONFIG_SMP */ |
| 1288 | 1283 | ||
| 1289 | #if BITS_PER_LONG == 32 | 1284 | #if BITS_PER_LONG == 32 |
| @@ -1494,8 +1489,65 @@ static int tg_nop(struct task_group *tg, void *data) | |||
| 1494 | #endif | 1489 | #endif |
| 1495 | 1490 | ||
| 1496 | #ifdef CONFIG_SMP | 1491 | #ifdef CONFIG_SMP |
| 1497 | static unsigned long source_load(int cpu, int type); | 1492 | /* Used instead of source_load when we know the type == 0 */ |
| 1498 | static unsigned long target_load(int cpu, int type); | 1493 | static unsigned long weighted_cpuload(const int cpu) |
| 1494 | { | ||
| 1495 | return cpu_rq(cpu)->load.weight; | ||
| 1496 | } | ||
| 1497 | |||
| 1498 | /* | ||
| 1499 | * Return a low guess at the load of a migration-source cpu weighted | ||
| 1500 | * according to the scheduling class and "nice" value. | ||
| 1501 | * | ||
| 1502 | * We want to under-estimate the load of migration sources, to | ||
| 1503 | * balance conservatively. | ||
| 1504 | */ | ||
| 1505 | static unsigned long source_load(int cpu, int type) | ||
| 1506 | { | ||
| 1507 | struct rq *rq = cpu_rq(cpu); | ||
| 1508 | unsigned long total = weighted_cpuload(cpu); | ||
| 1509 | |||
| 1510 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
| 1511 | return total; | ||
| 1512 | |||
| 1513 | return min(rq->cpu_load[type-1], total); | ||
| 1514 | } | ||
| 1515 | |||
| 1516 | /* | ||
| 1517 | * Return a high guess at the load of a migration-target cpu weighted | ||
| 1518 | * according to the scheduling class and "nice" value. | ||
| 1519 | */ | ||
| 1520 | static unsigned long target_load(int cpu, int type) | ||
| 1521 | { | ||
| 1522 | struct rq *rq = cpu_rq(cpu); | ||
| 1523 | unsigned long total = weighted_cpuload(cpu); | ||
| 1524 | |||
| 1525 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
| 1526 | return total; | ||
| 1527 | |||
| 1528 | return max(rq->cpu_load[type-1], total); | ||
| 1529 | } | ||
| 1530 | |||
| 1531 | static struct sched_group *group_of(int cpu) | ||
| 1532 | { | ||
| 1533 | struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); | ||
| 1534 | |||
| 1535 | if (!sd) | ||
| 1536 | return NULL; | ||
| 1537 | |||
| 1538 | return sd->groups; | ||
| 1539 | } | ||
| 1540 | |||
| 1541 | static unsigned long power_of(int cpu) | ||
| 1542 | { | ||
| 1543 | struct sched_group *group = group_of(cpu); | ||
| 1544 | |||
| 1545 | if (!group) | ||
| 1546 | return SCHED_LOAD_SCALE; | ||
| 1547 | |||
| 1548 | return group->cpu_power; | ||
| 1549 | } | ||
| 1550 | |||
| 1499 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1551 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
| 1500 | 1552 | ||
| 1501 | static unsigned long cpu_avg_load_per_task(int cpu) | 1553 | static unsigned long cpu_avg_load_per_task(int cpu) |
| @@ -1513,28 +1565,31 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
| 1513 | 1565 | ||
| 1514 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1566 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1515 | 1567 | ||
| 1568 | static __read_mostly unsigned long *update_shares_data; | ||
| 1569 | |||
| 1516 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1570 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
| 1517 | 1571 | ||
| 1518 | /* | 1572 | /* |
| 1519 | * Calculate and set the cpu's group shares. | 1573 | * Calculate and set the cpu's group shares. |
| 1520 | */ | 1574 | */ |
| 1521 | static void | 1575 | static void update_group_shares_cpu(struct task_group *tg, int cpu, |
| 1522 | update_group_shares_cpu(struct task_group *tg, int cpu, | 1576 | unsigned long sd_shares, |
| 1523 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1577 | unsigned long sd_rq_weight, |
| 1578 | unsigned long *usd_rq_weight) | ||
| 1524 | { | 1579 | { |
| 1525 | unsigned long shares; | 1580 | unsigned long shares, rq_weight; |
| 1526 | unsigned long rq_weight; | 1581 | int boost = 0; |
| 1527 | 1582 | ||
| 1528 | if (!tg->se[cpu]) | 1583 | rq_weight = usd_rq_weight[cpu]; |
| 1529 | return; | 1584 | if (!rq_weight) { |
| 1530 | 1585 | boost = 1; | |
| 1531 | rq_weight = tg->cfs_rq[cpu]->rq_weight; | 1586 | rq_weight = NICE_0_LOAD; |
| 1587 | } | ||
| 1532 | 1588 | ||
| 1533 | /* | 1589 | /* |
| 1534 | * \Sum shares * rq_weight | 1590 | * \Sum_j shares_j * rq_weight_i |
| 1535 | * shares = ----------------------- | 1591 | * shares_i = ----------------------------- |
| 1536 | * \Sum rq_weight | 1592 | * \Sum_j rq_weight_j |
| 1537 | * | ||
| 1538 | */ | 1593 | */ |
| 1539 | shares = (sd_shares * rq_weight) / sd_rq_weight; | 1594 | shares = (sd_shares * rq_weight) / sd_rq_weight; |
| 1540 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | 1595 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); |
| @@ -1545,8 +1600,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
| 1545 | unsigned long flags; | 1600 | unsigned long flags; |
| 1546 | 1601 | ||
| 1547 | spin_lock_irqsave(&rq->lock, flags); | 1602 | spin_lock_irqsave(&rq->lock, flags); |
| 1548 | tg->cfs_rq[cpu]->shares = shares; | 1603 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; |
| 1549 | 1604 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | |
| 1550 | __set_se_shares(tg->se[cpu], shares); | 1605 | __set_se_shares(tg->se[cpu], shares); |
| 1551 | spin_unlock_irqrestore(&rq->lock, flags); | 1606 | spin_unlock_irqrestore(&rq->lock, flags); |
| 1552 | } | 1607 | } |
| @@ -1559,22 +1614,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
| 1559 | */ | 1614 | */ |
| 1560 | static int tg_shares_up(struct task_group *tg, void *data) | 1615 | static int tg_shares_up(struct task_group *tg, void *data) |
| 1561 | { | 1616 | { |
| 1562 | unsigned long weight, rq_weight = 0; | 1617 | unsigned long weight, rq_weight = 0, shares = 0; |
| 1563 | unsigned long shares = 0; | 1618 | unsigned long *usd_rq_weight; |
| 1564 | struct sched_domain *sd = data; | 1619 | struct sched_domain *sd = data; |
| 1620 | unsigned long flags; | ||
| 1565 | int i; | 1621 | int i; |
| 1566 | 1622 | ||
| 1623 | if (!tg->se[0]) | ||
| 1624 | return 0; | ||
| 1625 | |||
| 1626 | local_irq_save(flags); | ||
| 1627 | usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); | ||
| 1628 | |||
| 1567 | for_each_cpu(i, sched_domain_span(sd)) { | 1629 | for_each_cpu(i, sched_domain_span(sd)) { |
| 1630 | weight = tg->cfs_rq[i]->load.weight; | ||
| 1631 | usd_rq_weight[i] = weight; | ||
| 1632 | |||
| 1568 | /* | 1633 | /* |
| 1569 | * If there are currently no tasks on the cpu pretend there | 1634 | * If there are currently no tasks on the cpu pretend there |
| 1570 | * is one of average load so that when a new task gets to | 1635 | * is one of average load so that when a new task gets to |
| 1571 | * run here it will not get delayed by group starvation. | 1636 | * run here it will not get delayed by group starvation. |
| 1572 | */ | 1637 | */ |
| 1573 | weight = tg->cfs_rq[i]->load.weight; | ||
| 1574 | if (!weight) | 1638 | if (!weight) |
| 1575 | weight = NICE_0_LOAD; | 1639 | weight = NICE_0_LOAD; |
| 1576 | 1640 | ||
| 1577 | tg->cfs_rq[i]->rq_weight = weight; | ||
| 1578 | rq_weight += weight; | 1641 | rq_weight += weight; |
| 1579 | shares += tg->cfs_rq[i]->shares; | 1642 | shares += tg->cfs_rq[i]->shares; |
| 1580 | } | 1643 | } |
| @@ -1586,7 +1649,9 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
| 1586 | shares = tg->shares; | 1649 | shares = tg->shares; |
| 1587 | 1650 | ||
| 1588 | for_each_cpu(i, sched_domain_span(sd)) | 1651 | for_each_cpu(i, sched_domain_span(sd)) |
| 1589 | update_group_shares_cpu(tg, i, shares, rq_weight); | 1652 | update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); |
| 1653 | |||
| 1654 | local_irq_restore(flags); | ||
| 1590 | 1655 | ||
| 1591 | return 0; | 1656 | return 0; |
| 1592 | } | 1657 | } |
| @@ -1616,8 +1681,14 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
| 1616 | 1681 | ||
| 1617 | static void update_shares(struct sched_domain *sd) | 1682 | static void update_shares(struct sched_domain *sd) |
| 1618 | { | 1683 | { |
| 1619 | u64 now = cpu_clock(raw_smp_processor_id()); | 1684 | s64 elapsed; |
| 1620 | s64 elapsed = now - sd->last_update; | 1685 | u64 now; |
| 1686 | |||
| 1687 | if (root_task_group_empty()) | ||
| 1688 | return; | ||
| 1689 | |||
| 1690 | now = cpu_clock(raw_smp_processor_id()); | ||
| 1691 | elapsed = now - sd->last_update; | ||
| 1621 | 1692 | ||
| 1622 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1693 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
| 1623 | sd->last_update = now; | 1694 | sd->last_update = now; |
| @@ -1627,6 +1698,9 @@ static void update_shares(struct sched_domain *sd) | |||
| 1627 | 1698 | ||
| 1628 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | 1699 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) |
| 1629 | { | 1700 | { |
| 1701 | if (root_task_group_empty()) | ||
| 1702 | return; | ||
| 1703 | |||
| 1630 | spin_unlock(&rq->lock); | 1704 | spin_unlock(&rq->lock); |
| 1631 | update_shares(sd); | 1705 | update_shares(sd); |
| 1632 | spin_lock(&rq->lock); | 1706 | spin_lock(&rq->lock); |
| @@ -1634,6 +1708,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
| 1634 | 1708 | ||
| 1635 | static void update_h_load(long cpu) | 1709 | static void update_h_load(long cpu) |
| 1636 | { | 1710 | { |
| 1711 | if (root_task_group_empty()) | ||
| 1712 | return; | ||
| 1713 | |||
| 1637 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1714 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
| 1638 | } | 1715 | } |
| 1639 | 1716 | ||
| @@ -1651,6 +1728,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
| 1651 | 1728 | ||
| 1652 | #ifdef CONFIG_PREEMPT | 1729 | #ifdef CONFIG_PREEMPT |
| 1653 | 1730 | ||
| 1731 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
| 1732 | |||
| 1654 | /* | 1733 | /* |
| 1655 | * fair double_lock_balance: Safely acquires both rq->locks in a fair | 1734 | * fair double_lock_balance: Safely acquires both rq->locks in a fair |
| 1656 | * way at the expense of forcing extra atomic operations in all | 1735 | * way at the expense of forcing extra atomic operations in all |
| @@ -1914,14 +1993,40 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
| 1914 | p->sched_class->prio_changed(rq, p, oldprio, running); | 1993 | p->sched_class->prio_changed(rq, p, oldprio, running); |
| 1915 | } | 1994 | } |
| 1916 | 1995 | ||
| 1917 | #ifdef CONFIG_SMP | 1996 | /** |
| 1918 | 1997 | * kthread_bind - bind a just-created kthread to a cpu. | |
| 1919 | /* Used instead of source_load when we know the type == 0 */ | 1998 | * @p: thread created by kthread_create(). |
| 1920 | static unsigned long weighted_cpuload(const int cpu) | 1999 | * @cpu: cpu (might not be online, must be possible) for @k to run on. |
| 2000 | * | ||
| 2001 | * Description: This function is equivalent to set_cpus_allowed(), | ||
| 2002 | * except that @cpu doesn't need to be online, and the thread must be | ||
| 2003 | * stopped (i.e., just returned from kthread_create()). | ||
| 2004 | * | ||
| 2005 | * Function lives here instead of kthread.c because it messes with | ||
| 2006 | * scheduler internals which require locking. | ||
| 2007 | */ | ||
| 2008 | void kthread_bind(struct task_struct *p, unsigned int cpu) | ||
| 1921 | { | 2009 | { |
| 1922 | return cpu_rq(cpu)->load.weight; | 2010 | struct rq *rq = cpu_rq(cpu); |
| 2011 | unsigned long flags; | ||
| 2012 | |||
| 2013 | /* Must have done schedule() in kthread() before we set_task_cpu */ | ||
| 2014 | if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { | ||
| 2015 | WARN_ON(1); | ||
| 2016 | return; | ||
| 2017 | } | ||
| 2018 | |||
| 2019 | spin_lock_irqsave(&rq->lock, flags); | ||
| 2020 | update_rq_clock(rq); | ||
| 2021 | set_task_cpu(p, cpu); | ||
| 2022 | p->cpus_allowed = cpumask_of_cpu(cpu); | ||
| 2023 | p->rt.nr_cpus_allowed = 1; | ||
| 2024 | p->flags |= PF_THREAD_BOUND; | ||
| 2025 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 1923 | } | 2026 | } |
| 2027 | EXPORT_SYMBOL(kthread_bind); | ||
| 1924 | 2028 | ||
| 2029 | #ifdef CONFIG_SMP | ||
| 1925 | /* | 2030 | /* |
| 1926 | * Is this task likely cache-hot: | 2031 | * Is this task likely cache-hot: |
| 1927 | */ | 2032 | */ |
| @@ -1933,7 +2038,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
| 1933 | /* | 2038 | /* |
| 1934 | * Buddy candidates are cache hot: | 2039 | * Buddy candidates are cache hot: |
| 1935 | */ | 2040 | */ |
| 1936 | if (sched_feat(CACHE_HOT_BUDDY) && | 2041 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && |
| 1937 | (&p->se == cfs_rq_of(&p->se)->next || | 2042 | (&p->se == cfs_rq_of(&p->se)->next || |
| 1938 | &p->se == cfs_rq_of(&p->se)->last)) | 2043 | &p->se == cfs_rq_of(&p->se)->last)) |
| 1939 | return 1; | 2044 | return 1; |
| @@ -1974,12 +2079,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1974 | #endif | 2079 | #endif |
| 1975 | if (old_cpu != new_cpu) { | 2080 | if (old_cpu != new_cpu) { |
| 1976 | p->se.nr_migrations++; | 2081 | p->se.nr_migrations++; |
| 1977 | new_rq->nr_migrations_in++; | ||
| 1978 | #ifdef CONFIG_SCHEDSTATS | 2082 | #ifdef CONFIG_SCHEDSTATS |
| 1979 | if (task_hot(p, old_rq->clock, NULL)) | 2083 | if (task_hot(p, old_rq->clock, NULL)) |
| 1980 | schedstat_inc(p, se.nr_forced2_migrations); | 2084 | schedstat_inc(p, se.nr_forced2_migrations); |
| 1981 | #endif | 2085 | #endif |
| 1982 | perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, | 2086 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, |
| 1983 | 1, 1, NULL, 0); | 2087 | 1, 1, NULL, 0); |
| 1984 | } | 2088 | } |
| 1985 | p->se.vruntime -= old_cfsrq->min_vruntime - | 2089 | p->se.vruntime -= old_cfsrq->min_vruntime - |
| @@ -2011,6 +2115,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | |||
| 2011 | * it is sufficient to simply update the task's cpu field. | 2115 | * it is sufficient to simply update the task's cpu field. |
| 2012 | */ | 2116 | */ |
| 2013 | if (!p->se.on_rq && !task_running(rq, p)) { | 2117 | if (!p->se.on_rq && !task_running(rq, p)) { |
| 2118 | update_rq_clock(rq); | ||
| 2014 | set_task_cpu(p, dest_cpu); | 2119 | set_task_cpu(p, dest_cpu); |
| 2015 | return 0; | 2120 | return 0; |
| 2016 | } | 2121 | } |
| @@ -2195,186 +2300,6 @@ void kick_process(struct task_struct *p) | |||
| 2195 | preempt_enable(); | 2300 | preempt_enable(); |
| 2196 | } | 2301 | } |
| 2197 | EXPORT_SYMBOL_GPL(kick_process); | 2302 | EXPORT_SYMBOL_GPL(kick_process); |
| 2198 | |||
| 2199 | /* | ||
| 2200 | * Return a low guess at the load of a migration-source cpu weighted | ||
| 2201 | * according to the scheduling class and "nice" value. | ||
| 2202 | * | ||
| 2203 | * We want to under-estimate the load of migration sources, to | ||
| 2204 | * balance conservatively. | ||
| 2205 | */ | ||
| 2206 | static unsigned long source_load(int cpu, int type) | ||
| 2207 | { | ||
| 2208 | struct rq *rq = cpu_rq(cpu); | ||
| 2209 | unsigned long total = weighted_cpuload(cpu); | ||
| 2210 | |||
| 2211 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
| 2212 | return total; | ||
| 2213 | |||
| 2214 | return min(rq->cpu_load[type-1], total); | ||
| 2215 | } | ||
| 2216 | |||
| 2217 | /* | ||
| 2218 | * Return a high guess at the load of a migration-target cpu weighted | ||
| 2219 | * according to the scheduling class and "nice" value. | ||
| 2220 | */ | ||
| 2221 | static unsigned long target_load(int cpu, int type) | ||
| 2222 | { | ||
| 2223 | struct rq *rq = cpu_rq(cpu); | ||
| 2224 | unsigned long total = weighted_cpuload(cpu); | ||
| 2225 | |||
| 2226 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
| 2227 | return total; | ||
| 2228 | |||
| 2229 | return max(rq->cpu_load[type-1], total); | ||
| 2230 | } | ||
| 2231 | |||
| 2232 | /* | ||
| 2233 | * find_idlest_group finds and returns the least busy CPU group within the | ||
| 2234 | * domain. | ||
| 2235 | */ | ||
| 2236 | static struct sched_group * | ||
| 2237 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | ||
| 2238 | { | ||
| 2239 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | ||
| 2240 | unsigned long min_load = ULONG_MAX, this_load = 0; | ||
| 2241 | int load_idx = sd->forkexec_idx; | ||
| 2242 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | ||
| 2243 | |||
| 2244 | do { | ||
| 2245 | unsigned long load, avg_load; | ||
| 2246 | int local_group; | ||
| 2247 | int i; | ||
| 2248 | |||
| 2249 | /* Skip over this group if it has no CPUs allowed */ | ||
| 2250 | if (!cpumask_intersects(sched_group_cpus(group), | ||
| 2251 | &p->cpus_allowed)) | ||
| 2252 | continue; | ||
| 2253 | |||
| 2254 | local_group = cpumask_test_cpu(this_cpu, | ||
| 2255 | sched_group_cpus(group)); | ||
| 2256 | |||
| 2257 | /* Tally up the load of all CPUs in the group */ | ||
| 2258 | avg_load = 0; | ||
| 2259 | |||
| 2260 | for_each_cpu(i, sched_group_cpus(group)) { | ||
| 2261 | /* Bias balancing toward cpus of our domain */ | ||
| 2262 | if (local_group) | ||
| 2263 | load = source_load(i, load_idx); | ||
| 2264 | else | ||
| 2265 | load = target_load(i, load_idx); | ||
| 2266 | |||
| 2267 | avg_load += load; | ||
| 2268 | } | ||
| 2269 | |||
| 2270 | /* Adjust by relative CPU power of the group */ | ||
| 2271 | avg_load = sg_div_cpu_power(group, | ||
| 2272 | avg_load * SCHED_LOAD_SCALE); | ||
| 2273 | |||
| 2274 | if (local_group) { | ||
| 2275 | this_load = avg_load; | ||
| 2276 | this = group; | ||
| 2277 | } else if (avg_load < min_load) { | ||
| 2278 | min_load = avg_load; | ||
| 2279 | idlest = group; | ||
| 2280 | } | ||
| 2281 | } while (group = group->next, group != sd->groups); | ||
| 2282 | |||
| 2283 | if (!idlest || 100*this_load < imbalance*min_load) | ||
| 2284 | return NULL; | ||
| 2285 | return idlest; | ||
| 2286 | } | ||
| 2287 | |||
| 2288 | /* | ||
| 2289 | * find_idlest_cpu - find the idlest cpu among the cpus in group. | ||
| 2290 | */ | ||
| 2291 | static int | ||
| 2292 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | ||
| 2293 | { | ||
| 2294 | unsigned long load, min_load = ULONG_MAX; | ||
| 2295 | int idlest = -1; | ||
| 2296 | int i; | ||
| 2297 | |||
| 2298 | /* Traverse only the allowed CPUs */ | ||
| 2299 | for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { | ||
| 2300 | load = weighted_cpuload(i); | ||
| 2301 | |||
| 2302 | if (load < min_load || (load == min_load && i == this_cpu)) { | ||
| 2303 | min_load = load; | ||
| 2304 | idlest = i; | ||
| 2305 | } | ||
| 2306 | } | ||
| 2307 | |||
| 2308 | return idlest; | ||
| 2309 | } | ||
| 2310 | |||
| 2311 | /* | ||
| 2312 | * sched_balance_self: balance the current task (running on cpu) in domains | ||
| 2313 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and | ||
| 2314 | * SD_BALANCE_EXEC. | ||
| 2315 | * | ||
| 2316 | * Balance, ie. select the least loaded group. | ||
| 2317 | * | ||
| 2318 | * Returns the target CPU number, or the same CPU if no balancing is needed. | ||
| 2319 | * | ||
| 2320 | * preempt must be disabled. | ||
| 2321 | */ | ||
| 2322 | static int sched_balance_self(int cpu, int flag) | ||
| 2323 | { | ||
| 2324 | struct task_struct *t = current; | ||
| 2325 | struct sched_domain *tmp, *sd = NULL; | ||
| 2326 | |||
| 2327 | for_each_domain(cpu, tmp) { | ||
| 2328 | /* | ||
| 2329 | * If power savings logic is enabled for a domain, stop there. | ||
| 2330 | */ | ||
| 2331 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
| 2332 | break; | ||
| 2333 | if (tmp->flags & flag) | ||
| 2334 | sd = tmp; | ||
| 2335 | } | ||
| 2336 | |||
| 2337 | if (sd) | ||
| 2338 | update_shares(sd); | ||
| 2339 | |||
| 2340 | while (sd) { | ||
| 2341 | struct sched_group *group; | ||
| 2342 | int new_cpu, weight; | ||
| 2343 | |||
| 2344 | if (!(sd->flags & flag)) { | ||
| 2345 | sd = sd->child; | ||
| 2346 | continue; | ||
| 2347 | } | ||
| 2348 | |||
| 2349 | group = find_idlest_group(sd, t, cpu); | ||
| 2350 | if (!group) { | ||
| 2351 | sd = sd->child; | ||
| 2352 | continue; | ||
| 2353 | } | ||
| 2354 | |||
| 2355 | new_cpu = find_idlest_cpu(group, t, cpu); | ||
| 2356 | if (new_cpu == -1 || new_cpu == cpu) { | ||
| 2357 | /* Now try balancing at a lower domain level of cpu */ | ||
| 2358 | sd = sd->child; | ||
| 2359 | continue; | ||
| 2360 | } | ||
| 2361 | |||
| 2362 | /* Now try balancing at a lower domain level of new_cpu */ | ||
| 2363 | cpu = new_cpu; | ||
| 2364 | weight = cpumask_weight(sched_domain_span(sd)); | ||
| 2365 | sd = NULL; | ||
| 2366 | for_each_domain(cpu, tmp) { | ||
| 2367 | if (weight <= cpumask_weight(sched_domain_span(tmp))) | ||
| 2368 | break; | ||
| 2369 | if (tmp->flags & flag) | ||
| 2370 | sd = tmp; | ||
| 2371 | } | ||
| 2372 | /* while loop will break here if sd == NULL */ | ||
| 2373 | } | ||
| 2374 | |||
| 2375 | return cpu; | ||
| 2376 | } | ||
| 2377 | |||
| 2378 | #endif /* CONFIG_SMP */ | 2303 | #endif /* CONFIG_SMP */ |
| 2379 | 2304 | ||
| 2380 | /** | 2305 | /** |
| @@ -2412,37 +2337,22 @@ void task_oncpu_function_call(struct task_struct *p, | |||
| 2412 | * | 2337 | * |
| 2413 | * returns failure only if the task is already active. | 2338 | * returns failure only if the task is already active. |
| 2414 | */ | 2339 | */ |
| 2415 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | 2340 | static int try_to_wake_up(struct task_struct *p, unsigned int state, |
| 2341 | int wake_flags) | ||
| 2416 | { | 2342 | { |
| 2417 | int cpu, orig_cpu, this_cpu, success = 0; | 2343 | int cpu, orig_cpu, this_cpu, success = 0; |
| 2418 | unsigned long flags; | 2344 | unsigned long flags; |
| 2419 | long old_state; | 2345 | struct rq *rq, *orig_rq; |
| 2420 | struct rq *rq; | ||
| 2421 | 2346 | ||
| 2422 | if (!sched_feat(SYNC_WAKEUPS)) | 2347 | if (!sched_feat(SYNC_WAKEUPS)) |
| 2423 | sync = 0; | 2348 | wake_flags &= ~WF_SYNC; |
| 2424 | |||
| 2425 | #ifdef CONFIG_SMP | ||
| 2426 | if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { | ||
| 2427 | struct sched_domain *sd; | ||
| 2428 | |||
| 2429 | this_cpu = raw_smp_processor_id(); | ||
| 2430 | cpu = task_cpu(p); | ||
| 2431 | 2349 | ||
| 2432 | for_each_domain(this_cpu, sd) { | 2350 | this_cpu = get_cpu(); |
| 2433 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
| 2434 | update_shares(sd); | ||
| 2435 | break; | ||
| 2436 | } | ||
| 2437 | } | ||
| 2438 | } | ||
| 2439 | #endif | ||
| 2440 | 2351 | ||
| 2441 | smp_wmb(); | 2352 | smp_wmb(); |
| 2442 | rq = task_rq_lock(p, &flags); | 2353 | rq = orig_rq = task_rq_lock(p, &flags); |
| 2443 | update_rq_clock(rq); | 2354 | update_rq_clock(rq); |
| 2444 | old_state = p->state; | 2355 | if (!(p->state & state)) |
| 2445 | if (!(old_state & state)) | ||
| 2446 | goto out; | 2356 | goto out; |
| 2447 | 2357 | ||
| 2448 | if (p->se.on_rq) | 2358 | if (p->se.on_rq) |
| @@ -2450,27 +2360,34 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 2450 | 2360 | ||
| 2451 | cpu = task_cpu(p); | 2361 | cpu = task_cpu(p); |
| 2452 | orig_cpu = cpu; | 2362 | orig_cpu = cpu; |
| 2453 | this_cpu = smp_processor_id(); | ||
| 2454 | 2363 | ||
| 2455 | #ifdef CONFIG_SMP | 2364 | #ifdef CONFIG_SMP |
| 2456 | if (unlikely(task_running(rq, p))) | 2365 | if (unlikely(task_running(rq, p))) |
| 2457 | goto out_activate; | 2366 | goto out_activate; |
| 2458 | 2367 | ||
| 2459 | cpu = p->sched_class->select_task_rq(p, sync); | 2368 | /* |
| 2369 | * In order to handle concurrent wakeups and release the rq->lock | ||
| 2370 | * we put the task in TASK_WAKING state. | ||
| 2371 | * | ||
| 2372 | * First fix up the nr_uninterruptible count: | ||
| 2373 | */ | ||
| 2374 | if (task_contributes_to_load(p)) | ||
| 2375 | rq->nr_uninterruptible--; | ||
| 2376 | p->state = TASK_WAKING; | ||
| 2377 | task_rq_unlock(rq, &flags); | ||
| 2378 | |||
| 2379 | cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | ||
| 2460 | if (cpu != orig_cpu) { | 2380 | if (cpu != orig_cpu) { |
| 2381 | local_irq_save(flags); | ||
| 2382 | rq = cpu_rq(cpu); | ||
| 2383 | update_rq_clock(rq); | ||
| 2461 | set_task_cpu(p, cpu); | 2384 | set_task_cpu(p, cpu); |
| 2462 | task_rq_unlock(rq, &flags); | 2385 | local_irq_restore(flags); |
| 2463 | /* might preempt at this point */ | ||
| 2464 | rq = task_rq_lock(p, &flags); | ||
| 2465 | old_state = p->state; | ||
| 2466 | if (!(old_state & state)) | ||
| 2467 | goto out; | ||
| 2468 | if (p->se.on_rq) | ||
| 2469 | goto out_running; | ||
| 2470 | |||
| 2471 | this_cpu = smp_processor_id(); | ||
| 2472 | cpu = task_cpu(p); | ||
| 2473 | } | 2386 | } |
| 2387 | rq = task_rq_lock(p, &flags); | ||
| 2388 | |||
| 2389 | WARN_ON(p->state != TASK_WAKING); | ||
| 2390 | cpu = task_cpu(p); | ||
| 2474 | 2391 | ||
| 2475 | #ifdef CONFIG_SCHEDSTATS | 2392 | #ifdef CONFIG_SCHEDSTATS |
| 2476 | schedstat_inc(rq, ttwu_count); | 2393 | schedstat_inc(rq, ttwu_count); |
| @@ -2490,7 +2407,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 2490 | out_activate: | 2407 | out_activate: |
| 2491 | #endif /* CONFIG_SMP */ | 2408 | #endif /* CONFIG_SMP */ |
| 2492 | schedstat_inc(p, se.nr_wakeups); | 2409 | schedstat_inc(p, se.nr_wakeups); |
| 2493 | if (sync) | 2410 | if (wake_flags & WF_SYNC) |
| 2494 | schedstat_inc(p, se.nr_wakeups_sync); | 2411 | schedstat_inc(p, se.nr_wakeups_sync); |
| 2495 | if (orig_cpu != cpu) | 2412 | if (orig_cpu != cpu) |
| 2496 | schedstat_inc(p, se.nr_wakeups_migrate); | 2413 | schedstat_inc(p, se.nr_wakeups_migrate); |
| @@ -2519,15 +2436,27 @@ out_activate: | |||
| 2519 | 2436 | ||
| 2520 | out_running: | 2437 | out_running: |
| 2521 | trace_sched_wakeup(rq, p, success); | 2438 | trace_sched_wakeup(rq, p, success); |
| 2522 | check_preempt_curr(rq, p, sync); | 2439 | check_preempt_curr(rq, p, wake_flags); |
| 2523 | 2440 | ||
| 2524 | p->state = TASK_RUNNING; | 2441 | p->state = TASK_RUNNING; |
| 2525 | #ifdef CONFIG_SMP | 2442 | #ifdef CONFIG_SMP |
| 2526 | if (p->sched_class->task_wake_up) | 2443 | if (p->sched_class->task_wake_up) |
| 2527 | p->sched_class->task_wake_up(rq, p); | 2444 | p->sched_class->task_wake_up(rq, p); |
| 2445 | |||
| 2446 | if (unlikely(rq->idle_stamp)) { | ||
| 2447 | u64 delta = rq->clock - rq->idle_stamp; | ||
| 2448 | u64 max = 2*sysctl_sched_migration_cost; | ||
| 2449 | |||
| 2450 | if (delta > max) | ||
| 2451 | rq->avg_idle = max; | ||
| 2452 | else | ||
| 2453 | update_avg(&rq->avg_idle, delta); | ||
| 2454 | rq->idle_stamp = 0; | ||
| 2455 | } | ||
| 2528 | #endif | 2456 | #endif |
| 2529 | out: | 2457 | out: |
| 2530 | task_rq_unlock(rq, &flags); | 2458 | task_rq_unlock(rq, &flags); |
| 2459 | put_cpu(); | ||
| 2531 | 2460 | ||
| 2532 | return success; | 2461 | return success; |
| 2533 | } | 2462 | } |
| @@ -2570,6 +2499,7 @@ static void __sched_fork(struct task_struct *p) | |||
| 2570 | p->se.avg_overlap = 0; | 2499 | p->se.avg_overlap = 0; |
| 2571 | p->se.start_runtime = 0; | 2500 | p->se.start_runtime = 0; |
| 2572 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; | 2501 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; |
| 2502 | p->se.avg_running = 0; | ||
| 2573 | 2503 | ||
| 2574 | #ifdef CONFIG_SCHEDSTATS | 2504 | #ifdef CONFIG_SCHEDSTATS |
| 2575 | p->se.wait_start = 0; | 2505 | p->se.wait_start = 0; |
| @@ -2628,21 +2558,48 @@ static void __sched_fork(struct task_struct *p) | |||
| 2628 | void sched_fork(struct task_struct *p, int clone_flags) | 2558 | void sched_fork(struct task_struct *p, int clone_flags) |
| 2629 | { | 2559 | { |
| 2630 | int cpu = get_cpu(); | 2560 | int cpu = get_cpu(); |
| 2561 | unsigned long flags; | ||
| 2631 | 2562 | ||
| 2632 | __sched_fork(p); | 2563 | __sched_fork(p); |
| 2633 | 2564 | ||
| 2634 | #ifdef CONFIG_SMP | 2565 | /* |
| 2635 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | 2566 | * Revert to default priority/policy on fork if requested. |
| 2636 | #endif | 2567 | */ |
| 2637 | set_task_cpu(p, cpu); | 2568 | if (unlikely(p->sched_reset_on_fork)) { |
| 2569 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { | ||
| 2570 | p->policy = SCHED_NORMAL; | ||
| 2571 | p->normal_prio = p->static_prio; | ||
| 2572 | } | ||
| 2573 | |||
| 2574 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
| 2575 | p->static_prio = NICE_TO_PRIO(0); | ||
| 2576 | p->normal_prio = p->static_prio; | ||
| 2577 | set_load_weight(p); | ||
| 2578 | } | ||
| 2579 | |||
| 2580 | /* | ||
| 2581 | * We don't need the reset flag anymore after the fork. It has | ||
| 2582 | * fulfilled its duty: | ||
| 2583 | */ | ||
| 2584 | p->sched_reset_on_fork = 0; | ||
| 2585 | } | ||
| 2638 | 2586 | ||
| 2639 | /* | 2587 | /* |
| 2640 | * Make sure we do not leak PI boosting priority to the child: | 2588 | * Make sure we do not leak PI boosting priority to the child. |
| 2641 | */ | 2589 | */ |
| 2642 | p->prio = current->normal_prio; | 2590 | p->prio = current->normal_prio; |
| 2591 | |||
| 2643 | if (!rt_prio(p->prio)) | 2592 | if (!rt_prio(p->prio)) |
| 2644 | p->sched_class = &fair_sched_class; | 2593 | p->sched_class = &fair_sched_class; |
| 2645 | 2594 | ||
| 2595 | #ifdef CONFIG_SMP | ||
| 2596 | cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); | ||
| 2597 | #endif | ||
| 2598 | local_irq_save(flags); | ||
| 2599 | update_rq_clock(cpu_rq(cpu)); | ||
| 2600 | set_task_cpu(p, cpu); | ||
| 2601 | local_irq_restore(flags); | ||
| 2602 | |||
| 2646 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2603 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
| 2647 | if (likely(sched_info_on())) | 2604 | if (likely(sched_info_on())) |
| 2648 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2605 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
| @@ -2675,8 +2632,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 2675 | BUG_ON(p->state != TASK_RUNNING); | 2632 | BUG_ON(p->state != TASK_RUNNING); |
| 2676 | update_rq_clock(rq); | 2633 | update_rq_clock(rq); |
| 2677 | 2634 | ||
| 2678 | p->prio = effective_prio(p); | ||
| 2679 | |||
| 2680 | if (!p->sched_class->task_new || !current->se.on_rq) { | 2635 | if (!p->sched_class->task_new || !current->se.on_rq) { |
| 2681 | activate_task(rq, p, 0); | 2636 | activate_task(rq, p, 0); |
| 2682 | } else { | 2637 | } else { |
| @@ -2688,7 +2643,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 2688 | inc_nr_running(rq); | 2643 | inc_nr_running(rq); |
| 2689 | } | 2644 | } |
| 2690 | trace_sched_wakeup_new(rq, p, 1); | 2645 | trace_sched_wakeup_new(rq, p, 1); |
| 2691 | check_preempt_curr(rq, p, 0); | 2646 | check_preempt_curr(rq, p, WF_FORK); |
| 2692 | #ifdef CONFIG_SMP | 2647 | #ifdef CONFIG_SMP |
| 2693 | if (p->sched_class->task_wake_up) | 2648 | if (p->sched_class->task_wake_up) |
| 2694 | p->sched_class->task_wake_up(rq, p); | 2649 | p->sched_class->task_wake_up(rq, p); |
| @@ -2796,12 +2751,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2796 | { | 2751 | { |
| 2797 | struct mm_struct *mm = rq->prev_mm; | 2752 | struct mm_struct *mm = rq->prev_mm; |
| 2798 | long prev_state; | 2753 | long prev_state; |
| 2799 | #ifdef CONFIG_SMP | ||
| 2800 | int post_schedule = 0; | ||
| 2801 | |||
| 2802 | if (current->sched_class->needs_post_schedule) | ||
| 2803 | post_schedule = current->sched_class->needs_post_schedule(rq); | ||
| 2804 | #endif | ||
| 2805 | 2754 | ||
| 2806 | rq->prev_mm = NULL; | 2755 | rq->prev_mm = NULL; |
| 2807 | 2756 | ||
| @@ -2818,12 +2767,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2818 | */ | 2767 | */ |
| 2819 | prev_state = prev->state; | 2768 | prev_state = prev->state; |
| 2820 | finish_arch_switch(prev); | 2769 | finish_arch_switch(prev); |
| 2821 | perf_counter_task_sched_in(current, cpu_of(rq)); | 2770 | perf_event_task_sched_in(current, cpu_of(rq)); |
| 2822 | finish_lock_switch(rq, prev); | 2771 | finish_lock_switch(rq, prev); |
| 2823 | #ifdef CONFIG_SMP | ||
| 2824 | if (post_schedule) | ||
| 2825 | current->sched_class->post_schedule(rq); | ||
| 2826 | #endif | ||
| 2827 | 2772 | ||
| 2828 | fire_sched_in_preempt_notifiers(current); | 2773 | fire_sched_in_preempt_notifiers(current); |
| 2829 | if (mm) | 2774 | if (mm) |
| @@ -2838,6 +2783,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2838 | } | 2783 | } |
| 2839 | } | 2784 | } |
| 2840 | 2785 | ||
| 2786 | #ifdef CONFIG_SMP | ||
| 2787 | |||
| 2788 | /* assumes rq->lock is held */ | ||
| 2789 | static inline void pre_schedule(struct rq *rq, struct task_struct *prev) | ||
| 2790 | { | ||
| 2791 | if (prev->sched_class->pre_schedule) | ||
| 2792 | prev->sched_class->pre_schedule(rq, prev); | ||
| 2793 | } | ||
| 2794 | |||
| 2795 | /* rq->lock is NOT held, but preemption is disabled */ | ||
| 2796 | static inline void post_schedule(struct rq *rq) | ||
| 2797 | { | ||
| 2798 | if (rq->post_schedule) { | ||
| 2799 | unsigned long flags; | ||
| 2800 | |||
| 2801 | spin_lock_irqsave(&rq->lock, flags); | ||
| 2802 | if (rq->curr->sched_class->post_schedule) | ||
| 2803 | rq->curr->sched_class->post_schedule(rq); | ||
| 2804 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 2805 | |||
| 2806 | rq->post_schedule = 0; | ||
| 2807 | } | ||
| 2808 | } | ||
| 2809 | |||
| 2810 | #else | ||
| 2811 | |||
| 2812 | static inline void pre_schedule(struct rq *rq, struct task_struct *p) | ||
| 2813 | { | ||
| 2814 | } | ||
| 2815 | |||
| 2816 | static inline void post_schedule(struct rq *rq) | ||
| 2817 | { | ||
| 2818 | } | ||
| 2819 | |||
| 2820 | #endif | ||
| 2821 | |||
| 2841 | /** | 2822 | /** |
| 2842 | * schedule_tail - first thing a freshly forked thread must call. | 2823 | * schedule_tail - first thing a freshly forked thread must call. |
| 2843 | * @prev: the thread we just switched away from. | 2824 | * @prev: the thread we just switched away from. |
| @@ -2848,6 +2829,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) | |||
| 2848 | struct rq *rq = this_rq(); | 2829 | struct rq *rq = this_rq(); |
| 2849 | 2830 | ||
| 2850 | finish_task_switch(rq, prev); | 2831 | finish_task_switch(rq, prev); |
| 2832 | |||
| 2833 | /* | ||
| 2834 | * FIXME: do we need to worry about rq being invalidated by the | ||
| 2835 | * task_switch? | ||
| 2836 | */ | ||
| 2837 | post_schedule(rq); | ||
| 2838 | |||
| 2851 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 2839 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
| 2852 | /* In this case, finish_task_switch does not reenable preemption */ | 2840 | /* In this case, finish_task_switch does not reenable preemption */ |
| 2853 | preempt_enable(); | 2841 | preempt_enable(); |
| @@ -2877,14 +2865,14 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2877 | */ | 2865 | */ |
| 2878 | arch_start_context_switch(prev); | 2866 | arch_start_context_switch(prev); |
| 2879 | 2867 | ||
| 2880 | if (unlikely(!mm)) { | 2868 | if (likely(!mm)) { |
| 2881 | next->active_mm = oldmm; | 2869 | next->active_mm = oldmm; |
| 2882 | atomic_inc(&oldmm->mm_count); | 2870 | atomic_inc(&oldmm->mm_count); |
| 2883 | enter_lazy_tlb(oldmm, next); | 2871 | enter_lazy_tlb(oldmm, next); |
| 2884 | } else | 2872 | } else |
| 2885 | switch_mm(oldmm, mm, next); | 2873 | switch_mm(oldmm, mm, next); |
| 2886 | 2874 | ||
| 2887 | if (unlikely(!prev->mm)) { | 2875 | if (likely(!prev->mm)) { |
| 2888 | prev->active_mm = NULL; | 2876 | prev->active_mm = NULL; |
| 2889 | rq->prev_mm = oldmm; | 2877 | rq->prev_mm = oldmm; |
| 2890 | } | 2878 | } |
| @@ -2965,6 +2953,19 @@ unsigned long nr_iowait(void) | |||
| 2965 | return sum; | 2953 | return sum; |
| 2966 | } | 2954 | } |
| 2967 | 2955 | ||
| 2956 | unsigned long nr_iowait_cpu(void) | ||
| 2957 | { | ||
| 2958 | struct rq *this = this_rq(); | ||
| 2959 | return atomic_read(&this->nr_iowait); | ||
| 2960 | } | ||
| 2961 | |||
| 2962 | unsigned long this_cpu_load(void) | ||
| 2963 | { | ||
| 2964 | struct rq *this = this_rq(); | ||
| 2965 | return this->cpu_load[0]; | ||
| 2966 | } | ||
| 2967 | |||
| 2968 | |||
| 2968 | /* Variables and functions for calc_load */ | 2969 | /* Variables and functions for calc_load */ |
| 2969 | static atomic_long_t calc_load_tasks; | 2970 | static atomic_long_t calc_load_tasks; |
| 2970 | static unsigned long calc_load_update; | 2971 | static unsigned long calc_load_update; |
| @@ -3034,15 +3035,6 @@ static void calc_load_account_active(struct rq *this_rq) | |||
| 3034 | } | 3035 | } |
| 3035 | 3036 | ||
| 3036 | /* | 3037 | /* |
| 3037 | * Externally visible per-cpu scheduler statistics: | ||
| 3038 | * cpu_nr_migrations(cpu) - number of migrations into that cpu | ||
| 3039 | */ | ||
| 3040 | u64 cpu_nr_migrations(int cpu) | ||
| 3041 | { | ||
| 3042 | return cpu_rq(cpu)->nr_migrations_in; | ||
| 3043 | } | ||
| 3044 | |||
| 3045 | /* | ||
| 3046 | * Update rq->cpu_load[] statistics. This function is usually called every | 3038 | * Update rq->cpu_load[] statistics. This function is usually called every |
| 3047 | * scheduler tick (TICK_NSEC). | 3039 | * scheduler tick (TICK_NSEC). |
| 3048 | */ | 3040 | */ |
| @@ -3164,7 +3156,7 @@ out: | |||
| 3164 | void sched_exec(void) | 3156 | void sched_exec(void) |
| 3165 | { | 3157 | { |
| 3166 | int new_cpu, this_cpu = get_cpu(); | 3158 | int new_cpu, this_cpu = get_cpu(); |
| 3167 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); | 3159 | new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); |
| 3168 | put_cpu(); | 3160 | put_cpu(); |
| 3169 | if (new_cpu != this_cpu) | 3161 | if (new_cpu != this_cpu) |
| 3170 | sched_migrate_task(current, new_cpu); | 3162 | sched_migrate_task(current, new_cpu); |
| @@ -3379,9 +3371,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 3379 | { | 3371 | { |
| 3380 | const struct sched_class *class; | 3372 | const struct sched_class *class; |
| 3381 | 3373 | ||
| 3382 | for (class = sched_class_highest; class; class = class->next) | 3374 | for_each_class(class) { |
| 3383 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | 3375 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) |
| 3384 | return 1; | 3376 | return 1; |
| 3377 | } | ||
| 3385 | 3378 | ||
| 3386 | return 0; | 3379 | return 0; |
| 3387 | } | 3380 | } |
| @@ -3544,7 +3537,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, | |||
| 3544 | * capacity but still has some space to pick up some load | 3537 | * capacity but still has some space to pick up some load |
| 3545 | * from other group and save more power | 3538 | * from other group and save more power |
| 3546 | */ | 3539 | */ |
| 3547 | if (sgs->sum_nr_running > sgs->group_capacity - 1) | 3540 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) |
| 3548 | return; | 3541 | return; |
| 3549 | 3542 | ||
| 3550 | if (sgs->sum_nr_running > sds->leader_nr_running || | 3543 | if (sgs->sum_nr_running > sds->leader_nr_running || |
| @@ -3583,11 +3576,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
| 3583 | *imbalance = sds->min_load_per_task; | 3576 | *imbalance = sds->min_load_per_task; |
| 3584 | sds->busiest = sds->group_min; | 3577 | sds->busiest = sds->group_min; |
| 3585 | 3578 | ||
| 3586 | if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { | ||
| 3587 | cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = | ||
| 3588 | group_first_cpu(sds->group_leader); | ||
| 3589 | } | ||
| 3590 | |||
| 3591 | return 1; | 3579 | return 1; |
| 3592 | 3580 | ||
| 3593 | } | 3581 | } |
| @@ -3612,8 +3600,105 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
| 3612 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 3600 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
| 3613 | 3601 | ||
| 3614 | 3602 | ||
| 3603 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | ||
| 3604 | { | ||
| 3605 | return SCHED_LOAD_SCALE; | ||
| 3606 | } | ||
| 3607 | |||
| 3608 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | ||
| 3609 | { | ||
| 3610 | return default_scale_freq_power(sd, cpu); | ||
| 3611 | } | ||
| 3612 | |||
| 3613 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | ||
| 3614 | { | ||
| 3615 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
| 3616 | unsigned long smt_gain = sd->smt_gain; | ||
| 3617 | |||
| 3618 | smt_gain /= weight; | ||
| 3619 | |||
| 3620 | return smt_gain; | ||
| 3621 | } | ||
| 3622 | |||
| 3623 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
| 3624 | { | ||
| 3625 | return default_scale_smt_power(sd, cpu); | ||
| 3626 | } | ||
| 3627 | |||
| 3628 | unsigned long scale_rt_power(int cpu) | ||
| 3629 | { | ||
| 3630 | struct rq *rq = cpu_rq(cpu); | ||
| 3631 | u64 total, available; | ||
| 3632 | |||
| 3633 | sched_avg_update(rq); | ||
| 3634 | |||
| 3635 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
| 3636 | available = total - rq->rt_avg; | ||
| 3637 | |||
| 3638 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
| 3639 | total = SCHED_LOAD_SCALE; | ||
| 3640 | |||
| 3641 | total >>= SCHED_LOAD_SHIFT; | ||
| 3642 | |||
| 3643 | return div_u64(available, total); | ||
| 3644 | } | ||
| 3645 | |||
| 3646 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
| 3647 | { | ||
| 3648 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
| 3649 | unsigned long power = SCHED_LOAD_SCALE; | ||
| 3650 | struct sched_group *sdg = sd->groups; | ||
| 3651 | |||
| 3652 | if (sched_feat(ARCH_POWER)) | ||
| 3653 | power *= arch_scale_freq_power(sd, cpu); | ||
| 3654 | else | ||
| 3655 | power *= default_scale_freq_power(sd, cpu); | ||
| 3656 | |||
| 3657 | power >>= SCHED_LOAD_SHIFT; | ||
| 3658 | |||
| 3659 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
| 3660 | if (sched_feat(ARCH_POWER)) | ||
| 3661 | power *= arch_scale_smt_power(sd, cpu); | ||
| 3662 | else | ||
| 3663 | power *= default_scale_smt_power(sd, cpu); | ||
| 3664 | |||
| 3665 | power >>= SCHED_LOAD_SHIFT; | ||
| 3666 | } | ||
| 3667 | |||
| 3668 | power *= scale_rt_power(cpu); | ||
| 3669 | power >>= SCHED_LOAD_SHIFT; | ||
| 3670 | |||
| 3671 | if (!power) | ||
| 3672 | power = 1; | ||
| 3673 | |||
| 3674 | sdg->cpu_power = power; | ||
| 3675 | } | ||
| 3676 | |||
| 3677 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
| 3678 | { | ||
| 3679 | struct sched_domain *child = sd->child; | ||
| 3680 | struct sched_group *group, *sdg = sd->groups; | ||
| 3681 | unsigned long power; | ||
| 3682 | |||
| 3683 | if (!child) { | ||
| 3684 | update_cpu_power(sd, cpu); | ||
| 3685 | return; | ||
| 3686 | } | ||
| 3687 | |||
| 3688 | power = 0; | ||
| 3689 | |||
| 3690 | group = child->groups; | ||
| 3691 | do { | ||
| 3692 | power += group->cpu_power; | ||
| 3693 | group = group->next; | ||
| 3694 | } while (group != child->groups); | ||
| 3695 | |||
| 3696 | sdg->cpu_power = power; | ||
| 3697 | } | ||
| 3698 | |||
| 3615 | /** | 3699 | /** |
| 3616 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3700 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| 3701 | * @sd: The sched_domain whose statistics are to be updated. | ||
| 3617 | * @group: sched_group whose statistics are to be updated. | 3702 | * @group: sched_group whose statistics are to be updated. |
| 3618 | * @this_cpu: Cpu for which load balance is currently performed. | 3703 | * @this_cpu: Cpu for which load balance is currently performed. |
| 3619 | * @idle: Idle status of this_cpu | 3704 | * @idle: Idle status of this_cpu |
| @@ -3624,7 +3709,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
| 3624 | * @balance: Should we balance. | 3709 | * @balance: Should we balance. |
| 3625 | * @sgs: variable to hold the statistics for this group. | 3710 | * @sgs: variable to hold the statistics for this group. |
| 3626 | */ | 3711 | */ |
| 3627 | static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | 3712 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
| 3713 | struct sched_group *group, int this_cpu, | ||
| 3628 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 3714 | enum cpu_idle_type idle, int load_idx, int *sd_idle, |
| 3629 | int local_group, const struct cpumask *cpus, | 3715 | int local_group, const struct cpumask *cpus, |
| 3630 | int *balance, struct sg_lb_stats *sgs) | 3716 | int *balance, struct sg_lb_stats *sgs) |
| @@ -3635,8 +3721,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
| 3635 | unsigned long sum_avg_load_per_task; | 3721 | unsigned long sum_avg_load_per_task; |
| 3636 | unsigned long avg_load_per_task; | 3722 | unsigned long avg_load_per_task; |
| 3637 | 3723 | ||
| 3638 | if (local_group) | 3724 | if (local_group) { |
| 3639 | balance_cpu = group_first_cpu(group); | 3725 | balance_cpu = group_first_cpu(group); |
| 3726 | if (balance_cpu == this_cpu) | ||
| 3727 | update_group_power(sd, this_cpu); | ||
| 3728 | } | ||
| 3640 | 3729 | ||
| 3641 | /* Tally up the load of all CPUs in the group */ | 3730 | /* Tally up the load of all CPUs in the group */ |
| 3642 | sum_avg_load_per_task = avg_load_per_task = 0; | 3731 | sum_avg_load_per_task = avg_load_per_task = 0; |
| @@ -3685,8 +3774,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
| 3685 | } | 3774 | } |
| 3686 | 3775 | ||
| 3687 | /* Adjust by relative CPU power of the group */ | 3776 | /* Adjust by relative CPU power of the group */ |
| 3688 | sgs->avg_load = sg_div_cpu_power(group, | 3777 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
| 3689 | sgs->group_load * SCHED_LOAD_SCALE); | ||
| 3690 | 3778 | ||
| 3691 | 3779 | ||
| 3692 | /* | 3780 | /* |
| @@ -3698,14 +3786,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
| 3698 | * normalized nr_running number somewhere that negates | 3786 | * normalized nr_running number somewhere that negates |
| 3699 | * the hierarchy? | 3787 | * the hierarchy? |
| 3700 | */ | 3788 | */ |
| 3701 | avg_load_per_task = sg_div_cpu_power(group, | 3789 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / |
| 3702 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | 3790 | group->cpu_power; |
| 3703 | 3791 | ||
| 3704 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 3792 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) |
| 3705 | sgs->group_imb = 1; | 3793 | sgs->group_imb = 1; |
| 3706 | 3794 | ||
| 3707 | sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3795 | sgs->group_capacity = |
| 3708 | 3796 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | |
| 3709 | } | 3797 | } |
| 3710 | 3798 | ||
| 3711 | /** | 3799 | /** |
| @@ -3723,9 +3811,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 3723 | const struct cpumask *cpus, int *balance, | 3811 | const struct cpumask *cpus, int *balance, |
| 3724 | struct sd_lb_stats *sds) | 3812 | struct sd_lb_stats *sds) |
| 3725 | { | 3813 | { |
| 3814 | struct sched_domain *child = sd->child; | ||
| 3726 | struct sched_group *group = sd->groups; | 3815 | struct sched_group *group = sd->groups; |
| 3727 | struct sg_lb_stats sgs; | 3816 | struct sg_lb_stats sgs; |
| 3728 | int load_idx; | 3817 | int load_idx, prefer_sibling = 0; |
| 3818 | |||
| 3819 | if (child && child->flags & SD_PREFER_SIBLING) | ||
| 3820 | prefer_sibling = 1; | ||
| 3729 | 3821 | ||
| 3730 | init_sd_power_savings_stats(sd, sds, idle); | 3822 | init_sd_power_savings_stats(sd, sds, idle); |
| 3731 | load_idx = get_sd_load_idx(sd, idle); | 3823 | load_idx = get_sd_load_idx(sd, idle); |
| @@ -3736,14 +3828,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 3736 | local_group = cpumask_test_cpu(this_cpu, | 3828 | local_group = cpumask_test_cpu(this_cpu, |
| 3737 | sched_group_cpus(group)); | 3829 | sched_group_cpus(group)); |
| 3738 | memset(&sgs, 0, sizeof(sgs)); | 3830 | memset(&sgs, 0, sizeof(sgs)); |
| 3739 | update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, | 3831 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, |
| 3740 | local_group, cpus, balance, &sgs); | 3832 | local_group, cpus, balance, &sgs); |
| 3741 | 3833 | ||
| 3742 | if (local_group && balance && !(*balance)) | 3834 | if (local_group && balance && !(*balance)) |
| 3743 | return; | 3835 | return; |
| 3744 | 3836 | ||
| 3745 | sds->total_load += sgs.group_load; | 3837 | sds->total_load += sgs.group_load; |
| 3746 | sds->total_pwr += group->__cpu_power; | 3838 | sds->total_pwr += group->cpu_power; |
| 3839 | |||
| 3840 | /* | ||
| 3841 | * In case the child domain prefers tasks go to siblings | ||
| 3842 | * first, lower the group capacity to one so that we'll try | ||
| 3843 | * and move all the excess tasks away. | ||
| 3844 | */ | ||
| 3845 | if (prefer_sibling) | ||
| 3846 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
| 3747 | 3847 | ||
| 3748 | if (local_group) { | 3848 | if (local_group) { |
| 3749 | sds->this_load = sgs.avg_load; | 3849 | sds->this_load = sgs.avg_load; |
| @@ -3763,7 +3863,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 3763 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 3863 | update_sd_power_savings_stats(group, sds, local_group, &sgs); |
| 3764 | group = group->next; | 3864 | group = group->next; |
| 3765 | } while (group != sd->groups); | 3865 | } while (group != sd->groups); |
| 3766 | |||
| 3767 | } | 3866 | } |
| 3768 | 3867 | ||
| 3769 | /** | 3868 | /** |
| @@ -3801,28 +3900,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
| 3801 | * moving them. | 3900 | * moving them. |
| 3802 | */ | 3901 | */ |
| 3803 | 3902 | ||
| 3804 | pwr_now += sds->busiest->__cpu_power * | 3903 | pwr_now += sds->busiest->cpu_power * |
| 3805 | min(sds->busiest_load_per_task, sds->max_load); | 3904 | min(sds->busiest_load_per_task, sds->max_load); |
| 3806 | pwr_now += sds->this->__cpu_power * | 3905 | pwr_now += sds->this->cpu_power * |
| 3807 | min(sds->this_load_per_task, sds->this_load); | 3906 | min(sds->this_load_per_task, sds->this_load); |
| 3808 | pwr_now /= SCHED_LOAD_SCALE; | 3907 | pwr_now /= SCHED_LOAD_SCALE; |
| 3809 | 3908 | ||
| 3810 | /* Amount of load we'd subtract */ | 3909 | /* Amount of load we'd subtract */ |
| 3811 | tmp = sg_div_cpu_power(sds->busiest, | 3910 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
| 3812 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 3911 | sds->busiest->cpu_power; |
| 3813 | if (sds->max_load > tmp) | 3912 | if (sds->max_load > tmp) |
| 3814 | pwr_move += sds->busiest->__cpu_power * | 3913 | pwr_move += sds->busiest->cpu_power * |
| 3815 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 3914 | min(sds->busiest_load_per_task, sds->max_load - tmp); |
| 3816 | 3915 | ||
| 3817 | /* Amount of load we'd add */ | 3916 | /* Amount of load we'd add */ |
| 3818 | if (sds->max_load * sds->busiest->__cpu_power < | 3917 | if (sds->max_load * sds->busiest->cpu_power < |
| 3819 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | 3918 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) |
| 3820 | tmp = sg_div_cpu_power(sds->this, | 3919 | tmp = (sds->max_load * sds->busiest->cpu_power) / |
| 3821 | sds->max_load * sds->busiest->__cpu_power); | 3920 | sds->this->cpu_power; |
| 3822 | else | 3921 | else |
| 3823 | tmp = sg_div_cpu_power(sds->this, | 3922 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
| 3824 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 3923 | sds->this->cpu_power; |
| 3825 | pwr_move += sds->this->__cpu_power * | 3924 | pwr_move += sds->this->cpu_power * |
| 3826 | min(sds->this_load_per_task, sds->this_load + tmp); | 3925 | min(sds->this_load_per_task, sds->this_load + tmp); |
| 3827 | pwr_move /= SCHED_LOAD_SCALE; | 3926 | pwr_move /= SCHED_LOAD_SCALE; |
| 3828 | 3927 | ||
| @@ -3857,8 +3956,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
| 3857 | sds->max_load - sds->busiest_load_per_task); | 3956 | sds->max_load - sds->busiest_load_per_task); |
| 3858 | 3957 | ||
| 3859 | /* How much load to actually move to equalise the imbalance */ | 3958 | /* How much load to actually move to equalise the imbalance */ |
| 3860 | *imbalance = min(max_pull * sds->busiest->__cpu_power, | 3959 | *imbalance = min(max_pull * sds->busiest->cpu_power, |
| 3861 | (sds->avg_load - sds->this_load) * sds->this->__cpu_power) | 3960 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) |
| 3862 | / SCHED_LOAD_SCALE; | 3961 | / SCHED_LOAD_SCALE; |
| 3863 | 3962 | ||
| 3864 | /* | 3963 | /* |
| @@ -3988,15 +4087,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
| 3988 | int i; | 4087 | int i; |
| 3989 | 4088 | ||
| 3990 | for_each_cpu(i, sched_group_cpus(group)) { | 4089 | for_each_cpu(i, sched_group_cpus(group)) { |
| 4090 | unsigned long power = power_of(i); | ||
| 4091 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
| 3991 | unsigned long wl; | 4092 | unsigned long wl; |
| 3992 | 4093 | ||
| 3993 | if (!cpumask_test_cpu(i, cpus)) | 4094 | if (!cpumask_test_cpu(i, cpus)) |
| 3994 | continue; | 4095 | continue; |
| 3995 | 4096 | ||
| 3996 | rq = cpu_rq(i); | 4097 | rq = cpu_rq(i); |
| 3997 | wl = weighted_cpuload(i); | 4098 | wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; |
| 4099 | wl /= power; | ||
| 3998 | 4100 | ||
| 3999 | if (rq->nr_running == 1 && wl > imbalance) | 4101 | if (capacity && rq->nr_running == 1 && wl > imbalance) |
| 4000 | continue; | 4102 | continue; |
| 4001 | 4103 | ||
| 4002 | if (wl > max_load) { | 4104 | if (wl > max_load) { |
| @@ -4032,7 +4134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 4032 | unsigned long flags; | 4134 | unsigned long flags; |
| 4033 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4135 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
| 4034 | 4136 | ||
| 4035 | cpumask_setall(cpus); | 4137 | cpumask_copy(cpus, cpu_online_mask); |
| 4036 | 4138 | ||
| 4037 | /* | 4139 | /* |
| 4038 | * When power savings policy is enabled for the parent domain, idle | 4140 | * When power savings policy is enabled for the parent domain, idle |
| @@ -4195,7 +4297,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
| 4195 | int all_pinned = 0; | 4297 | int all_pinned = 0; |
| 4196 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4298 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
| 4197 | 4299 | ||
| 4198 | cpumask_setall(cpus); | 4300 | cpumask_copy(cpus, cpu_online_mask); |
| 4199 | 4301 | ||
| 4200 | /* | 4302 | /* |
| 4201 | * When power savings policy is enabled for the parent domain, idle | 4303 | * When power savings policy is enabled for the parent domain, idle |
| @@ -4335,6 +4437,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 4335 | int pulled_task = 0; | 4437 | int pulled_task = 0; |
| 4336 | unsigned long next_balance = jiffies + HZ; | 4438 | unsigned long next_balance = jiffies + HZ; |
| 4337 | 4439 | ||
| 4440 | this_rq->idle_stamp = this_rq->clock; | ||
| 4441 | |||
| 4442 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | ||
| 4443 | return; | ||
| 4444 | |||
| 4338 | for_each_domain(this_cpu, sd) { | 4445 | for_each_domain(this_cpu, sd) { |
| 4339 | unsigned long interval; | 4446 | unsigned long interval; |
| 4340 | 4447 | ||
| @@ -4349,8 +4456,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 4349 | interval = msecs_to_jiffies(sd->balance_interval); | 4456 | interval = msecs_to_jiffies(sd->balance_interval); |
| 4350 | if (time_after(next_balance, sd->last_balance + interval)) | 4457 | if (time_after(next_balance, sd->last_balance + interval)) |
| 4351 | next_balance = sd->last_balance + interval; | 4458 | next_balance = sd->last_balance + interval; |
| 4352 | if (pulled_task) | 4459 | if (pulled_task) { |
| 4460 | this_rq->idle_stamp = 0; | ||
| 4353 | break; | 4461 | break; |
| 4462 | } | ||
| 4354 | } | 4463 | } |
| 4355 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | 4464 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { |
| 4356 | /* | 4465 | /* |
| @@ -4952,8 +5061,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
| 4952 | p->gtime = cputime_add(p->gtime, cputime); | 5061 | p->gtime = cputime_add(p->gtime, cputime); |
| 4953 | 5062 | ||
| 4954 | /* Add guest time to cpustat. */ | 5063 | /* Add guest time to cpustat. */ |
| 4955 | cpustat->user = cputime64_add(cpustat->user, tmp); | 5064 | if (TASK_NICE(p) > 0) { |
| 4956 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | 5065 | cpustat->nice = cputime64_add(cpustat->nice, tmp); |
| 5066 | cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); | ||
| 5067 | } else { | ||
| 5068 | cpustat->user = cputime64_add(cpustat->user, tmp); | ||
| 5069 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | ||
| 5070 | } | ||
| 4957 | } | 5071 | } |
| 4958 | 5072 | ||
| 4959 | /* | 5073 | /* |
| @@ -5031,17 +5145,16 @@ void account_idle_time(cputime_t cputime) | |||
| 5031 | */ | 5145 | */ |
| 5032 | void account_process_tick(struct task_struct *p, int user_tick) | 5146 | void account_process_tick(struct task_struct *p, int user_tick) |
| 5033 | { | 5147 | { |
| 5034 | cputime_t one_jiffy = jiffies_to_cputime(1); | 5148 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
| 5035 | cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); | ||
| 5036 | struct rq *rq = this_rq(); | 5149 | struct rq *rq = this_rq(); |
| 5037 | 5150 | ||
| 5038 | if (user_tick) | 5151 | if (user_tick) |
| 5039 | account_user_time(p, one_jiffy, one_jiffy_scaled); | 5152 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
| 5040 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 5153 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
| 5041 | account_system_time(p, HARDIRQ_OFFSET, one_jiffy, | 5154 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, |
| 5042 | one_jiffy_scaled); | 5155 | one_jiffy_scaled); |
| 5043 | else | 5156 | else |
| 5044 | account_idle_time(one_jiffy); | 5157 | account_idle_time(cputime_one_jiffy); |
| 5045 | } | 5158 | } |
| 5046 | 5159 | ||
| 5047 | /* | 5160 | /* |
| @@ -5069,60 +5182,86 @@ void account_idle_ticks(unsigned long ticks) | |||
| 5069 | * Use precise platform statistics if available: | 5182 | * Use precise platform statistics if available: |
| 5070 | */ | 5183 | */ |
| 5071 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 5184 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
| 5072 | cputime_t task_utime(struct task_struct *p) | 5185 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
| 5073 | { | 5186 | { |
| 5074 | return p->utime; | 5187 | *ut = p->utime; |
| 5188 | *st = p->stime; | ||
| 5075 | } | 5189 | } |
| 5076 | 5190 | ||
| 5077 | cputime_t task_stime(struct task_struct *p) | 5191 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
| 5078 | { | 5192 | { |
| 5079 | return p->stime; | 5193 | struct task_cputime cputime; |
| 5194 | |||
| 5195 | thread_group_cputime(p, &cputime); | ||
| 5196 | |||
| 5197 | *ut = cputime.utime; | ||
| 5198 | *st = cputime.stime; | ||
| 5080 | } | 5199 | } |
| 5081 | #else | 5200 | #else |
| 5082 | cputime_t task_utime(struct task_struct *p) | 5201 | |
| 5202 | #ifndef nsecs_to_cputime | ||
| 5203 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | ||
| 5204 | #endif | ||
| 5205 | |||
| 5206 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 5083 | { | 5207 | { |
| 5084 | clock_t utime = cputime_to_clock_t(p->utime), | 5208 | cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); |
| 5085 | total = utime + cputime_to_clock_t(p->stime); | ||
| 5086 | u64 temp; | ||
| 5087 | 5209 | ||
| 5088 | /* | 5210 | /* |
| 5089 | * Use CFS's precise accounting: | 5211 | * Use CFS's precise accounting: |
| 5090 | */ | 5212 | */ |
| 5091 | temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); | 5213 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); |
| 5092 | 5214 | ||
| 5093 | if (total) { | 5215 | if (total) { |
| 5094 | temp *= utime; | 5216 | u64 temp; |
| 5217 | |||
| 5218 | temp = (u64)(rtime * utime); | ||
| 5095 | do_div(temp, total); | 5219 | do_div(temp, total); |
| 5096 | } | 5220 | utime = (cputime_t)temp; |
| 5097 | utime = (clock_t)temp; | 5221 | } else |
| 5222 | utime = rtime; | ||
| 5223 | |||
| 5224 | /* | ||
| 5225 | * Compare with previous values, to keep monotonicity: | ||
| 5226 | */ | ||
| 5227 | p->prev_utime = max(p->prev_utime, utime); | ||
| 5228 | p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); | ||
| 5098 | 5229 | ||
| 5099 | p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); | 5230 | *ut = p->prev_utime; |
| 5100 | return p->prev_utime; | 5231 | *st = p->prev_stime; |
| 5101 | } | 5232 | } |
| 5102 | 5233 | ||
| 5103 | cputime_t task_stime(struct task_struct *p) | 5234 | /* |
| 5235 | * Must be called with siglock held. | ||
| 5236 | */ | ||
| 5237 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 5104 | { | 5238 | { |
| 5105 | clock_t stime; | 5239 | struct signal_struct *sig = p->signal; |
| 5240 | struct task_cputime cputime; | ||
| 5241 | cputime_t rtime, utime, total; | ||
| 5106 | 5242 | ||
| 5107 | /* | 5243 | thread_group_cputime(p, &cputime); |
| 5108 | * Use CFS's precise accounting. (we subtract utime from | ||
| 5109 | * the total, to make sure the total observed by userspace | ||
| 5110 | * grows monotonically - apps rely on that): | ||
| 5111 | */ | ||
| 5112 | stime = nsec_to_clock_t(p->se.sum_exec_runtime) - | ||
| 5113 | cputime_to_clock_t(task_utime(p)); | ||
| 5114 | 5244 | ||
| 5115 | if (stime >= 0) | 5245 | total = cputime_add(cputime.utime, cputime.stime); |
| 5116 | p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); | 5246 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); |
| 5117 | 5247 | ||
| 5118 | return p->prev_stime; | 5248 | if (total) { |
| 5119 | } | 5249 | u64 temp; |
| 5120 | #endif | ||
| 5121 | 5250 | ||
| 5122 | inline cputime_t task_gtime(struct task_struct *p) | 5251 | temp = (u64)(rtime * cputime.utime); |
| 5123 | { | 5252 | do_div(temp, total); |
| 5124 | return p->gtime; | 5253 | utime = (cputime_t)temp; |
| 5254 | } else | ||
| 5255 | utime = rtime; | ||
| 5256 | |||
| 5257 | sig->prev_utime = max(sig->prev_utime, utime); | ||
| 5258 | sig->prev_stime = max(sig->prev_stime, | ||
| 5259 | cputime_sub(rtime, sig->prev_utime)); | ||
| 5260 | |||
| 5261 | *ut = sig->prev_utime; | ||
| 5262 | *st = sig->prev_stime; | ||
| 5125 | } | 5263 | } |
| 5264 | #endif | ||
| 5126 | 5265 | ||
| 5127 | /* | 5266 | /* |
| 5128 | * This function gets called by the timer code, with HZ frequency. | 5267 | * This function gets called by the timer code, with HZ frequency. |
| @@ -5145,7 +5284,7 @@ void scheduler_tick(void) | |||
| 5145 | curr->sched_class->task_tick(rq, curr, 0); | 5284 | curr->sched_class->task_tick(rq, curr, 0); |
| 5146 | spin_unlock(&rq->lock); | 5285 | spin_unlock(&rq->lock); |
| 5147 | 5286 | ||
| 5148 | perf_counter_task_tick(curr, cpu); | 5287 | perf_event_task_tick(curr, cpu); |
| 5149 | 5288 | ||
| 5150 | #ifdef CONFIG_SMP | 5289 | #ifdef CONFIG_SMP |
| 5151 | rq->idle_at_tick = idle_cpu(cpu); | 5290 | rq->idle_at_tick = idle_cpu(cpu); |
| @@ -5257,14 +5396,13 @@ static inline void schedule_debug(struct task_struct *prev) | |||
| 5257 | #endif | 5396 | #endif |
| 5258 | } | 5397 | } |
| 5259 | 5398 | ||
| 5260 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | 5399 | static void put_prev_task(struct rq *rq, struct task_struct *p) |
| 5261 | { | 5400 | { |
| 5262 | if (prev->state == TASK_RUNNING) { | 5401 | u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; |
| 5263 | u64 runtime = prev->se.sum_exec_runtime; | ||
| 5264 | 5402 | ||
| 5265 | runtime -= prev->se.prev_sum_exec_runtime; | 5403 | update_avg(&p->se.avg_running, runtime); |
| 5266 | runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); | ||
| 5267 | 5404 | ||
| 5405 | if (p->state == TASK_RUNNING) { | ||
| 5268 | /* | 5406 | /* |
| 5269 | * In order to avoid avg_overlap growing stale when we are | 5407 | * In order to avoid avg_overlap growing stale when we are |
| 5270 | * indeed overlapping and hence not getting put to sleep, grow | 5408 | * indeed overlapping and hence not getting put to sleep, grow |
| @@ -5274,9 +5412,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) | |||
| 5274 | * correlates to the amount of cache footprint a task can | 5412 | * correlates to the amount of cache footprint a task can |
| 5275 | * build up. | 5413 | * build up. |
| 5276 | */ | 5414 | */ |
| 5277 | update_avg(&prev->se.avg_overlap, runtime); | 5415 | runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); |
| 5416 | update_avg(&p->se.avg_overlap, runtime); | ||
| 5417 | } else { | ||
| 5418 | update_avg(&p->se.avg_running, 0); | ||
| 5278 | } | 5419 | } |
| 5279 | prev->sched_class->put_prev_task(rq, prev); | 5420 | p->sched_class->put_prev_task(rq, p); |
| 5280 | } | 5421 | } |
| 5281 | 5422 | ||
| 5282 | /* | 5423 | /* |
| @@ -5325,7 +5466,7 @@ need_resched: | |||
| 5325 | preempt_disable(); | 5466 | preempt_disable(); |
| 5326 | cpu = smp_processor_id(); | 5467 | cpu = smp_processor_id(); |
| 5327 | rq = cpu_rq(cpu); | 5468 | rq = cpu_rq(cpu); |
| 5328 | rcu_qsctr_inc(cpu); | 5469 | rcu_sched_qs(cpu); |
| 5329 | prev = rq->curr; | 5470 | prev = rq->curr; |
| 5330 | switch_count = &prev->nivcsw; | 5471 | switch_count = &prev->nivcsw; |
| 5331 | 5472 | ||
| @@ -5349,10 +5490,7 @@ need_resched_nonpreemptible: | |||
| 5349 | switch_count = &prev->nvcsw; | 5490 | switch_count = &prev->nvcsw; |
| 5350 | } | 5491 | } |
| 5351 | 5492 | ||
| 5352 | #ifdef CONFIG_SMP | 5493 | pre_schedule(rq, prev); |
| 5353 | if (prev->sched_class->pre_schedule) | ||
| 5354 | prev->sched_class->pre_schedule(rq, prev); | ||
| 5355 | #endif | ||
| 5356 | 5494 | ||
| 5357 | if (unlikely(!rq->nr_running)) | 5495 | if (unlikely(!rq->nr_running)) |
| 5358 | idle_balance(cpu, rq); | 5496 | idle_balance(cpu, rq); |
| @@ -5362,7 +5500,7 @@ need_resched_nonpreemptible: | |||
| 5362 | 5500 | ||
| 5363 | if (likely(prev != next)) { | 5501 | if (likely(prev != next)) { |
| 5364 | sched_info_switch(prev, next); | 5502 | sched_info_switch(prev, next); |
| 5365 | perf_counter_task_sched_out(prev, next, cpu); | 5503 | perf_event_task_sched_out(prev, next, cpu); |
| 5366 | 5504 | ||
| 5367 | rq->nr_switches++; | 5505 | rq->nr_switches++; |
| 5368 | rq->curr = next; | 5506 | rq->curr = next; |
| @@ -5378,6 +5516,8 @@ need_resched_nonpreemptible: | |||
| 5378 | } else | 5516 | } else |
| 5379 | spin_unlock_irq(&rq->lock); | 5517 | spin_unlock_irq(&rq->lock); |
| 5380 | 5518 | ||
| 5519 | post_schedule(rq); | ||
| 5520 | |||
| 5381 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 5521 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
| 5382 | goto need_resched_nonpreemptible; | 5522 | goto need_resched_nonpreemptible; |
| 5383 | 5523 | ||
| @@ -5387,7 +5527,7 @@ need_resched_nonpreemptible: | |||
| 5387 | } | 5527 | } |
| 5388 | EXPORT_SYMBOL(schedule); | 5528 | EXPORT_SYMBOL(schedule); |
| 5389 | 5529 | ||
| 5390 | #ifdef CONFIG_SMP | 5530 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
| 5391 | /* | 5531 | /* |
| 5392 | * Look out! "owner" is an entirely speculative pointer | 5532 | * Look out! "owner" is an entirely speculative pointer |
| 5393 | * access and not reliable. | 5533 | * access and not reliable. |
| @@ -5509,10 +5649,10 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
| 5509 | 5649 | ||
| 5510 | #endif /* CONFIG_PREEMPT */ | 5650 | #endif /* CONFIG_PREEMPT */ |
| 5511 | 5651 | ||
| 5512 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | 5652 | int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, |
| 5513 | void *key) | 5653 | void *key) |
| 5514 | { | 5654 | { |
| 5515 | return try_to_wake_up(curr->private, mode, sync); | 5655 | return try_to_wake_up(curr->private, mode, wake_flags); |
| 5516 | } | 5656 | } |
| 5517 | EXPORT_SYMBOL(default_wake_function); | 5657 | EXPORT_SYMBOL(default_wake_function); |
| 5518 | 5658 | ||
| @@ -5526,14 +5666,14 @@ EXPORT_SYMBOL(default_wake_function); | |||
| 5526 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 5666 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
| 5527 | */ | 5667 | */ |
| 5528 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 5668 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
| 5529 | int nr_exclusive, int sync, void *key) | 5669 | int nr_exclusive, int wake_flags, void *key) |
| 5530 | { | 5670 | { |
| 5531 | wait_queue_t *curr, *next; | 5671 | wait_queue_t *curr, *next; |
| 5532 | 5672 | ||
| 5533 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { | 5673 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { |
| 5534 | unsigned flags = curr->flags; | 5674 | unsigned flags = curr->flags; |
| 5535 | 5675 | ||
| 5536 | if (curr->func(curr, mode, sync, key) && | 5676 | if (curr->func(curr, mode, wake_flags, key) && |
| 5537 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | 5677 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) |
| 5538 | break; | 5678 | break; |
| 5539 | } | 5679 | } |
| @@ -5594,16 +5734,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | |||
| 5594 | int nr_exclusive, void *key) | 5734 | int nr_exclusive, void *key) |
| 5595 | { | 5735 | { |
| 5596 | unsigned long flags; | 5736 | unsigned long flags; |
| 5597 | int sync = 1; | 5737 | int wake_flags = WF_SYNC; |
| 5598 | 5738 | ||
| 5599 | if (unlikely(!q)) | 5739 | if (unlikely(!q)) |
| 5600 | return; | 5740 | return; |
| 5601 | 5741 | ||
| 5602 | if (unlikely(!nr_exclusive)) | 5742 | if (unlikely(!nr_exclusive)) |
| 5603 | sync = 0; | 5743 | wake_flags = 0; |
| 5604 | 5744 | ||
| 5605 | spin_lock_irqsave(&q->lock, flags); | 5745 | spin_lock_irqsave(&q->lock, flags); |
| 5606 | __wake_up_common(q, mode, nr_exclusive, sync, key); | 5746 | __wake_up_common(q, mode, nr_exclusive, wake_flags, key); |
| 5607 | spin_unlock_irqrestore(&q->lock, flags); | 5747 | spin_unlock_irqrestore(&q->lock, flags); |
| 5608 | } | 5748 | } |
| 5609 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); | 5749 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); |
| @@ -6081,22 +6221,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | |||
| 6081 | BUG_ON(p->se.on_rq); | 6221 | BUG_ON(p->se.on_rq); |
| 6082 | 6222 | ||
| 6083 | p->policy = policy; | 6223 | p->policy = policy; |
| 6084 | switch (p->policy) { | ||
| 6085 | case SCHED_NORMAL: | ||
| 6086 | case SCHED_BATCH: | ||
| 6087 | case SCHED_IDLE: | ||
| 6088 | p->sched_class = &fair_sched_class; | ||
| 6089 | break; | ||
| 6090 | case SCHED_FIFO: | ||
| 6091 | case SCHED_RR: | ||
| 6092 | p->sched_class = &rt_sched_class; | ||
| 6093 | break; | ||
| 6094 | } | ||
| 6095 | |||
| 6096 | p->rt_priority = prio; | 6224 | p->rt_priority = prio; |
| 6097 | p->normal_prio = normal_prio(p); | 6225 | p->normal_prio = normal_prio(p); |
| 6098 | /* we are holding p->pi_lock already */ | 6226 | /* we are holding p->pi_lock already */ |
| 6099 | p->prio = rt_mutex_getprio(p); | 6227 | p->prio = rt_mutex_getprio(p); |
| 6228 | if (rt_prio(p->prio)) | ||
| 6229 | p->sched_class = &rt_sched_class; | ||
| 6230 | else | ||
| 6231 | p->sched_class = &fair_sched_class; | ||
| 6100 | set_load_weight(p); | 6232 | set_load_weight(p); |
| 6101 | } | 6233 | } |
| 6102 | 6234 | ||
| @@ -6123,17 +6255,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, | |||
| 6123 | unsigned long flags; | 6255 | unsigned long flags; |
| 6124 | const struct sched_class *prev_class = p->sched_class; | 6256 | const struct sched_class *prev_class = p->sched_class; |
| 6125 | struct rq *rq; | 6257 | struct rq *rq; |
| 6258 | int reset_on_fork; | ||
| 6126 | 6259 | ||
| 6127 | /* may grab non-irq protected spin_locks */ | 6260 | /* may grab non-irq protected spin_locks */ |
| 6128 | BUG_ON(in_interrupt()); | 6261 | BUG_ON(in_interrupt()); |
| 6129 | recheck: | 6262 | recheck: |
| 6130 | /* double check policy once rq lock held */ | 6263 | /* double check policy once rq lock held */ |
| 6131 | if (policy < 0) | 6264 | if (policy < 0) { |
| 6265 | reset_on_fork = p->sched_reset_on_fork; | ||
| 6132 | policy = oldpolicy = p->policy; | 6266 | policy = oldpolicy = p->policy; |
| 6133 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 6267 | } else { |
| 6134 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 6268 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); |
| 6135 | policy != SCHED_IDLE) | 6269 | policy &= ~SCHED_RESET_ON_FORK; |
| 6136 | return -EINVAL; | 6270 | |
| 6271 | if (policy != SCHED_FIFO && policy != SCHED_RR && | ||
| 6272 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | ||
| 6273 | policy != SCHED_IDLE) | ||
| 6274 | return -EINVAL; | ||
| 6275 | } | ||
| 6276 | |||
| 6137 | /* | 6277 | /* |
| 6138 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 6278 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
| 6139 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 6279 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
| @@ -6177,6 +6317,10 @@ recheck: | |||
| 6177 | /* can't change other user's priorities */ | 6317 | /* can't change other user's priorities */ |
| 6178 | if (!check_same_owner(p)) | 6318 | if (!check_same_owner(p)) |
| 6179 | return -EPERM; | 6319 | return -EPERM; |
| 6320 | |||
| 6321 | /* Normal users shall not reset the sched_reset_on_fork flag */ | ||
| 6322 | if (p->sched_reset_on_fork && !reset_on_fork) | ||
| 6323 | return -EPERM; | ||
| 6180 | } | 6324 | } |
| 6181 | 6325 | ||
| 6182 | if (user) { | 6326 | if (user) { |
| @@ -6220,6 +6364,8 @@ recheck: | |||
| 6220 | if (running) | 6364 | if (running) |
| 6221 | p->sched_class->put_prev_task(rq, p); | 6365 | p->sched_class->put_prev_task(rq, p); |
| 6222 | 6366 | ||
| 6367 | p->sched_reset_on_fork = reset_on_fork; | ||
| 6368 | |||
| 6223 | oldprio = p->prio; | 6369 | oldprio = p->prio; |
| 6224 | __setscheduler(rq, p, policy, param->sched_priority); | 6370 | __setscheduler(rq, p, policy, param->sched_priority); |
| 6225 | 6371 | ||
| @@ -6336,14 +6482,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
| 6336 | if (p) { | 6482 | if (p) { |
| 6337 | retval = security_task_getscheduler(p); | 6483 | retval = security_task_getscheduler(p); |
| 6338 | if (!retval) | 6484 | if (!retval) |
| 6339 | retval = p->policy; | 6485 | retval = p->policy |
| 6486 | | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); | ||
| 6340 | } | 6487 | } |
| 6341 | read_unlock(&tasklist_lock); | 6488 | read_unlock(&tasklist_lock); |
| 6342 | return retval; | 6489 | return retval; |
| 6343 | } | 6490 | } |
| 6344 | 6491 | ||
| 6345 | /** | 6492 | /** |
| 6346 | * sys_sched_getscheduler - get the RT priority of a thread | 6493 | * sys_sched_getparam - get the RT priority of a thread |
| 6347 | * @pid: the pid in question. | 6494 | * @pid: the pid in question. |
| 6348 | * @param: structure containing the RT priority. | 6495 | * @param: structure containing the RT priority. |
| 6349 | */ | 6496 | */ |
| @@ -6571,19 +6718,9 @@ static inline int should_resched(void) | |||
| 6571 | 6718 | ||
| 6572 | static void __cond_resched(void) | 6719 | static void __cond_resched(void) |
| 6573 | { | 6720 | { |
| 6574 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6721 | add_preempt_count(PREEMPT_ACTIVE); |
| 6575 | __might_sleep(__FILE__, __LINE__); | 6722 | schedule(); |
| 6576 | #endif | 6723 | sub_preempt_count(PREEMPT_ACTIVE); |
| 6577 | /* | ||
| 6578 | * The BKS might be reacquired before we have dropped | ||
| 6579 | * PREEMPT_ACTIVE, which could trigger a second | ||
| 6580 | * cond_resched() call. | ||
| 6581 | */ | ||
| 6582 | do { | ||
| 6583 | add_preempt_count(PREEMPT_ACTIVE); | ||
| 6584 | schedule(); | ||
| 6585 | sub_preempt_count(PREEMPT_ACTIVE); | ||
| 6586 | } while (need_resched()); | ||
| 6587 | } | 6724 | } |
| 6588 | 6725 | ||
| 6589 | int __sched _cond_resched(void) | 6726 | int __sched _cond_resched(void) |
| @@ -6597,18 +6734,20 @@ int __sched _cond_resched(void) | |||
| 6597 | EXPORT_SYMBOL(_cond_resched); | 6734 | EXPORT_SYMBOL(_cond_resched); |
| 6598 | 6735 | ||
| 6599 | /* | 6736 | /* |
| 6600 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 6737 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
| 6601 | * call schedule, and on return reacquire the lock. | 6738 | * call schedule, and on return reacquire the lock. |
| 6602 | * | 6739 | * |
| 6603 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 6740 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
| 6604 | * operations here to prevent schedule() from being called twice (once via | 6741 | * operations here to prevent schedule() from being called twice (once via |
| 6605 | * spin_unlock(), once by hand). | 6742 | * spin_unlock(), once by hand). |
| 6606 | */ | 6743 | */ |
| 6607 | int cond_resched_lock(spinlock_t *lock) | 6744 | int __cond_resched_lock(spinlock_t *lock) |
| 6608 | { | 6745 | { |
| 6609 | int resched = should_resched(); | 6746 | int resched = should_resched(); |
| 6610 | int ret = 0; | 6747 | int ret = 0; |
| 6611 | 6748 | ||
| 6749 | lockdep_assert_held(lock); | ||
| 6750 | |||
| 6612 | if (spin_needbreak(lock) || resched) { | 6751 | if (spin_needbreak(lock) || resched) { |
| 6613 | spin_unlock(lock); | 6752 | spin_unlock(lock); |
| 6614 | if (resched) | 6753 | if (resched) |
| @@ -6620,9 +6759,9 @@ int cond_resched_lock(spinlock_t *lock) | |||
| 6620 | } | 6759 | } |
| 6621 | return ret; | 6760 | return ret; |
| 6622 | } | 6761 | } |
| 6623 | EXPORT_SYMBOL(cond_resched_lock); | 6762 | EXPORT_SYMBOL(__cond_resched_lock); |
| 6624 | 6763 | ||
| 6625 | int __sched cond_resched_softirq(void) | 6764 | int __sched __cond_resched_softirq(void) |
| 6626 | { | 6765 | { |
| 6627 | BUG_ON(!in_softirq()); | 6766 | BUG_ON(!in_softirq()); |
| 6628 | 6767 | ||
| @@ -6634,7 +6773,7 @@ int __sched cond_resched_softirq(void) | |||
| 6634 | } | 6773 | } |
| 6635 | return 0; | 6774 | return 0; |
| 6636 | } | 6775 | } |
| 6637 | EXPORT_SYMBOL(cond_resched_softirq); | 6776 | EXPORT_SYMBOL(__cond_resched_softirq); |
| 6638 | 6777 | ||
| 6639 | /** | 6778 | /** |
| 6640 | * yield - yield the current processor to other threads. | 6779 | * yield - yield the current processor to other threads. |
| @@ -6652,17 +6791,16 @@ EXPORT_SYMBOL(yield); | |||
| 6652 | /* | 6791 | /* |
| 6653 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 6792 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
| 6654 | * that process accounting knows that this is a task in IO wait state. | 6793 | * that process accounting knows that this is a task in IO wait state. |
| 6655 | * | ||
| 6656 | * But don't do that if it is a deliberate, throttling IO wait (this task | ||
| 6657 | * has set its backing_dev_info: the queue against which it should throttle) | ||
| 6658 | */ | 6794 | */ |
| 6659 | void __sched io_schedule(void) | 6795 | void __sched io_schedule(void) |
| 6660 | { | 6796 | { |
| 6661 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6797 | struct rq *rq = raw_rq(); |
| 6662 | 6798 | ||
| 6663 | delayacct_blkio_start(); | 6799 | delayacct_blkio_start(); |
| 6664 | atomic_inc(&rq->nr_iowait); | 6800 | atomic_inc(&rq->nr_iowait); |
| 6801 | current->in_iowait = 1; | ||
| 6665 | schedule(); | 6802 | schedule(); |
| 6803 | current->in_iowait = 0; | ||
| 6666 | atomic_dec(&rq->nr_iowait); | 6804 | atomic_dec(&rq->nr_iowait); |
| 6667 | delayacct_blkio_end(); | 6805 | delayacct_blkio_end(); |
| 6668 | } | 6806 | } |
| @@ -6670,12 +6808,14 @@ EXPORT_SYMBOL(io_schedule); | |||
| 6670 | 6808 | ||
| 6671 | long __sched io_schedule_timeout(long timeout) | 6809 | long __sched io_schedule_timeout(long timeout) |
| 6672 | { | 6810 | { |
| 6673 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6811 | struct rq *rq = raw_rq(); |
| 6674 | long ret; | 6812 | long ret; |
| 6675 | 6813 | ||
| 6676 | delayacct_blkio_start(); | 6814 | delayacct_blkio_start(); |
| 6677 | atomic_inc(&rq->nr_iowait); | 6815 | atomic_inc(&rq->nr_iowait); |
| 6816 | current->in_iowait = 1; | ||
| 6678 | ret = schedule_timeout(timeout); | 6817 | ret = schedule_timeout(timeout); |
| 6818 | current->in_iowait = 0; | ||
| 6679 | atomic_dec(&rq->nr_iowait); | 6819 | atomic_dec(&rq->nr_iowait); |
| 6680 | delayacct_blkio_end(); | 6820 | delayacct_blkio_end(); |
| 6681 | return ret; | 6821 | return ret; |
| @@ -6759,23 +6899,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
| 6759 | if (retval) | 6899 | if (retval) |
| 6760 | goto out_unlock; | 6900 | goto out_unlock; |
| 6761 | 6901 | ||
| 6762 | /* | 6902 | time_slice = p->sched_class->get_rr_interval(p); |
| 6763 | * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER | ||
| 6764 | * tasks that are on an otherwise idle runqueue: | ||
| 6765 | */ | ||
| 6766 | time_slice = 0; | ||
| 6767 | if (p->policy == SCHED_RR) { | ||
| 6768 | time_slice = DEF_TIMESLICE; | ||
| 6769 | } else if (p->policy != SCHED_FIFO) { | ||
| 6770 | struct sched_entity *se = &p->se; | ||
| 6771 | unsigned long flags; | ||
| 6772 | struct rq *rq; | ||
| 6773 | 6903 | ||
| 6774 | rq = task_rq_lock(p, &flags); | ||
| 6775 | if (rq->cfs.load.weight) | ||
| 6776 | time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); | ||
| 6777 | task_rq_unlock(rq, &flags); | ||
| 6778 | } | ||
| 6779 | read_unlock(&tasklist_lock); | 6904 | read_unlock(&tasklist_lock); |
| 6780 | jiffies_to_timespec(time_slice, &t); | 6905 | jiffies_to_timespec(time_slice, &t); |
| 6781 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 6906 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
| @@ -6848,7 +6973,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 6848 | /* | 6973 | /* |
| 6849 | * Only show locks if all tasks are dumped: | 6974 | * Only show locks if all tasks are dumped: |
| 6850 | */ | 6975 | */ |
| 6851 | if (state_filter == -1) | 6976 | if (!state_filter) |
| 6852 | debug_show_all_locks(); | 6977 | debug_show_all_locks(); |
| 6853 | } | 6978 | } |
| 6854 | 6979 | ||
| @@ -6992,8 +7117,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
| 6992 | 7117 | ||
| 6993 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { | 7118 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { |
| 6994 | /* Need help from migration thread: drop lock and wait. */ | 7119 | /* Need help from migration thread: drop lock and wait. */ |
| 7120 | struct task_struct *mt = rq->migration_thread; | ||
| 7121 | |||
| 7122 | get_task_struct(mt); | ||
| 6995 | task_rq_unlock(rq, &flags); | 7123 | task_rq_unlock(rq, &flags); |
| 6996 | wake_up_process(rq->migration_thread); | 7124 | wake_up_process(rq->migration_thread); |
| 7125 | put_task_struct(mt); | ||
| 6997 | wait_for_completion(&req.done); | 7126 | wait_for_completion(&req.done); |
| 6998 | tlb_migrate_finish(p->mm); | 7127 | tlb_migrate_finish(p->mm); |
| 6999 | return 0; | 7128 | return 0; |
| @@ -7051,6 +7180,11 @@ fail: | |||
| 7051 | return ret; | 7180 | return ret; |
| 7052 | } | 7181 | } |
| 7053 | 7182 | ||
| 7183 | #define RCU_MIGRATION_IDLE 0 | ||
| 7184 | #define RCU_MIGRATION_NEED_QS 1 | ||
| 7185 | #define RCU_MIGRATION_GOT_QS 2 | ||
| 7186 | #define RCU_MIGRATION_MUST_SYNC 3 | ||
| 7187 | |||
| 7054 | /* | 7188 | /* |
| 7055 | * migration_thread - this is a highprio system thread that performs | 7189 | * migration_thread - this is a highprio system thread that performs |
| 7056 | * thread migration by bumping thread off CPU then 'pushing' onto | 7190 | * thread migration by bumping thread off CPU then 'pushing' onto |
| @@ -7058,6 +7192,7 @@ fail: | |||
| 7058 | */ | 7192 | */ |
| 7059 | static int migration_thread(void *data) | 7193 | static int migration_thread(void *data) |
| 7060 | { | 7194 | { |
| 7195 | int badcpu; | ||
| 7061 | int cpu = (long)data; | 7196 | int cpu = (long)data; |
| 7062 | struct rq *rq; | 7197 | struct rq *rq; |
| 7063 | 7198 | ||
| @@ -7092,8 +7227,17 @@ static int migration_thread(void *data) | |||
| 7092 | req = list_entry(head->next, struct migration_req, list); | 7227 | req = list_entry(head->next, struct migration_req, list); |
| 7093 | list_del_init(head->next); | 7228 | list_del_init(head->next); |
| 7094 | 7229 | ||
| 7095 | spin_unlock(&rq->lock); | 7230 | if (req->task != NULL) { |
| 7096 | __migrate_task(req->task, cpu, req->dest_cpu); | 7231 | spin_unlock(&rq->lock); |
| 7232 | __migrate_task(req->task, cpu, req->dest_cpu); | ||
| 7233 | } else if (likely(cpu == (badcpu = smp_processor_id()))) { | ||
| 7234 | req->dest_cpu = RCU_MIGRATION_GOT_QS; | ||
| 7235 | spin_unlock(&rq->lock); | ||
| 7236 | } else { | ||
| 7237 | req->dest_cpu = RCU_MIGRATION_MUST_SYNC; | ||
| 7238 | spin_unlock(&rq->lock); | ||
| 7239 | WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); | ||
| 7240 | } | ||
| 7097 | local_irq_enable(); | 7241 | local_irq_enable(); |
| 7098 | 7242 | ||
| 7099 | complete(&req->done); | 7243 | complete(&req->done); |
| @@ -7300,17 +7444,16 @@ static struct ctl_table sd_ctl_dir[] = { | |||
| 7300 | .procname = "sched_domain", | 7444 | .procname = "sched_domain", |
| 7301 | .mode = 0555, | 7445 | .mode = 0555, |
| 7302 | }, | 7446 | }, |
| 7303 | {0, }, | 7447 | {} |
| 7304 | }; | 7448 | }; |
| 7305 | 7449 | ||
| 7306 | static struct ctl_table sd_ctl_root[] = { | 7450 | static struct ctl_table sd_ctl_root[] = { |
| 7307 | { | 7451 | { |
| 7308 | .ctl_name = CTL_KERN, | ||
| 7309 | .procname = "kernel", | 7452 | .procname = "kernel", |
| 7310 | .mode = 0555, | 7453 | .mode = 0555, |
| 7311 | .child = sd_ctl_dir, | 7454 | .child = sd_ctl_dir, |
| 7312 | }, | 7455 | }, |
| 7313 | {0, }, | 7456 | {} |
| 7314 | }; | 7457 | }; |
| 7315 | 7458 | ||
| 7316 | static struct ctl_table *sd_alloc_ctl_entry(int n) | 7459 | static struct ctl_table *sd_alloc_ctl_entry(int n) |
| @@ -7607,7 +7750,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 7607 | /* | 7750 | /* |
| 7608 | * Register at high priority so that task migration (migrate_all_tasks) | 7751 | * Register at high priority so that task migration (migrate_all_tasks) |
| 7609 | * happens before everything else. This has to be lower priority than | 7752 | * happens before everything else. This has to be lower priority than |
| 7610 | * the notifier in the perf_counter subsystem, though. | 7753 | * the notifier in the perf_event subsystem, though. |
| 7611 | */ | 7754 | */ |
| 7612 | static struct notifier_block __cpuinitdata migration_notifier = { | 7755 | static struct notifier_block __cpuinitdata migration_notifier = { |
| 7613 | .notifier_call = migration_call, | 7756 | .notifier_call = migration_call, |
| @@ -7625,7 +7768,7 @@ static int __init migration_init(void) | |||
| 7625 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 7768 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
| 7626 | register_cpu_notifier(&migration_notifier); | 7769 | register_cpu_notifier(&migration_notifier); |
| 7627 | 7770 | ||
| 7628 | return err; | 7771 | return 0; |
| 7629 | } | 7772 | } |
| 7630 | early_initcall(migration_init); | 7773 | early_initcall(migration_init); |
| 7631 | #endif | 7774 | #endif |
| @@ -7634,6 +7777,16 @@ early_initcall(migration_init); | |||
| 7634 | 7777 | ||
| 7635 | #ifdef CONFIG_SCHED_DEBUG | 7778 | #ifdef CONFIG_SCHED_DEBUG |
| 7636 | 7779 | ||
| 7780 | static __read_mostly int sched_domain_debug_enabled; | ||
| 7781 | |||
| 7782 | static int __init sched_domain_debug_setup(char *str) | ||
| 7783 | { | ||
| 7784 | sched_domain_debug_enabled = 1; | ||
| 7785 | |||
| 7786 | return 0; | ||
| 7787 | } | ||
| 7788 | early_param("sched_debug", sched_domain_debug_setup); | ||
| 7789 | |||
| 7637 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 7790 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
| 7638 | struct cpumask *groupmask) | 7791 | struct cpumask *groupmask) |
| 7639 | { | 7792 | { |
| @@ -7672,7 +7825,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 7672 | break; | 7825 | break; |
| 7673 | } | 7826 | } |
| 7674 | 7827 | ||
| 7675 | if (!group->__cpu_power) { | 7828 | if (!group->cpu_power) { |
| 7676 | printk(KERN_CONT "\n"); | 7829 | printk(KERN_CONT "\n"); |
| 7677 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 7830 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
| 7678 | "set\n"); | 7831 | "set\n"); |
| @@ -7696,9 +7849,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 7696 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 7849 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
| 7697 | 7850 | ||
| 7698 | printk(KERN_CONT " %s", str); | 7851 | printk(KERN_CONT " %s", str); |
| 7699 | if (group->__cpu_power != SCHED_LOAD_SCALE) { | 7852 | if (group->cpu_power != SCHED_LOAD_SCALE) { |
| 7700 | printk(KERN_CONT " (__cpu_power = %d)", | 7853 | printk(KERN_CONT " (cpu_power = %d)", |
| 7701 | group->__cpu_power); | 7854 | group->cpu_power); |
| 7702 | } | 7855 | } |
| 7703 | 7856 | ||
| 7704 | group = group->next; | 7857 | group = group->next; |
| @@ -7720,6 +7873,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 7720 | cpumask_var_t groupmask; | 7873 | cpumask_var_t groupmask; |
| 7721 | int level = 0; | 7874 | int level = 0; |
| 7722 | 7875 | ||
| 7876 | if (!sched_domain_debug_enabled) | ||
| 7877 | return; | ||
| 7878 | |||
| 7723 | if (!sd) { | 7879 | if (!sd) { |
| 7724 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | 7880 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); |
| 7725 | return; | 7881 | return; |
| @@ -7763,9 +7919,7 @@ static int sd_degenerate(struct sched_domain *sd) | |||
| 7763 | } | 7919 | } |
| 7764 | 7920 | ||
| 7765 | /* Following flags don't use groups */ | 7921 | /* Following flags don't use groups */ |
| 7766 | if (sd->flags & (SD_WAKE_IDLE | | 7922 | if (sd->flags & (SD_WAKE_AFFINE)) |
| 7767 | SD_WAKE_AFFINE | | ||
| 7768 | SD_WAKE_BALANCE)) | ||
| 7769 | return 0; | 7923 | return 0; |
| 7770 | 7924 | ||
| 7771 | return 1; | 7925 | return 1; |
| @@ -7782,10 +7936,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
| 7782 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) | 7936 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) |
| 7783 | return 0; | 7937 | return 0; |
| 7784 | 7938 | ||
| 7785 | /* Does parent contain flags not in child? */ | ||
| 7786 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ | ||
| 7787 | if (cflags & SD_WAKE_AFFINE) | ||
| 7788 | pflags &= ~SD_WAKE_BALANCE; | ||
| 7789 | /* Flags needing groups don't count if only 1 group in parent */ | 7939 | /* Flags needing groups don't count if only 1 group in parent */ |
| 7790 | if (parent->groups == parent->groups->next) { | 7940 | if (parent->groups == parent->groups->next) { |
| 7791 | pflags &= ~(SD_LOAD_BALANCE | | 7941 | pflags &= ~(SD_LOAD_BALANCE | |
| @@ -7805,6 +7955,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
| 7805 | 7955 | ||
| 7806 | static void free_rootdomain(struct root_domain *rd) | 7956 | static void free_rootdomain(struct root_domain *rd) |
| 7807 | { | 7957 | { |
| 7958 | synchronize_sched(); | ||
| 7959 | |||
| 7808 | cpupri_cleanup(&rd->cpupri); | 7960 | cpupri_cleanup(&rd->cpupri); |
| 7809 | 7961 | ||
| 7810 | free_cpumask_var(rd->rto_mask); | 7962 | free_cpumask_var(rd->rto_mask); |
| @@ -7841,7 +7993,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
| 7841 | rq->rd = rd; | 7993 | rq->rd = rd; |
| 7842 | 7994 | ||
| 7843 | cpumask_set_cpu(rq->cpu, rd->span); | 7995 | cpumask_set_cpu(rq->cpu, rd->span); |
| 7844 | if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) | 7996 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) |
| 7845 | set_rq_online(rq); | 7997 | set_rq_online(rq); |
| 7846 | 7998 | ||
| 7847 | spin_unlock_irqrestore(&rq->lock, flags); | 7999 | spin_unlock_irqrestore(&rq->lock, flags); |
| @@ -7945,6 +8097,7 @@ static cpumask_var_t cpu_isolated_map; | |||
| 7945 | /* Setup the mask of cpus configured for isolated domains */ | 8097 | /* Setup the mask of cpus configured for isolated domains */ |
| 7946 | static int __init isolated_cpu_setup(char *str) | 8098 | static int __init isolated_cpu_setup(char *str) |
| 7947 | { | 8099 | { |
| 8100 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | ||
| 7948 | cpulist_parse(str, cpu_isolated_map); | 8101 | cpulist_parse(str, cpu_isolated_map); |
| 7949 | return 1; | 8102 | return 1; |
| 7950 | } | 8103 | } |
| @@ -7983,7 +8136,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
| 7983 | continue; | 8136 | continue; |
| 7984 | 8137 | ||
| 7985 | cpumask_clear(sched_group_cpus(sg)); | 8138 | cpumask_clear(sched_group_cpus(sg)); |
| 7986 | sg->__cpu_power = 0; | 8139 | sg->cpu_power = 0; |
| 7987 | 8140 | ||
| 7988 | for_each_cpu(j, span) { | 8141 | for_each_cpu(j, span) { |
| 7989 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | 8142 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) |
| @@ -8091,6 +8244,39 @@ struct static_sched_domain { | |||
| 8091 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 8244 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); |
| 8092 | }; | 8245 | }; |
| 8093 | 8246 | ||
| 8247 | struct s_data { | ||
| 8248 | #ifdef CONFIG_NUMA | ||
| 8249 | int sd_allnodes; | ||
| 8250 | cpumask_var_t domainspan; | ||
| 8251 | cpumask_var_t covered; | ||
| 8252 | cpumask_var_t notcovered; | ||
| 8253 | #endif | ||
| 8254 | cpumask_var_t nodemask; | ||
| 8255 | cpumask_var_t this_sibling_map; | ||
| 8256 | cpumask_var_t this_core_map; | ||
| 8257 | cpumask_var_t send_covered; | ||
| 8258 | cpumask_var_t tmpmask; | ||
| 8259 | struct sched_group **sched_group_nodes; | ||
| 8260 | struct root_domain *rd; | ||
| 8261 | }; | ||
| 8262 | |||
| 8263 | enum s_alloc { | ||
| 8264 | sa_sched_groups = 0, | ||
| 8265 | sa_rootdomain, | ||
| 8266 | sa_tmpmask, | ||
| 8267 | sa_send_covered, | ||
| 8268 | sa_this_core_map, | ||
| 8269 | sa_this_sibling_map, | ||
| 8270 | sa_nodemask, | ||
| 8271 | sa_sched_group_nodes, | ||
| 8272 | #ifdef CONFIG_NUMA | ||
| 8273 | sa_notcovered, | ||
| 8274 | sa_covered, | ||
| 8275 | sa_domainspan, | ||
| 8276 | #endif | ||
| 8277 | sa_none, | ||
| 8278 | }; | ||
| 8279 | |||
| 8094 | /* | 8280 | /* |
| 8095 | * SMT sched-domains: | 8281 | * SMT sched-domains: |
| 8096 | */ | 8282 | */ |
| @@ -8208,11 +8394,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
| 8208 | continue; | 8394 | continue; |
| 8209 | } | 8395 | } |
| 8210 | 8396 | ||
| 8211 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 8397 | sg->cpu_power += sd->groups->cpu_power; |
| 8212 | } | 8398 | } |
| 8213 | sg = sg->next; | 8399 | sg = sg->next; |
| 8214 | } while (sg != group_head); | 8400 | } while (sg != group_head); |
| 8215 | } | 8401 | } |
| 8402 | |||
| 8403 | static int build_numa_sched_groups(struct s_data *d, | ||
| 8404 | const struct cpumask *cpu_map, int num) | ||
| 8405 | { | ||
| 8406 | struct sched_domain *sd; | ||
| 8407 | struct sched_group *sg, *prev; | ||
| 8408 | int n, j; | ||
| 8409 | |||
| 8410 | cpumask_clear(d->covered); | ||
| 8411 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
| 8412 | if (cpumask_empty(d->nodemask)) { | ||
| 8413 | d->sched_group_nodes[num] = NULL; | ||
| 8414 | goto out; | ||
| 8415 | } | ||
| 8416 | |||
| 8417 | sched_domain_node_span(num, d->domainspan); | ||
| 8418 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
| 8419 | |||
| 8420 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
| 8421 | GFP_KERNEL, num); | ||
| 8422 | if (!sg) { | ||
| 8423 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
| 8424 | num); | ||
| 8425 | return -ENOMEM; | ||
| 8426 | } | ||
| 8427 | d->sched_group_nodes[num] = sg; | ||
| 8428 | |||
| 8429 | for_each_cpu(j, d->nodemask) { | ||
| 8430 | sd = &per_cpu(node_domains, j).sd; | ||
| 8431 | sd->groups = sg; | ||
| 8432 | } | ||
| 8433 | |||
| 8434 | sg->cpu_power = 0; | ||
| 8435 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | ||
| 8436 | sg->next = sg; | ||
| 8437 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
| 8438 | |||
| 8439 | prev = sg; | ||
| 8440 | for (j = 0; j < nr_node_ids; j++) { | ||
| 8441 | n = (num + j) % nr_node_ids; | ||
| 8442 | cpumask_complement(d->notcovered, d->covered); | ||
| 8443 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
| 8444 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
| 8445 | if (cpumask_empty(d->tmpmask)) | ||
| 8446 | break; | ||
| 8447 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
| 8448 | if (cpumask_empty(d->tmpmask)) | ||
| 8449 | continue; | ||
| 8450 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
| 8451 | GFP_KERNEL, num); | ||
| 8452 | if (!sg) { | ||
| 8453 | printk(KERN_WARNING | ||
| 8454 | "Can not alloc domain group for node %d\n", j); | ||
| 8455 | return -ENOMEM; | ||
| 8456 | } | ||
| 8457 | sg->cpu_power = 0; | ||
| 8458 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
| 8459 | sg->next = prev->next; | ||
| 8460 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
| 8461 | prev->next = sg; | ||
| 8462 | prev = sg; | ||
| 8463 | } | ||
| 8464 | out: | ||
| 8465 | return 0; | ||
| 8466 | } | ||
| 8216 | #endif /* CONFIG_NUMA */ | 8467 | #endif /* CONFIG_NUMA */ |
| 8217 | 8468 | ||
| 8218 | #ifdef CONFIG_NUMA | 8469 | #ifdef CONFIG_NUMA |
| @@ -8266,15 +8517,13 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
| 8266 | * there are asymmetries in the topology. If there are asymmetries, group | 8517 | * there are asymmetries in the topology. If there are asymmetries, group |
| 8267 | * having more cpu_power will pickup more load compared to the group having | 8518 | * having more cpu_power will pickup more load compared to the group having |
| 8268 | * less cpu_power. | 8519 | * less cpu_power. |
| 8269 | * | ||
| 8270 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents | ||
| 8271 | * the maximum number of tasks a group can handle in the presence of other idle | ||
| 8272 | * or lightly loaded groups in the same sched domain. | ||
| 8273 | */ | 8520 | */ |
| 8274 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 8521 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
| 8275 | { | 8522 | { |
| 8276 | struct sched_domain *child; | 8523 | struct sched_domain *child; |
| 8277 | struct sched_group *group; | 8524 | struct sched_group *group; |
| 8525 | long power; | ||
| 8526 | int weight; | ||
| 8278 | 8527 | ||
| 8279 | WARN_ON(!sd || !sd->groups); | 8528 | WARN_ON(!sd || !sd->groups); |
| 8280 | 8529 | ||
| @@ -8283,28 +8532,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 8283 | 8532 | ||
| 8284 | child = sd->child; | 8533 | child = sd->child; |
| 8285 | 8534 | ||
| 8286 | sd->groups->__cpu_power = 0; | 8535 | sd->groups->cpu_power = 0; |
| 8287 | 8536 | ||
| 8288 | /* | 8537 | if (!child) { |
| 8289 | * For perf policy, if the groups in child domain share resources | 8538 | power = SCHED_LOAD_SCALE; |
| 8290 | * (for example cores sharing some portions of the cache hierarchy | 8539 | weight = cpumask_weight(sched_domain_span(sd)); |
| 8291 | * or SMT), then set this domain groups cpu_power such that each group | 8540 | /* |
| 8292 | * can handle only one task, when there are other idle groups in the | 8541 | * SMT siblings share the power of a single core. |
| 8293 | * same sched domain. | 8542 | * Usually multiple threads get a better yield out of |
| 8294 | */ | 8543 | * that one core than a single thread would have, |
| 8295 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | 8544 | * reflect that in sd->smt_gain. |
| 8296 | (child->flags & | 8545 | */ |
| 8297 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | 8546 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
| 8298 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); | 8547 | power *= sd->smt_gain; |
| 8548 | power /= weight; | ||
| 8549 | power >>= SCHED_LOAD_SHIFT; | ||
| 8550 | } | ||
| 8551 | sd->groups->cpu_power += power; | ||
| 8299 | return; | 8552 | return; |
| 8300 | } | 8553 | } |
| 8301 | 8554 | ||
| 8302 | /* | 8555 | /* |
| 8303 | * add cpu_power of each child group to this groups cpu_power | 8556 | * Add cpu_power of each child group to this groups cpu_power. |
| 8304 | */ | 8557 | */ |
| 8305 | group = child->groups; | 8558 | group = child->groups; |
| 8306 | do { | 8559 | do { |
| 8307 | sg_inc_cpu_power(sd->groups, group->__cpu_power); | 8560 | sd->groups->cpu_power += group->cpu_power; |
| 8308 | group = group->next; | 8561 | group = group->next; |
| 8309 | } while (group != child->groups); | 8562 | } while (group != child->groups); |
| 8310 | } | 8563 | } |
| @@ -8371,287 +8624,292 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
| 8371 | request = attr->relax_domain_level; | 8624 | request = attr->relax_domain_level; |
| 8372 | if (request < sd->level) { | 8625 | if (request < sd->level) { |
| 8373 | /* turn off idle balance on this domain */ | 8626 | /* turn off idle balance on this domain */ |
| 8374 | sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); | 8627 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
| 8375 | } else { | 8628 | } else { |
| 8376 | /* turn on idle balance on this domain */ | 8629 | /* turn on idle balance on this domain */ |
| 8377 | sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); | 8630 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
| 8631 | } | ||
| 8632 | } | ||
| 8633 | |||
| 8634 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | ||
| 8635 | const struct cpumask *cpu_map) | ||
| 8636 | { | ||
| 8637 | switch (what) { | ||
| 8638 | case sa_sched_groups: | ||
| 8639 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ | ||
| 8640 | d->sched_group_nodes = NULL; | ||
| 8641 | case sa_rootdomain: | ||
| 8642 | free_rootdomain(d->rd); /* fall through */ | ||
| 8643 | case sa_tmpmask: | ||
| 8644 | free_cpumask_var(d->tmpmask); /* fall through */ | ||
| 8645 | case sa_send_covered: | ||
| 8646 | free_cpumask_var(d->send_covered); /* fall through */ | ||
| 8647 | case sa_this_core_map: | ||
| 8648 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
| 8649 | case sa_this_sibling_map: | ||
| 8650 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
| 8651 | case sa_nodemask: | ||
| 8652 | free_cpumask_var(d->nodemask); /* fall through */ | ||
| 8653 | case sa_sched_group_nodes: | ||
| 8654 | #ifdef CONFIG_NUMA | ||
| 8655 | kfree(d->sched_group_nodes); /* fall through */ | ||
| 8656 | case sa_notcovered: | ||
| 8657 | free_cpumask_var(d->notcovered); /* fall through */ | ||
| 8658 | case sa_covered: | ||
| 8659 | free_cpumask_var(d->covered); /* fall through */ | ||
| 8660 | case sa_domainspan: | ||
| 8661 | free_cpumask_var(d->domainspan); /* fall through */ | ||
| 8662 | #endif | ||
| 8663 | case sa_none: | ||
| 8664 | break; | ||
| 8378 | } | 8665 | } |
| 8379 | } | 8666 | } |
| 8380 | 8667 | ||
| 8381 | /* | 8668 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
| 8382 | * Build sched domains for a given set of cpus and attach the sched domains | 8669 | const struct cpumask *cpu_map) |
| 8383 | * to the individual cpus | ||
| 8384 | */ | ||
| 8385 | static int __build_sched_domains(const struct cpumask *cpu_map, | ||
| 8386 | struct sched_domain_attr *attr) | ||
| 8387 | { | 8670 | { |
| 8388 | int i, err = -ENOMEM; | ||
| 8389 | struct root_domain *rd; | ||
| 8390 | cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, | ||
| 8391 | tmpmask; | ||
| 8392 | #ifdef CONFIG_NUMA | ||
| 8393 | cpumask_var_t domainspan, covered, notcovered; | ||
| 8394 | struct sched_group **sched_group_nodes = NULL; | ||
| 8395 | int sd_allnodes = 0; | ||
| 8396 | |||
| 8397 | if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) | ||
| 8398 | goto out; | ||
| 8399 | if (!alloc_cpumask_var(&covered, GFP_KERNEL)) | ||
| 8400 | goto free_domainspan; | ||
| 8401 | if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) | ||
| 8402 | goto free_covered; | ||
| 8403 | #endif | ||
| 8404 | |||
| 8405 | if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) | ||
| 8406 | goto free_notcovered; | ||
| 8407 | if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) | ||
| 8408 | goto free_nodemask; | ||
| 8409 | if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) | ||
| 8410 | goto free_this_sibling_map; | ||
| 8411 | if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) | ||
| 8412 | goto free_this_core_map; | ||
| 8413 | if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) | ||
| 8414 | goto free_send_covered; | ||
| 8415 | |||
| 8416 | #ifdef CONFIG_NUMA | 8671 | #ifdef CONFIG_NUMA |
| 8417 | /* | 8672 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) |
| 8418 | * Allocate the per-node list of sched groups | 8673 | return sa_none; |
| 8419 | */ | 8674 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) |
| 8420 | sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), | 8675 | return sa_domainspan; |
| 8421 | GFP_KERNEL); | 8676 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) |
| 8422 | if (!sched_group_nodes) { | 8677 | return sa_covered; |
| 8678 | /* Allocate the per-node list of sched groups */ | ||
| 8679 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
| 8680 | sizeof(struct sched_group *), GFP_KERNEL); | ||
| 8681 | if (!d->sched_group_nodes) { | ||
| 8423 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 8682 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
| 8424 | goto free_tmpmask; | 8683 | return sa_notcovered; |
| 8425 | } | 8684 | } |
| 8426 | #endif | 8685 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; |
| 8427 | 8686 | #endif | |
| 8428 | rd = alloc_rootdomain(); | 8687 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) |
| 8429 | if (!rd) { | 8688 | return sa_sched_group_nodes; |
| 8689 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
| 8690 | return sa_nodemask; | ||
| 8691 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
| 8692 | return sa_this_sibling_map; | ||
| 8693 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
| 8694 | return sa_this_core_map; | ||
| 8695 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
| 8696 | return sa_send_covered; | ||
| 8697 | d->rd = alloc_rootdomain(); | ||
| 8698 | if (!d->rd) { | ||
| 8430 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 8699 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
| 8431 | goto free_sched_groups; | 8700 | return sa_tmpmask; |
| 8432 | } | 8701 | } |
| 8702 | return sa_rootdomain; | ||
| 8703 | } | ||
| 8433 | 8704 | ||
| 8705 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | ||
| 8706 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | ||
| 8707 | { | ||
| 8708 | struct sched_domain *sd = NULL; | ||
| 8434 | #ifdef CONFIG_NUMA | 8709 | #ifdef CONFIG_NUMA |
| 8435 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; | 8710 | struct sched_domain *parent; |
| 8436 | #endif | ||
| 8437 | |||
| 8438 | /* | ||
| 8439 | * Set up domains for cpus specified by the cpu_map. | ||
| 8440 | */ | ||
| 8441 | for_each_cpu(i, cpu_map) { | ||
| 8442 | struct sched_domain *sd = NULL, *p; | ||
| 8443 | |||
| 8444 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); | ||
| 8445 | |||
| 8446 | #ifdef CONFIG_NUMA | ||
| 8447 | if (cpumask_weight(cpu_map) > | ||
| 8448 | SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { | ||
| 8449 | sd = &per_cpu(allnodes_domains, i).sd; | ||
| 8450 | SD_INIT(sd, ALLNODES); | ||
| 8451 | set_domain_attribute(sd, attr); | ||
| 8452 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
| 8453 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | ||
| 8454 | p = sd; | ||
| 8455 | sd_allnodes = 1; | ||
| 8456 | } else | ||
| 8457 | p = NULL; | ||
| 8458 | 8711 | ||
| 8459 | sd = &per_cpu(node_domains, i).sd; | 8712 | d->sd_allnodes = 0; |
| 8460 | SD_INIT(sd, NODE); | 8713 | if (cpumask_weight(cpu_map) > |
| 8714 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
| 8715 | sd = &per_cpu(allnodes_domains, i).sd; | ||
| 8716 | SD_INIT(sd, ALLNODES); | ||
| 8461 | set_domain_attribute(sd, attr); | 8717 | set_domain_attribute(sd, attr); |
| 8462 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | 8718 | cpumask_copy(sched_domain_span(sd), cpu_map); |
| 8463 | sd->parent = p; | 8719 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); |
| 8464 | if (p) | 8720 | d->sd_allnodes = 1; |
| 8465 | p->child = sd; | 8721 | } |
| 8466 | cpumask_and(sched_domain_span(sd), | 8722 | parent = sd; |
| 8467 | sched_domain_span(sd), cpu_map); | 8723 | |
| 8724 | sd = &per_cpu(node_domains, i).sd; | ||
| 8725 | SD_INIT(sd, NODE); | ||
| 8726 | set_domain_attribute(sd, attr); | ||
| 8727 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
| 8728 | sd->parent = parent; | ||
| 8729 | if (parent) | ||
| 8730 | parent->child = sd; | ||
| 8731 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
| 8468 | #endif | 8732 | #endif |
| 8733 | return sd; | ||
| 8734 | } | ||
| 8469 | 8735 | ||
| 8470 | p = sd; | 8736 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, |
| 8471 | sd = &per_cpu(phys_domains, i).sd; | 8737 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
| 8472 | SD_INIT(sd, CPU); | 8738 | struct sched_domain *parent, int i) |
| 8473 | set_domain_attribute(sd, attr); | 8739 | { |
| 8474 | cpumask_copy(sched_domain_span(sd), nodemask); | 8740 | struct sched_domain *sd; |
| 8475 | sd->parent = p; | 8741 | sd = &per_cpu(phys_domains, i).sd; |
| 8476 | if (p) | 8742 | SD_INIT(sd, CPU); |
| 8477 | p->child = sd; | 8743 | set_domain_attribute(sd, attr); |
| 8478 | cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); | 8744 | cpumask_copy(sched_domain_span(sd), d->nodemask); |
| 8745 | sd->parent = parent; | ||
| 8746 | if (parent) | ||
| 8747 | parent->child = sd; | ||
| 8748 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
| 8749 | return sd; | ||
| 8750 | } | ||
| 8479 | 8751 | ||
| 8752 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | ||
| 8753 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
| 8754 | struct sched_domain *parent, int i) | ||
| 8755 | { | ||
| 8756 | struct sched_domain *sd = parent; | ||
| 8480 | #ifdef CONFIG_SCHED_MC | 8757 | #ifdef CONFIG_SCHED_MC |
| 8481 | p = sd; | 8758 | sd = &per_cpu(core_domains, i).sd; |
| 8482 | sd = &per_cpu(core_domains, i).sd; | 8759 | SD_INIT(sd, MC); |
| 8483 | SD_INIT(sd, MC); | 8760 | set_domain_attribute(sd, attr); |
| 8484 | set_domain_attribute(sd, attr); | 8761 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); |
| 8485 | cpumask_and(sched_domain_span(sd), cpu_map, | 8762 | sd->parent = parent; |
| 8486 | cpu_coregroup_mask(i)); | 8763 | parent->child = sd; |
| 8487 | sd->parent = p; | 8764 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); |
| 8488 | p->child = sd; | ||
| 8489 | cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); | ||
| 8490 | #endif | 8765 | #endif |
| 8766 | return sd; | ||
| 8767 | } | ||
| 8491 | 8768 | ||
| 8769 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
| 8770 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
| 8771 | struct sched_domain *parent, int i) | ||
| 8772 | { | ||
| 8773 | struct sched_domain *sd = parent; | ||
| 8492 | #ifdef CONFIG_SCHED_SMT | 8774 | #ifdef CONFIG_SCHED_SMT |
| 8493 | p = sd; | 8775 | sd = &per_cpu(cpu_domains, i).sd; |
| 8494 | sd = &per_cpu(cpu_domains, i).sd; | 8776 | SD_INIT(sd, SIBLING); |
| 8495 | SD_INIT(sd, SIBLING); | 8777 | set_domain_attribute(sd, attr); |
| 8496 | set_domain_attribute(sd, attr); | 8778 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); |
| 8497 | cpumask_and(sched_domain_span(sd), | 8779 | sd->parent = parent; |
| 8498 | topology_thread_cpumask(i), cpu_map); | 8780 | parent->child = sd; |
| 8499 | sd->parent = p; | 8781 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); |
| 8500 | p->child = sd; | ||
| 8501 | cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); | ||
| 8502 | #endif | 8782 | #endif |
| 8503 | } | 8783 | return sd; |
| 8784 | } | ||
| 8504 | 8785 | ||
| 8786 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | ||
| 8787 | const struct cpumask *cpu_map, int cpu) | ||
| 8788 | { | ||
| 8789 | switch (l) { | ||
| 8505 | #ifdef CONFIG_SCHED_SMT | 8790 | #ifdef CONFIG_SCHED_SMT |
| 8506 | /* Set up CPU (sibling) groups */ | 8791 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ |
| 8507 | for_each_cpu(i, cpu_map) { | 8792 | cpumask_and(d->this_sibling_map, cpu_map, |
| 8508 | cpumask_and(this_sibling_map, | 8793 | topology_thread_cpumask(cpu)); |
| 8509 | topology_thread_cpumask(i), cpu_map); | 8794 | if (cpu == cpumask_first(d->this_sibling_map)) |
| 8510 | if (i != cpumask_first(this_sibling_map)) | 8795 | init_sched_build_groups(d->this_sibling_map, cpu_map, |
| 8511 | continue; | 8796 | &cpu_to_cpu_group, |
| 8512 | 8797 | d->send_covered, d->tmpmask); | |
| 8513 | init_sched_build_groups(this_sibling_map, cpu_map, | 8798 | break; |
| 8514 | &cpu_to_cpu_group, | ||
| 8515 | send_covered, tmpmask); | ||
| 8516 | } | ||
| 8517 | #endif | 8799 | #endif |
| 8518 | |||
| 8519 | #ifdef CONFIG_SCHED_MC | 8800 | #ifdef CONFIG_SCHED_MC |
| 8520 | /* Set up multi-core groups */ | 8801 | case SD_LV_MC: /* set up multi-core groups */ |
| 8521 | for_each_cpu(i, cpu_map) { | 8802 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); |
| 8522 | cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); | 8803 | if (cpu == cpumask_first(d->this_core_map)) |
| 8523 | if (i != cpumask_first(this_core_map)) | 8804 | init_sched_build_groups(d->this_core_map, cpu_map, |
| 8524 | continue; | 8805 | &cpu_to_core_group, |
| 8525 | 8806 | d->send_covered, d->tmpmask); | |
| 8526 | init_sched_build_groups(this_core_map, cpu_map, | 8807 | break; |
| 8527 | &cpu_to_core_group, | ||
| 8528 | send_covered, tmpmask); | ||
| 8529 | } | ||
| 8530 | #endif | 8808 | #endif |
| 8531 | 8809 | case SD_LV_CPU: /* set up physical groups */ | |
| 8532 | /* Set up physical groups */ | 8810 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
| 8533 | for (i = 0; i < nr_node_ids; i++) { | 8811 | if (!cpumask_empty(d->nodemask)) |
| 8534 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8812 | init_sched_build_groups(d->nodemask, cpu_map, |
| 8535 | if (cpumask_empty(nodemask)) | 8813 | &cpu_to_phys_group, |
| 8536 | continue; | 8814 | d->send_covered, d->tmpmask); |
| 8537 | 8815 | break; | |
| 8538 | init_sched_build_groups(nodemask, cpu_map, | ||
| 8539 | &cpu_to_phys_group, | ||
| 8540 | send_covered, tmpmask); | ||
| 8541 | } | ||
| 8542 | |||
| 8543 | #ifdef CONFIG_NUMA | 8816 | #ifdef CONFIG_NUMA |
| 8544 | /* Set up node groups */ | 8817 | case SD_LV_ALLNODES: |
| 8545 | if (sd_allnodes) { | 8818 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, |
| 8546 | init_sched_build_groups(cpu_map, cpu_map, | 8819 | d->send_covered, d->tmpmask); |
| 8547 | &cpu_to_allnodes_group, | 8820 | break; |
| 8548 | send_covered, tmpmask); | 8821 | #endif |
| 8822 | default: | ||
| 8823 | break; | ||
| 8549 | } | 8824 | } |
| 8825 | } | ||
| 8550 | 8826 | ||
| 8551 | for (i = 0; i < nr_node_ids; i++) { | 8827 | /* |
| 8552 | /* Set up node groups */ | 8828 | * Build sched domains for a given set of cpus and attach the sched domains |
| 8553 | struct sched_group *sg, *prev; | 8829 | * to the individual cpus |
| 8554 | int j; | 8830 | */ |
| 8555 | 8831 | static int __build_sched_domains(const struct cpumask *cpu_map, | |
| 8556 | cpumask_clear(covered); | 8832 | struct sched_domain_attr *attr) |
| 8557 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8833 | { |
| 8558 | if (cpumask_empty(nodemask)) { | 8834 | enum s_alloc alloc_state = sa_none; |
| 8559 | sched_group_nodes[i] = NULL; | 8835 | struct s_data d; |
| 8560 | continue; | 8836 | struct sched_domain *sd; |
| 8561 | } | 8837 | int i; |
| 8838 | #ifdef CONFIG_NUMA | ||
| 8839 | d.sd_allnodes = 0; | ||
| 8840 | #endif | ||
| 8562 | 8841 | ||
| 8563 | sched_domain_node_span(i, domainspan); | 8842 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
| 8564 | cpumask_and(domainspan, domainspan, cpu_map); | 8843 | if (alloc_state != sa_rootdomain) |
| 8844 | goto error; | ||
| 8845 | alloc_state = sa_sched_groups; | ||
| 8565 | 8846 | ||
| 8566 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | 8847 | /* |
| 8567 | GFP_KERNEL, i); | 8848 | * Set up domains for cpus specified by the cpu_map. |
| 8568 | if (!sg) { | 8849 | */ |
| 8569 | printk(KERN_WARNING "Can not alloc domain group for " | 8850 | for_each_cpu(i, cpu_map) { |
| 8570 | "node %d\n", i); | 8851 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), |
| 8571 | goto error; | 8852 | cpu_map); |
| 8572 | } | ||
| 8573 | sched_group_nodes[i] = sg; | ||
| 8574 | for_each_cpu(j, nodemask) { | ||
| 8575 | struct sched_domain *sd; | ||
| 8576 | 8853 | ||
| 8577 | sd = &per_cpu(node_domains, j).sd; | 8854 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
| 8578 | sd->groups = sg; | 8855 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
| 8579 | } | 8856 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
| 8580 | sg->__cpu_power = 0; | 8857 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
| 8581 | cpumask_copy(sched_group_cpus(sg), nodemask); | 8858 | } |
| 8582 | sg->next = sg; | ||
| 8583 | cpumask_or(covered, covered, nodemask); | ||
| 8584 | prev = sg; | ||
| 8585 | 8859 | ||
| 8586 | for (j = 0; j < nr_node_ids; j++) { | 8860 | for_each_cpu(i, cpu_map) { |
| 8587 | int n = (i + j) % nr_node_ids; | 8861 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
| 8862 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
| 8863 | } | ||
| 8588 | 8864 | ||
| 8589 | cpumask_complement(notcovered, covered); | 8865 | /* Set up physical groups */ |
| 8590 | cpumask_and(tmpmask, notcovered, cpu_map); | 8866 | for (i = 0; i < nr_node_ids; i++) |
| 8591 | cpumask_and(tmpmask, tmpmask, domainspan); | 8867 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); |
| 8592 | if (cpumask_empty(tmpmask)) | ||
| 8593 | break; | ||
| 8594 | 8868 | ||
| 8595 | cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); | 8869 | #ifdef CONFIG_NUMA |
| 8596 | if (cpumask_empty(tmpmask)) | 8870 | /* Set up node groups */ |
| 8597 | continue; | 8871 | if (d.sd_allnodes) |
| 8872 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
| 8598 | 8873 | ||
| 8599 | sg = kmalloc_node(sizeof(struct sched_group) + | 8874 | for (i = 0; i < nr_node_ids; i++) |
| 8600 | cpumask_size(), | 8875 | if (build_numa_sched_groups(&d, cpu_map, i)) |
| 8601 | GFP_KERNEL, i); | 8876 | goto error; |
| 8602 | if (!sg) { | ||
| 8603 | printk(KERN_WARNING | ||
| 8604 | "Can not alloc domain group for node %d\n", j); | ||
| 8605 | goto error; | ||
| 8606 | } | ||
| 8607 | sg->__cpu_power = 0; | ||
| 8608 | cpumask_copy(sched_group_cpus(sg), tmpmask); | ||
| 8609 | sg->next = prev->next; | ||
| 8610 | cpumask_or(covered, covered, tmpmask); | ||
| 8611 | prev->next = sg; | ||
| 8612 | prev = sg; | ||
| 8613 | } | ||
| 8614 | } | ||
| 8615 | #endif | 8877 | #endif |
| 8616 | 8878 | ||
| 8617 | /* Calculate CPU power for physical packages and nodes */ | 8879 | /* Calculate CPU power for physical packages and nodes */ |
| 8618 | #ifdef CONFIG_SCHED_SMT | 8880 | #ifdef CONFIG_SCHED_SMT |
| 8619 | for_each_cpu(i, cpu_map) { | 8881 | for_each_cpu(i, cpu_map) { |
| 8620 | struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; | 8882 | sd = &per_cpu(cpu_domains, i).sd; |
| 8621 | |||
| 8622 | init_sched_groups_power(i, sd); | 8883 | init_sched_groups_power(i, sd); |
| 8623 | } | 8884 | } |
| 8624 | #endif | 8885 | #endif |
| 8625 | #ifdef CONFIG_SCHED_MC | 8886 | #ifdef CONFIG_SCHED_MC |
| 8626 | for_each_cpu(i, cpu_map) { | 8887 | for_each_cpu(i, cpu_map) { |
| 8627 | struct sched_domain *sd = &per_cpu(core_domains, i).sd; | 8888 | sd = &per_cpu(core_domains, i).sd; |
| 8628 | |||
| 8629 | init_sched_groups_power(i, sd); | 8889 | init_sched_groups_power(i, sd); |
| 8630 | } | 8890 | } |
| 8631 | #endif | 8891 | #endif |
| 8632 | 8892 | ||
| 8633 | for_each_cpu(i, cpu_map) { | 8893 | for_each_cpu(i, cpu_map) { |
| 8634 | struct sched_domain *sd = &per_cpu(phys_domains, i).sd; | 8894 | sd = &per_cpu(phys_domains, i).sd; |
| 8635 | |||
| 8636 | init_sched_groups_power(i, sd); | 8895 | init_sched_groups_power(i, sd); |
| 8637 | } | 8896 | } |
| 8638 | 8897 | ||
| 8639 | #ifdef CONFIG_NUMA | 8898 | #ifdef CONFIG_NUMA |
| 8640 | for (i = 0; i < nr_node_ids; i++) | 8899 | for (i = 0; i < nr_node_ids; i++) |
| 8641 | init_numa_sched_groups_power(sched_group_nodes[i]); | 8900 | init_numa_sched_groups_power(d.sched_group_nodes[i]); |
| 8642 | 8901 | ||
| 8643 | if (sd_allnodes) { | 8902 | if (d.sd_allnodes) { |
| 8644 | struct sched_group *sg; | 8903 | struct sched_group *sg; |
| 8645 | 8904 | ||
| 8646 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 8905 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, |
| 8647 | tmpmask); | 8906 | d.tmpmask); |
| 8648 | init_numa_sched_groups_power(sg); | 8907 | init_numa_sched_groups_power(sg); |
| 8649 | } | 8908 | } |
| 8650 | #endif | 8909 | #endif |
| 8651 | 8910 | ||
| 8652 | /* Attach the domains */ | 8911 | /* Attach the domains */ |
| 8653 | for_each_cpu(i, cpu_map) { | 8912 | for_each_cpu(i, cpu_map) { |
| 8654 | struct sched_domain *sd; | ||
| 8655 | #ifdef CONFIG_SCHED_SMT | 8913 | #ifdef CONFIG_SCHED_SMT |
| 8656 | sd = &per_cpu(cpu_domains, i).sd; | 8914 | sd = &per_cpu(cpu_domains, i).sd; |
| 8657 | #elif defined(CONFIG_SCHED_MC) | 8915 | #elif defined(CONFIG_SCHED_MC) |
| @@ -8659,44 +8917,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
| 8659 | #else | 8917 | #else |
| 8660 | sd = &per_cpu(phys_domains, i).sd; | 8918 | sd = &per_cpu(phys_domains, i).sd; |
| 8661 | #endif | 8919 | #endif |
| 8662 | cpu_attach_domain(sd, rd, i); | 8920 | cpu_attach_domain(sd, d.rd, i); |
| 8663 | } | 8921 | } |
| 8664 | 8922 | ||
| 8665 | err = 0; | 8923 | d.sched_group_nodes = NULL; /* don't free this we still need it */ |
| 8666 | 8924 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | |
| 8667 | free_tmpmask: | 8925 | return 0; |
| 8668 | free_cpumask_var(tmpmask); | ||
| 8669 | free_send_covered: | ||
| 8670 | free_cpumask_var(send_covered); | ||
| 8671 | free_this_core_map: | ||
| 8672 | free_cpumask_var(this_core_map); | ||
| 8673 | free_this_sibling_map: | ||
| 8674 | free_cpumask_var(this_sibling_map); | ||
| 8675 | free_nodemask: | ||
| 8676 | free_cpumask_var(nodemask); | ||
| 8677 | free_notcovered: | ||
| 8678 | #ifdef CONFIG_NUMA | ||
| 8679 | free_cpumask_var(notcovered); | ||
| 8680 | free_covered: | ||
| 8681 | free_cpumask_var(covered); | ||
| 8682 | free_domainspan: | ||
| 8683 | free_cpumask_var(domainspan); | ||
| 8684 | out: | ||
| 8685 | #endif | ||
| 8686 | return err; | ||
| 8687 | |||
| 8688 | free_sched_groups: | ||
| 8689 | #ifdef CONFIG_NUMA | ||
| 8690 | kfree(sched_group_nodes); | ||
| 8691 | #endif | ||
| 8692 | goto free_tmpmask; | ||
| 8693 | 8926 | ||
| 8694 | #ifdef CONFIG_NUMA | ||
| 8695 | error: | 8927 | error: |
| 8696 | free_sched_groups(cpu_map, tmpmask); | 8928 | __free_domain_allocs(&d, alloc_state, cpu_map); |
| 8697 | free_rootdomain(rd); | 8929 | return -ENOMEM; |
| 8698 | goto free_tmpmask; | ||
| 8699 | #endif | ||
| 8700 | } | 8930 | } |
| 8701 | 8931 | ||
| 8702 | static int build_sched_domains(const struct cpumask *cpu_map) | 8932 | static int build_sched_domains(const struct cpumask *cpu_map) |
| @@ -8704,7 +8934,7 @@ static int build_sched_domains(const struct cpumask *cpu_map) | |||
| 8704 | return __build_sched_domains(cpu_map, NULL); | 8934 | return __build_sched_domains(cpu_map, NULL); |
| 8705 | } | 8935 | } |
| 8706 | 8936 | ||
| 8707 | static struct cpumask *doms_cur; /* current sched domains */ | 8937 | static cpumask_var_t *doms_cur; /* current sched domains */ |
| 8708 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | 8938 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ |
| 8709 | static struct sched_domain_attr *dattr_cur; | 8939 | static struct sched_domain_attr *dattr_cur; |
| 8710 | /* attribues of custom domains in 'doms_cur' */ | 8940 | /* attribues of custom domains in 'doms_cur' */ |
| @@ -8726,6 +8956,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void) | |||
| 8726 | return 0; | 8956 | return 0; |
| 8727 | } | 8957 | } |
| 8728 | 8958 | ||
| 8959 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) | ||
| 8960 | { | ||
| 8961 | int i; | ||
| 8962 | cpumask_var_t *doms; | ||
| 8963 | |||
| 8964 | doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); | ||
| 8965 | if (!doms) | ||
| 8966 | return NULL; | ||
| 8967 | for (i = 0; i < ndoms; i++) { | ||
| 8968 | if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { | ||
| 8969 | free_sched_domains(doms, i); | ||
| 8970 | return NULL; | ||
| 8971 | } | ||
| 8972 | } | ||
| 8973 | return doms; | ||
| 8974 | } | ||
| 8975 | |||
| 8976 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | ||
| 8977 | { | ||
| 8978 | unsigned int i; | ||
| 8979 | for (i = 0; i < ndoms; i++) | ||
| 8980 | free_cpumask_var(doms[i]); | ||
| 8981 | kfree(doms); | ||
| 8982 | } | ||
| 8983 | |||
| 8729 | /* | 8984 | /* |
| 8730 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 8985 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
| 8731 | * For now this just excludes isolated cpus, but could be used to | 8986 | * For now this just excludes isolated cpus, but could be used to |
| @@ -8737,12 +8992,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) | |||
| 8737 | 8992 | ||
| 8738 | arch_update_cpu_topology(); | 8993 | arch_update_cpu_topology(); |
| 8739 | ndoms_cur = 1; | 8994 | ndoms_cur = 1; |
| 8740 | doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); | 8995 | doms_cur = alloc_sched_domains(ndoms_cur); |
| 8741 | if (!doms_cur) | 8996 | if (!doms_cur) |
| 8742 | doms_cur = fallback_doms; | 8997 | doms_cur = &fallback_doms; |
| 8743 | cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); | 8998 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
| 8744 | dattr_cur = NULL; | 8999 | dattr_cur = NULL; |
| 8745 | err = build_sched_domains(doms_cur); | 9000 | err = build_sched_domains(doms_cur[0]); |
| 8746 | register_sched_domain_sysctl(); | 9001 | register_sched_domain_sysctl(); |
| 8747 | 9002 | ||
| 8748 | return err; | 9003 | return err; |
| @@ -8792,19 +9047,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
| 8792 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | 9047 | * doms_new[] to the current sched domain partitioning, doms_cur[]. |
| 8793 | * It destroys each deleted domain and builds each new domain. | 9048 | * It destroys each deleted domain and builds each new domain. |
| 8794 | * | 9049 | * |
| 8795 | * 'doms_new' is an array of cpumask's of length 'ndoms_new'. | 9050 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. |
| 8796 | * The masks don't intersect (don't overlap.) We should setup one | 9051 | * The masks don't intersect (don't overlap.) We should setup one |
| 8797 | * sched domain for each mask. CPUs not in any of the cpumasks will | 9052 | * sched domain for each mask. CPUs not in any of the cpumasks will |
| 8798 | * not be load balanced. If the same cpumask appears both in the | 9053 | * not be load balanced. If the same cpumask appears both in the |
| 8799 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | 9054 | * current 'doms_cur' domains and in the new 'doms_new', we can leave |
| 8800 | * it as it is. | 9055 | * it as it is. |
| 8801 | * | 9056 | * |
| 8802 | * The passed in 'doms_new' should be kmalloc'd. This routine takes | 9057 | * The passed in 'doms_new' should be allocated using |
| 8803 | * ownership of it and will kfree it when done with it. If the caller | 9058 | * alloc_sched_domains. This routine takes ownership of it and will |
| 8804 | * failed the kmalloc call, then it can pass in doms_new == NULL && | 9059 | * free_sched_domains it when done with it. If the caller failed the |
| 8805 | * ndoms_new == 1, and partition_sched_domains() will fallback to | 9060 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, |
| 8806 | * the single partition 'fallback_doms', it also forces the domains | 9061 | * and partition_sched_domains() will fallback to the single partition |
| 8807 | * to be rebuilt. | 9062 | * 'fallback_doms', it also forces the domains to be rebuilt. |
| 8808 | * | 9063 | * |
| 8809 | * If doms_new == NULL it will be replaced with cpu_online_mask. | 9064 | * If doms_new == NULL it will be replaced with cpu_online_mask. |
| 8810 | * ndoms_new == 0 is a special case for destroying existing domains, | 9065 | * ndoms_new == 0 is a special case for destroying existing domains, |
| @@ -8812,8 +9067,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
| 8812 | * | 9067 | * |
| 8813 | * Call with hotplug lock held | 9068 | * Call with hotplug lock held |
| 8814 | */ | 9069 | */ |
| 8815 | /* FIXME: Change to struct cpumask *doms_new[] */ | 9070 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], |
| 8816 | void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, | ||
| 8817 | struct sched_domain_attr *dattr_new) | 9071 | struct sched_domain_attr *dattr_new) |
| 8818 | { | 9072 | { |
| 8819 | int i, j, n; | 9073 | int i, j, n; |
| @@ -8832,40 +9086,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, | |||
| 8832 | /* Destroy deleted domains */ | 9086 | /* Destroy deleted domains */ |
| 8833 | for (i = 0; i < ndoms_cur; i++) { | 9087 | for (i = 0; i < ndoms_cur; i++) { |
| 8834 | for (j = 0; j < n && !new_topology; j++) { | 9088 | for (j = 0; j < n && !new_topology; j++) { |
| 8835 | if (cpumask_equal(&doms_cur[i], &doms_new[j]) | 9089 | if (cpumask_equal(doms_cur[i], doms_new[j]) |
| 8836 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | 9090 | && dattrs_equal(dattr_cur, i, dattr_new, j)) |
| 8837 | goto match1; | 9091 | goto match1; |
| 8838 | } | 9092 | } |
| 8839 | /* no match - a current sched domain not in new doms_new[] */ | 9093 | /* no match - a current sched domain not in new doms_new[] */ |
| 8840 | detach_destroy_domains(doms_cur + i); | 9094 | detach_destroy_domains(doms_cur[i]); |
| 8841 | match1: | 9095 | match1: |
| 8842 | ; | 9096 | ; |
| 8843 | } | 9097 | } |
| 8844 | 9098 | ||
| 8845 | if (doms_new == NULL) { | 9099 | if (doms_new == NULL) { |
| 8846 | ndoms_cur = 0; | 9100 | ndoms_cur = 0; |
| 8847 | doms_new = fallback_doms; | 9101 | doms_new = &fallback_doms; |
| 8848 | cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); | 9102 | cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); |
| 8849 | WARN_ON_ONCE(dattr_new); | 9103 | WARN_ON_ONCE(dattr_new); |
| 8850 | } | 9104 | } |
| 8851 | 9105 | ||
| 8852 | /* Build new domains */ | 9106 | /* Build new domains */ |
| 8853 | for (i = 0; i < ndoms_new; i++) { | 9107 | for (i = 0; i < ndoms_new; i++) { |
| 8854 | for (j = 0; j < ndoms_cur && !new_topology; j++) { | 9108 | for (j = 0; j < ndoms_cur && !new_topology; j++) { |
| 8855 | if (cpumask_equal(&doms_new[i], &doms_cur[j]) | 9109 | if (cpumask_equal(doms_new[i], doms_cur[j]) |
| 8856 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | 9110 | && dattrs_equal(dattr_new, i, dattr_cur, j)) |
| 8857 | goto match2; | 9111 | goto match2; |
| 8858 | } | 9112 | } |
| 8859 | /* no match - add a new doms_new */ | 9113 | /* no match - add a new doms_new */ |
| 8860 | __build_sched_domains(doms_new + i, | 9114 | __build_sched_domains(doms_new[i], |
| 8861 | dattr_new ? dattr_new + i : NULL); | 9115 | dattr_new ? dattr_new + i : NULL); |
| 8862 | match2: | 9116 | match2: |
| 8863 | ; | 9117 | ; |
| 8864 | } | 9118 | } |
| 8865 | 9119 | ||
| 8866 | /* Remember the new sched domains */ | 9120 | /* Remember the new sched domains */ |
| 8867 | if (doms_cur != fallback_doms) | 9121 | if (doms_cur != &fallback_doms) |
| 8868 | kfree(doms_cur); | 9122 | free_sched_domains(doms_cur, ndoms_cur); |
| 8869 | kfree(dattr_cur); /* kfree(NULL) is safe */ | 9123 | kfree(dattr_cur); /* kfree(NULL) is safe */ |
| 8870 | doms_cur = doms_new; | 9124 | doms_cur = doms_new; |
| 8871 | dattr_cur = dattr_new; | 9125 | dattr_cur = dattr_new; |
| @@ -9015,6 +9269,7 @@ void __init sched_init_smp(void) | |||
| 9015 | cpumask_var_t non_isolated_cpus; | 9269 | cpumask_var_t non_isolated_cpus; |
| 9016 | 9270 | ||
| 9017 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 9271 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
| 9272 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | ||
| 9018 | 9273 | ||
| 9019 | #if defined(CONFIG_NUMA) | 9274 | #if defined(CONFIG_NUMA) |
| 9020 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | 9275 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), |
| @@ -9046,7 +9301,6 @@ void __init sched_init_smp(void) | |||
| 9046 | sched_init_granularity(); | 9301 | sched_init_granularity(); |
| 9047 | free_cpumask_var(non_isolated_cpus); | 9302 | free_cpumask_var(non_isolated_cpus); |
| 9048 | 9303 | ||
| 9049 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | ||
| 9050 | init_sched_rt_class(); | 9304 | init_sched_rt_class(); |
| 9051 | } | 9305 | } |
| 9052 | #else | 9306 | #else |
| @@ -9187,10 +9441,6 @@ void __init sched_init(void) | |||
| 9187 | #ifdef CONFIG_CPUMASK_OFFSTACK | 9441 | #ifdef CONFIG_CPUMASK_OFFSTACK |
| 9188 | alloc_size += num_possible_cpus() * cpumask_size(); | 9442 | alloc_size += num_possible_cpus() * cpumask_size(); |
| 9189 | #endif | 9443 | #endif |
| 9190 | /* | ||
| 9191 | * As sched_init() is called before page_alloc is setup, | ||
| 9192 | * we use alloc_bootmem(). | ||
| 9193 | */ | ||
| 9194 | if (alloc_size) { | 9444 | if (alloc_size) { |
| 9195 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 9445 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
| 9196 | 9446 | ||
| @@ -9259,6 +9509,10 @@ void __init sched_init(void) | |||
| 9259 | #endif /* CONFIG_USER_SCHED */ | 9509 | #endif /* CONFIG_USER_SCHED */ |
| 9260 | #endif /* CONFIG_GROUP_SCHED */ | 9510 | #endif /* CONFIG_GROUP_SCHED */ |
| 9261 | 9511 | ||
| 9512 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
| 9513 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | ||
| 9514 | __alignof__(unsigned long)); | ||
| 9515 | #endif | ||
| 9262 | for_each_possible_cpu(i) { | 9516 | for_each_possible_cpu(i) { |
| 9263 | struct rq *rq; | 9517 | struct rq *rq; |
| 9264 | 9518 | ||
| @@ -9304,11 +9558,11 @@ void __init sched_init(void) | |||
| 9304 | * system cpu resource, based on the weight assigned to root | 9558 | * system cpu resource, based on the weight assigned to root |
| 9305 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | 9559 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished |
| 9306 | * by letting tasks of init_task_group sit in a separate cfs_rq | 9560 | * by letting tasks of init_task_group sit in a separate cfs_rq |
| 9307 | * (init_cfs_rq) and having one entity represent this group of | 9561 | * (init_tg_cfs_rq) and having one entity represent this group of |
| 9308 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | 9562 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). |
| 9309 | */ | 9563 | */ |
| 9310 | init_tg_cfs_entry(&init_task_group, | 9564 | init_tg_cfs_entry(&init_task_group, |
| 9311 | &per_cpu(init_cfs_rq, i), | 9565 | &per_cpu(init_tg_cfs_rq, i), |
| 9312 | &per_cpu(init_sched_entity, i), i, 1, | 9566 | &per_cpu(init_sched_entity, i), i, 1, |
| 9313 | root_task_group.se[i]); | 9567 | root_task_group.se[i]); |
| 9314 | 9568 | ||
| @@ -9334,12 +9588,15 @@ void __init sched_init(void) | |||
| 9334 | #ifdef CONFIG_SMP | 9588 | #ifdef CONFIG_SMP |
| 9335 | rq->sd = NULL; | 9589 | rq->sd = NULL; |
| 9336 | rq->rd = NULL; | 9590 | rq->rd = NULL; |
| 9591 | rq->post_schedule = 0; | ||
| 9337 | rq->active_balance = 0; | 9592 | rq->active_balance = 0; |
| 9338 | rq->next_balance = jiffies; | 9593 | rq->next_balance = jiffies; |
| 9339 | rq->push_cpu = 0; | 9594 | rq->push_cpu = 0; |
| 9340 | rq->cpu = i; | 9595 | rq->cpu = i; |
| 9341 | rq->online = 0; | 9596 | rq->online = 0; |
| 9342 | rq->migration_thread = NULL; | 9597 | rq->migration_thread = NULL; |
| 9598 | rq->idle_stamp = 0; | ||
| 9599 | rq->avg_idle = 2*sysctl_sched_migration_cost; | ||
| 9343 | INIT_LIST_HEAD(&rq->migration_queue); | 9600 | INIT_LIST_HEAD(&rq->migration_queue); |
| 9344 | rq_attach_root(rq, &def_root_domain); | 9601 | rq_attach_root(rq, &def_root_domain); |
| 9345 | #endif | 9602 | #endif |
| @@ -9383,28 +9640,37 @@ void __init sched_init(void) | |||
| 9383 | current->sched_class = &fair_sched_class; | 9640 | current->sched_class = &fair_sched_class; |
| 9384 | 9641 | ||
| 9385 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | 9642 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ |
| 9386 | alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 9643 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
| 9387 | #ifdef CONFIG_SMP | 9644 | #ifdef CONFIG_SMP |
| 9388 | #ifdef CONFIG_NO_HZ | 9645 | #ifdef CONFIG_NO_HZ |
| 9389 | alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); | 9646 | zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); |
| 9390 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); | 9647 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); |
| 9391 | #endif | 9648 | #endif |
| 9392 | alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 9649 | /* May be allocated at isolcpus cmdline parse time */ |
| 9650 | if (cpu_isolated_map == NULL) | ||
| 9651 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | ||
| 9393 | #endif /* SMP */ | 9652 | #endif /* SMP */ |
| 9394 | 9653 | ||
| 9395 | perf_counter_init(); | 9654 | perf_event_init(); |
| 9396 | 9655 | ||
| 9397 | scheduler_running = 1; | 9656 | scheduler_running = 1; |
| 9398 | } | 9657 | } |
| 9399 | 9658 | ||
| 9400 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 9659 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
| 9401 | void __might_sleep(char *file, int line) | 9660 | static inline int preempt_count_equals(int preempt_offset) |
| 9661 | { | ||
| 9662 | int nested = preempt_count() & ~PREEMPT_ACTIVE; | ||
| 9663 | |||
| 9664 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | ||
| 9665 | } | ||
| 9666 | |||
| 9667 | void __might_sleep(char *file, int line, int preempt_offset) | ||
| 9402 | { | 9668 | { |
| 9403 | #ifdef in_atomic | 9669 | #ifdef in_atomic |
| 9404 | static unsigned long prev_jiffy; /* ratelimiting */ | 9670 | static unsigned long prev_jiffy; /* ratelimiting */ |
| 9405 | 9671 | ||
| 9406 | if ((!in_atomic() && !irqs_disabled()) || | 9672 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
| 9407 | system_state != SYSTEM_RUNNING || oops_in_progress) | 9673 | system_state != SYSTEM_RUNNING || oops_in_progress) |
| 9408 | return; | 9674 | return; |
| 9409 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 9675 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
| 9410 | return; | 9676 | return; |
| @@ -10157,7 +10423,7 @@ static int sched_rt_global_constraints(void) | |||
| 10157 | #endif /* CONFIG_RT_GROUP_SCHED */ | 10423 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 10158 | 10424 | ||
| 10159 | int sched_rt_handler(struct ctl_table *table, int write, | 10425 | int sched_rt_handler(struct ctl_table *table, int write, |
| 10160 | struct file *filp, void __user *buffer, size_t *lenp, | 10426 | void __user *buffer, size_t *lenp, |
| 10161 | loff_t *ppos) | 10427 | loff_t *ppos) |
| 10162 | { | 10428 | { |
| 10163 | int ret; | 10429 | int ret; |
| @@ -10168,7 +10434,7 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
| 10168 | old_period = sysctl_sched_rt_period; | 10434 | old_period = sysctl_sched_rt_period; |
| 10169 | old_runtime = sysctl_sched_rt_runtime; | 10435 | old_runtime = sysctl_sched_rt_runtime; |
| 10170 | 10436 | ||
| 10171 | ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); | 10437 | ret = proc_dointvec(table, write, buffer, lenp, ppos); |
| 10172 | 10438 | ||
| 10173 | if (!ret && write) { | 10439 | if (!ret && write) { |
| 10174 | ret = sched_rt_global_constraints(); | 10440 | ret = sched_rt_global_constraints(); |
| @@ -10222,8 +10488,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
| 10222 | } | 10488 | } |
| 10223 | 10489 | ||
| 10224 | static int | 10490 | static int |
| 10225 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 10491 | cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
| 10226 | struct task_struct *tsk) | ||
| 10227 | { | 10492 | { |
| 10228 | #ifdef CONFIG_RT_GROUP_SCHED | 10493 | #ifdef CONFIG_RT_GROUP_SCHED |
| 10229 | if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) | 10494 | if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) |
| @@ -10233,15 +10498,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
| 10233 | if (tsk->sched_class != &fair_sched_class) | 10498 | if (tsk->sched_class != &fair_sched_class) |
| 10234 | return -EINVAL; | 10499 | return -EINVAL; |
| 10235 | #endif | 10500 | #endif |
| 10501 | return 0; | ||
| 10502 | } | ||
| 10236 | 10503 | ||
| 10504 | static int | ||
| 10505 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
| 10506 | struct task_struct *tsk, bool threadgroup) | ||
| 10507 | { | ||
| 10508 | int retval = cpu_cgroup_can_attach_task(cgrp, tsk); | ||
| 10509 | if (retval) | ||
| 10510 | return retval; | ||
| 10511 | if (threadgroup) { | ||
| 10512 | struct task_struct *c; | ||
| 10513 | rcu_read_lock(); | ||
| 10514 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
| 10515 | retval = cpu_cgroup_can_attach_task(cgrp, c); | ||
| 10516 | if (retval) { | ||
| 10517 | rcu_read_unlock(); | ||
| 10518 | return retval; | ||
| 10519 | } | ||
| 10520 | } | ||
| 10521 | rcu_read_unlock(); | ||
| 10522 | } | ||
| 10237 | return 0; | 10523 | return 0; |
| 10238 | } | 10524 | } |
| 10239 | 10525 | ||
| 10240 | static void | 10526 | static void |
| 10241 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 10527 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
| 10242 | struct cgroup *old_cont, struct task_struct *tsk) | 10528 | struct cgroup *old_cont, struct task_struct *tsk, |
| 10529 | bool threadgroup) | ||
| 10243 | { | 10530 | { |
| 10244 | sched_move_task(tsk); | 10531 | sched_move_task(tsk); |
| 10532 | if (threadgroup) { | ||
| 10533 | struct task_struct *c; | ||
| 10534 | rcu_read_lock(); | ||
| 10535 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
| 10536 | sched_move_task(c); | ||
| 10537 | } | ||
| 10538 | rcu_read_unlock(); | ||
| 10539 | } | ||
| 10245 | } | 10540 | } |
| 10246 | 10541 | ||
| 10247 | #ifdef CONFIG_FAIR_GROUP_SCHED | 10542 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| @@ -10581,3 +10876,114 @@ struct cgroup_subsys cpuacct_subsys = { | |||
| 10581 | .subsys_id = cpuacct_subsys_id, | 10876 | .subsys_id = cpuacct_subsys_id, |
| 10582 | }; | 10877 | }; |
| 10583 | #endif /* CONFIG_CGROUP_CPUACCT */ | 10878 | #endif /* CONFIG_CGROUP_CPUACCT */ |
| 10879 | |||
| 10880 | #ifndef CONFIG_SMP | ||
| 10881 | |||
| 10882 | int rcu_expedited_torture_stats(char *page) | ||
| 10883 | { | ||
| 10884 | return 0; | ||
| 10885 | } | ||
| 10886 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
| 10887 | |||
| 10888 | void synchronize_sched_expedited(void) | ||
| 10889 | { | ||
| 10890 | } | ||
| 10891 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
| 10892 | |||
| 10893 | #else /* #ifndef CONFIG_SMP */ | ||
| 10894 | |||
| 10895 | static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); | ||
| 10896 | static DEFINE_MUTEX(rcu_sched_expedited_mutex); | ||
| 10897 | |||
| 10898 | #define RCU_EXPEDITED_STATE_POST -2 | ||
| 10899 | #define RCU_EXPEDITED_STATE_IDLE -1 | ||
| 10900 | |||
| 10901 | static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | ||
| 10902 | |||
| 10903 | int rcu_expedited_torture_stats(char *page) | ||
| 10904 | { | ||
| 10905 | int cnt = 0; | ||
| 10906 | int cpu; | ||
| 10907 | |||
| 10908 | cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); | ||
| 10909 | for_each_online_cpu(cpu) { | ||
| 10910 | cnt += sprintf(&page[cnt], " %d:%d", | ||
| 10911 | cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); | ||
| 10912 | } | ||
| 10913 | cnt += sprintf(&page[cnt], "\n"); | ||
| 10914 | return cnt; | ||
| 10915 | } | ||
| 10916 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
| 10917 | |||
| 10918 | static long synchronize_sched_expedited_count; | ||
| 10919 | |||
| 10920 | /* | ||
| 10921 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
| 10922 | * approach to force grace period to end quickly. This consumes | ||
| 10923 | * significant time on all CPUs, and is thus not recommended for | ||
| 10924 | * any sort of common-case code. | ||
| 10925 | * | ||
| 10926 | * Note that it is illegal to call this function while holding any | ||
| 10927 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
| 10928 | * observe this restriction will result in deadlock. | ||
| 10929 | */ | ||
| 10930 | void synchronize_sched_expedited(void) | ||
| 10931 | { | ||
| 10932 | int cpu; | ||
| 10933 | unsigned long flags; | ||
| 10934 | bool need_full_sync = 0; | ||
| 10935 | struct rq *rq; | ||
| 10936 | struct migration_req *req; | ||
| 10937 | long snap; | ||
| 10938 | int trycount = 0; | ||
| 10939 | |||
| 10940 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
| 10941 | snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; | ||
| 10942 | get_online_cpus(); | ||
| 10943 | while (!mutex_trylock(&rcu_sched_expedited_mutex)) { | ||
| 10944 | put_online_cpus(); | ||
| 10945 | if (trycount++ < 10) | ||
| 10946 | udelay(trycount * num_online_cpus()); | ||
| 10947 | else { | ||
| 10948 | synchronize_sched(); | ||
| 10949 | return; | ||
| 10950 | } | ||
| 10951 | if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { | ||
| 10952 | smp_mb(); /* ensure test happens before caller kfree */ | ||
| 10953 | return; | ||
| 10954 | } | ||
| 10955 | get_online_cpus(); | ||
| 10956 | } | ||
| 10957 | rcu_expedited_state = RCU_EXPEDITED_STATE_POST; | ||
| 10958 | for_each_online_cpu(cpu) { | ||
| 10959 | rq = cpu_rq(cpu); | ||
| 10960 | req = &per_cpu(rcu_migration_req, cpu); | ||
| 10961 | init_completion(&req->done); | ||
| 10962 | req->task = NULL; | ||
| 10963 | req->dest_cpu = RCU_MIGRATION_NEED_QS; | ||
| 10964 | spin_lock_irqsave(&rq->lock, flags); | ||
| 10965 | list_add(&req->list, &rq->migration_queue); | ||
| 10966 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 10967 | wake_up_process(rq->migration_thread); | ||
| 10968 | } | ||
| 10969 | for_each_online_cpu(cpu) { | ||
| 10970 | rcu_expedited_state = cpu; | ||
| 10971 | req = &per_cpu(rcu_migration_req, cpu); | ||
| 10972 | rq = cpu_rq(cpu); | ||
| 10973 | wait_for_completion(&req->done); | ||
| 10974 | spin_lock_irqsave(&rq->lock, flags); | ||
| 10975 | if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) | ||
| 10976 | need_full_sync = 1; | ||
| 10977 | req->dest_cpu = RCU_MIGRATION_IDLE; | ||
| 10978 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 10979 | } | ||
| 10980 | rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | ||
| 10981 | synchronize_sched_expedited_count++; | ||
| 10982 | mutex_unlock(&rcu_sched_expedited_mutex); | ||
| 10983 | put_online_cpus(); | ||
| 10984 | if (need_full_sync) | ||
| 10985 | synchronize_sched(); | ||
| 10986 | } | ||
| 10987 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
| 10988 | |||
| 10989 | #endif /* #else #ifndef CONFIG_SMP */ | ||
