diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 667 | ||||
-rw-r--r-- | kernel/sched/debug.c | 12 | ||||
-rw-r--r-- | kernel/sched/fair.c | 543 | ||||
-rw-r--r-- | kernel/sched/features.h | 1 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 2 | ||||
-rw-r--r-- | kernel/sched/rt.c | 107 | ||||
-rw-r--r-- | kernel/sched/sched.h | 10 |
8 files changed, 635 insertions, 709 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a7dd35102a3..173ea52f3af0 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o | |||
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
19 | |||
20 | |||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4603b9d8f30a..d5594a4268d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -83,6 +83,7 @@ | |||
83 | 83 | ||
84 | #include "sched.h" | 84 | #include "sched.h" |
85 | #include "../workqueue_sched.h" | 85 | #include "../workqueue_sched.h" |
86 | #include "../smpboot.h" | ||
86 | 87 | ||
87 | #define CREATE_TRACE_POINTS | 88 | #define CREATE_TRACE_POINTS |
88 | #include <trace/events/sched.h> | 89 | #include <trace/events/sched.h> |
@@ -141,9 +142,8 @@ const_debug unsigned int sysctl_sched_features = | |||
141 | #define SCHED_FEAT(name, enabled) \ | 142 | #define SCHED_FEAT(name, enabled) \ |
142 | #name , | 143 | #name , |
143 | 144 | ||
144 | static __read_mostly char *sched_feat_names[] = { | 145 | static const char * const sched_feat_names[] = { |
145 | #include "features.h" | 146 | #include "features.h" |
146 | NULL | ||
147 | }; | 147 | }; |
148 | 148 | ||
149 | #undef SCHED_FEAT | 149 | #undef SCHED_FEAT |
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data) | |||
692 | } | 692 | } |
693 | #endif | 693 | #endif |
694 | 694 | ||
695 | void update_cpu_load(struct rq *this_rq); | ||
696 | |||
697 | static void set_load_weight(struct task_struct *p) | 695 | static void set_load_weight(struct task_struct *p) |
698 | { | 696 | { |
699 | int prio = p->static_prio - MAX_RT_PRIO; | 697 | int prio = p->static_prio - MAX_RT_PRIO; |
@@ -2083,6 +2081,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2083 | #endif | 2081 | #endif |
2084 | 2082 | ||
2085 | /* Here we just switch the register state and the stack. */ | 2083 | /* Here we just switch the register state and the stack. */ |
2084 | rcu_switch_from(prev); | ||
2086 | switch_to(prev, next, prev); | 2085 | switch_to(prev, next, prev); |
2087 | 2086 | ||
2088 | barrier(); | 2087 | barrier(); |
@@ -2486,22 +2485,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
2486 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | 2485 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
2487 | * every tick. We fix it up based on jiffies. | 2486 | * every tick. We fix it up based on jiffies. |
2488 | */ | 2487 | */ |
2489 | void update_cpu_load(struct rq *this_rq) | 2488 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, |
2489 | unsigned long pending_updates) | ||
2490 | { | 2490 | { |
2491 | unsigned long this_load = this_rq->load.weight; | ||
2492 | unsigned long curr_jiffies = jiffies; | ||
2493 | unsigned long pending_updates; | ||
2494 | int i, scale; | 2491 | int i, scale; |
2495 | 2492 | ||
2496 | this_rq->nr_load_updates++; | 2493 | this_rq->nr_load_updates++; |
2497 | 2494 | ||
2498 | /* Avoid repeated calls on same jiffy, when moving in and out of idle */ | ||
2499 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2500 | return; | ||
2501 | |||
2502 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2503 | this_rq->last_load_update_tick = curr_jiffies; | ||
2504 | |||
2505 | /* Update our load: */ | 2495 | /* Update our load: */ |
2506 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | 2496 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ |
2507 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 2497 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
@@ -2526,9 +2516,78 @@ void update_cpu_load(struct rq *this_rq) | |||
2526 | sched_avg_update(this_rq); | 2516 | sched_avg_update(this_rq); |
2527 | } | 2517 | } |
2528 | 2518 | ||
2519 | #ifdef CONFIG_NO_HZ | ||
2520 | /* | ||
2521 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
2522 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
2523 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
2524 | * | ||
2525 | * Therefore we cannot use the delta approach from the regular tick since that | ||
2526 | * would seriously skew the load calculation. However we'll make do for those | ||
2527 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
2528 | * (tick_nohz_idle_exit). | ||
2529 | * | ||
2530 | * This means we might still be one tick off for nohz periods. | ||
2531 | */ | ||
2532 | |||
2533 | /* | ||
2534 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
2535 | * idle balance. | ||
2536 | */ | ||
2537 | void update_idle_cpu_load(struct rq *this_rq) | ||
2538 | { | ||
2539 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2540 | unsigned long load = this_rq->load.weight; | ||
2541 | unsigned long pending_updates; | ||
2542 | |||
2543 | /* | ||
2544 | * bail if there's load or we're actually up-to-date. | ||
2545 | */ | ||
2546 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
2547 | return; | ||
2548 | |||
2549 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2550 | this_rq->last_load_update_tick = curr_jiffies; | ||
2551 | |||
2552 | __update_cpu_load(this_rq, load, pending_updates); | ||
2553 | } | ||
2554 | |||
2555 | /* | ||
2556 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
2557 | */ | ||
2558 | void update_cpu_load_nohz(void) | ||
2559 | { | ||
2560 | struct rq *this_rq = this_rq(); | ||
2561 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2562 | unsigned long pending_updates; | ||
2563 | |||
2564 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2565 | return; | ||
2566 | |||
2567 | raw_spin_lock(&this_rq->lock); | ||
2568 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2569 | if (pending_updates) { | ||
2570 | this_rq->last_load_update_tick = curr_jiffies; | ||
2571 | /* | ||
2572 | * We were idle, this means load 0, the current load might be | ||
2573 | * !0 due to remote wakeups and the sort. | ||
2574 | */ | ||
2575 | __update_cpu_load(this_rq, 0, pending_updates); | ||
2576 | } | ||
2577 | raw_spin_unlock(&this_rq->lock); | ||
2578 | } | ||
2579 | #endif /* CONFIG_NO_HZ */ | ||
2580 | |||
2581 | /* | ||
2582 | * Called from scheduler_tick() | ||
2583 | */ | ||
2529 | static void update_cpu_load_active(struct rq *this_rq) | 2584 | static void update_cpu_load_active(struct rq *this_rq) |
2530 | { | 2585 | { |
2531 | update_cpu_load(this_rq); | 2586 | /* |
2587 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
2588 | */ | ||
2589 | this_rq->last_load_update_tick = jiffies; | ||
2590 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | ||
2532 | 2591 | ||
2533 | calc_load_account_active(this_rq); | 2592 | calc_load_account_active(this_rq); |
2534 | } | 2593 | } |
@@ -3113,6 +3172,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3113 | if (irqs_disabled()) | 3172 | if (irqs_disabled()) |
3114 | print_irqtrace_events(prev); | 3173 | print_irqtrace_events(prev); |
3115 | dump_stack(); | 3174 | dump_stack(); |
3175 | add_taint(TAINT_WARN); | ||
3116 | } | 3176 | } |
3117 | 3177 | ||
3118 | /* | 3178 | /* |
@@ -4042,11 +4102,8 @@ static bool check_same_owner(struct task_struct *p) | |||
4042 | 4102 | ||
4043 | rcu_read_lock(); | 4103 | rcu_read_lock(); |
4044 | pcred = __task_cred(p); | 4104 | pcred = __task_cred(p); |
4045 | if (cred->user->user_ns == pcred->user->user_ns) | 4105 | match = (uid_eq(cred->euid, pcred->euid) || |
4046 | match = (cred->euid == pcred->euid || | 4106 | uid_eq(cred->euid, pcred->uid)); |
4047 | cred->euid == pcred->uid); | ||
4048 | else | ||
4049 | match = false; | ||
4050 | rcu_read_unlock(); | 4107 | rcu_read_unlock(); |
4051 | return match; | 4108 | return match; |
4052 | } | 4109 | } |
@@ -4957,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
4957 | p->sched_class->set_cpus_allowed(p, new_mask); | 5014 | p->sched_class->set_cpus_allowed(p, new_mask); |
4958 | 5015 | ||
4959 | cpumask_copy(&p->cpus_allowed, new_mask); | 5016 | cpumask_copy(&p->cpus_allowed, new_mask); |
4960 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 5017 | p->nr_cpus_allowed = cpumask_weight(new_mask); |
4961 | } | 5018 | } |
4962 | 5019 | ||
4963 | /* | 5020 | /* |
@@ -5499,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | |||
5499 | 5556 | ||
5500 | #ifdef CONFIG_SCHED_DEBUG | 5557 | #ifdef CONFIG_SCHED_DEBUG |
5501 | 5558 | ||
5502 | static __read_mostly int sched_domain_debug_enabled; | 5559 | static __read_mostly int sched_debug_enabled; |
5503 | 5560 | ||
5504 | static int __init sched_domain_debug_setup(char *str) | 5561 | static int __init sched_debug_setup(char *str) |
5505 | { | 5562 | { |
5506 | sched_domain_debug_enabled = 1; | 5563 | sched_debug_enabled = 1; |
5507 | 5564 | ||
5508 | return 0; | 5565 | return 0; |
5509 | } | 5566 | } |
5510 | early_param("sched_debug", sched_domain_debug_setup); | 5567 | early_param("sched_debug", sched_debug_setup); |
5568 | |||
5569 | static inline bool sched_debug(void) | ||
5570 | { | ||
5571 | return sched_debug_enabled; | ||
5572 | } | ||
5511 | 5573 | ||
5512 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 5574 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
5513 | struct cpumask *groupmask) | 5575 | struct cpumask *groupmask) |
@@ -5547,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5547 | break; | 5609 | break; |
5548 | } | 5610 | } |
5549 | 5611 | ||
5550 | if (!group->sgp->power) { | 5612 | /* |
5613 | * Even though we initialize ->power to something semi-sane, | ||
5614 | * we leave power_orig unset. This allows us to detect if | ||
5615 | * domain iteration is still funny without causing /0 traps. | ||
5616 | */ | ||
5617 | if (!group->sgp->power_orig) { | ||
5551 | printk(KERN_CONT "\n"); | 5618 | printk(KERN_CONT "\n"); |
5552 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5619 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
5553 | "set\n"); | 5620 | "set\n"); |
@@ -5560,7 +5627,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5560 | break; | 5627 | break; |
5561 | } | 5628 | } |
5562 | 5629 | ||
5563 | if (cpumask_intersects(groupmask, sched_group_cpus(group))) { | 5630 | if (!(sd->flags & SD_OVERLAP) && |
5631 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
5564 | printk(KERN_CONT "\n"); | 5632 | printk(KERN_CONT "\n"); |
5565 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5633 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
5566 | break; | 5634 | break; |
@@ -5594,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5594 | { | 5662 | { |
5595 | int level = 0; | 5663 | int level = 0; |
5596 | 5664 | ||
5597 | if (!sched_domain_debug_enabled) | 5665 | if (!sched_debug_enabled) |
5598 | return; | 5666 | return; |
5599 | 5667 | ||
5600 | if (!sd) { | 5668 | if (!sd) { |
@@ -5615,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5615 | } | 5683 | } |
5616 | #else /* !CONFIG_SCHED_DEBUG */ | 5684 | #else /* !CONFIG_SCHED_DEBUG */ |
5617 | # define sched_domain_debug(sd, cpu) do { } while (0) | 5685 | # define sched_domain_debug(sd, cpu) do { } while (0) |
5686 | static inline bool sched_debug(void) | ||
5687 | { | ||
5688 | return false; | ||
5689 | } | ||
5618 | #endif /* CONFIG_SCHED_DEBUG */ | 5690 | #endif /* CONFIG_SCHED_DEBUG */ |
5619 | 5691 | ||
5620 | static int sd_degenerate(struct sched_domain *sd) | 5692 | static int sd_degenerate(struct sched_domain *sd) |
@@ -5898,99 +5970,11 @@ static int __init isolated_cpu_setup(char *str) | |||
5898 | 5970 | ||
5899 | __setup("isolcpus=", isolated_cpu_setup); | 5971 | __setup("isolcpus=", isolated_cpu_setup); |
5900 | 5972 | ||
5901 | #ifdef CONFIG_NUMA | ||
5902 | |||
5903 | /** | ||
5904 | * find_next_best_node - find the next node to include in a sched_domain | ||
5905 | * @node: node whose sched_domain we're building | ||
5906 | * @used_nodes: nodes already in the sched_domain | ||
5907 | * | ||
5908 | * Find the next node to include in a given scheduling domain. Simply | ||
5909 | * finds the closest node not already in the @used_nodes map. | ||
5910 | * | ||
5911 | * Should use nodemask_t. | ||
5912 | */ | ||
5913 | static int find_next_best_node(int node, nodemask_t *used_nodes) | ||
5914 | { | ||
5915 | int i, n, val, min_val, best_node = -1; | ||
5916 | |||
5917 | min_val = INT_MAX; | ||
5918 | |||
5919 | for (i = 0; i < nr_node_ids; i++) { | ||
5920 | /* Start at @node */ | ||
5921 | n = (node + i) % nr_node_ids; | ||
5922 | |||
5923 | if (!nr_cpus_node(n)) | ||
5924 | continue; | ||
5925 | |||
5926 | /* Skip already used nodes */ | ||
5927 | if (node_isset(n, *used_nodes)) | ||
5928 | continue; | ||
5929 | |||
5930 | /* Simple min distance search */ | ||
5931 | val = node_distance(node, n); | ||
5932 | |||
5933 | if (val < min_val) { | ||
5934 | min_val = val; | ||
5935 | best_node = n; | ||
5936 | } | ||
5937 | } | ||
5938 | |||
5939 | if (best_node != -1) | ||
5940 | node_set(best_node, *used_nodes); | ||
5941 | return best_node; | ||
5942 | } | ||
5943 | |||
5944 | /** | ||
5945 | * sched_domain_node_span - get a cpumask for a node's sched_domain | ||
5946 | * @node: node whose cpumask we're constructing | ||
5947 | * @span: resulting cpumask | ||
5948 | * | ||
5949 | * Given a node, construct a good cpumask for its sched_domain to span. It | ||
5950 | * should be one that prevents unnecessary balancing, but also spreads tasks | ||
5951 | * out optimally. | ||
5952 | */ | ||
5953 | static void sched_domain_node_span(int node, struct cpumask *span) | ||
5954 | { | ||
5955 | nodemask_t used_nodes; | ||
5956 | int i; | ||
5957 | |||
5958 | cpumask_clear(span); | ||
5959 | nodes_clear(used_nodes); | ||
5960 | |||
5961 | cpumask_or(span, span, cpumask_of_node(node)); | ||
5962 | node_set(node, used_nodes); | ||
5963 | |||
5964 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | ||
5965 | int next_node = find_next_best_node(node, &used_nodes); | ||
5966 | if (next_node < 0) | ||
5967 | break; | ||
5968 | cpumask_or(span, span, cpumask_of_node(next_node)); | ||
5969 | } | ||
5970 | } | ||
5971 | |||
5972 | static const struct cpumask *cpu_node_mask(int cpu) | ||
5973 | { | ||
5974 | lockdep_assert_held(&sched_domains_mutex); | ||
5975 | |||
5976 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
5977 | |||
5978 | return sched_domains_tmpmask; | ||
5979 | } | ||
5980 | |||
5981 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
5982 | { | ||
5983 | return cpu_possible_mask; | ||
5984 | } | ||
5985 | #endif /* CONFIG_NUMA */ | ||
5986 | |||
5987 | static const struct cpumask *cpu_cpu_mask(int cpu) | 5973 | static const struct cpumask *cpu_cpu_mask(int cpu) |
5988 | { | 5974 | { |
5989 | return cpumask_of_node(cpu_to_node(cpu)); | 5975 | return cpumask_of_node(cpu_to_node(cpu)); |
5990 | } | 5976 | } |
5991 | 5977 | ||
5992 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
5993 | |||
5994 | struct sd_data { | 5978 | struct sd_data { |
5995 | struct sched_domain **__percpu sd; | 5979 | struct sched_domain **__percpu sd; |
5996 | struct sched_group **__percpu sg; | 5980 | struct sched_group **__percpu sg; |
@@ -6020,9 +6004,48 @@ struct sched_domain_topology_level { | |||
6020 | sched_domain_init_f init; | 6004 | sched_domain_init_f init; |
6021 | sched_domain_mask_f mask; | 6005 | sched_domain_mask_f mask; |
6022 | int flags; | 6006 | int flags; |
6007 | int numa_level; | ||
6023 | struct sd_data data; | 6008 | struct sd_data data; |
6024 | }; | 6009 | }; |
6025 | 6010 | ||
6011 | /* | ||
6012 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
6013 | * domain traversal. | ||
6014 | * | ||
6015 | * Asymmetric node setups can result in situations where the domain tree is of | ||
6016 | * unequal depth, make sure to skip domains that already cover the entire | ||
6017 | * range. | ||
6018 | * | ||
6019 | * In that case build_sched_domains() will have terminated the iteration early | ||
6020 | * and our sibling sd spans will be empty. Domains should always include the | ||
6021 | * cpu they're built on, so check that. | ||
6022 | * | ||
6023 | */ | ||
6024 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
6025 | { | ||
6026 | const struct cpumask *span = sched_domain_span(sd); | ||
6027 | struct sd_data *sdd = sd->private; | ||
6028 | struct sched_domain *sibling; | ||
6029 | int i; | ||
6030 | |||
6031 | for_each_cpu(i, span) { | ||
6032 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
6033 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
6034 | continue; | ||
6035 | |||
6036 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
6037 | } | ||
6038 | } | ||
6039 | |||
6040 | /* | ||
6041 | * Return the canonical balance cpu for this group, this is the first cpu | ||
6042 | * of this group that's also in the iteration mask. | ||
6043 | */ | ||
6044 | int group_balance_cpu(struct sched_group *sg) | ||
6045 | { | ||
6046 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
6047 | } | ||
6048 | |||
6026 | static int | 6049 | static int |
6027 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | 6050 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) |
6028 | { | 6051 | { |
@@ -6041,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
6041 | if (cpumask_test_cpu(i, covered)) | 6064 | if (cpumask_test_cpu(i, covered)) |
6042 | continue; | 6065 | continue; |
6043 | 6066 | ||
6067 | child = *per_cpu_ptr(sdd->sd, i); | ||
6068 | |||
6069 | /* See the comment near build_group_mask(). */ | ||
6070 | if (!cpumask_test_cpu(i, sched_domain_span(child))) | ||
6071 | continue; | ||
6072 | |||
6044 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 6073 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
6045 | GFP_KERNEL, cpu_to_node(cpu)); | 6074 | GFP_KERNEL, cpu_to_node(cpu)); |
6046 | 6075 | ||
@@ -6048,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
6048 | goto fail; | 6077 | goto fail; |
6049 | 6078 | ||
6050 | sg_span = sched_group_cpus(sg); | 6079 | sg_span = sched_group_cpus(sg); |
6051 | |||
6052 | child = *per_cpu_ptr(sdd->sd, i); | ||
6053 | if (child->child) { | 6080 | if (child->child) { |
6054 | child = child->child; | 6081 | child = child->child; |
6055 | cpumask_copy(sg_span, sched_domain_span(child)); | 6082 | cpumask_copy(sg_span, sched_domain_span(child)); |
@@ -6058,10 +6085,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
6058 | 6085 | ||
6059 | cpumask_or(covered, covered, sg_span); | 6086 | cpumask_or(covered, covered, sg_span); |
6060 | 6087 | ||
6061 | sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); | 6088 | sg->sgp = *per_cpu_ptr(sdd->sgp, i); |
6062 | atomic_inc(&sg->sgp->ref); | 6089 | if (atomic_inc_return(&sg->sgp->ref) == 1) |
6090 | build_group_mask(sd, sg); | ||
6063 | 6091 | ||
6064 | if (cpumask_test_cpu(cpu, sg_span)) | 6092 | /* |
6093 | * Initialize sgp->power such that even if we mess up the | ||
6094 | * domains and no possible iteration will get us here, we won't | ||
6095 | * die on a /0 trap. | ||
6096 | */ | ||
6097 | sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); | ||
6098 | |||
6099 | /* | ||
6100 | * Make sure the first group of this domain contains the | ||
6101 | * canonical balance cpu. Otherwise the sched_domain iteration | ||
6102 | * breaks. See update_sg_lb_stats(). | ||
6103 | */ | ||
6104 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
6105 | group_balance_cpu(sg) == cpu) | ||
6065 | groups = sg; | 6106 | groups = sg; |
6066 | 6107 | ||
6067 | if (!first) | 6108 | if (!first) |
@@ -6135,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
6135 | 6176 | ||
6136 | cpumask_clear(sched_group_cpus(sg)); | 6177 | cpumask_clear(sched_group_cpus(sg)); |
6137 | sg->sgp->power = 0; | 6178 | sg->sgp->power = 0; |
6179 | cpumask_setall(sched_group_mask(sg)); | ||
6138 | 6180 | ||
6139 | for_each_cpu(j, span) { | 6181 | for_each_cpu(j, span) { |
6140 | if (get_group(j, sdd, NULL) != group) | 6182 | if (get_group(j, sdd, NULL) != group) |
@@ -6176,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6176 | sg = sg->next; | 6218 | sg = sg->next; |
6177 | } while (sg != sd->groups); | 6219 | } while (sg != sd->groups); |
6178 | 6220 | ||
6179 | if (cpu != group_first_cpu(sg)) | 6221 | if (cpu != group_balance_cpu(sg)) |
6180 | return; | 6222 | return; |
6181 | 6223 | ||
6182 | update_group_power(sd, cpu); | 6224 | update_group_power(sd, cpu); |
@@ -6211,10 +6253,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ | |||
6211 | } | 6253 | } |
6212 | 6254 | ||
6213 | SD_INIT_FUNC(CPU) | 6255 | SD_INIT_FUNC(CPU) |
6214 | #ifdef CONFIG_NUMA | ||
6215 | SD_INIT_FUNC(ALLNODES) | ||
6216 | SD_INIT_FUNC(NODE) | ||
6217 | #endif | ||
6218 | #ifdef CONFIG_SCHED_SMT | 6256 | #ifdef CONFIG_SCHED_SMT |
6219 | SD_INIT_FUNC(SIBLING) | 6257 | SD_INIT_FUNC(SIBLING) |
6220 | #endif | 6258 | #endif |
@@ -6230,11 +6268,8 @@ int sched_domain_level_max; | |||
6230 | 6268 | ||
6231 | static int __init setup_relax_domain_level(char *str) | 6269 | static int __init setup_relax_domain_level(char *str) |
6232 | { | 6270 | { |
6233 | unsigned long val; | 6271 | if (kstrtoint(str, 0, &default_relax_domain_level)) |
6234 | 6272 | pr_warn("Unable to set relax_domain_level\n"); | |
6235 | val = simple_strtoul(str, NULL, 0); | ||
6236 | if (val < sched_domain_level_max) | ||
6237 | default_relax_domain_level = val; | ||
6238 | 6273 | ||
6239 | return 1; | 6274 | return 1; |
6240 | } | 6275 | } |
@@ -6336,15 +6371,236 @@ static struct sched_domain_topology_level default_topology[] = { | |||
6336 | { sd_init_BOOK, cpu_book_mask, }, | 6371 | { sd_init_BOOK, cpu_book_mask, }, |
6337 | #endif | 6372 | #endif |
6338 | { sd_init_CPU, cpu_cpu_mask, }, | 6373 | { sd_init_CPU, cpu_cpu_mask, }, |
6339 | #ifdef CONFIG_NUMA | ||
6340 | { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, | ||
6341 | { sd_init_ALLNODES, cpu_allnodes_mask, }, | ||
6342 | #endif | ||
6343 | { NULL, }, | 6374 | { NULL, }, |
6344 | }; | 6375 | }; |
6345 | 6376 | ||
6346 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | 6377 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; |
6347 | 6378 | ||
6379 | #ifdef CONFIG_NUMA | ||
6380 | |||
6381 | static int sched_domains_numa_levels; | ||
6382 | static int *sched_domains_numa_distance; | ||
6383 | static struct cpumask ***sched_domains_numa_masks; | ||
6384 | static int sched_domains_curr_level; | ||
6385 | |||
6386 | static inline int sd_local_flags(int level) | ||
6387 | { | ||
6388 | if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) | ||
6389 | return 0; | ||
6390 | |||
6391 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; | ||
6392 | } | ||
6393 | |||
6394 | static struct sched_domain * | ||
6395 | sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | ||
6396 | { | ||
6397 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); | ||
6398 | int level = tl->numa_level; | ||
6399 | int sd_weight = cpumask_weight( | ||
6400 | sched_domains_numa_masks[level][cpu_to_node(cpu)]); | ||
6401 | |||
6402 | *sd = (struct sched_domain){ | ||
6403 | .min_interval = sd_weight, | ||
6404 | .max_interval = 2*sd_weight, | ||
6405 | .busy_factor = 32, | ||
6406 | .imbalance_pct = 125, | ||
6407 | .cache_nice_tries = 2, | ||
6408 | .busy_idx = 3, | ||
6409 | .idle_idx = 2, | ||
6410 | .newidle_idx = 0, | ||
6411 | .wake_idx = 0, | ||
6412 | .forkexec_idx = 0, | ||
6413 | |||
6414 | .flags = 1*SD_LOAD_BALANCE | ||
6415 | | 1*SD_BALANCE_NEWIDLE | ||
6416 | | 0*SD_BALANCE_EXEC | ||
6417 | | 0*SD_BALANCE_FORK | ||
6418 | | 0*SD_BALANCE_WAKE | ||
6419 | | 0*SD_WAKE_AFFINE | ||
6420 | | 0*SD_PREFER_LOCAL | ||
6421 | | 0*SD_SHARE_CPUPOWER | ||
6422 | | 0*SD_SHARE_PKG_RESOURCES | ||
6423 | | 1*SD_SERIALIZE | ||
6424 | | 0*SD_PREFER_SIBLING | ||
6425 | | sd_local_flags(level) | ||
6426 | , | ||
6427 | .last_balance = jiffies, | ||
6428 | .balance_interval = sd_weight, | ||
6429 | }; | ||
6430 | SD_INIT_NAME(sd, NUMA); | ||
6431 | sd->private = &tl->data; | ||
6432 | |||
6433 | /* | ||
6434 | * Ugly hack to pass state to sd_numa_mask()... | ||
6435 | */ | ||
6436 | sched_domains_curr_level = tl->numa_level; | ||
6437 | |||
6438 | return sd; | ||
6439 | } | ||
6440 | |||
6441 | static const struct cpumask *sd_numa_mask(int cpu) | ||
6442 | { | ||
6443 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
6444 | } | ||
6445 | |||
6446 | static void sched_numa_warn(const char *str) | ||
6447 | { | ||
6448 | static int done = false; | ||
6449 | int i,j; | ||
6450 | |||
6451 | if (done) | ||
6452 | return; | ||
6453 | |||
6454 | done = true; | ||
6455 | |||
6456 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
6457 | |||
6458 | for (i = 0; i < nr_node_ids; i++) { | ||
6459 | printk(KERN_WARNING " "); | ||
6460 | for (j = 0; j < nr_node_ids; j++) | ||
6461 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
6462 | printk(KERN_CONT "\n"); | ||
6463 | } | ||
6464 | printk(KERN_WARNING "\n"); | ||
6465 | } | ||
6466 | |||
6467 | static bool find_numa_distance(int distance) | ||
6468 | { | ||
6469 | int i; | ||
6470 | |||
6471 | if (distance == node_distance(0, 0)) | ||
6472 | return true; | ||
6473 | |||
6474 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6475 | if (sched_domains_numa_distance[i] == distance) | ||
6476 | return true; | ||
6477 | } | ||
6478 | |||
6479 | return false; | ||
6480 | } | ||
6481 | |||
6482 | static void sched_init_numa(void) | ||
6483 | { | ||
6484 | int next_distance, curr_distance = node_distance(0, 0); | ||
6485 | struct sched_domain_topology_level *tl; | ||
6486 | int level = 0; | ||
6487 | int i, j, k; | ||
6488 | |||
6489 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
6490 | if (!sched_domains_numa_distance) | ||
6491 | return; | ||
6492 | |||
6493 | /* | ||
6494 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
6495 | * unique distances in the node_distance() table. | ||
6496 | * | ||
6497 | * Assumes node_distance(0,j) includes all distances in | ||
6498 | * node_distance(i,j) in order to avoid cubic time. | ||
6499 | */ | ||
6500 | next_distance = curr_distance; | ||
6501 | for (i = 0; i < nr_node_ids; i++) { | ||
6502 | for (j = 0; j < nr_node_ids; j++) { | ||
6503 | for (k = 0; k < nr_node_ids; k++) { | ||
6504 | int distance = node_distance(i, k); | ||
6505 | |||
6506 | if (distance > curr_distance && | ||
6507 | (distance < next_distance || | ||
6508 | next_distance == curr_distance)) | ||
6509 | next_distance = distance; | ||
6510 | |||
6511 | /* | ||
6512 | * While not a strong assumption it would be nice to know | ||
6513 | * about cases where if node A is connected to B, B is not | ||
6514 | * equally connected to A. | ||
6515 | */ | ||
6516 | if (sched_debug() && node_distance(k, i) != distance) | ||
6517 | sched_numa_warn("Node-distance not symmetric"); | ||
6518 | |||
6519 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
6520 | sched_numa_warn("Node-0 not representative"); | ||
6521 | } | ||
6522 | if (next_distance != curr_distance) { | ||
6523 | sched_domains_numa_distance[level++] = next_distance; | ||
6524 | sched_domains_numa_levels = level; | ||
6525 | curr_distance = next_distance; | ||
6526 | } else break; | ||
6527 | } | ||
6528 | |||
6529 | /* | ||
6530 | * In case of sched_debug() we verify the above assumption. | ||
6531 | */ | ||
6532 | if (!sched_debug()) | ||
6533 | break; | ||
6534 | } | ||
6535 | /* | ||
6536 | * 'level' contains the number of unique distances, excluding the | ||
6537 | * identity distance node_distance(i,i). | ||
6538 | * | ||
6539 | * The sched_domains_nume_distance[] array includes the actual distance | ||
6540 | * numbers. | ||
6541 | */ | ||
6542 | |||
6543 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
6544 | if (!sched_domains_numa_masks) | ||
6545 | return; | ||
6546 | |||
6547 | /* | ||
6548 | * Now for each level, construct a mask per node which contains all | ||
6549 | * cpus of nodes that are that many hops away from us. | ||
6550 | */ | ||
6551 | for (i = 0; i < level; i++) { | ||
6552 | sched_domains_numa_masks[i] = | ||
6553 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
6554 | if (!sched_domains_numa_masks[i]) | ||
6555 | return; | ||
6556 | |||
6557 | for (j = 0; j < nr_node_ids; j++) { | ||
6558 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
6559 | if (!mask) | ||
6560 | return; | ||
6561 | |||
6562 | sched_domains_numa_masks[i][j] = mask; | ||
6563 | |||
6564 | for (k = 0; k < nr_node_ids; k++) { | ||
6565 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
6566 | continue; | ||
6567 | |||
6568 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
6569 | } | ||
6570 | } | ||
6571 | } | ||
6572 | |||
6573 | tl = kzalloc((ARRAY_SIZE(default_topology) + level) * | ||
6574 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
6575 | if (!tl) | ||
6576 | return; | ||
6577 | |||
6578 | /* | ||
6579 | * Copy the default topology bits.. | ||
6580 | */ | ||
6581 | for (i = 0; default_topology[i].init; i++) | ||
6582 | tl[i] = default_topology[i]; | ||
6583 | |||
6584 | /* | ||
6585 | * .. and append 'j' levels of NUMA goodness. | ||
6586 | */ | ||
6587 | for (j = 0; j < level; i++, j++) { | ||
6588 | tl[i] = (struct sched_domain_topology_level){ | ||
6589 | .init = sd_numa_init, | ||
6590 | .mask = sd_numa_mask, | ||
6591 | .flags = SDTL_OVERLAP, | ||
6592 | .numa_level = j, | ||
6593 | }; | ||
6594 | } | ||
6595 | |||
6596 | sched_domain_topology = tl; | ||
6597 | } | ||
6598 | #else | ||
6599 | static inline void sched_init_numa(void) | ||
6600 | { | ||
6601 | } | ||
6602 | #endif /* CONFIG_NUMA */ | ||
6603 | |||
6348 | static int __sdt_alloc(const struct cpumask *cpu_map) | 6604 | static int __sdt_alloc(const struct cpumask *cpu_map) |
6349 | { | 6605 | { |
6350 | struct sched_domain_topology_level *tl; | 6606 | struct sched_domain_topology_level *tl; |
@@ -6382,9 +6638,11 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6382 | if (!sg) | 6638 | if (!sg) |
6383 | return -ENOMEM; | 6639 | return -ENOMEM; |
6384 | 6640 | ||
6641 | sg->next = sg; | ||
6642 | |||
6385 | *per_cpu_ptr(sdd->sg, j) = sg; | 6643 | *per_cpu_ptr(sdd->sg, j) = sg; |
6386 | 6644 | ||
6387 | sgp = kzalloc_node(sizeof(struct sched_group_power), | 6645 | sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), |
6388 | GFP_KERNEL, cpu_to_node(j)); | 6646 | GFP_KERNEL, cpu_to_node(j)); |
6389 | if (!sgp) | 6647 | if (!sgp) |
6390 | return -ENOMEM; | 6648 | return -ENOMEM; |
@@ -6405,16 +6663,26 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
6405 | struct sd_data *sdd = &tl->data; | 6663 | struct sd_data *sdd = &tl->data; |
6406 | 6664 | ||
6407 | for_each_cpu(j, cpu_map) { | 6665 | for_each_cpu(j, cpu_map) { |
6408 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); | 6666 | struct sched_domain *sd; |
6409 | if (sd && (sd->flags & SD_OVERLAP)) | 6667 | |
6410 | free_sched_groups(sd->groups, 0); | 6668 | if (sdd->sd) { |
6411 | kfree(*per_cpu_ptr(sdd->sd, j)); | 6669 | sd = *per_cpu_ptr(sdd->sd, j); |
6412 | kfree(*per_cpu_ptr(sdd->sg, j)); | 6670 | if (sd && (sd->flags & SD_OVERLAP)) |
6413 | kfree(*per_cpu_ptr(sdd->sgp, j)); | 6671 | free_sched_groups(sd->groups, 0); |
6672 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
6673 | } | ||
6674 | |||
6675 | if (sdd->sg) | ||
6676 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
6677 | if (sdd->sgp) | ||
6678 | kfree(*per_cpu_ptr(sdd->sgp, j)); | ||
6414 | } | 6679 | } |
6415 | free_percpu(sdd->sd); | 6680 | free_percpu(sdd->sd); |
6681 | sdd->sd = NULL; | ||
6416 | free_percpu(sdd->sg); | 6682 | free_percpu(sdd->sg); |
6683 | sdd->sg = NULL; | ||
6417 | free_percpu(sdd->sgp); | 6684 | free_percpu(sdd->sgp); |
6685 | sdd->sgp = NULL; | ||
6418 | } | 6686 | } |
6419 | } | 6687 | } |
6420 | 6688 | ||
@@ -6427,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6427 | if (!sd) | 6695 | if (!sd) |
6428 | return child; | 6696 | return child; |
6429 | 6697 | ||
6430 | set_domain_attribute(sd, attr); | ||
6431 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | 6698 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); |
6432 | if (child) { | 6699 | if (child) { |
6433 | sd->level = child->level + 1; | 6700 | sd->level = child->level + 1; |
@@ -6435,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6435 | child->parent = sd; | 6702 | child->parent = sd; |
6436 | } | 6703 | } |
6437 | sd->child = child; | 6704 | sd->child = child; |
6705 | set_domain_attribute(sd, attr); | ||
6438 | 6706 | ||
6439 | return sd; | 6707 | return sd; |
6440 | } | 6708 | } |
@@ -6575,7 +6843,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) | |||
6575 | if (!doms_cur) | 6843 | if (!doms_cur) |
6576 | doms_cur = &fallback_doms; | 6844 | doms_cur = &fallback_doms; |
6577 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 6845 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
6578 | dattr_cur = NULL; | ||
6579 | err = build_sched_domains(doms_cur[0], NULL); | 6846 | err = build_sched_domains(doms_cur[0], NULL); |
6580 | register_sched_domain_sysctl(); | 6847 | register_sched_domain_sysctl(); |
6581 | 6848 | ||
@@ -6700,97 +6967,6 @@ match2: | |||
6700 | mutex_unlock(&sched_domains_mutex); | 6967 | mutex_unlock(&sched_domains_mutex); |
6701 | } | 6968 | } |
6702 | 6969 | ||
6703 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
6704 | static void reinit_sched_domains(void) | ||
6705 | { | ||
6706 | get_online_cpus(); | ||
6707 | |||
6708 | /* Destroy domains first to force the rebuild */ | ||
6709 | partition_sched_domains(0, NULL, NULL); | ||
6710 | |||
6711 | rebuild_sched_domains(); | ||
6712 | put_online_cpus(); | ||
6713 | } | ||
6714 | |||
6715 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
6716 | { | ||
6717 | unsigned int level = 0; | ||
6718 | |||
6719 | if (sscanf(buf, "%u", &level) != 1) | ||
6720 | return -EINVAL; | ||
6721 | |||
6722 | /* | ||
6723 | * level is always be positive so don't check for | ||
6724 | * level < POWERSAVINGS_BALANCE_NONE which is 0 | ||
6725 | * What happens on 0 or 1 byte write, | ||
6726 | * need to check for count as well? | ||
6727 | */ | ||
6728 | |||
6729 | if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) | ||
6730 | return -EINVAL; | ||
6731 | |||
6732 | if (smt) | ||
6733 | sched_smt_power_savings = level; | ||
6734 | else | ||
6735 | sched_mc_power_savings = level; | ||
6736 | |||
6737 | reinit_sched_domains(); | ||
6738 | |||
6739 | return count; | ||
6740 | } | ||
6741 | |||
6742 | #ifdef CONFIG_SCHED_MC | ||
6743 | static ssize_t sched_mc_power_savings_show(struct device *dev, | ||
6744 | struct device_attribute *attr, | ||
6745 | char *buf) | ||
6746 | { | ||
6747 | return sprintf(buf, "%u\n", sched_mc_power_savings); | ||
6748 | } | ||
6749 | static ssize_t sched_mc_power_savings_store(struct device *dev, | ||
6750 | struct device_attribute *attr, | ||
6751 | const char *buf, size_t count) | ||
6752 | { | ||
6753 | return sched_power_savings_store(buf, count, 0); | ||
6754 | } | ||
6755 | static DEVICE_ATTR(sched_mc_power_savings, 0644, | ||
6756 | sched_mc_power_savings_show, | ||
6757 | sched_mc_power_savings_store); | ||
6758 | #endif | ||
6759 | |||
6760 | #ifdef CONFIG_SCHED_SMT | ||
6761 | static ssize_t sched_smt_power_savings_show(struct device *dev, | ||
6762 | struct device_attribute *attr, | ||
6763 | char *buf) | ||
6764 | { | ||
6765 | return sprintf(buf, "%u\n", sched_smt_power_savings); | ||
6766 | } | ||
6767 | static ssize_t sched_smt_power_savings_store(struct device *dev, | ||
6768 | struct device_attribute *attr, | ||
6769 | const char *buf, size_t count) | ||
6770 | { | ||
6771 | return sched_power_savings_store(buf, count, 1); | ||
6772 | } | ||
6773 | static DEVICE_ATTR(sched_smt_power_savings, 0644, | ||
6774 | sched_smt_power_savings_show, | ||
6775 | sched_smt_power_savings_store); | ||
6776 | #endif | ||
6777 | |||
6778 | int __init sched_create_sysfs_power_savings_entries(struct device *dev) | ||
6779 | { | ||
6780 | int err = 0; | ||
6781 | |||
6782 | #ifdef CONFIG_SCHED_SMT | ||
6783 | if (smt_capable()) | ||
6784 | err = device_create_file(dev, &dev_attr_sched_smt_power_savings); | ||
6785 | #endif | ||
6786 | #ifdef CONFIG_SCHED_MC | ||
6787 | if (!err && mc_capable()) | ||
6788 | err = device_create_file(dev, &dev_attr_sched_mc_power_savings); | ||
6789 | #endif | ||
6790 | return err; | ||
6791 | } | ||
6792 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
6793 | |||
6794 | /* | 6970 | /* |
6795 | * Update cpusets according to cpu_active mask. If cpusets are | 6971 | * Update cpusets according to cpu_active mask. If cpusets are |
6796 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper | 6972 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
@@ -6828,6 +7004,8 @@ void __init sched_init_smp(void) | |||
6828 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7004 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
6829 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7005 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
6830 | 7006 | ||
7007 | sched_init_numa(); | ||
7008 | |||
6831 | get_online_cpus(); | 7009 | get_online_cpus(); |
6832 | mutex_lock(&sched_domains_mutex); | 7010 | mutex_lock(&sched_domains_mutex); |
6833 | init_sched_domains(cpu_active_mask); | 7011 | init_sched_domains(cpu_active_mask); |
@@ -7049,6 +7227,7 @@ void __init sched_init(void) | |||
7049 | /* May be allocated at isolcpus cmdline parse time */ | 7227 | /* May be allocated at isolcpus cmdline parse time */ |
7050 | if (cpu_isolated_map == NULL) | 7228 | if (cpu_isolated_map == NULL) |
7051 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 7229 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
7230 | idle_thread_set_boot_cpu(); | ||
7052 | #endif | 7231 | #endif |
7053 | init_sched_fair_class(); | 7232 | init_sched_fair_class(); |
7054 | 7233 | ||
@@ -7970,13 +8149,9 @@ static struct cftype cpu_files[] = { | |||
7970 | .write_u64 = cpu_rt_period_write_uint, | 8149 | .write_u64 = cpu_rt_period_write_uint, |
7971 | }, | 8150 | }, |
7972 | #endif | 8151 | #endif |
8152 | { } /* terminate */ | ||
7973 | }; | 8153 | }; |
7974 | 8154 | ||
7975 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
7976 | { | ||
7977 | return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); | ||
7978 | } | ||
7979 | |||
7980 | struct cgroup_subsys cpu_cgroup_subsys = { | 8155 | struct cgroup_subsys cpu_cgroup_subsys = { |
7981 | .name = "cpu", | 8156 | .name = "cpu", |
7982 | .create = cpu_cgroup_create, | 8157 | .create = cpu_cgroup_create, |
@@ -7984,8 +8159,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7984 | .can_attach = cpu_cgroup_can_attach, | 8159 | .can_attach = cpu_cgroup_can_attach, |
7985 | .attach = cpu_cgroup_attach, | 8160 | .attach = cpu_cgroup_attach, |
7986 | .exit = cpu_cgroup_exit, | 8161 | .exit = cpu_cgroup_exit, |
7987 | .populate = cpu_cgroup_populate, | ||
7988 | .subsys_id = cpu_cgroup_subsys_id, | 8162 | .subsys_id = cpu_cgroup_subsys_id, |
8163 | .base_cftypes = cpu_files, | ||
7989 | .early_init = 1, | 8164 | .early_init = 1, |
7990 | }; | 8165 | }; |
7991 | 8166 | ||
@@ -8170,13 +8345,9 @@ static struct cftype files[] = { | |||
8170 | .name = "stat", | 8345 | .name = "stat", |
8171 | .read_map = cpuacct_stats_show, | 8346 | .read_map = cpuacct_stats_show, |
8172 | }, | 8347 | }, |
8348 | { } /* terminate */ | ||
8173 | }; | 8349 | }; |
8174 | 8350 | ||
8175 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
8176 | { | ||
8177 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); | ||
8178 | } | ||
8179 | |||
8180 | /* | 8351 | /* |
8181 | * charge this task's execution time to its accounting group. | 8352 | * charge this task's execution time to its accounting group. |
8182 | * | 8353 | * |
@@ -8208,7 +8379,7 @@ struct cgroup_subsys cpuacct_subsys = { | |||
8208 | .name = "cpuacct", | 8379 | .name = "cpuacct", |
8209 | .create = cpuacct_create, | 8380 | .create = cpuacct_create, |
8210 | .destroy = cpuacct_destroy, | 8381 | .destroy = cpuacct_destroy, |
8211 | .populate = cpuacct_populate, | ||
8212 | .subsys_id = cpuacct_subsys_id, | 8382 | .subsys_id = cpuacct_subsys_id, |
8383 | .base_cftypes = files, | ||
8213 | }; | 8384 | }; |
8214 | #endif /* CONFIG_CGROUP_CPUACCT */ | 8385 | #endif /* CONFIG_CGROUP_CPUACCT */ |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 09acaa15161d..6f79596e0ea9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
202 | SPLIT_NS(spread0)); | 202 | SPLIT_NS(spread0)); |
203 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", | 203 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", |
204 | cfs_rq->nr_spread_over); | 204 | cfs_rq->nr_spread_over); |
205 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | 205 | SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); |
206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
207 | #ifdef CONFIG_FAIR_GROUP_SCHED | 207 | #ifdef CONFIG_FAIR_GROUP_SCHED |
208 | #ifdef CONFIG_SMP | 208 | #ifdef CONFIG_SMP |
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
260 | SEQ_printf(m, "\ncpu#%d\n", cpu); | 260 | SEQ_printf(m, "\ncpu#%d\n", cpu); |
261 | #endif | 261 | #endif |
262 | 262 | ||
263 | #define P(x) \ | 263 | #define P(x) \ |
264 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) | 264 | do { \ |
265 | if (sizeof(rq->x) == 4) \ | ||
266 | SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ | ||
267 | else \ | ||
268 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ | ||
269 | } while (0) | ||
270 | |||
265 | #define PN(x) \ | 271 | #define PN(x) \ |
266 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) | 272 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) |
267 | 273 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0d97ebdc58f0..c099cc6eebe3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
784 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 784 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
785 | #ifdef CONFIG_SMP | 785 | #ifdef CONFIG_SMP |
786 | if (entity_is_task(se)) | 786 | if (entity_is_task(se)) |
787 | list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); | 787 | list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); |
788 | #endif | 788 | #endif |
789 | cfs_rq->nr_running++; | 789 | cfs_rq->nr_running++; |
790 | } | 790 | } |
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2703 | int want_sd = 1; | 2703 | int want_sd = 1; |
2704 | int sync = wake_flags & WF_SYNC; | 2704 | int sync = wake_flags & WF_SYNC; |
2705 | 2705 | ||
2706 | if (p->rt.nr_cpus_allowed == 1) | 2706 | if (p->nr_cpus_allowed == 1) |
2707 | return prev_cpu; | 2707 | return prev_cpu; |
2708 | 2708 | ||
2709 | if (sd_flag & SD_BALANCE_WAKE) { | 2709 | if (sd_flag & SD_BALANCE_WAKE) { |
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2721 | * If power savings logic is enabled for a domain, see if we | 2721 | * If power savings logic is enabled for a domain, see if we |
2722 | * are not overloaded, if so, don't balance wider. | 2722 | * are not overloaded, if so, don't balance wider. |
2723 | */ | 2723 | */ |
2724 | if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { | 2724 | if (tmp->flags & (SD_PREFER_LOCAL)) { |
2725 | unsigned long power = 0; | 2725 | unsigned long power = 0; |
2726 | unsigned long nr_running = 0; | 2726 | unsigned long nr_running = 0; |
2727 | unsigned long capacity; | 2727 | unsigned long capacity; |
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2734 | 2734 | ||
2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | 2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); |
2736 | 2736 | ||
2737 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
2738 | nr_running /= 2; | ||
2739 | |||
2740 | if (nr_running < capacity) | 2737 | if (nr_running < capacity) |
2741 | want_sd = 0; | 2738 | want_sd = 0; |
2742 | } | 2739 | } |
@@ -3082,7 +3079,7 @@ struct lb_env { | |||
3082 | struct rq *dst_rq; | 3079 | struct rq *dst_rq; |
3083 | 3080 | ||
3084 | enum cpu_idle_type idle; | 3081 | enum cpu_idle_type idle; |
3085 | long load_move; | 3082 | long imbalance; |
3086 | unsigned int flags; | 3083 | unsigned int flags; |
3087 | 3084 | ||
3088 | unsigned int loop; | 3085 | unsigned int loop; |
@@ -3215,8 +3212,10 @@ static int move_one_task(struct lb_env *env) | |||
3215 | 3212 | ||
3216 | static unsigned long task_h_load(struct task_struct *p); | 3213 | static unsigned long task_h_load(struct task_struct *p); |
3217 | 3214 | ||
3215 | static const unsigned int sched_nr_migrate_break = 32; | ||
3216 | |||
3218 | /* | 3217 | /* |
3219 | * move_tasks tries to move up to load_move weighted load from busiest to | 3218 | * move_tasks tries to move up to imbalance weighted load from busiest to |
3220 | * this_rq, as part of a balancing operation within domain "sd". | 3219 | * this_rq, as part of a balancing operation within domain "sd". |
3221 | * Returns 1 if successful and 0 otherwise. | 3220 | * Returns 1 if successful and 0 otherwise. |
3222 | * | 3221 | * |
@@ -3229,7 +3228,7 @@ static int move_tasks(struct lb_env *env) | |||
3229 | unsigned long load; | 3228 | unsigned long load; |
3230 | int pulled = 0; | 3229 | int pulled = 0; |
3231 | 3230 | ||
3232 | if (env->load_move <= 0) | 3231 | if (env->imbalance <= 0) |
3233 | return 0; | 3232 | return 0; |
3234 | 3233 | ||
3235 | while (!list_empty(tasks)) { | 3234 | while (!list_empty(tasks)) { |
@@ -3242,7 +3241,7 @@ static int move_tasks(struct lb_env *env) | |||
3242 | 3241 | ||
3243 | /* take a breather every nr_migrate tasks */ | 3242 | /* take a breather every nr_migrate tasks */ |
3244 | if (env->loop > env->loop_break) { | 3243 | if (env->loop > env->loop_break) { |
3245 | env->loop_break += sysctl_sched_nr_migrate; | 3244 | env->loop_break += sched_nr_migrate_break; |
3246 | env->flags |= LBF_NEED_BREAK; | 3245 | env->flags |= LBF_NEED_BREAK; |
3247 | break; | 3246 | break; |
3248 | } | 3247 | } |
@@ -3252,10 +3251,10 @@ static int move_tasks(struct lb_env *env) | |||
3252 | 3251 | ||
3253 | load = task_h_load(p); | 3252 | load = task_h_load(p); |
3254 | 3253 | ||
3255 | if (load < 16 && !env->sd->nr_balance_failed) | 3254 | if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) |
3256 | goto next; | 3255 | goto next; |
3257 | 3256 | ||
3258 | if ((load / 2) > env->load_move) | 3257 | if ((load / 2) > env->imbalance) |
3259 | goto next; | 3258 | goto next; |
3260 | 3259 | ||
3261 | if (!can_migrate_task(p, env)) | 3260 | if (!can_migrate_task(p, env)) |
@@ -3263,7 +3262,7 @@ static int move_tasks(struct lb_env *env) | |||
3263 | 3262 | ||
3264 | move_task(p, env); | 3263 | move_task(p, env); |
3265 | pulled++; | 3264 | pulled++; |
3266 | env->load_move -= load; | 3265 | env->imbalance -= load; |
3267 | 3266 | ||
3268 | #ifdef CONFIG_PREEMPT | 3267 | #ifdef CONFIG_PREEMPT |
3269 | /* | 3268 | /* |
@@ -3279,7 +3278,7 @@ static int move_tasks(struct lb_env *env) | |||
3279 | * We only want to steal up to the prescribed amount of | 3278 | * We only want to steal up to the prescribed amount of |
3280 | * weighted load. | 3279 | * weighted load. |
3281 | */ | 3280 | */ |
3282 | if (env->load_move <= 0) | 3281 | if (env->imbalance <= 0) |
3283 | break; | 3282 | break; |
3284 | 3283 | ||
3285 | continue; | 3284 | continue; |
@@ -3433,14 +3432,6 @@ struct sd_lb_stats { | |||
3433 | unsigned int busiest_group_weight; | 3432 | unsigned int busiest_group_weight; |
3434 | 3433 | ||
3435 | int group_imb; /* Is there imbalance in this sd */ | 3434 | int group_imb; /* Is there imbalance in this sd */ |
3436 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3437 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
3438 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
3439 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
3440 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
3441 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
3442 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
3443 | #endif | ||
3444 | }; | 3435 | }; |
3445 | 3436 | ||
3446 | /* | 3437 | /* |
@@ -3484,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
3484 | return load_idx; | 3475 | return load_idx; |
3485 | } | 3476 | } |
3486 | 3477 | ||
3487 | |||
3488 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3489 | /** | ||
3490 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
3491 | * the given sched_domain, during load balancing. | ||
3492 | * | ||
3493 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
3494 | * @sds: Variable containing the statistics for sd. | ||
3495 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
3496 | */ | ||
3497 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3498 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3499 | { | ||
3500 | /* | ||
3501 | * Busy processors will not participate in power savings | ||
3502 | * balance. | ||
3503 | */ | ||
3504 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
3505 | sds->power_savings_balance = 0; | ||
3506 | else { | ||
3507 | sds->power_savings_balance = 1; | ||
3508 | sds->min_nr_running = ULONG_MAX; | ||
3509 | sds->leader_nr_running = 0; | ||
3510 | } | ||
3511 | } | ||
3512 | |||
3513 | /** | ||
3514 | * update_sd_power_savings_stats - Update the power saving stats for a | ||
3515 | * sched_domain while performing load balancing. | ||
3516 | * | ||
3517 | * @group: sched_group belonging to the sched_domain under consideration. | ||
3518 | * @sds: Variable containing the statistics of the sched_domain | ||
3519 | * @local_group: Does group contain the CPU for which we're performing | ||
3520 | * load balancing ? | ||
3521 | * @sgs: Variable containing the statistics of the group. | ||
3522 | */ | ||
3523 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3524 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3525 | { | ||
3526 | |||
3527 | if (!sds->power_savings_balance) | ||
3528 | return; | ||
3529 | |||
3530 | /* | ||
3531 | * If the local group is idle or completely loaded | ||
3532 | * no need to do power savings balance at this domain | ||
3533 | */ | ||
3534 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
3535 | !sds->this_nr_running)) | ||
3536 | sds->power_savings_balance = 0; | ||
3537 | |||
3538 | /* | ||
3539 | * If a group is already running at full capacity or idle, | ||
3540 | * don't include that group in power savings calculations | ||
3541 | */ | ||
3542 | if (!sds->power_savings_balance || | ||
3543 | sgs->sum_nr_running >= sgs->group_capacity || | ||
3544 | !sgs->sum_nr_running) | ||
3545 | return; | ||
3546 | |||
3547 | /* | ||
3548 | * Calculate the group which has the least non-idle load. | ||
3549 | * This is the group from where we need to pick up the load | ||
3550 | * for saving power | ||
3551 | */ | ||
3552 | if ((sgs->sum_nr_running < sds->min_nr_running) || | ||
3553 | (sgs->sum_nr_running == sds->min_nr_running && | ||
3554 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
3555 | sds->group_min = group; | ||
3556 | sds->min_nr_running = sgs->sum_nr_running; | ||
3557 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
3558 | sgs->sum_nr_running; | ||
3559 | } | ||
3560 | |||
3561 | /* | ||
3562 | * Calculate the group which is almost near its | ||
3563 | * capacity but still has some space to pick up some load | ||
3564 | * from other group and save more power | ||
3565 | */ | ||
3566 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) | ||
3567 | return; | ||
3568 | |||
3569 | if (sgs->sum_nr_running > sds->leader_nr_running || | ||
3570 | (sgs->sum_nr_running == sds->leader_nr_running && | ||
3571 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { | ||
3572 | sds->group_leader = group; | ||
3573 | sds->leader_nr_running = sgs->sum_nr_running; | ||
3574 | } | ||
3575 | } | ||
3576 | |||
3577 | /** | ||
3578 | * check_power_save_busiest_group - see if there is potential for some power-savings balance | ||
3579 | * @sds: Variable containing the statistics of the sched_domain | ||
3580 | * under consideration. | ||
3581 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
3582 | * @imbalance: Variable to store the imbalance. | ||
3583 | * | ||
3584 | * Description: | ||
3585 | * Check if we have potential to perform some power-savings balance. | ||
3586 | * If yes, set the busiest group to be the least loaded group in the | ||
3587 | * sched_domain, so that it's CPUs can be put to idle. | ||
3588 | * | ||
3589 | * Returns 1 if there is potential to perform power-savings balance. | ||
3590 | * Else returns 0. | ||
3591 | */ | ||
3592 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3593 | int this_cpu, unsigned long *imbalance) | ||
3594 | { | ||
3595 | if (!sds->power_savings_balance) | ||
3596 | return 0; | ||
3597 | |||
3598 | if (sds->this != sds->group_leader || | ||
3599 | sds->group_leader == sds->group_min) | ||
3600 | return 0; | ||
3601 | |||
3602 | *imbalance = sds->min_load_per_task; | ||
3603 | sds->busiest = sds->group_min; | ||
3604 | |||
3605 | return 1; | ||
3606 | |||
3607 | } | ||
3608 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3609 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3610 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3611 | { | ||
3612 | return; | ||
3613 | } | ||
3614 | |||
3615 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3616 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3617 | { | ||
3618 | return; | ||
3619 | } | ||
3620 | |||
3621 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3622 | int this_cpu, unsigned long *imbalance) | ||
3623 | { | ||
3624 | return 0; | ||
3625 | } | ||
3626 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3627 | |||
3628 | |||
3629 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | 3478 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) |
3630 | { | 3479 | { |
3631 | return SCHED_POWER_SCALE; | 3480 | return SCHED_POWER_SCALE; |
@@ -3654,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | |||
3654 | unsigned long scale_rt_power(int cpu) | 3503 | unsigned long scale_rt_power(int cpu) |
3655 | { | 3504 | { |
3656 | struct rq *rq = cpu_rq(cpu); | 3505 | struct rq *rq = cpu_rq(cpu); |
3657 | u64 total, available; | 3506 | u64 total, available, age_stamp, avg; |
3507 | |||
3508 | /* | ||
3509 | * Since we're reading these variables without serialization make sure | ||
3510 | * we read them once before doing sanity checks on them. | ||
3511 | */ | ||
3512 | age_stamp = ACCESS_ONCE(rq->age_stamp); | ||
3513 | avg = ACCESS_ONCE(rq->rt_avg); | ||
3658 | 3514 | ||
3659 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | 3515 | total = sched_avg_period() + (rq->clock - age_stamp); |
3660 | 3516 | ||
3661 | if (unlikely(total < rq->rt_avg)) { | 3517 | if (unlikely(total < avg)) { |
3662 | /* Ensures that power won't end up being negative */ | 3518 | /* Ensures that power won't end up being negative */ |
3663 | available = 0; | 3519 | available = 0; |
3664 | } else { | 3520 | } else { |
3665 | available = total - rq->rt_avg; | 3521 | available = total - avg; |
3666 | } | 3522 | } |
3667 | 3523 | ||
3668 | if (unlikely((s64)total < SCHED_POWER_SCALE)) | 3524 | if (unlikely((s64)total < SCHED_POWER_SCALE)) |
@@ -3725,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
3725 | 3581 | ||
3726 | power = 0; | 3582 | power = 0; |
3727 | 3583 | ||
3728 | group = child->groups; | 3584 | if (child->flags & SD_OVERLAP) { |
3729 | do { | 3585 | /* |
3730 | power += group->sgp->power; | 3586 | * SD_OVERLAP domains cannot assume that child groups |
3731 | group = group->next; | 3587 | * span the current group. |
3732 | } while (group != child->groups); | 3588 | */ |
3733 | 3589 | ||
3734 | sdg->sgp->power = power; | 3590 | for_each_cpu(cpu, sched_group_cpus(sdg)) |
3591 | power += power_of(cpu); | ||
3592 | } else { | ||
3593 | /* | ||
3594 | * !SD_OVERLAP domains can assume that child groups | ||
3595 | * span the current group. | ||
3596 | */ | ||
3597 | |||
3598 | group = child->groups; | ||
3599 | do { | ||
3600 | power += group->sgp->power; | ||
3601 | group = group->next; | ||
3602 | } while (group != child->groups); | ||
3603 | } | ||
3604 | |||
3605 | sdg->sgp->power_orig = sdg->sgp->power = power; | ||
3735 | } | 3606 | } |
3736 | 3607 | ||
3737 | /* | 3608 | /* |
@@ -3761,41 +3632,43 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
3761 | 3632 | ||
3762 | /** | 3633 | /** |
3763 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3634 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
3764 | * @sd: The sched_domain whose statistics are to be updated. | 3635 | * @env: The load balancing environment. |
3765 | * @group: sched_group whose statistics are to be updated. | 3636 | * @group: sched_group whose statistics are to be updated. |
3766 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3767 | * @idle: Idle status of this_cpu | ||
3768 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 3637 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
3769 | * @local_group: Does group contain this_cpu. | 3638 | * @local_group: Does group contain this_cpu. |
3770 | * @cpus: Set of cpus considered for load balancing. | 3639 | * @cpus: Set of cpus considered for load balancing. |
3771 | * @balance: Should we balance. | 3640 | * @balance: Should we balance. |
3772 | * @sgs: variable to hold the statistics for this group. | 3641 | * @sgs: variable to hold the statistics for this group. |
3773 | */ | 3642 | */ |
3774 | static inline void update_sg_lb_stats(struct sched_domain *sd, | 3643 | static inline void update_sg_lb_stats(struct lb_env *env, |
3775 | struct sched_group *group, int this_cpu, | 3644 | struct sched_group *group, int load_idx, |
3776 | enum cpu_idle_type idle, int load_idx, | ||
3777 | int local_group, const struct cpumask *cpus, | 3645 | int local_group, const struct cpumask *cpus, |
3778 | int *balance, struct sg_lb_stats *sgs) | 3646 | int *balance, struct sg_lb_stats *sgs) |
3779 | { | 3647 | { |
3780 | unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; | 3648 | unsigned long nr_running, max_nr_running, min_nr_running; |
3781 | int i; | 3649 | unsigned long load, max_cpu_load, min_cpu_load; |
3782 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 3650 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
3783 | unsigned long avg_load_per_task = 0; | 3651 | unsigned long avg_load_per_task = 0; |
3652 | int i; | ||
3784 | 3653 | ||
3785 | if (local_group) | 3654 | if (local_group) |
3786 | balance_cpu = group_first_cpu(group); | 3655 | balance_cpu = group_balance_cpu(group); |
3787 | 3656 | ||
3788 | /* Tally up the load of all CPUs in the group */ | 3657 | /* Tally up the load of all CPUs in the group */ |
3789 | max_cpu_load = 0; | 3658 | max_cpu_load = 0; |
3790 | min_cpu_load = ~0UL; | 3659 | min_cpu_load = ~0UL; |
3791 | max_nr_running = 0; | 3660 | max_nr_running = 0; |
3661 | min_nr_running = ~0UL; | ||
3792 | 3662 | ||
3793 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 3663 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
3794 | struct rq *rq = cpu_rq(i); | 3664 | struct rq *rq = cpu_rq(i); |
3795 | 3665 | ||
3666 | nr_running = rq->nr_running; | ||
3667 | |||
3796 | /* Bias balancing toward cpus of our domain */ | 3668 | /* Bias balancing toward cpus of our domain */ |
3797 | if (local_group) { | 3669 | if (local_group) { |
3798 | if (idle_cpu(i) && !first_idle_cpu) { | 3670 | if (idle_cpu(i) && !first_idle_cpu && |
3671 | cpumask_test_cpu(i, sched_group_mask(group))) { | ||
3799 | first_idle_cpu = 1; | 3672 | first_idle_cpu = 1; |
3800 | balance_cpu = i; | 3673 | balance_cpu = i; |
3801 | } | 3674 | } |
@@ -3803,16 +3676,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3803 | load = target_load(i, load_idx); | 3676 | load = target_load(i, load_idx); |
3804 | } else { | 3677 | } else { |
3805 | load = source_load(i, load_idx); | 3678 | load = source_load(i, load_idx); |
3806 | if (load > max_cpu_load) { | 3679 | if (load > max_cpu_load) |
3807 | max_cpu_load = load; | 3680 | max_cpu_load = load; |
3808 | max_nr_running = rq->nr_running; | ||
3809 | } | ||
3810 | if (min_cpu_load > load) | 3681 | if (min_cpu_load > load) |
3811 | min_cpu_load = load; | 3682 | min_cpu_load = load; |
3683 | |||
3684 | if (nr_running > max_nr_running) | ||
3685 | max_nr_running = nr_running; | ||
3686 | if (min_nr_running > nr_running) | ||
3687 | min_nr_running = nr_running; | ||
3812 | } | 3688 | } |
3813 | 3689 | ||
3814 | sgs->group_load += load; | 3690 | sgs->group_load += load; |
3815 | sgs->sum_nr_running += rq->nr_running; | 3691 | sgs->sum_nr_running += nr_running; |
3816 | sgs->sum_weighted_load += weighted_cpuload(i); | 3692 | sgs->sum_weighted_load += weighted_cpuload(i); |
3817 | if (idle_cpu(i)) | 3693 | if (idle_cpu(i)) |
3818 | sgs->idle_cpus++; | 3694 | sgs->idle_cpus++; |
@@ -3825,14 +3701,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3825 | * to do the newly idle load balance. | 3701 | * to do the newly idle load balance. |
3826 | */ | 3702 | */ |
3827 | if (local_group) { | 3703 | if (local_group) { |
3828 | if (idle != CPU_NEWLY_IDLE) { | 3704 | if (env->idle != CPU_NEWLY_IDLE) { |
3829 | if (balance_cpu != this_cpu) { | 3705 | if (balance_cpu != env->dst_cpu) { |
3830 | *balance = 0; | 3706 | *balance = 0; |
3831 | return; | 3707 | return; |
3832 | } | 3708 | } |
3833 | update_group_power(sd, this_cpu); | 3709 | update_group_power(env->sd, env->dst_cpu); |
3834 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | 3710 | } else if (time_after_eq(jiffies, group->sgp->next_update)) |
3835 | update_group_power(sd, this_cpu); | 3711 | update_group_power(env->sd, env->dst_cpu); |
3836 | } | 3712 | } |
3837 | 3713 | ||
3838 | /* Adjust by relative CPU power of the group */ | 3714 | /* Adjust by relative CPU power of the group */ |
@@ -3850,13 +3726,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3850 | if (sgs->sum_nr_running) | 3726 | if (sgs->sum_nr_running) |
3851 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 3727 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
3852 | 3728 | ||
3853 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) | 3729 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && |
3730 | (max_nr_running - min_nr_running) > 1) | ||
3854 | sgs->group_imb = 1; | 3731 | sgs->group_imb = 1; |
3855 | 3732 | ||
3856 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, | 3733 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, |
3857 | SCHED_POWER_SCALE); | 3734 | SCHED_POWER_SCALE); |
3858 | if (!sgs->group_capacity) | 3735 | if (!sgs->group_capacity) |
3859 | sgs->group_capacity = fix_small_capacity(sd, group); | 3736 | sgs->group_capacity = fix_small_capacity(env->sd, group); |
3860 | sgs->group_weight = group->group_weight; | 3737 | sgs->group_weight = group->group_weight; |
3861 | 3738 | ||
3862 | if (sgs->group_capacity > sgs->sum_nr_running) | 3739 | if (sgs->group_capacity > sgs->sum_nr_running) |
@@ -3865,20 +3742,18 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3865 | 3742 | ||
3866 | /** | 3743 | /** |
3867 | * update_sd_pick_busiest - return 1 on busiest group | 3744 | * update_sd_pick_busiest - return 1 on busiest group |
3868 | * @sd: sched_domain whose statistics are to be checked | 3745 | * @env: The load balancing environment. |
3869 | * @sds: sched_domain statistics | 3746 | * @sds: sched_domain statistics |
3870 | * @sg: sched_group candidate to be checked for being the busiest | 3747 | * @sg: sched_group candidate to be checked for being the busiest |
3871 | * @sgs: sched_group statistics | 3748 | * @sgs: sched_group statistics |
3872 | * @this_cpu: the current cpu | ||
3873 | * | 3749 | * |
3874 | * Determine if @sg is a busier group than the previously selected | 3750 | * Determine if @sg is a busier group than the previously selected |
3875 | * busiest group. | 3751 | * busiest group. |
3876 | */ | 3752 | */ |
3877 | static bool update_sd_pick_busiest(struct sched_domain *sd, | 3753 | static bool update_sd_pick_busiest(struct lb_env *env, |
3878 | struct sd_lb_stats *sds, | 3754 | struct sd_lb_stats *sds, |
3879 | struct sched_group *sg, | 3755 | struct sched_group *sg, |
3880 | struct sg_lb_stats *sgs, | 3756 | struct sg_lb_stats *sgs) |
3881 | int this_cpu) | ||
3882 | { | 3757 | { |
3883 | if (sgs->avg_load <= sds->max_load) | 3758 | if (sgs->avg_load <= sds->max_load) |
3884 | return false; | 3759 | return false; |
@@ -3894,8 +3769,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
3894 | * numbered CPUs in the group, therefore mark all groups | 3769 | * numbered CPUs in the group, therefore mark all groups |
3895 | * higher than ourself as busy. | 3770 | * higher than ourself as busy. |
3896 | */ | 3771 | */ |
3897 | if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | 3772 | if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && |
3898 | this_cpu < group_first_cpu(sg)) { | 3773 | env->dst_cpu < group_first_cpu(sg)) { |
3899 | if (!sds->busiest) | 3774 | if (!sds->busiest) |
3900 | return true; | 3775 | return true; |
3901 | 3776 | ||
@@ -3908,35 +3783,32 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
3908 | 3783 | ||
3909 | /** | 3784 | /** |
3910 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. | 3785 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
3911 | * @sd: sched_domain whose statistics are to be updated. | 3786 | * @env: The load balancing environment. |
3912 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3913 | * @idle: Idle status of this_cpu | ||
3914 | * @cpus: Set of cpus considered for load balancing. | 3787 | * @cpus: Set of cpus considered for load balancing. |
3915 | * @balance: Should we balance. | 3788 | * @balance: Should we balance. |
3916 | * @sds: variable to hold the statistics for this sched_domain. | 3789 | * @sds: variable to hold the statistics for this sched_domain. |
3917 | */ | 3790 | */ |
3918 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | 3791 | static inline void update_sd_lb_stats(struct lb_env *env, |
3919 | enum cpu_idle_type idle, const struct cpumask *cpus, | 3792 | const struct cpumask *cpus, |
3920 | int *balance, struct sd_lb_stats *sds) | 3793 | int *balance, struct sd_lb_stats *sds) |
3921 | { | 3794 | { |
3922 | struct sched_domain *child = sd->child; | 3795 | struct sched_domain *child = env->sd->child; |
3923 | struct sched_group *sg = sd->groups; | 3796 | struct sched_group *sg = env->sd->groups; |
3924 | struct sg_lb_stats sgs; | 3797 | struct sg_lb_stats sgs; |
3925 | int load_idx, prefer_sibling = 0; | 3798 | int load_idx, prefer_sibling = 0; |
3926 | 3799 | ||
3927 | if (child && child->flags & SD_PREFER_SIBLING) | 3800 | if (child && child->flags & SD_PREFER_SIBLING) |
3928 | prefer_sibling = 1; | 3801 | prefer_sibling = 1; |
3929 | 3802 | ||
3930 | init_sd_power_savings_stats(sd, sds, idle); | 3803 | load_idx = get_sd_load_idx(env->sd, env->idle); |
3931 | load_idx = get_sd_load_idx(sd, idle); | ||
3932 | 3804 | ||
3933 | do { | 3805 | do { |
3934 | int local_group; | 3806 | int local_group; |
3935 | 3807 | ||
3936 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); | 3808 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
3937 | memset(&sgs, 0, sizeof(sgs)); | 3809 | memset(&sgs, 0, sizeof(sgs)); |
3938 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, | 3810 | update_sg_lb_stats(env, sg, load_idx, local_group, |
3939 | local_group, cpus, balance, &sgs); | 3811 | cpus, balance, &sgs); |
3940 | 3812 | ||
3941 | if (local_group && !(*balance)) | 3813 | if (local_group && !(*balance)) |
3942 | return; | 3814 | return; |
@@ -3964,7 +3836,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3964 | sds->this_load_per_task = sgs.sum_weighted_load; | 3836 | sds->this_load_per_task = sgs.sum_weighted_load; |
3965 | sds->this_has_capacity = sgs.group_has_capacity; | 3837 | sds->this_has_capacity = sgs.group_has_capacity; |
3966 | sds->this_idle_cpus = sgs.idle_cpus; | 3838 | sds->this_idle_cpus = sgs.idle_cpus; |
3967 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { | 3839 | } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { |
3968 | sds->max_load = sgs.avg_load; | 3840 | sds->max_load = sgs.avg_load; |
3969 | sds->busiest = sg; | 3841 | sds->busiest = sg; |
3970 | sds->busiest_nr_running = sgs.sum_nr_running; | 3842 | sds->busiest_nr_running = sgs.sum_nr_running; |
@@ -3976,9 +3848,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3976 | sds->group_imb = sgs.group_imb; | 3848 | sds->group_imb = sgs.group_imb; |
3977 | } | 3849 | } |
3978 | 3850 | ||
3979 | update_sd_power_savings_stats(sg, sds, local_group, &sgs); | ||
3980 | sg = sg->next; | 3851 | sg = sg->next; |
3981 | } while (sg != sd->groups); | 3852 | } while (sg != env->sd->groups); |
3982 | } | 3853 | } |
3983 | 3854 | ||
3984 | /** | 3855 | /** |
@@ -4001,29 +3872,26 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
4001 | * Returns 1 when packing is required and a task should be moved to | 3872 | * Returns 1 when packing is required and a task should be moved to |
4002 | * this CPU. The amount of the imbalance is returned in *imbalance. | 3873 | * this CPU. The amount of the imbalance is returned in *imbalance. |
4003 | * | 3874 | * |
4004 | * @sd: The sched_domain whose packing is to be checked. | 3875 | * @env: The load balancing environment. |
4005 | * @sds: Statistics of the sched_domain which is to be packed | 3876 | * @sds: Statistics of the sched_domain which is to be packed |
4006 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
4007 | * @imbalance: returns amount of imbalanced due to packing. | ||
4008 | */ | 3877 | */ |
4009 | static int check_asym_packing(struct sched_domain *sd, | 3878 | static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) |
4010 | struct sd_lb_stats *sds, | ||
4011 | int this_cpu, unsigned long *imbalance) | ||
4012 | { | 3879 | { |
4013 | int busiest_cpu; | 3880 | int busiest_cpu; |
4014 | 3881 | ||
4015 | if (!(sd->flags & SD_ASYM_PACKING)) | 3882 | if (!(env->sd->flags & SD_ASYM_PACKING)) |
4016 | return 0; | 3883 | return 0; |
4017 | 3884 | ||
4018 | if (!sds->busiest) | 3885 | if (!sds->busiest) |
4019 | return 0; | 3886 | return 0; |
4020 | 3887 | ||
4021 | busiest_cpu = group_first_cpu(sds->busiest); | 3888 | busiest_cpu = group_first_cpu(sds->busiest); |
4022 | if (this_cpu > busiest_cpu) | 3889 | if (env->dst_cpu > busiest_cpu) |
4023 | return 0; | 3890 | return 0; |
4024 | 3891 | ||
4025 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, | 3892 | env->imbalance = DIV_ROUND_CLOSEST( |
4026 | SCHED_POWER_SCALE); | 3893 | sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); |
3894 | |||
4027 | return 1; | 3895 | return 1; |
4028 | } | 3896 | } |
4029 | 3897 | ||
@@ -4031,12 +3899,11 @@ static int check_asym_packing(struct sched_domain *sd, | |||
4031 | * fix_small_imbalance - Calculate the minor imbalance that exists | 3899 | * fix_small_imbalance - Calculate the minor imbalance that exists |
4032 | * amongst the groups of a sched_domain, during | 3900 | * amongst the groups of a sched_domain, during |
4033 | * load balancing. | 3901 | * load balancing. |
3902 | * @env: The load balancing environment. | ||
4034 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | 3903 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. |
4035 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
4036 | * @imbalance: Variable to store the imbalance. | ||
4037 | */ | 3904 | */ |
4038 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | 3905 | static inline |
4039 | int this_cpu, unsigned long *imbalance) | 3906 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4040 | { | 3907 | { |
4041 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 3908 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
4042 | unsigned int imbn = 2; | 3909 | unsigned int imbn = 2; |
@@ -4047,9 +3914,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4047 | if (sds->busiest_load_per_task > | 3914 | if (sds->busiest_load_per_task > |
4048 | sds->this_load_per_task) | 3915 | sds->this_load_per_task) |
4049 | imbn = 1; | 3916 | imbn = 1; |
4050 | } else | 3917 | } else { |
4051 | sds->this_load_per_task = | 3918 | sds->this_load_per_task = |
4052 | cpu_avg_load_per_task(this_cpu); | 3919 | cpu_avg_load_per_task(env->dst_cpu); |
3920 | } | ||
4053 | 3921 | ||
4054 | scaled_busy_load_per_task = sds->busiest_load_per_task | 3922 | scaled_busy_load_per_task = sds->busiest_load_per_task |
4055 | * SCHED_POWER_SCALE; | 3923 | * SCHED_POWER_SCALE; |
@@ -4057,7 +3925,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4057 | 3925 | ||
4058 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 3926 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= |
4059 | (scaled_busy_load_per_task * imbn)) { | 3927 | (scaled_busy_load_per_task * imbn)) { |
4060 | *imbalance = sds->busiest_load_per_task; | 3928 | env->imbalance = sds->busiest_load_per_task; |
4061 | return; | 3929 | return; |
4062 | } | 3930 | } |
4063 | 3931 | ||
@@ -4094,18 +3962,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4094 | 3962 | ||
4095 | /* Move if we gain throughput */ | 3963 | /* Move if we gain throughput */ |
4096 | if (pwr_move > pwr_now) | 3964 | if (pwr_move > pwr_now) |
4097 | *imbalance = sds->busiest_load_per_task; | 3965 | env->imbalance = sds->busiest_load_per_task; |
4098 | } | 3966 | } |
4099 | 3967 | ||
4100 | /** | 3968 | /** |
4101 | * calculate_imbalance - Calculate the amount of imbalance present within the | 3969 | * calculate_imbalance - Calculate the amount of imbalance present within the |
4102 | * groups of a given sched_domain during load balance. | 3970 | * groups of a given sched_domain during load balance. |
3971 | * @env: load balance environment | ||
4103 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | 3972 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. |
4104 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
4105 | * @imbalance: The variable to store the imbalance. | ||
4106 | */ | 3973 | */ |
4107 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | 3974 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4108 | unsigned long *imbalance) | ||
4109 | { | 3975 | { |
4110 | unsigned long max_pull, load_above_capacity = ~0UL; | 3976 | unsigned long max_pull, load_above_capacity = ~0UL; |
4111 | 3977 | ||
@@ -4121,8 +3987,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4121 | * its cpu_power, while calculating max_load..) | 3987 | * its cpu_power, while calculating max_load..) |
4122 | */ | 3988 | */ |
4123 | if (sds->max_load < sds->avg_load) { | 3989 | if (sds->max_load < sds->avg_load) { |
4124 | *imbalance = 0; | 3990 | env->imbalance = 0; |
4125 | return fix_small_imbalance(sds, this_cpu, imbalance); | 3991 | return fix_small_imbalance(env, sds); |
4126 | } | 3992 | } |
4127 | 3993 | ||
4128 | if (!sds->group_imb) { | 3994 | if (!sds->group_imb) { |
@@ -4150,7 +4016,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4150 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 4016 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); |
4151 | 4017 | ||
4152 | /* How much load to actually move to equalise the imbalance */ | 4018 | /* How much load to actually move to equalise the imbalance */ |
4153 | *imbalance = min(max_pull * sds->busiest->sgp->power, | 4019 | env->imbalance = min(max_pull * sds->busiest->sgp->power, |
4154 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) | 4020 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) |
4155 | / SCHED_POWER_SCALE; | 4021 | / SCHED_POWER_SCALE; |
4156 | 4022 | ||
@@ -4160,8 +4026,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4160 | * a think about bumping its value to force at least one task to be | 4026 | * a think about bumping its value to force at least one task to be |
4161 | * moved | 4027 | * moved |
4162 | */ | 4028 | */ |
4163 | if (*imbalance < sds->busiest_load_per_task) | 4029 | if (env->imbalance < sds->busiest_load_per_task) |
4164 | return fix_small_imbalance(sds, this_cpu, imbalance); | 4030 | return fix_small_imbalance(env, sds); |
4165 | 4031 | ||
4166 | } | 4032 | } |
4167 | 4033 | ||
@@ -4177,11 +4043,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4177 | * Also calculates the amount of weighted load which should be moved | 4043 | * Also calculates the amount of weighted load which should be moved |
4178 | * to restore balance. | 4044 | * to restore balance. |
4179 | * | 4045 | * |
4180 | * @sd: The sched_domain whose busiest group is to be returned. | 4046 | * @env: The load balancing environment. |
4181 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
4182 | * @imbalance: Variable which stores amount of weighted load which should | ||
4183 | * be moved to restore balance/put a group to idle. | ||
4184 | * @idle: The idle status of this_cpu. | ||
4185 | * @cpus: The set of CPUs under consideration for load-balancing. | 4047 | * @cpus: The set of CPUs under consideration for load-balancing. |
4186 | * @balance: Pointer to a variable indicating if this_cpu | 4048 | * @balance: Pointer to a variable indicating if this_cpu |
4187 | * is the appropriate cpu to perform load balancing at this_level. | 4049 | * is the appropriate cpu to perform load balancing at this_level. |
@@ -4192,9 +4054,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4192 | * put to idle by rebalancing its tasks onto our group. | 4054 | * put to idle by rebalancing its tasks onto our group. |
4193 | */ | 4055 | */ |
4194 | static struct sched_group * | 4056 | static struct sched_group * |
4195 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 4057 | find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) |
4196 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
4197 | const struct cpumask *cpus, int *balance) | ||
4198 | { | 4058 | { |
4199 | struct sd_lb_stats sds; | 4059 | struct sd_lb_stats sds; |
4200 | 4060 | ||
@@ -4204,7 +4064,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4204 | * Compute the various statistics relavent for load balancing at | 4064 | * Compute the various statistics relavent for load balancing at |
4205 | * this level. | 4065 | * this level. |
4206 | */ | 4066 | */ |
4207 | update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); | 4067 | update_sd_lb_stats(env, cpus, balance, &sds); |
4208 | 4068 | ||
4209 | /* | 4069 | /* |
4210 | * this_cpu is not the appropriate cpu to perform load balancing at | 4070 | * this_cpu is not the appropriate cpu to perform load balancing at |
@@ -4213,8 +4073,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4213 | if (!(*balance)) | 4073 | if (!(*balance)) |
4214 | goto ret; | 4074 | goto ret; |
4215 | 4075 | ||
4216 | if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && | 4076 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
4217 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | 4077 | check_asym_packing(env, &sds)) |
4218 | return sds.busiest; | 4078 | return sds.busiest; |
4219 | 4079 | ||
4220 | /* There is no busy sibling group to pull tasks from */ | 4080 | /* There is no busy sibling group to pull tasks from */ |
@@ -4232,7 +4092,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4232 | goto force_balance; | 4092 | goto force_balance; |
4233 | 4093 | ||
4234 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 4094 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
4235 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 4095 | if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && |
4236 | !sds.busiest_has_capacity) | 4096 | !sds.busiest_has_capacity) |
4237 | goto force_balance; | 4097 | goto force_balance; |
4238 | 4098 | ||
@@ -4250,7 +4110,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4250 | if (sds.this_load >= sds.avg_load) | 4110 | if (sds.this_load >= sds.avg_load) |
4251 | goto out_balanced; | 4111 | goto out_balanced; |
4252 | 4112 | ||
4253 | if (idle == CPU_IDLE) { | 4113 | if (env->idle == CPU_IDLE) { |
4254 | /* | 4114 | /* |
4255 | * This cpu is idle. If the busiest group load doesn't | 4115 | * This cpu is idle. If the busiest group load doesn't |
4256 | * have more tasks than the number of available cpu's and | 4116 | * have more tasks than the number of available cpu's and |
@@ -4265,34 +4125,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4265 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | 4125 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use |
4266 | * imbalance_pct to be conservative. | 4126 | * imbalance_pct to be conservative. |
4267 | */ | 4127 | */ |
4268 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 4128 | if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) |
4269 | goto out_balanced; | 4129 | goto out_balanced; |
4270 | } | 4130 | } |
4271 | 4131 | ||
4272 | force_balance: | 4132 | force_balance: |
4273 | /* Looks like there is an imbalance. Compute it */ | 4133 | /* Looks like there is an imbalance. Compute it */ |
4274 | calculate_imbalance(&sds, this_cpu, imbalance); | 4134 | calculate_imbalance(env, &sds); |
4275 | return sds.busiest; | 4135 | return sds.busiest; |
4276 | 4136 | ||
4277 | out_balanced: | 4137 | out_balanced: |
4278 | /* | ||
4279 | * There is no obvious imbalance. But check if we can do some balancing | ||
4280 | * to save power. | ||
4281 | */ | ||
4282 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
4283 | return sds.busiest; | ||
4284 | ret: | 4138 | ret: |
4285 | *imbalance = 0; | 4139 | env->imbalance = 0; |
4286 | return NULL; | 4140 | return NULL; |
4287 | } | 4141 | } |
4288 | 4142 | ||
4289 | /* | 4143 | /* |
4290 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 4144 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
4291 | */ | 4145 | */ |
4292 | static struct rq * | 4146 | static struct rq *find_busiest_queue(struct lb_env *env, |
4293 | find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | 4147 | struct sched_group *group, |
4294 | enum cpu_idle_type idle, unsigned long imbalance, | 4148 | const struct cpumask *cpus) |
4295 | const struct cpumask *cpus) | ||
4296 | { | 4149 | { |
4297 | struct rq *busiest = NULL, *rq; | 4150 | struct rq *busiest = NULL, *rq; |
4298 | unsigned long max_load = 0; | 4151 | unsigned long max_load = 0; |
@@ -4305,7 +4158,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4305 | unsigned long wl; | 4158 | unsigned long wl; |
4306 | 4159 | ||
4307 | if (!capacity) | 4160 | if (!capacity) |
4308 | capacity = fix_small_capacity(sd, group); | 4161 | capacity = fix_small_capacity(env->sd, group); |
4309 | 4162 | ||
4310 | if (!cpumask_test_cpu(i, cpus)) | 4163 | if (!cpumask_test_cpu(i, cpus)) |
4311 | continue; | 4164 | continue; |
@@ -4317,7 +4170,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4317 | * When comparing with imbalance, use weighted_cpuload() | 4170 | * When comparing with imbalance, use weighted_cpuload() |
4318 | * which is not scaled with the cpu power. | 4171 | * which is not scaled with the cpu power. |
4319 | */ | 4172 | */ |
4320 | if (capacity && rq->nr_running == 1 && wl > imbalance) | 4173 | if (capacity && rq->nr_running == 1 && wl > env->imbalance) |
4321 | continue; | 4174 | continue; |
4322 | 4175 | ||
4323 | /* | 4176 | /* |
@@ -4346,40 +4199,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4346 | /* Working cpumask for load_balance and load_balance_newidle. */ | 4199 | /* Working cpumask for load_balance and load_balance_newidle. */ |
4347 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 4200 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
4348 | 4201 | ||
4349 | static int need_active_balance(struct sched_domain *sd, int idle, | 4202 | static int need_active_balance(struct lb_env *env) |
4350 | int busiest_cpu, int this_cpu) | ||
4351 | { | 4203 | { |
4352 | if (idle == CPU_NEWLY_IDLE) { | 4204 | struct sched_domain *sd = env->sd; |
4205 | |||
4206 | if (env->idle == CPU_NEWLY_IDLE) { | ||
4353 | 4207 | ||
4354 | /* | 4208 | /* |
4355 | * ASYM_PACKING needs to force migrate tasks from busy but | 4209 | * ASYM_PACKING needs to force migrate tasks from busy but |
4356 | * higher numbered CPUs in order to pack all tasks in the | 4210 | * higher numbered CPUs in order to pack all tasks in the |
4357 | * lowest numbered CPUs. | 4211 | * lowest numbered CPUs. |
4358 | */ | 4212 | */ |
4359 | if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) | 4213 | if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) |
4360 | return 1; | 4214 | return 1; |
4361 | |||
4362 | /* | ||
4363 | * The only task running in a non-idle cpu can be moved to this | ||
4364 | * cpu in an attempt to completely freeup the other CPU | ||
4365 | * package. | ||
4366 | * | ||
4367 | * The package power saving logic comes from | ||
4368 | * find_busiest_group(). If there are no imbalance, then | ||
4369 | * f_b_g() will return NULL. However when sched_mc={1,2} then | ||
4370 | * f_b_g() will select a group from which a running task may be | ||
4371 | * pulled to this cpu in order to make the other package idle. | ||
4372 | * If there is no opportunity to make a package idle and if | ||
4373 | * there are no imbalance, then f_b_g() will return NULL and no | ||
4374 | * action will be taken in load_balance_newidle(). | ||
4375 | * | ||
4376 | * Under normal task pull operation due to imbalance, there | ||
4377 | * will be more than one task in the source run queue and | ||
4378 | * move_tasks() will succeed. ld_moved will be true and this | ||
4379 | * active balance code will not be triggered. | ||
4380 | */ | ||
4381 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | ||
4382 | return 0; | ||
4383 | } | 4215 | } |
4384 | 4216 | ||
4385 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 4217 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
@@ -4397,7 +4229,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4397 | { | 4229 | { |
4398 | int ld_moved, active_balance = 0; | 4230 | int ld_moved, active_balance = 0; |
4399 | struct sched_group *group; | 4231 | struct sched_group *group; |
4400 | unsigned long imbalance; | ||
4401 | struct rq *busiest; | 4232 | struct rq *busiest; |
4402 | unsigned long flags; | 4233 | unsigned long flags; |
4403 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4234 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
@@ -4407,7 +4238,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4407 | .dst_cpu = this_cpu, | 4238 | .dst_cpu = this_cpu, |
4408 | .dst_rq = this_rq, | 4239 | .dst_rq = this_rq, |
4409 | .idle = idle, | 4240 | .idle = idle, |
4410 | .loop_break = sysctl_sched_nr_migrate, | 4241 | .loop_break = sched_nr_migrate_break, |
4411 | }; | 4242 | }; |
4412 | 4243 | ||
4413 | cpumask_copy(cpus, cpu_active_mask); | 4244 | cpumask_copy(cpus, cpu_active_mask); |
@@ -4415,8 +4246,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4415 | schedstat_inc(sd, lb_count[idle]); | 4246 | schedstat_inc(sd, lb_count[idle]); |
4416 | 4247 | ||
4417 | redo: | 4248 | redo: |
4418 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, | 4249 | group = find_busiest_group(&env, cpus, balance); |
4419 | cpus, balance); | ||
4420 | 4250 | ||
4421 | if (*balance == 0) | 4251 | if (*balance == 0) |
4422 | goto out_balanced; | 4252 | goto out_balanced; |
@@ -4426,7 +4256,7 @@ redo: | |||
4426 | goto out_balanced; | 4256 | goto out_balanced; |
4427 | } | 4257 | } |
4428 | 4258 | ||
4429 | busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); | 4259 | busiest = find_busiest_queue(&env, group, cpus); |
4430 | if (!busiest) { | 4260 | if (!busiest) { |
4431 | schedstat_inc(sd, lb_nobusyq[idle]); | 4261 | schedstat_inc(sd, lb_nobusyq[idle]); |
4432 | goto out_balanced; | 4262 | goto out_balanced; |
@@ -4434,7 +4264,7 @@ redo: | |||
4434 | 4264 | ||
4435 | BUG_ON(busiest == this_rq); | 4265 | BUG_ON(busiest == this_rq); |
4436 | 4266 | ||
4437 | schedstat_add(sd, lb_imbalance[idle], imbalance); | 4267 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
4438 | 4268 | ||
4439 | ld_moved = 0; | 4269 | ld_moved = 0; |
4440 | if (busiest->nr_running > 1) { | 4270 | if (busiest->nr_running > 1) { |
@@ -4445,10 +4275,9 @@ redo: | |||
4445 | * correctly treated as an imbalance. | 4275 | * correctly treated as an imbalance. |
4446 | */ | 4276 | */ |
4447 | env.flags |= LBF_ALL_PINNED; | 4277 | env.flags |= LBF_ALL_PINNED; |
4448 | env.load_move = imbalance; | 4278 | env.src_cpu = busiest->cpu; |
4449 | env.src_cpu = busiest->cpu; | 4279 | env.src_rq = busiest; |
4450 | env.src_rq = busiest; | 4280 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
4451 | env.loop_max = busiest->nr_running; | ||
4452 | 4281 | ||
4453 | more_balance: | 4282 | more_balance: |
4454 | local_irq_save(flags); | 4283 | local_irq_save(flags); |
@@ -4490,7 +4319,7 @@ more_balance: | |||
4490 | if (idle != CPU_NEWLY_IDLE) | 4319 | if (idle != CPU_NEWLY_IDLE) |
4491 | sd->nr_balance_failed++; | 4320 | sd->nr_balance_failed++; |
4492 | 4321 | ||
4493 | if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { | 4322 | if (need_active_balance(&env)) { |
4494 | raw_spin_lock_irqsave(&busiest->lock, flags); | 4323 | raw_spin_lock_irqsave(&busiest->lock, flags); |
4495 | 4324 | ||
4496 | /* don't kick the active_load_balance_cpu_stop, | 4325 | /* don't kick the active_load_balance_cpu_stop, |
@@ -4517,10 +4346,11 @@ more_balance: | |||
4517 | } | 4346 | } |
4518 | raw_spin_unlock_irqrestore(&busiest->lock, flags); | 4347 | raw_spin_unlock_irqrestore(&busiest->lock, flags); |
4519 | 4348 | ||
4520 | if (active_balance) | 4349 | if (active_balance) { |
4521 | stop_one_cpu_nowait(cpu_of(busiest), | 4350 | stop_one_cpu_nowait(cpu_of(busiest), |
4522 | active_load_balance_cpu_stop, busiest, | 4351 | active_load_balance_cpu_stop, busiest, |
4523 | &busiest->active_balance_work); | 4352 | &busiest->active_balance_work); |
4353 | } | ||
4524 | 4354 | ||
4525 | /* | 4355 | /* |
4526 | * We've kicked active balancing, reset the failure | 4356 | * We've kicked active balancing, reset the failure |
@@ -4701,104 +4531,15 @@ static struct { | |||
4701 | unsigned long next_balance; /* in jiffy units */ | 4531 | unsigned long next_balance; /* in jiffy units */ |
4702 | } nohz ____cacheline_aligned; | 4532 | } nohz ____cacheline_aligned; |
4703 | 4533 | ||
4704 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 4534 | static inline int find_new_ilb(int call_cpu) |
4705 | /** | ||
4706 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
4707 | * @cpu: The cpu whose lowest level of sched domain is to | ||
4708 | * be returned. | ||
4709 | * @flag: The flag to check for the lowest sched_domain | ||
4710 | * for the given cpu. | ||
4711 | * | ||
4712 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
4713 | */ | ||
4714 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
4715 | { | ||
4716 | struct sched_domain *sd; | ||
4717 | |||
4718 | for_each_domain(cpu, sd) | ||
4719 | if (sd->flags & flag) | ||
4720 | break; | ||
4721 | |||
4722 | return sd; | ||
4723 | } | ||
4724 | |||
4725 | /** | ||
4726 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
4727 | * @cpu: The cpu whose domains we're iterating over. | ||
4728 | * @sd: variable holding the value of the power_savings_sd | ||
4729 | * for cpu. | ||
4730 | * @flag: The flag to filter the sched_domains to be iterated. | ||
4731 | * | ||
4732 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
4733 | * set, starting from the lowest sched_domain to the highest. | ||
4734 | */ | ||
4735 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
4736 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
4737 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
4738 | |||
4739 | /** | ||
4740 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
4741 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
4742 | * | ||
4743 | * Returns: Returns the id of the idle load balancer if it exists, | ||
4744 | * Else, returns >= nr_cpu_ids. | ||
4745 | * | ||
4746 | * This algorithm picks the idle load balancer such that it belongs to a | ||
4747 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
4748 | * completely idle packages/cores just for the purpose of idle load balancing | ||
4749 | * when there are other idle cpu's which are better suited for that job. | ||
4750 | */ | ||
4751 | static int find_new_ilb(int cpu) | ||
4752 | { | 4535 | { |
4753 | int ilb = cpumask_first(nohz.idle_cpus_mask); | 4536 | int ilb = cpumask_first(nohz.idle_cpus_mask); |
4754 | struct sched_group *ilbg; | ||
4755 | struct sched_domain *sd; | ||
4756 | |||
4757 | /* | ||
4758 | * Have idle load balancer selection from semi-idle packages only | ||
4759 | * when power-aware load balancing is enabled | ||
4760 | */ | ||
4761 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
4762 | goto out_done; | ||
4763 | |||
4764 | /* | ||
4765 | * Optimize for the case when we have no idle CPUs or only one | ||
4766 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
4767 | */ | ||
4768 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) | ||
4769 | goto out_done; | ||
4770 | 4537 | ||
4771 | rcu_read_lock(); | ||
4772 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
4773 | ilbg = sd->groups; | ||
4774 | |||
4775 | do { | ||
4776 | if (ilbg->group_weight != | ||
4777 | atomic_read(&ilbg->sgp->nr_busy_cpus)) { | ||
4778 | ilb = cpumask_first_and(nohz.idle_cpus_mask, | ||
4779 | sched_group_cpus(ilbg)); | ||
4780 | goto unlock; | ||
4781 | } | ||
4782 | |||
4783 | ilbg = ilbg->next; | ||
4784 | |||
4785 | } while (ilbg != sd->groups); | ||
4786 | } | ||
4787 | unlock: | ||
4788 | rcu_read_unlock(); | ||
4789 | |||
4790 | out_done: | ||
4791 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) | 4538 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) |
4792 | return ilb; | 4539 | return ilb; |
4793 | 4540 | ||
4794 | return nr_cpu_ids; | 4541 | return nr_cpu_ids; |
4795 | } | 4542 | } |
4796 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
4797 | static inline int find_new_ilb(int call_cpu) | ||
4798 | { | ||
4799 | return nr_cpu_ids; | ||
4800 | } | ||
4801 | #endif | ||
4802 | 4543 | ||
4803 | /* | 4544 | /* |
4804 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | 4545 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the |
@@ -5021,7 +4762,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
5021 | 4762 | ||
5022 | raw_spin_lock_irq(&this_rq->lock); | 4763 | raw_spin_lock_irq(&this_rq->lock); |
5023 | update_rq_clock(this_rq); | 4764 | update_rq_clock(this_rq); |
5024 | update_cpu_load(this_rq); | 4765 | update_idle_cpu_load(this_rq); |
5025 | raw_spin_unlock_irq(&this_rq->lock); | 4766 | raw_spin_unlock_irq(&this_rq->lock); |
5026 | 4767 | ||
5027 | rebalance_domains(balance_cpu, CPU_IDLE); | 4768 | rebalance_domains(balance_cpu, CPU_IDLE); |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e61fd73913d0..de00a486c5c6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -68,3 +68,4 @@ SCHED_FEAT(TTWU_QUEUE, true) | |||
68 | 68 | ||
69 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 69 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
70 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 70 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
71 | SCHED_FEAT(LB_MIN, false) | ||
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 91b4c957f289..b44d604b35d1 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * idle-task scheduling class. | 4 | * idle-task scheduling class. |
5 | * | 5 | * |
6 | * (NOTE: these are not related to SCHED_IDLE tasks which are | 6 | * (NOTE: these are not related to SCHED_IDLE tasks which are |
7 | * handled in sched_fair.c) | 7 | * handled in sched/fair.c) |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #ifdef CONFIG_SMP | 10 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 44af55e6d5d0..573e1ca01102 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) | |||
274 | 274 | ||
275 | static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 275 | static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
276 | { | 276 | { |
277 | struct task_struct *p; | ||
278 | |||
277 | if (!rt_entity_is_task(rt_se)) | 279 | if (!rt_entity_is_task(rt_se)) |
278 | return; | 280 | return; |
279 | 281 | ||
282 | p = rt_task_of(rt_se); | ||
280 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; | 283 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; |
281 | 284 | ||
282 | rt_rq->rt_nr_total++; | 285 | rt_rq->rt_nr_total++; |
283 | if (rt_se->nr_cpus_allowed > 1) | 286 | if (p->nr_cpus_allowed > 1) |
284 | rt_rq->rt_nr_migratory++; | 287 | rt_rq->rt_nr_migratory++; |
285 | 288 | ||
286 | update_rt_migration(rt_rq); | 289 | update_rt_migration(rt_rq); |
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
288 | 291 | ||
289 | static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 292 | static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
290 | { | 293 | { |
294 | struct task_struct *p; | ||
295 | |||
291 | if (!rt_entity_is_task(rt_se)) | 296 | if (!rt_entity_is_task(rt_se)) |
292 | return; | 297 | return; |
293 | 298 | ||
299 | p = rt_task_of(rt_se); | ||
294 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; | 300 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; |
295 | 301 | ||
296 | rt_rq->rt_nr_total--; | 302 | rt_rq->rt_nr_total--; |
297 | if (rt_se->nr_cpus_allowed > 1) | 303 | if (p->nr_cpus_allowed > 1) |
298 | rt_rq->rt_nr_migratory--; | 304 | rt_rq->rt_nr_migratory--; |
299 | 305 | ||
300 | update_rt_migration(rt_rq); | 306 | update_rt_migration(rt_rq); |
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
1161 | 1167 | ||
1162 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); | 1168 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); |
1163 | 1169 | ||
1164 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 1170 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) |
1165 | enqueue_pushable_task(rq, p); | 1171 | enqueue_pushable_task(rq, p); |
1166 | 1172 | ||
1167 | inc_nr_running(rq); | 1173 | inc_nr_running(rq); |
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1225 | 1231 | ||
1226 | cpu = task_cpu(p); | 1232 | cpu = task_cpu(p); |
1227 | 1233 | ||
1228 | if (p->rt.nr_cpus_allowed == 1) | 1234 | if (p->nr_cpus_allowed == 1) |
1229 | goto out; | 1235 | goto out; |
1230 | 1236 | ||
1231 | /* For anything but wake ups, just return the task_cpu */ | 1237 | /* For anything but wake ups, just return the task_cpu */ |
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1260 | * will have to sort it out. | 1266 | * will have to sort it out. |
1261 | */ | 1267 | */ |
1262 | if (curr && unlikely(rt_task(curr)) && | 1268 | if (curr && unlikely(rt_task(curr)) && |
1263 | (curr->rt.nr_cpus_allowed < 2 || | 1269 | (curr->nr_cpus_allowed < 2 || |
1264 | curr->prio <= p->prio) && | 1270 | curr->prio <= p->prio) && |
1265 | (p->rt.nr_cpus_allowed > 1)) { | 1271 | (p->nr_cpus_allowed > 1)) { |
1266 | int target = find_lowest_rq(p); | 1272 | int target = find_lowest_rq(p); |
1267 | 1273 | ||
1268 | if (target != -1) | 1274 | if (target != -1) |
@@ -1276,10 +1282,10 @@ out: | |||
1276 | 1282 | ||
1277 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1283 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
1278 | { | 1284 | { |
1279 | if (rq->curr->rt.nr_cpus_allowed == 1) | 1285 | if (rq->curr->nr_cpus_allowed == 1) |
1280 | return; | 1286 | return; |
1281 | 1287 | ||
1282 | if (p->rt.nr_cpus_allowed != 1 | 1288 | if (p->nr_cpus_allowed != 1 |
1283 | && cpupri_find(&rq->rd->cpupri, p, NULL)) | 1289 | && cpupri_find(&rq->rd->cpupri, p, NULL)) |
1284 | return; | 1290 | return; |
1285 | 1291 | ||
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
1395 | * The previous task needs to be made eligible for pushing | 1401 | * The previous task needs to be made eligible for pushing |
1396 | * if it is still active | 1402 | * if it is still active |
1397 | */ | 1403 | */ |
1398 | if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) | 1404 | if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) |
1399 | enqueue_pushable_task(rq, p); | 1405 | enqueue_pushable_task(rq, p); |
1400 | } | 1406 | } |
1401 | 1407 | ||
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | |||
1408 | { | 1414 | { |
1409 | if (!task_running(rq, p) && | 1415 | if (!task_running(rq, p) && |
1410 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && | 1416 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && |
1411 | (p->rt.nr_cpus_allowed > 1)) | 1417 | (p->nr_cpus_allowed > 1)) |
1412 | return 1; | 1418 | return 1; |
1413 | return 0; | 1419 | return 0; |
1414 | } | 1420 | } |
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
1464 | if (unlikely(!lowest_mask)) | 1470 | if (unlikely(!lowest_mask)) |
1465 | return -1; | 1471 | return -1; |
1466 | 1472 | ||
1467 | if (task->rt.nr_cpus_allowed == 1) | 1473 | if (task->nr_cpus_allowed == 1) |
1468 | return -1; /* No other targets possible */ | 1474 | return -1; /* No other targets possible */ |
1469 | 1475 | ||
1470 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) | 1476 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) |
@@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1556 | task_running(rq, task) || | 1562 | task_running(rq, task) || |
1557 | !task->on_rq)) { | 1563 | !task->on_rq)) { |
1558 | 1564 | ||
1559 | raw_spin_unlock(&lowest_rq->lock); | 1565 | double_unlock_balance(rq, lowest_rq); |
1560 | lowest_rq = NULL; | 1566 | lowest_rq = NULL; |
1561 | break; | 1567 | break; |
1562 | } | 1568 | } |
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
1586 | 1592 | ||
1587 | BUG_ON(rq->cpu != task_cpu(p)); | 1593 | BUG_ON(rq->cpu != task_cpu(p)); |
1588 | BUG_ON(task_current(rq, p)); | 1594 | BUG_ON(task_current(rq, p)); |
1589 | BUG_ON(p->rt.nr_cpus_allowed <= 1); | 1595 | BUG_ON(p->nr_cpus_allowed <= 1); |
1590 | 1596 | ||
1591 | BUG_ON(!p->on_rq); | 1597 | BUG_ON(!p->on_rq); |
1592 | BUG_ON(!rt_task(p)); | 1598 | BUG_ON(!rt_task(p)); |
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1793 | if (!task_running(rq, p) && | 1799 | if (!task_running(rq, p) && |
1794 | !test_tsk_need_resched(rq->curr) && | 1800 | !test_tsk_need_resched(rq->curr) && |
1795 | has_pushable_tasks(rq) && | 1801 | has_pushable_tasks(rq) && |
1796 | p->rt.nr_cpus_allowed > 1 && | 1802 | p->nr_cpus_allowed > 1 && |
1797 | rt_task(rq->curr) && | 1803 | rt_task(rq->curr) && |
1798 | (rq->curr->rt.nr_cpus_allowed < 2 || | 1804 | (rq->curr->nr_cpus_allowed < 2 || |
1799 | rq->curr->prio <= p->prio)) | 1805 | rq->curr->prio <= p->prio)) |
1800 | push_rt_tasks(rq); | 1806 | push_rt_tasks(rq); |
1801 | } | 1807 | } |
@@ -1803,44 +1809,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1803 | static void set_cpus_allowed_rt(struct task_struct *p, | 1809 | static void set_cpus_allowed_rt(struct task_struct *p, |
1804 | const struct cpumask *new_mask) | 1810 | const struct cpumask *new_mask) |
1805 | { | 1811 | { |
1806 | int weight = cpumask_weight(new_mask); | 1812 | struct rq *rq; |
1813 | int weight; | ||
1807 | 1814 | ||
1808 | BUG_ON(!rt_task(p)); | 1815 | BUG_ON(!rt_task(p)); |
1809 | 1816 | ||
1810 | /* | 1817 | if (!p->on_rq) |
1811 | * Update the migration status of the RQ if we have an RT task | 1818 | return; |
1812 | * which is running AND changing its weight value. | ||
1813 | */ | ||
1814 | if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { | ||
1815 | struct rq *rq = task_rq(p); | ||
1816 | 1819 | ||
1817 | if (!task_current(rq, p)) { | 1820 | weight = cpumask_weight(new_mask); |
1818 | /* | ||
1819 | * Make sure we dequeue this task from the pushable list | ||
1820 | * before going further. It will either remain off of | ||
1821 | * the list because we are no longer pushable, or it | ||
1822 | * will be requeued. | ||
1823 | */ | ||
1824 | if (p->rt.nr_cpus_allowed > 1) | ||
1825 | dequeue_pushable_task(rq, p); | ||
1826 | 1821 | ||
1827 | /* | 1822 | /* |
1828 | * Requeue if our weight is changing and still > 1 | 1823 | * Only update if the process changes its state from whether it |
1829 | */ | 1824 | * can migrate or not. |
1830 | if (weight > 1) | 1825 | */ |
1831 | enqueue_pushable_task(rq, p); | 1826 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) |
1832 | 1827 | return; | |
1833 | } | ||
1834 | 1828 | ||
1835 | if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { | 1829 | rq = task_rq(p); |
1836 | rq->rt.rt_nr_migratory++; | ||
1837 | } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { | ||
1838 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
1839 | rq->rt.rt_nr_migratory--; | ||
1840 | } | ||
1841 | 1830 | ||
1842 | update_rt_migration(&rq->rt); | 1831 | /* |
1832 | * The process used to be able to migrate OR it can now migrate | ||
1833 | */ | ||
1834 | if (weight <= 1) { | ||
1835 | if (!task_current(rq, p)) | ||
1836 | dequeue_pushable_task(rq, p); | ||
1837 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
1838 | rq->rt.rt_nr_migratory--; | ||
1839 | } else { | ||
1840 | if (!task_current(rq, p)) | ||
1841 | enqueue_pushable_task(rq, p); | ||
1842 | rq->rt.rt_nr_migratory++; | ||
1843 | } | 1843 | } |
1844 | |||
1845 | update_rt_migration(&rq->rt); | ||
1844 | } | 1846 | } |
1845 | 1847 | ||
1846 | /* Assumes rq->lock is held */ | 1848 | /* Assumes rq->lock is held */ |
@@ -1983,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
1983 | 1985 | ||
1984 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | 1986 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) |
1985 | { | 1987 | { |
1988 | struct sched_rt_entity *rt_se = &p->rt; | ||
1989 | |||
1986 | update_curr_rt(rq); | 1990 | update_curr_rt(rq); |
1987 | 1991 | ||
1988 | watchdog(rq, p); | 1992 | watchdog(rq, p); |
@@ -2000,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
2000 | p->rt.time_slice = RR_TIMESLICE; | 2004 | p->rt.time_slice = RR_TIMESLICE; |
2001 | 2005 | ||
2002 | /* | 2006 | /* |
2003 | * Requeue to the end of queue if we are not the only element | 2007 | * Requeue to the end of queue if we (and all of our ancestors) are the |
2004 | * on the queue: | 2008 | * only element on the queue |
2005 | */ | 2009 | */ |
2006 | if (p->rt.run_list.prev != p->rt.run_list.next) { | 2010 | for_each_sched_rt_entity(rt_se) { |
2007 | requeue_task_rt(rq, p, 0); | 2011 | if (rt_se->run_list.prev != rt_se->run_list.next) { |
2008 | set_tsk_need_resched(p); | 2012 | requeue_task_rt(rq, p, 0); |
2013 | set_tsk_need_resched(p); | ||
2014 | return; | ||
2015 | } | ||
2009 | } | 2016 | } |
2010 | } | 2017 | } |
2011 | 2018 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fb3acba4d52e..6d52cea7f33d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -201,7 +201,7 @@ struct cfs_bandwidth { }; | |||
201 | /* CFS-related fields in a runqueue */ | 201 | /* CFS-related fields in a runqueue */ |
202 | struct cfs_rq { | 202 | struct cfs_rq { |
203 | struct load_weight load; | 203 | struct load_weight load; |
204 | unsigned long nr_running, h_nr_running; | 204 | unsigned int nr_running, h_nr_running; |
205 | 205 | ||
206 | u64 exec_clock; | 206 | u64 exec_clock; |
207 | u64 min_vruntime; | 207 | u64 min_vruntime; |
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void) | |||
279 | /* Real-Time classes' related field in a runqueue: */ | 279 | /* Real-Time classes' related field in a runqueue: */ |
280 | struct rt_rq { | 280 | struct rt_rq { |
281 | struct rt_prio_array active; | 281 | struct rt_prio_array active; |
282 | unsigned long rt_nr_running; | 282 | unsigned int rt_nr_running; |
283 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 283 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
284 | struct { | 284 | struct { |
285 | int curr; /* highest queued rt task prio */ | 285 | int curr; /* highest queued rt task prio */ |
@@ -353,7 +353,7 @@ struct rq { | |||
353 | * nr_running and cpu_load should be in the same cacheline because | 353 | * nr_running and cpu_load should be in the same cacheline because |
354 | * remote CPUs use both these fields when doing load calculation. | 354 | * remote CPUs use both these fields when doing load calculation. |
355 | */ | 355 | */ |
356 | unsigned long nr_running; | 356 | unsigned int nr_running; |
357 | #define CPU_LOAD_IDX_MAX 5 | 357 | #define CPU_LOAD_IDX_MAX 5 |
358 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 358 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
359 | unsigned long last_load_update_tick; | 359 | unsigned long last_load_update_tick; |
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
526 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 526 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
527 | DECLARE_PER_CPU(int, sd_llc_id); | 527 | DECLARE_PER_CPU(int, sd_llc_id); |
528 | 528 | ||
529 | extern int group_balance_cpu(struct sched_group *sg); | ||
530 | |||
529 | #endif /* CONFIG_SMP */ | 531 | #endif /* CONFIG_SMP */ |
530 | 532 | ||
531 | #include "stats.h" | 533 | #include "stats.h" |
@@ -876,7 +878,7 @@ extern void resched_cpu(int cpu); | |||
876 | extern struct rt_bandwidth def_rt_bandwidth; | 878 | extern struct rt_bandwidth def_rt_bandwidth; |
877 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | 879 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); |
878 | 880 | ||
879 | extern void update_cpu_load(struct rq *this_rq); | 881 | extern void update_idle_cpu_load(struct rq *this_rq); |
880 | 882 | ||
881 | #ifdef CONFIG_CGROUP_CPUACCT | 883 | #ifdef CONFIG_CGROUP_CPUACCT |
882 | #include <linux/cgroup.h> | 884 | #include <linux/cgroup.h> |