diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 109 |
1 files changed, 49 insertions, 60 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c7395d97e4cb..9b4c4f320130 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p) | |||
872 | return max(smin, smax); | 872 | return max(smin, smax); |
873 | } | 873 | } |
874 | 874 | ||
875 | /* | ||
876 | * Once a preferred node is selected the scheduler balancer will prefer moving | ||
877 | * a task to that node for sysctl_numa_balancing_settle_count number of PTE | ||
878 | * scans. This will give the process the chance to accumulate more faults on | ||
879 | * the preferred node but still allow the scheduler to move the task again if | ||
880 | * the nodes CPUs are overloaded. | ||
881 | */ | ||
882 | unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; | ||
883 | |||
884 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) | 875 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) |
885 | { | 876 | { |
886 | rq->nr_numa_running += (p->numa_preferred_nid != -1); | 877 | rq->nr_numa_running += (p->numa_preferred_nid != -1); |
@@ -930,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) | |||
930 | if (!p->numa_group) | 921 | if (!p->numa_group) |
931 | return 0; | 922 | return 0; |
932 | 923 | ||
933 | return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; | 924 | return p->numa_group->faults[task_faults_idx(nid, 0)] + |
925 | p->numa_group->faults[task_faults_idx(nid, 1)]; | ||
934 | } | 926 | } |
935 | 927 | ||
936 | /* | 928 | /* |
@@ -1023,7 +1015,7 @@ struct task_numa_env { | |||
1023 | 1015 | ||
1024 | struct numa_stats src_stats, dst_stats; | 1016 | struct numa_stats src_stats, dst_stats; |
1025 | 1017 | ||
1026 | int imbalance_pct, idx; | 1018 | int imbalance_pct; |
1027 | 1019 | ||
1028 | struct task_struct *best_task; | 1020 | struct task_struct *best_task; |
1029 | long best_imp; | 1021 | long best_imp; |
@@ -1211,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1211 | * elsewhere, so there is no point in (re)trying. | 1203 | * elsewhere, so there is no point in (re)trying. |
1212 | */ | 1204 | */ |
1213 | if (unlikely(!sd)) { | 1205 | if (unlikely(!sd)) { |
1214 | p->numa_preferred_nid = cpu_to_node(task_cpu(p)); | 1206 | p->numa_preferred_nid = task_node(p); |
1215 | return -EINVAL; | 1207 | return -EINVAL; |
1216 | } | 1208 | } |
1217 | 1209 | ||
@@ -1258,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p) | |||
1258 | p->numa_scan_period = task_scan_min(p); | 1250 | p->numa_scan_period = task_scan_min(p); |
1259 | 1251 | ||
1260 | if (env.best_task == NULL) { | 1252 | if (env.best_task == NULL) { |
1261 | int ret = migrate_task_to(p, env.best_cpu); | 1253 | ret = migrate_task_to(p, env.best_cpu); |
1254 | if (ret != 0) | ||
1255 | trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); | ||
1262 | return ret; | 1256 | return ret; |
1263 | } | 1257 | } |
1264 | 1258 | ||
1265 | ret = migrate_swap(p, env.best_task); | 1259 | ret = migrate_swap(p, env.best_task); |
1260 | if (ret != 0) | ||
1261 | trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); | ||
1266 | put_task_struct(env.best_task); | 1262 | put_task_struct(env.best_task); |
1267 | return ret; | 1263 | return ret; |
1268 | } | 1264 | } |
@@ -1278,7 +1274,7 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
1278 | p->numa_migrate_retry = jiffies + HZ; | 1274 | p->numa_migrate_retry = jiffies + HZ; |
1279 | 1275 | ||
1280 | /* Success if task is already running on preferred CPU */ | 1276 | /* Success if task is already running on preferred CPU */ |
1281 | if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) | 1277 | if (task_node(p) == p->numa_preferred_nid) |
1282 | return; | 1278 | return; |
1283 | 1279 | ||
1284 | /* Otherwise, try migrate to a CPU on the preferred node */ | 1280 | /* Otherwise, try migrate to a CPU on the preferred node */ |
@@ -1350,7 +1346,6 @@ static void update_task_scan_period(struct task_struct *p, | |||
1350 | * scanning faster if shared accesses dominate as it may | 1346 | * scanning faster if shared accesses dominate as it may |
1351 | * simply bounce migrations uselessly | 1347 | * simply bounce migrations uselessly |
1352 | */ | 1348 | */ |
1353 | period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); | ||
1354 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); | 1349 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); |
1355 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; | 1350 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; |
1356 | } | 1351 | } |
@@ -1762,6 +1757,8 @@ void task_numa_work(struct callback_head *work) | |||
1762 | start = end; | 1757 | start = end; |
1763 | if (pages <= 0) | 1758 | if (pages <= 0) |
1764 | goto out; | 1759 | goto out; |
1760 | |||
1761 | cond_resched(); | ||
1765 | } while (end != vma->vm_end); | 1762 | } while (end != vma->vm_end); |
1766 | } | 1763 | } |
1767 | 1764 | ||
@@ -2365,13 +2362,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
2365 | } | 2362 | } |
2366 | wakeup = 0; | 2363 | wakeup = 0; |
2367 | } else { | 2364 | } else { |
2368 | /* | 2365 | __synchronize_entity_decay(se); |
2369 | * Task re-woke on same cpu (or else migrate_task_rq_fair() | ||
2370 | * would have made count negative); we must be careful to avoid | ||
2371 | * double-accounting blocked time after synchronizing decays. | ||
2372 | */ | ||
2373 | se->avg.last_runnable_update += __synchronize_entity_decay(se) | ||
2374 | << 20; | ||
2375 | } | 2366 | } |
2376 | 2367 | ||
2377 | /* migrated tasks did not contribute to our blocked load */ | 2368 | /* migrated tasks did not contribute to our blocked load */ |
@@ -3923,7 +3914,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
3923 | { | 3914 | { |
3924 | struct sched_entity *se = tg->se[cpu]; | 3915 | struct sched_entity *se = tg->se[cpu]; |
3925 | 3916 | ||
3926 | if (!tg->parent || !wl) /* the trivial, non-cgroup case */ | 3917 | if (!tg->parent) /* the trivial, non-cgroup case */ |
3927 | return wl; | 3918 | return wl; |
3928 | 3919 | ||
3929 | for_each_sched_entity(se) { | 3920 | for_each_sched_entity(se) { |
@@ -4101,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
4101 | */ | 4092 | */ |
4102 | static struct sched_group * | 4093 | static struct sched_group * |
4103 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, | 4094 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, |
4104 | int this_cpu, int load_idx) | 4095 | int this_cpu, int sd_flag) |
4105 | { | 4096 | { |
4106 | struct sched_group *idlest = NULL, *group = sd->groups; | 4097 | struct sched_group *idlest = NULL, *group = sd->groups; |
4107 | unsigned long min_load = ULONG_MAX, this_load = 0; | 4098 | unsigned long min_load = ULONG_MAX, this_load = 0; |
4099 | int load_idx = sd->forkexec_idx; | ||
4108 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 4100 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
4109 | 4101 | ||
4102 | if (sd_flag & SD_BALANCE_WAKE) | ||
4103 | load_idx = sd->wake_idx; | ||
4104 | |||
4110 | do { | 4105 | do { |
4111 | unsigned long load, avg_load; | 4106 | unsigned long load, avg_load; |
4112 | int local_group; | 4107 | int local_group; |
@@ -4274,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
4274 | } | 4269 | } |
4275 | 4270 | ||
4276 | while (sd) { | 4271 | while (sd) { |
4277 | int load_idx = sd->forkexec_idx; | ||
4278 | struct sched_group *group; | 4272 | struct sched_group *group; |
4279 | int weight; | 4273 | int weight; |
4280 | 4274 | ||
@@ -4283,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
4283 | continue; | 4277 | continue; |
4284 | } | 4278 | } |
4285 | 4279 | ||
4286 | if (sd_flag & SD_BALANCE_WAKE) | 4280 | group = find_idlest_group(sd, p, cpu, sd_flag); |
4287 | load_idx = sd->wake_idx; | ||
4288 | |||
4289 | group = find_idlest_group(sd, p, cpu, load_idx); | ||
4290 | if (!group) { | 4281 | if (!group) { |
4291 | sd = sd->child; | 4282 | sd = sd->child; |
4292 | continue; | 4283 | continue; |
@@ -5512,7 +5503,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5512 | struct sched_group *group, int load_idx, | 5503 | struct sched_group *group, int load_idx, |
5513 | int local_group, struct sg_lb_stats *sgs) | 5504 | int local_group, struct sg_lb_stats *sgs) |
5514 | { | 5505 | { |
5515 | unsigned long nr_running; | ||
5516 | unsigned long load; | 5506 | unsigned long load; |
5517 | int i; | 5507 | int i; |
5518 | 5508 | ||
@@ -5521,8 +5511,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5521 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5511 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
5522 | struct rq *rq = cpu_rq(i); | 5512 | struct rq *rq = cpu_rq(i); |
5523 | 5513 | ||
5524 | nr_running = rq->nr_running; | ||
5525 | |||
5526 | /* Bias balancing toward cpus of our domain */ | 5514 | /* Bias balancing toward cpus of our domain */ |
5527 | if (local_group) | 5515 | if (local_group) |
5528 | load = target_load(i, load_idx); | 5516 | load = target_load(i, load_idx); |
@@ -5530,7 +5518,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5530 | load = source_load(i, load_idx); | 5518 | load = source_load(i, load_idx); |
5531 | 5519 | ||
5532 | sgs->group_load += load; | 5520 | sgs->group_load += load; |
5533 | sgs->sum_nr_running += nr_running; | 5521 | sgs->sum_nr_running += rq->nr_running; |
5534 | #ifdef CONFIG_NUMA_BALANCING | 5522 | #ifdef CONFIG_NUMA_BALANCING |
5535 | sgs->nr_numa_running += rq->nr_numa_running; | 5523 | sgs->nr_numa_running += rq->nr_numa_running; |
5536 | sgs->nr_preferred_running += rq->nr_preferred_running; | 5524 | sgs->nr_preferred_running += rq->nr_preferred_running; |
@@ -6521,7 +6509,7 @@ static struct { | |||
6521 | unsigned long next_balance; /* in jiffy units */ | 6509 | unsigned long next_balance; /* in jiffy units */ |
6522 | } nohz ____cacheline_aligned; | 6510 | } nohz ____cacheline_aligned; |
6523 | 6511 | ||
6524 | static inline int find_new_ilb(int call_cpu) | 6512 | static inline int find_new_ilb(void) |
6525 | { | 6513 | { |
6526 | int ilb = cpumask_first(nohz.idle_cpus_mask); | 6514 | int ilb = cpumask_first(nohz.idle_cpus_mask); |
6527 | 6515 | ||
@@ -6536,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu) | |||
6536 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | 6524 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle |
6537 | * CPU (if there is one). | 6525 | * CPU (if there is one). |
6538 | */ | 6526 | */ |
6539 | static void nohz_balancer_kick(int cpu) | 6527 | static void nohz_balancer_kick(void) |
6540 | { | 6528 | { |
6541 | int ilb_cpu; | 6529 | int ilb_cpu; |
6542 | 6530 | ||
6543 | nohz.next_balance++; | 6531 | nohz.next_balance++; |
6544 | 6532 | ||
6545 | ilb_cpu = find_new_ilb(cpu); | 6533 | ilb_cpu = find_new_ilb(); |
6546 | 6534 | ||
6547 | if (ilb_cpu >= nr_cpu_ids) | 6535 | if (ilb_cpu >= nr_cpu_ids) |
6548 | return; | 6536 | return; |
@@ -6652,10 +6640,10 @@ void update_max_interval(void) | |||
6652 | * | 6640 | * |
6653 | * Balancing parameters are set up in init_sched_domains. | 6641 | * Balancing parameters are set up in init_sched_domains. |
6654 | */ | 6642 | */ |
6655 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 6643 | static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) |
6656 | { | 6644 | { |
6657 | int continue_balancing = 1; | 6645 | int continue_balancing = 1; |
6658 | struct rq *rq = cpu_rq(cpu); | 6646 | int cpu = rq->cpu; |
6659 | unsigned long interval; | 6647 | unsigned long interval; |
6660 | struct sched_domain *sd; | 6648 | struct sched_domain *sd; |
6661 | /* Earliest time when we have to do rebalance again */ | 6649 | /* Earliest time when we have to do rebalance again */ |
@@ -6752,9 +6740,9 @@ out: | |||
6752 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the | 6740 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the |
6753 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 6741 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
6754 | */ | 6742 | */ |
6755 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | 6743 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) |
6756 | { | 6744 | { |
6757 | struct rq *this_rq = cpu_rq(this_cpu); | 6745 | int this_cpu = this_rq->cpu; |
6758 | struct rq *rq; | 6746 | struct rq *rq; |
6759 | int balance_cpu; | 6747 | int balance_cpu; |
6760 | 6748 | ||
@@ -6781,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
6781 | update_idle_cpu_load(rq); | 6769 | update_idle_cpu_load(rq); |
6782 | raw_spin_unlock_irq(&rq->lock); | 6770 | raw_spin_unlock_irq(&rq->lock); |
6783 | 6771 | ||
6784 | rebalance_domains(balance_cpu, CPU_IDLE); | 6772 | rebalance_domains(rq, CPU_IDLE); |
6785 | 6773 | ||
6786 | if (time_after(this_rq->next_balance, rq->next_balance)) | 6774 | if (time_after(this_rq->next_balance, rq->next_balance)) |
6787 | this_rq->next_balance = rq->next_balance; | 6775 | this_rq->next_balance = rq->next_balance; |
@@ -6800,14 +6788,14 @@ end: | |||
6800 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 6788 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
6801 | * domain span are idle. | 6789 | * domain span are idle. |
6802 | */ | 6790 | */ |
6803 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | 6791 | static inline int nohz_kick_needed(struct rq *rq) |
6804 | { | 6792 | { |
6805 | unsigned long now = jiffies; | 6793 | unsigned long now = jiffies; |
6806 | struct sched_domain *sd; | 6794 | struct sched_domain *sd; |
6807 | struct sched_group_power *sgp; | 6795 | struct sched_group_power *sgp; |
6808 | int nr_busy; | 6796 | int nr_busy, cpu = rq->cpu; |
6809 | 6797 | ||
6810 | if (unlikely(idle_cpu(cpu))) | 6798 | if (unlikely(rq->idle_balance)) |
6811 | return 0; | 6799 | return 0; |
6812 | 6800 | ||
6813 | /* | 6801 | /* |
@@ -6856,7 +6844,7 @@ need_kick: | |||
6856 | return 1; | 6844 | return 1; |
6857 | } | 6845 | } |
6858 | #else | 6846 | #else |
6859 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | 6847 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } |
6860 | #endif | 6848 | #endif |
6861 | 6849 | ||
6862 | /* | 6850 | /* |
@@ -6865,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | |||
6865 | */ | 6853 | */ |
6866 | static void run_rebalance_domains(struct softirq_action *h) | 6854 | static void run_rebalance_domains(struct softirq_action *h) |
6867 | { | 6855 | { |
6868 | int this_cpu = smp_processor_id(); | 6856 | struct rq *this_rq = this_rq(); |
6869 | struct rq *this_rq = cpu_rq(this_cpu); | ||
6870 | enum cpu_idle_type idle = this_rq->idle_balance ? | 6857 | enum cpu_idle_type idle = this_rq->idle_balance ? |
6871 | CPU_IDLE : CPU_NOT_IDLE; | 6858 | CPU_IDLE : CPU_NOT_IDLE; |
6872 | 6859 | ||
6873 | rebalance_domains(this_cpu, idle); | 6860 | rebalance_domains(this_rq, idle); |
6874 | 6861 | ||
6875 | /* | 6862 | /* |
6876 | * If this cpu has a pending nohz_balance_kick, then do the | 6863 | * If this cpu has a pending nohz_balance_kick, then do the |
6877 | * balancing on behalf of the other idle cpus whose ticks are | 6864 | * balancing on behalf of the other idle cpus whose ticks are |
6878 | * stopped. | 6865 | * stopped. |
6879 | */ | 6866 | */ |
6880 | nohz_idle_balance(this_cpu, idle); | 6867 | nohz_idle_balance(this_rq, idle); |
6881 | } | 6868 | } |
6882 | 6869 | ||
6883 | static inline int on_null_domain(int cpu) | 6870 | static inline int on_null_domain(struct rq *rq) |
6884 | { | 6871 | { |
6885 | return !rcu_dereference_sched(cpu_rq(cpu)->sd); | 6872 | return !rcu_dereference_sched(rq->sd); |
6886 | } | 6873 | } |
6887 | 6874 | ||
6888 | /* | 6875 | /* |
6889 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 6876 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
6890 | */ | 6877 | */ |
6891 | void trigger_load_balance(struct rq *rq, int cpu) | 6878 | void trigger_load_balance(struct rq *rq) |
6892 | { | 6879 | { |
6893 | /* Don't need to rebalance while attached to NULL domain */ | 6880 | /* Don't need to rebalance while attached to NULL domain */ |
6894 | if (time_after_eq(jiffies, rq->next_balance) && | 6881 | if (unlikely(on_null_domain(rq))) |
6895 | likely(!on_null_domain(cpu))) | 6882 | return; |
6883 | |||
6884 | if (time_after_eq(jiffies, rq->next_balance)) | ||
6896 | raise_softirq(SCHED_SOFTIRQ); | 6885 | raise_softirq(SCHED_SOFTIRQ); |
6897 | #ifdef CONFIG_NO_HZ_COMMON | 6886 | #ifdef CONFIG_NO_HZ_COMMON |
6898 | if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | 6887 | if (nohz_kick_needed(rq)) |
6899 | nohz_balancer_kick(cpu); | 6888 | nohz_balancer_kick(); |
6900 | #endif | 6889 | #endif |
6901 | } | 6890 | } |
6902 | 6891 | ||
@@ -7012,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
7012 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 7001 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
7013 | 7002 | ||
7014 | /* | 7003 | /* |
7015 | * Ensure the task's vruntime is normalized, so that when its | 7004 | * Ensure the task's vruntime is normalized, so that when it's |
7016 | * switched back to the fair class the enqueue_entity(.flags=0) will | 7005 | * switched back to the fair class the enqueue_entity(.flags=0) will |
7017 | * do the right thing. | 7006 | * do the right thing. |
7018 | * | 7007 | * |
7019 | * If it was on_rq, then the dequeue_entity(.flags=0) will already | 7008 | * If it's on_rq, then the dequeue_entity(.flags=0) will already |
7020 | * have normalized the vruntime, if it was !on_rq, then only when | 7009 | * have normalized the vruntime, if it's !on_rq, then only when |
7021 | * the task is sleeping will it still have non-normalized vruntime. | 7010 | * the task is sleeping will it still have non-normalized vruntime. |
7022 | */ | 7011 | */ |
7023 | if (!se->on_rq && p->state != TASK_RUNNING) { | 7012 | if (!p->on_rq && p->state != TASK_RUNNING) { |
7024 | /* | 7013 | /* |
7025 | * Fix up our vruntime so that the current sleep doesn't | 7014 | * Fix up our vruntime so that the current sleep doesn't |
7026 | * cause 'unlimited' sleep bonus. | 7015 | * cause 'unlimited' sleep bonus. |