aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c109
1 files changed, 49 insertions, 60 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c7395d97e4cb..9b4c4f320130 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p)
872 return max(smin, smax); 872 return max(smin, smax);
873} 873}
874 874
875/*
876 * Once a preferred node is selected the scheduler balancer will prefer moving
877 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
878 * scans. This will give the process the chance to accumulate more faults on
879 * the preferred node but still allow the scheduler to move the task again if
880 * the nodes CPUs are overloaded.
881 */
882unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
883
884static void account_numa_enqueue(struct rq *rq, struct task_struct *p) 875static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
885{ 876{
886 rq->nr_numa_running += (p->numa_preferred_nid != -1); 877 rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -930,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
930 if (!p->numa_group) 921 if (!p->numa_group)
931 return 0; 922 return 0;
932 923
933 return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; 924 return p->numa_group->faults[task_faults_idx(nid, 0)] +
925 p->numa_group->faults[task_faults_idx(nid, 1)];
934} 926}
935 927
936/* 928/*
@@ -1023,7 +1015,7 @@ struct task_numa_env {
1023 1015
1024 struct numa_stats src_stats, dst_stats; 1016 struct numa_stats src_stats, dst_stats;
1025 1017
1026 int imbalance_pct, idx; 1018 int imbalance_pct;
1027 1019
1028 struct task_struct *best_task; 1020 struct task_struct *best_task;
1029 long best_imp; 1021 long best_imp;
@@ -1211,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p)
1211 * elsewhere, so there is no point in (re)trying. 1203 * elsewhere, so there is no point in (re)trying.
1212 */ 1204 */
1213 if (unlikely(!sd)) { 1205 if (unlikely(!sd)) {
1214 p->numa_preferred_nid = cpu_to_node(task_cpu(p)); 1206 p->numa_preferred_nid = task_node(p);
1215 return -EINVAL; 1207 return -EINVAL;
1216 } 1208 }
1217 1209
@@ -1258,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p)
1258 p->numa_scan_period = task_scan_min(p); 1250 p->numa_scan_period = task_scan_min(p);
1259 1251
1260 if (env.best_task == NULL) { 1252 if (env.best_task == NULL) {
1261 int ret = migrate_task_to(p, env.best_cpu); 1253 ret = migrate_task_to(p, env.best_cpu);
1254 if (ret != 0)
1255 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1262 return ret; 1256 return ret;
1263 } 1257 }
1264 1258
1265 ret = migrate_swap(p, env.best_task); 1259 ret = migrate_swap(p, env.best_task);
1260 if (ret != 0)
1261 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1266 put_task_struct(env.best_task); 1262 put_task_struct(env.best_task);
1267 return ret; 1263 return ret;
1268} 1264}
@@ -1278,7 +1274,7 @@ static void numa_migrate_preferred(struct task_struct *p)
1278 p->numa_migrate_retry = jiffies + HZ; 1274 p->numa_migrate_retry = jiffies + HZ;
1279 1275
1280 /* Success if task is already running on preferred CPU */ 1276 /* Success if task is already running on preferred CPU */
1281 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) 1277 if (task_node(p) == p->numa_preferred_nid)
1282 return; 1278 return;
1283 1279
1284 /* Otherwise, try migrate to a CPU on the preferred node */ 1280 /* Otherwise, try migrate to a CPU on the preferred node */
@@ -1350,7 +1346,6 @@ static void update_task_scan_period(struct task_struct *p,
1350 * scanning faster if shared accesses dominate as it may 1346 * scanning faster if shared accesses dominate as it may
1351 * simply bounce migrations uselessly 1347 * simply bounce migrations uselessly
1352 */ 1348 */
1353 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1354 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); 1349 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1355 diff = (diff * ratio) / NUMA_PERIOD_SLOTS; 1350 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1356 } 1351 }
@@ -1762,6 +1757,8 @@ void task_numa_work(struct callback_head *work)
1762 start = end; 1757 start = end;
1763 if (pages <= 0) 1758 if (pages <= 0)
1764 goto out; 1759 goto out;
1760
1761 cond_resched();
1765 } while (end != vma->vm_end); 1762 } while (end != vma->vm_end);
1766 } 1763 }
1767 1764
@@ -2365,13 +2362,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2365 } 2362 }
2366 wakeup = 0; 2363 wakeup = 0;
2367 } else { 2364 } else {
2368 /* 2365 __synchronize_entity_decay(se);
2369 * Task re-woke on same cpu (or else migrate_task_rq_fair()
2370 * would have made count negative); we must be careful to avoid
2371 * double-accounting blocked time after synchronizing decays.
2372 */
2373 se->avg.last_runnable_update += __synchronize_entity_decay(se)
2374 << 20;
2375 } 2366 }
2376 2367
2377 /* migrated tasks did not contribute to our blocked load */ 2368 /* migrated tasks did not contribute to our blocked load */
@@ -3923,7 +3914,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3923{ 3914{
3924 struct sched_entity *se = tg->se[cpu]; 3915 struct sched_entity *se = tg->se[cpu];
3925 3916
3926 if (!tg->parent || !wl) /* the trivial, non-cgroup case */ 3917 if (!tg->parent) /* the trivial, non-cgroup case */
3927 return wl; 3918 return wl;
3928 3919
3929 for_each_sched_entity(se) { 3920 for_each_sched_entity(se) {
@@ -4101,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4101 */ 4092 */
4102static struct sched_group * 4093static struct sched_group *
4103find_idlest_group(struct sched_domain *sd, struct task_struct *p, 4094find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4104 int this_cpu, int load_idx) 4095 int this_cpu, int sd_flag)
4105{ 4096{
4106 struct sched_group *idlest = NULL, *group = sd->groups; 4097 struct sched_group *idlest = NULL, *group = sd->groups;
4107 unsigned long min_load = ULONG_MAX, this_load = 0; 4098 unsigned long min_load = ULONG_MAX, this_load = 0;
4099 int load_idx = sd->forkexec_idx;
4108 int imbalance = 100 + (sd->imbalance_pct-100)/2; 4100 int imbalance = 100 + (sd->imbalance_pct-100)/2;
4109 4101
4102 if (sd_flag & SD_BALANCE_WAKE)
4103 load_idx = sd->wake_idx;
4104
4110 do { 4105 do {
4111 unsigned long load, avg_load; 4106 unsigned long load, avg_load;
4112 int local_group; 4107 int local_group;
@@ -4274,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4274 } 4269 }
4275 4270
4276 while (sd) { 4271 while (sd) {
4277 int load_idx = sd->forkexec_idx;
4278 struct sched_group *group; 4272 struct sched_group *group;
4279 int weight; 4273 int weight;
4280 4274
@@ -4283,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4283 continue; 4277 continue;
4284 } 4278 }
4285 4279
4286 if (sd_flag & SD_BALANCE_WAKE) 4280 group = find_idlest_group(sd, p, cpu, sd_flag);
4287 load_idx = sd->wake_idx;
4288
4289 group = find_idlest_group(sd, p, cpu, load_idx);
4290 if (!group) { 4281 if (!group) {
4291 sd = sd->child; 4282 sd = sd->child;
4292 continue; 4283 continue;
@@ -5512,7 +5503,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5512 struct sched_group *group, int load_idx, 5503 struct sched_group *group, int load_idx,
5513 int local_group, struct sg_lb_stats *sgs) 5504 int local_group, struct sg_lb_stats *sgs)
5514{ 5505{
5515 unsigned long nr_running;
5516 unsigned long load; 5506 unsigned long load;
5517 int i; 5507 int i;
5518 5508
@@ -5521,8 +5511,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5521 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5511 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5522 struct rq *rq = cpu_rq(i); 5512 struct rq *rq = cpu_rq(i);
5523 5513
5524 nr_running = rq->nr_running;
5525
5526 /* Bias balancing toward cpus of our domain */ 5514 /* Bias balancing toward cpus of our domain */
5527 if (local_group) 5515 if (local_group)
5528 load = target_load(i, load_idx); 5516 load = target_load(i, load_idx);
@@ -5530,7 +5518,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5530 load = source_load(i, load_idx); 5518 load = source_load(i, load_idx);
5531 5519
5532 sgs->group_load += load; 5520 sgs->group_load += load;
5533 sgs->sum_nr_running += nr_running; 5521 sgs->sum_nr_running += rq->nr_running;
5534#ifdef CONFIG_NUMA_BALANCING 5522#ifdef CONFIG_NUMA_BALANCING
5535 sgs->nr_numa_running += rq->nr_numa_running; 5523 sgs->nr_numa_running += rq->nr_numa_running;
5536 sgs->nr_preferred_running += rq->nr_preferred_running; 5524 sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -6521,7 +6509,7 @@ static struct {
6521 unsigned long next_balance; /* in jiffy units */ 6509 unsigned long next_balance; /* in jiffy units */
6522} nohz ____cacheline_aligned; 6510} nohz ____cacheline_aligned;
6523 6511
6524static inline int find_new_ilb(int call_cpu) 6512static inline int find_new_ilb(void)
6525{ 6513{
6526 int ilb = cpumask_first(nohz.idle_cpus_mask); 6514 int ilb = cpumask_first(nohz.idle_cpus_mask);
6527 6515
@@ -6536,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu)
6536 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle 6524 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
6537 * CPU (if there is one). 6525 * CPU (if there is one).
6538 */ 6526 */
6539static void nohz_balancer_kick(int cpu) 6527static void nohz_balancer_kick(void)
6540{ 6528{
6541 int ilb_cpu; 6529 int ilb_cpu;
6542 6530
6543 nohz.next_balance++; 6531 nohz.next_balance++;
6544 6532
6545 ilb_cpu = find_new_ilb(cpu); 6533 ilb_cpu = find_new_ilb();
6546 6534
6547 if (ilb_cpu >= nr_cpu_ids) 6535 if (ilb_cpu >= nr_cpu_ids)
6548 return; 6536 return;
@@ -6652,10 +6640,10 @@ void update_max_interval(void)
6652 * 6640 *
6653 * Balancing parameters are set up in init_sched_domains. 6641 * Balancing parameters are set up in init_sched_domains.
6654 */ 6642 */
6655static void rebalance_domains(int cpu, enum cpu_idle_type idle) 6643static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
6656{ 6644{
6657 int continue_balancing = 1; 6645 int continue_balancing = 1;
6658 struct rq *rq = cpu_rq(cpu); 6646 int cpu = rq->cpu;
6659 unsigned long interval; 6647 unsigned long interval;
6660 struct sched_domain *sd; 6648 struct sched_domain *sd;
6661 /* Earliest time when we have to do rebalance again */ 6649 /* Earliest time when we have to do rebalance again */
@@ -6752,9 +6740,9 @@ out:
6752 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 6740 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
6753 * rebalancing for all the cpus for whom scheduler ticks are stopped. 6741 * rebalancing for all the cpus for whom scheduler ticks are stopped.
6754 */ 6742 */
6755static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) 6743static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
6756{ 6744{
6757 struct rq *this_rq = cpu_rq(this_cpu); 6745 int this_cpu = this_rq->cpu;
6758 struct rq *rq; 6746 struct rq *rq;
6759 int balance_cpu; 6747 int balance_cpu;
6760 6748
@@ -6781,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
6781 update_idle_cpu_load(rq); 6769 update_idle_cpu_load(rq);
6782 raw_spin_unlock_irq(&rq->lock); 6770 raw_spin_unlock_irq(&rq->lock);
6783 6771
6784 rebalance_domains(balance_cpu, CPU_IDLE); 6772 rebalance_domains(rq, CPU_IDLE);
6785 6773
6786 if (time_after(this_rq->next_balance, rq->next_balance)) 6774 if (time_after(this_rq->next_balance, rq->next_balance))
6787 this_rq->next_balance = rq->next_balance; 6775 this_rq->next_balance = rq->next_balance;
@@ -6800,14 +6788,14 @@ end:
6800 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 6788 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
6801 * domain span are idle. 6789 * domain span are idle.
6802 */ 6790 */
6803static inline int nohz_kick_needed(struct rq *rq, int cpu) 6791static inline int nohz_kick_needed(struct rq *rq)
6804{ 6792{
6805 unsigned long now = jiffies; 6793 unsigned long now = jiffies;
6806 struct sched_domain *sd; 6794 struct sched_domain *sd;
6807 struct sched_group_power *sgp; 6795 struct sched_group_power *sgp;
6808 int nr_busy; 6796 int nr_busy, cpu = rq->cpu;
6809 6797
6810 if (unlikely(idle_cpu(cpu))) 6798 if (unlikely(rq->idle_balance))
6811 return 0; 6799 return 0;
6812 6800
6813 /* 6801 /*
@@ -6856,7 +6844,7 @@ need_kick:
6856 return 1; 6844 return 1;
6857} 6845}
6858#else 6846#else
6859static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 6847static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
6860#endif 6848#endif
6861 6849
6862/* 6850/*
@@ -6865,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
6865 */ 6853 */
6866static void run_rebalance_domains(struct softirq_action *h) 6854static void run_rebalance_domains(struct softirq_action *h)
6867{ 6855{
6868 int this_cpu = smp_processor_id(); 6856 struct rq *this_rq = this_rq();
6869 struct rq *this_rq = cpu_rq(this_cpu);
6870 enum cpu_idle_type idle = this_rq->idle_balance ? 6857 enum cpu_idle_type idle = this_rq->idle_balance ?
6871 CPU_IDLE : CPU_NOT_IDLE; 6858 CPU_IDLE : CPU_NOT_IDLE;
6872 6859
6873 rebalance_domains(this_cpu, idle); 6860 rebalance_domains(this_rq, idle);
6874 6861
6875 /* 6862 /*
6876 * If this cpu has a pending nohz_balance_kick, then do the 6863 * If this cpu has a pending nohz_balance_kick, then do the
6877 * balancing on behalf of the other idle cpus whose ticks are 6864 * balancing on behalf of the other idle cpus whose ticks are
6878 * stopped. 6865 * stopped.
6879 */ 6866 */
6880 nohz_idle_balance(this_cpu, idle); 6867 nohz_idle_balance(this_rq, idle);
6881} 6868}
6882 6869
6883static inline int on_null_domain(int cpu) 6870static inline int on_null_domain(struct rq *rq)
6884{ 6871{
6885 return !rcu_dereference_sched(cpu_rq(cpu)->sd); 6872 return !rcu_dereference_sched(rq->sd);
6886} 6873}
6887 6874
6888/* 6875/*
6889 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 6876 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
6890 */ 6877 */
6891void trigger_load_balance(struct rq *rq, int cpu) 6878void trigger_load_balance(struct rq *rq)
6892{ 6879{
6893 /* Don't need to rebalance while attached to NULL domain */ 6880 /* Don't need to rebalance while attached to NULL domain */
6894 if (time_after_eq(jiffies, rq->next_balance) && 6881 if (unlikely(on_null_domain(rq)))
6895 likely(!on_null_domain(cpu))) 6882 return;
6883
6884 if (time_after_eq(jiffies, rq->next_balance))
6896 raise_softirq(SCHED_SOFTIRQ); 6885 raise_softirq(SCHED_SOFTIRQ);
6897#ifdef CONFIG_NO_HZ_COMMON 6886#ifdef CONFIG_NO_HZ_COMMON
6898 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 6887 if (nohz_kick_needed(rq))
6899 nohz_balancer_kick(cpu); 6888 nohz_balancer_kick();
6900#endif 6889#endif
6901} 6890}
6902 6891
@@ -7012,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7012 struct cfs_rq *cfs_rq = cfs_rq_of(se); 7001 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7013 7002
7014 /* 7003 /*
7015 * Ensure the task's vruntime is normalized, so that when its 7004 * Ensure the task's vruntime is normalized, so that when it's
7016 * switched back to the fair class the enqueue_entity(.flags=0) will 7005 * switched back to the fair class the enqueue_entity(.flags=0) will
7017 * do the right thing. 7006 * do the right thing.
7018 * 7007 *
7019 * If it was on_rq, then the dequeue_entity(.flags=0) will already 7008 * If it's on_rq, then the dequeue_entity(.flags=0) will already
7020 * have normalized the vruntime, if it was !on_rq, then only when 7009 * have normalized the vruntime, if it's !on_rq, then only when
7021 * the task is sleeping will it still have non-normalized vruntime. 7010 * the task is sleeping will it still have non-normalized vruntime.
7022 */ 7011 */
7023 if (!se->on_rq && p->state != TASK_RUNNING) { 7012 if (!p->on_rq && p->state != TASK_RUNNING) {
7024 /* 7013 /*
7025 * Fix up our vruntime so that the current sleep doesn't 7014 * Fix up our vruntime so that the current sleep doesn't
7026 * cause 'unlimited' sleep bonus. 7015 * cause 'unlimited' sleep bonus.