aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--kernel/sched/core.c29
-rw-r--r--kernel/sched/fair.c120
-rw-r--r--kernel/sched/sched.h5
3 files changed, 142 insertions, 12 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3e2c893df173..8cfd51f62241 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4468,6 +4468,35 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
4468 4468
4469 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 4469 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4470} 4470}
4471
4472/*
4473 * Requeue a task on a given node and accurately track the number of NUMA
4474 * tasks on the runqueues
4475 */
4476void sched_setnuma(struct task_struct *p, int nid)
4477{
4478 struct rq *rq;
4479 unsigned long flags;
4480 bool on_rq, running;
4481
4482 rq = task_rq_lock(p, &flags);
4483 on_rq = p->on_rq;
4484 running = task_current(rq, p);
4485
4486 if (on_rq)
4487 dequeue_task(rq, p, 0);
4488 if (running)
4489 p->sched_class->put_prev_task(rq, p);
4490
4491 p->numa_preferred_nid = nid;
4492 p->numa_migrate_seq = 1;
4493
4494 if (running)
4495 p->sched_class->set_curr_task(rq);
4496 if (on_rq)
4497 enqueue_task(rq, p, 0);
4498 task_rq_unlock(rq, p, &flags);
4499}
4471#endif 4500#endif
4472 4501
4473/* 4502/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 423316cdee07..5166b9b1af70 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -888,6 +888,18 @@ static unsigned int task_scan_max(struct task_struct *p)
888 */ 888 */
889unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; 889unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
890 890
891static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
892{
893 rq->nr_numa_running += (p->numa_preferred_nid != -1);
894 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
895}
896
897static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
898{
899 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
900 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
901}
902
891struct numa_group { 903struct numa_group {
892 atomic_t refcount; 904 atomic_t refcount;
893 905
@@ -1227,6 +1239,8 @@ static int task_numa_migrate(struct task_struct *p)
1227 if (env.best_cpu == -1) 1239 if (env.best_cpu == -1)
1228 return -EAGAIN; 1240 return -EAGAIN;
1229 1241
1242 sched_setnuma(p, env.dst_nid);
1243
1230 if (env.best_task == NULL) { 1244 if (env.best_task == NULL) {
1231 int ret = migrate_task_to(p, env.best_cpu); 1245 int ret = migrate_task_to(p, env.best_cpu);
1232 return ret; 1246 return ret;
@@ -1342,8 +1356,7 @@ static void task_numa_placement(struct task_struct *p)
1342 /* Preferred node as the node with the most faults */ 1356 /* Preferred node as the node with the most faults */
1343 if (max_faults && max_nid != p->numa_preferred_nid) { 1357 if (max_faults && max_nid != p->numa_preferred_nid) {
1344 /* Update the preferred nid and migrate task if possible */ 1358 /* Update the preferred nid and migrate task if possible */
1345 p->numa_preferred_nid = max_nid; 1359 sched_setnuma(p, max_nid);
1346 p->numa_migrate_seq = 1;
1347 numa_migrate_preferred(p); 1360 numa_migrate_preferred(p);
1348 } 1361 }
1349} 1362}
@@ -1741,6 +1754,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1741static void task_tick_numa(struct rq *rq, struct task_struct *curr) 1754static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1742{ 1755{
1743} 1756}
1757
1758static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1759{
1760}
1761
1762static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1763{
1764}
1744#endif /* CONFIG_NUMA_BALANCING */ 1765#endif /* CONFIG_NUMA_BALANCING */
1745 1766
1746static void 1767static void
@@ -1750,8 +1771,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1750 if (!parent_entity(se)) 1771 if (!parent_entity(se))
1751 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 1772 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1752#ifdef CONFIG_SMP 1773#ifdef CONFIG_SMP
1753 if (entity_is_task(se)) 1774 if (entity_is_task(se)) {
1754 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); 1775 struct rq *rq = rq_of(cfs_rq);
1776
1777 account_numa_enqueue(rq, task_of(se));
1778 list_add(&se->group_node, &rq->cfs_tasks);
1779 }
1755#endif 1780#endif
1756 cfs_rq->nr_running++; 1781 cfs_rq->nr_running++;
1757} 1782}
@@ -1762,8 +1787,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1762 update_load_sub(&cfs_rq->load, se->load.weight); 1787 update_load_sub(&cfs_rq->load, se->load.weight);
1763 if (!parent_entity(se)) 1788 if (!parent_entity(se))
1764 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 1789 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1765 if (entity_is_task(se)) 1790 if (entity_is_task(se)) {
1791 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
1766 list_del_init(&se->group_node); 1792 list_del_init(&se->group_node);
1793 }
1767 cfs_rq->nr_running--; 1794 cfs_rq->nr_running--;
1768} 1795}
1769 1796
@@ -4605,6 +4632,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
4605 4632
4606static unsigned long __read_mostly max_load_balance_interval = HZ/10; 4633static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4607 4634
4635enum fbq_type { regular, remote, all };
4636
4608#define LBF_ALL_PINNED 0x01 4637#define LBF_ALL_PINNED 0x01
4609#define LBF_NEED_BREAK 0x02 4638#define LBF_NEED_BREAK 0x02
4610#define LBF_DST_PINNED 0x04 4639#define LBF_DST_PINNED 0x04
@@ -4631,6 +4660,8 @@ struct lb_env {
4631 unsigned int loop; 4660 unsigned int loop;
4632 unsigned int loop_break; 4661 unsigned int loop_break;
4633 unsigned int loop_max; 4662 unsigned int loop_max;
4663
4664 enum fbq_type fbq_type;
4634}; 4665};
4635 4666
4636/* 4667/*
@@ -5092,6 +5123,10 @@ struct sg_lb_stats {
5092 unsigned int group_weight; 5123 unsigned int group_weight;
5093 int group_imb; /* Is there an imbalance in the group ? */ 5124 int group_imb; /* Is there an imbalance in the group ? */
5094 int group_has_capacity; /* Is there extra capacity in the group? */ 5125 int group_has_capacity; /* Is there extra capacity in the group? */
5126#ifdef CONFIG_NUMA_BALANCING
5127 unsigned int nr_numa_running;
5128 unsigned int nr_preferred_running;
5129#endif
5095}; 5130};
5096 5131
5097/* 5132/*
@@ -5409,6 +5444,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5409 5444
5410 sgs->group_load += load; 5445 sgs->group_load += load;
5411 sgs->sum_nr_running += nr_running; 5446 sgs->sum_nr_running += nr_running;
5447#ifdef CONFIG_NUMA_BALANCING
5448 sgs->nr_numa_running += rq->nr_numa_running;
5449 sgs->nr_preferred_running += rq->nr_preferred_running;
5450#endif
5412 sgs->sum_weighted_load += weighted_cpuload(i); 5451 sgs->sum_weighted_load += weighted_cpuload(i);
5413 if (idle_cpu(i)) 5452 if (idle_cpu(i))
5414 sgs->idle_cpus++; 5453 sgs->idle_cpus++;
@@ -5474,14 +5513,43 @@ static bool update_sd_pick_busiest(struct lb_env *env,
5474 return false; 5513 return false;
5475} 5514}
5476 5515
5516#ifdef CONFIG_NUMA_BALANCING
5517static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5518{
5519 if (sgs->sum_nr_running > sgs->nr_numa_running)
5520 return regular;
5521 if (sgs->sum_nr_running > sgs->nr_preferred_running)
5522 return remote;
5523 return all;
5524}
5525
5526static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5527{
5528 if (rq->nr_running > rq->nr_numa_running)
5529 return regular;
5530 if (rq->nr_running > rq->nr_preferred_running)
5531 return remote;
5532 return all;
5533}
5534#else
5535static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5536{
5537 return all;
5538}
5539
5540static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5541{
5542 return regular;
5543}
5544#endif /* CONFIG_NUMA_BALANCING */
5545
5477/** 5546/**
5478 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 5547 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
5479 * @env: The load balancing environment. 5548 * @env: The load balancing environment.
5480 * @balance: Should we balance. 5549 * @balance: Should we balance.
5481 * @sds: variable to hold the statistics for this sched_domain. 5550 * @sds: variable to hold the statistics for this sched_domain.
5482 */ 5551 */
5483static inline void update_sd_lb_stats(struct lb_env *env, 5552static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
5484 struct sd_lb_stats *sds)
5485{ 5553{
5486 struct sched_domain *child = env->sd->child; 5554 struct sched_domain *child = env->sd->child;
5487 struct sched_group *sg = env->sd->groups; 5555 struct sched_group *sg = env->sd->groups;
@@ -5538,6 +5606,9 @@ next_group:
5538 5606
5539 sg = sg->next; 5607 sg = sg->next;
5540 } while (sg != env->sd->groups); 5608 } while (sg != env->sd->groups);
5609
5610 if (env->sd->flags & SD_NUMA)
5611 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
5541} 5612}
5542 5613
5543/** 5614/**
@@ -5841,15 +5912,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
5841 int i; 5912 int i;
5842 5913
5843 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5914 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5844 unsigned long power = power_of(i); 5915 unsigned long power, capacity, wl;
5845 unsigned long capacity = DIV_ROUND_CLOSEST(power, 5916 enum fbq_type rt;
5846 SCHED_POWER_SCALE); 5917
5847 unsigned long wl; 5918 rq = cpu_rq(i);
5919 rt = fbq_classify_rq(rq);
5848 5920
5921 /*
5922 * We classify groups/runqueues into three groups:
5923 * - regular: there are !numa tasks
5924 * - remote: there are numa tasks that run on the 'wrong' node
5925 * - all: there is no distinction
5926 *
5927 * In order to avoid migrating ideally placed numa tasks,
5928 * ignore those when there's better options.
5929 *
5930 * If we ignore the actual busiest queue to migrate another
5931 * task, the next balance pass can still reduce the busiest
5932 * queue by moving tasks around inside the node.
5933 *
5934 * If we cannot move enough load due to this classification
5935 * the next pass will adjust the group classification and
5936 * allow migration of more tasks.
5937 *
5938 * Both cases only affect the total convergence complexity.
5939 */
5940 if (rt > env->fbq_type)
5941 continue;
5942
5943 power = power_of(i);
5944 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
5849 if (!capacity) 5945 if (!capacity)
5850 capacity = fix_small_capacity(env->sd, group); 5946 capacity = fix_small_capacity(env->sd, group);
5851 5947
5852 rq = cpu_rq(i);
5853 wl = weighted_cpuload(i); 5948 wl = weighted_cpuload(i);
5854 5949
5855 /* 5950 /*
@@ -5966,6 +6061,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5966 .idle = idle, 6061 .idle = idle,
5967 .loop_break = sched_nr_migrate_break, 6062 .loop_break = sched_nr_migrate_break,
5968 .cpus = cpus, 6063 .cpus = cpus,
6064 .fbq_type = all,
5969 }; 6065 };
5970 6066
5971 /* 6067 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eeb1923812a1..d69cb325c27e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -409,6 +409,10 @@ struct rq {
409 * remote CPUs use both these fields when doing load calculation. 409 * remote CPUs use both these fields when doing load calculation.
410 */ 410 */
411 unsigned int nr_running; 411 unsigned int nr_running;
412#ifdef CONFIG_NUMA_BALANCING
413 unsigned int nr_numa_running;
414 unsigned int nr_preferred_running;
415#endif
412 #define CPU_LOAD_IDX_MAX 5 416 #define CPU_LOAD_IDX_MAX 5
413 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 417 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
414 unsigned long last_load_update_tick; 418 unsigned long last_load_update_tick;
@@ -557,6 +561,7 @@ static inline u64 rq_clock_task(struct rq *rq)
557} 561}
558 562
559#ifdef CONFIG_NUMA_BALANCING 563#ifdef CONFIG_NUMA_BALANCING
564extern void sched_setnuma(struct task_struct *p, int node);
560extern int migrate_task_to(struct task_struct *p, int cpu); 565extern int migrate_task_to(struct task_struct *p, int cpu);
561extern int migrate_swap(struct task_struct *, struct task_struct *); 566extern int migrate_swap(struct task_struct *, struct task_struct *);
562#endif /* CONFIG_NUMA_BALANCING */ 567#endif /* CONFIG_NUMA_BALANCING */