aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2013-10-07 06:29:33 -0400
committerIngo Molnar <mingo@kernel.org>2013-10-09 08:48:10 -0400
commit0ec8aa00f2b4dc457836ef4e2662b02483e94fb7 (patch)
treeffd621a5f639a10b1a7213892649e512ea0ee1ba /kernel
parentca28aa53dd95868c9e38917b9881c09dacfacf1a (diff)
sched/numa: Avoid migrating tasks that are placed on their preferred node
This patch classifies scheduler domains and runqueues into types depending the number of tasks that are about their NUMA placement and the number that are currently running on their preferred node. The types are regular: There are tasks running that do not care about their NUMA placement. remote: There are tasks running that care about their placement but are currently running on a node remote to their ideal placement all: No distinction To implement this the patch tracks the number of tasks that are optimally NUMA placed (rq->nr_preferred_running) and the number of tasks running that care about their placement (nr_numa_running). The load balancer uses this information to avoid migrating idea placed NUMA tasks as long as better options for load balancing exists. For example, it will not consider balancing between a group whose tasks are all perfectly placed and a group with remote tasks. Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Link: http://lkml.kernel.org/r/1381141781-10992-56-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c29
-rw-r--r--kernel/sched/fair.c120
-rw-r--r--kernel/sched/sched.h5
3 files changed, 142 insertions, 12 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3e2c893df173..8cfd51f62241 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4468,6 +4468,35 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
4468 4468
4469 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 4469 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4470} 4470}
4471
4472/*
4473 * Requeue a task on a given node and accurately track the number of NUMA
4474 * tasks on the runqueues
4475 */
4476void sched_setnuma(struct task_struct *p, int nid)
4477{
4478 struct rq *rq;
4479 unsigned long flags;
4480 bool on_rq, running;
4481
4482 rq = task_rq_lock(p, &flags);
4483 on_rq = p->on_rq;
4484 running = task_current(rq, p);
4485
4486 if (on_rq)
4487 dequeue_task(rq, p, 0);
4488 if (running)
4489 p->sched_class->put_prev_task(rq, p);
4490
4491 p->numa_preferred_nid = nid;
4492 p->numa_migrate_seq = 1;
4493
4494 if (running)
4495 p->sched_class->set_curr_task(rq);
4496 if (on_rq)
4497 enqueue_task(rq, p, 0);
4498 task_rq_unlock(rq, p, &flags);
4499}
4471#endif 4500#endif
4472 4501
4473/* 4502/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 423316cdee07..5166b9b1af70 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -888,6 +888,18 @@ static unsigned int task_scan_max(struct task_struct *p)
888 */ 888 */
889unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; 889unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
890 890
891static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
892{
893 rq->nr_numa_running += (p->numa_preferred_nid != -1);
894 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
895}
896
897static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
898{
899 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
900 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
901}
902
891struct numa_group { 903struct numa_group {
892 atomic_t refcount; 904 atomic_t refcount;
893 905
@@ -1227,6 +1239,8 @@ static int task_numa_migrate(struct task_struct *p)
1227 if (env.best_cpu == -1) 1239 if (env.best_cpu == -1)
1228 return -EAGAIN; 1240 return -EAGAIN;
1229 1241
1242 sched_setnuma(p, env.dst_nid);
1243
1230 if (env.best_task == NULL) { 1244 if (env.best_task == NULL) {
1231 int ret = migrate_task_to(p, env.best_cpu); 1245 int ret = migrate_task_to(p, env.best_cpu);
1232 return ret; 1246 return ret;
@@ -1342,8 +1356,7 @@ static void task_numa_placement(struct task_struct *p)
1342 /* Preferred node as the node with the most faults */ 1356 /* Preferred node as the node with the most faults */
1343 if (max_faults && max_nid != p->numa_preferred_nid) { 1357 if (max_faults && max_nid != p->numa_preferred_nid) {
1344 /* Update the preferred nid and migrate task if possible */ 1358 /* Update the preferred nid and migrate task if possible */
1345 p->numa_preferred_nid = max_nid; 1359 sched_setnuma(p, max_nid);
1346 p->numa_migrate_seq = 1;
1347 numa_migrate_preferred(p); 1360 numa_migrate_preferred(p);
1348 } 1361 }
1349} 1362}
@@ -1741,6 +1754,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1741static void task_tick_numa(struct rq *rq, struct task_struct *curr) 1754static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1742{ 1755{
1743} 1756}
1757
1758static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1759{
1760}
1761
1762static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1763{
1764}
1744#endif /* CONFIG_NUMA_BALANCING */ 1765#endif /* CONFIG_NUMA_BALANCING */
1745 1766
1746static void 1767static void
@@ -1750,8 +1771,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1750 if (!parent_entity(se)) 1771 if (!parent_entity(se))
1751 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 1772 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1752#ifdef CONFIG_SMP 1773#ifdef CONFIG_SMP
1753 if (entity_is_task(se)) 1774 if (entity_is_task(se)) {
1754 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); 1775 struct rq *rq = rq_of(cfs_rq);
1776
1777 account_numa_enqueue(rq, task_of(se));
1778 list_add(&se->group_node, &rq->cfs_tasks);
1779 }
1755#endif 1780#endif
1756 cfs_rq->nr_running++; 1781 cfs_rq->nr_running++;
1757} 1782}
@@ -1762,8 +1787,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1762 update_load_sub(&cfs_rq->load, se->load.weight); 1787 update_load_sub(&cfs_rq->load, se->load.weight);
1763 if (!parent_entity(se)) 1788 if (!parent_entity(se))
1764 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 1789 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1765 if (entity_is_task(se)) 1790 if (entity_is_task(se)) {
1791 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
1766 list_del_init(&se->group_node); 1792 list_del_init(&se->group_node);
1793 }
1767 cfs_rq->nr_running--; 1794 cfs_rq->nr_running--;
1768} 1795}
1769 1796
@@ -4605,6 +4632,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
4605 4632
4606static unsigned long __read_mostly max_load_balance_interval = HZ/10; 4633static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4607 4634
4635enum fbq_type { regular, remote, all };
4636
4608#define LBF_ALL_PINNED 0x01 4637#define LBF_ALL_PINNED 0x01
4609#define LBF_NEED_BREAK 0x02 4638#define LBF_NEED_BREAK 0x02
4610#define LBF_DST_PINNED 0x04 4639#define LBF_DST_PINNED 0x04
@@ -4631,6 +4660,8 @@ struct lb_env {
4631 unsigned int loop; 4660 unsigned int loop;
4632 unsigned int loop_break; 4661 unsigned int loop_break;
4633 unsigned int loop_max; 4662 unsigned int loop_max;
4663
4664 enum fbq_type fbq_type;
4634}; 4665};
4635 4666
4636/* 4667/*
@@ -5092,6 +5123,10 @@ struct sg_lb_stats {
5092 unsigned int group_weight; 5123 unsigned int group_weight;
5093 int group_imb; /* Is there an imbalance in the group ? */ 5124 int group_imb; /* Is there an imbalance in the group ? */
5094 int group_has_capacity; /* Is there extra capacity in the group? */ 5125 int group_has_capacity; /* Is there extra capacity in the group? */
5126#ifdef CONFIG_NUMA_BALANCING
5127 unsigned int nr_numa_running;
5128 unsigned int nr_preferred_running;
5129#endif
5095}; 5130};
5096 5131
5097/* 5132/*
@@ -5409,6 +5444,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5409 5444
5410 sgs->group_load += load; 5445 sgs->group_load += load;
5411 sgs->sum_nr_running += nr_running; 5446 sgs->sum_nr_running += nr_running;
5447#ifdef CONFIG_NUMA_BALANCING
5448 sgs->nr_numa_running += rq->nr_numa_running;
5449 sgs->nr_preferred_running += rq->nr_preferred_running;
5450#endif
5412 sgs->sum_weighted_load += weighted_cpuload(i); 5451 sgs->sum_weighted_load += weighted_cpuload(i);
5413 if (idle_cpu(i)) 5452 if (idle_cpu(i))
5414 sgs->idle_cpus++; 5453 sgs->idle_cpus++;
@@ -5474,14 +5513,43 @@ static bool update_sd_pick_busiest(struct lb_env *env,
5474 return false; 5513 return false;
5475} 5514}
5476 5515
5516#ifdef CONFIG_NUMA_BALANCING
5517static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5518{
5519 if (sgs->sum_nr_running > sgs->nr_numa_running)
5520 return regular;
5521 if (sgs->sum_nr_running > sgs->nr_preferred_running)
5522 return remote;
5523 return all;
5524}
5525
5526static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5527{
5528 if (rq->nr_running > rq->nr_numa_running)
5529 return regular;
5530 if (rq->nr_running > rq->nr_preferred_running)
5531 return remote;
5532 return all;
5533}
5534#else
5535static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5536{
5537 return all;
5538}
5539
5540static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5541{
5542 return regular;
5543}
5544#endif /* CONFIG_NUMA_BALANCING */
5545
5477/** 5546/**
5478 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 5547 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
5479 * @env: The load balancing environment. 5548 * @env: The load balancing environment.
5480 * @balance: Should we balance. 5549 * @balance: Should we balance.
5481 * @sds: variable to hold the statistics for this sched_domain. 5550 * @sds: variable to hold the statistics for this sched_domain.
5482 */ 5551 */
5483static inline void update_sd_lb_stats(struct lb_env *env, 5552static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
5484 struct sd_lb_stats *sds)
5485{ 5553{
5486 struct sched_domain *child = env->sd->child; 5554 struct sched_domain *child = env->sd->child;
5487 struct sched_group *sg = env->sd->groups; 5555 struct sched_group *sg = env->sd->groups;
@@ -5538,6 +5606,9 @@ next_group:
5538 5606
5539 sg = sg->next; 5607 sg = sg->next;
5540 } while (sg != env->sd->groups); 5608 } while (sg != env->sd->groups);
5609
5610 if (env->sd->flags & SD_NUMA)
5611 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
5541} 5612}
5542 5613
5543/** 5614/**
@@ -5841,15 +5912,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
5841 int i; 5912 int i;
5842 5913
5843 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5914 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5844 unsigned long power = power_of(i); 5915 unsigned long power, capacity, wl;
5845 unsigned long capacity = DIV_ROUND_CLOSEST(power, 5916 enum fbq_type rt;
5846 SCHED_POWER_SCALE); 5917
5847 unsigned long wl; 5918 rq = cpu_rq(i);
5919 rt = fbq_classify_rq(rq);
5848 5920
5921 /*
5922 * We classify groups/runqueues into three groups:
5923 * - regular: there are !numa tasks
5924 * - remote: there are numa tasks that run on the 'wrong' node
5925 * - all: there is no distinction
5926 *
5927 * In order to avoid migrating ideally placed numa tasks,
5928 * ignore those when there's better options.
5929 *
5930 * If we ignore the actual busiest queue to migrate another
5931 * task, the next balance pass can still reduce the busiest
5932 * queue by moving tasks around inside the node.
5933 *
5934 * If we cannot move enough load due to this classification
5935 * the next pass will adjust the group classification and
5936 * allow migration of more tasks.
5937 *
5938 * Both cases only affect the total convergence complexity.
5939 */
5940 if (rt > env->fbq_type)
5941 continue;
5942
5943 power = power_of(i);
5944 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
5849 if (!capacity) 5945 if (!capacity)
5850 capacity = fix_small_capacity(env->sd, group); 5946 capacity = fix_small_capacity(env->sd, group);
5851 5947
5852 rq = cpu_rq(i);
5853 wl = weighted_cpuload(i); 5948 wl = weighted_cpuload(i);
5854 5949
5855 /* 5950 /*
@@ -5966,6 +6061,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5966 .idle = idle, 6061 .idle = idle,
5967 .loop_break = sched_nr_migrate_break, 6062 .loop_break = sched_nr_migrate_break,
5968 .cpus = cpus, 6063 .cpus = cpus,
6064 .fbq_type = all,
5969 }; 6065 };
5970 6066
5971 /* 6067 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eeb1923812a1..d69cb325c27e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -409,6 +409,10 @@ struct rq {
409 * remote CPUs use both these fields when doing load calculation. 409 * remote CPUs use both these fields when doing load calculation.
410 */ 410 */
411 unsigned int nr_running; 411 unsigned int nr_running;
412#ifdef CONFIG_NUMA_BALANCING
413 unsigned int nr_numa_running;
414 unsigned int nr_preferred_running;
415#endif
412 #define CPU_LOAD_IDX_MAX 5 416 #define CPU_LOAD_IDX_MAX 5
413 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 417 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
414 unsigned long last_load_update_tick; 418 unsigned long last_load_update_tick;
@@ -557,6 +561,7 @@ static inline u64 rq_clock_task(struct rq *rq)
557} 561}
558 562
559#ifdef CONFIG_NUMA_BALANCING 563#ifdef CONFIG_NUMA_BALANCING
564extern void sched_setnuma(struct task_struct *p, int node);
560extern int migrate_task_to(struct task_struct *p, int cpu); 565extern int migrate_task_to(struct task_struct *p, int cpu);
561extern int migrate_swap(struct task_struct *, struct task_struct *); 566extern int migrate_swap(struct task_struct *, struct task_struct *);
562#endif /* CONFIG_NUMA_BALANCING */ 567#endif /* CONFIG_NUMA_BALANCING */