diff options
-rw-r--r-- | kernel/sched/core.c | 29 | ||||
-rw-r--r-- | kernel/sched/fair.c | 120 | ||||
-rw-r--r-- | kernel/sched/sched.h | 5 |
3 files changed, 142 insertions, 12 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3e2c893df173..8cfd51f62241 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -4468,6 +4468,35 @@ int migrate_task_to(struct task_struct *p, int target_cpu) | |||
4468 | 4468 | ||
4469 | return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); | 4469 | return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); |
4470 | } | 4470 | } |
4471 | |||
4472 | /* | ||
4473 | * Requeue a task on a given node and accurately track the number of NUMA | ||
4474 | * tasks on the runqueues | ||
4475 | */ | ||
4476 | void sched_setnuma(struct task_struct *p, int nid) | ||
4477 | { | ||
4478 | struct rq *rq; | ||
4479 | unsigned long flags; | ||
4480 | bool on_rq, running; | ||
4481 | |||
4482 | rq = task_rq_lock(p, &flags); | ||
4483 | on_rq = p->on_rq; | ||
4484 | running = task_current(rq, p); | ||
4485 | |||
4486 | if (on_rq) | ||
4487 | dequeue_task(rq, p, 0); | ||
4488 | if (running) | ||
4489 | p->sched_class->put_prev_task(rq, p); | ||
4490 | |||
4491 | p->numa_preferred_nid = nid; | ||
4492 | p->numa_migrate_seq = 1; | ||
4493 | |||
4494 | if (running) | ||
4495 | p->sched_class->set_curr_task(rq); | ||
4496 | if (on_rq) | ||
4497 | enqueue_task(rq, p, 0); | ||
4498 | task_rq_unlock(rq, p, &flags); | ||
4499 | } | ||
4471 | #endif | 4500 | #endif |
4472 | 4501 | ||
4473 | /* | 4502 | /* |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 423316cdee07..5166b9b1af70 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -888,6 +888,18 @@ static unsigned int task_scan_max(struct task_struct *p) | |||
888 | */ | 888 | */ |
889 | unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; | 889 | unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; |
890 | 890 | ||
891 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) | ||
892 | { | ||
893 | rq->nr_numa_running += (p->numa_preferred_nid != -1); | ||
894 | rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); | ||
895 | } | ||
896 | |||
897 | static void account_numa_dequeue(struct rq *rq, struct task_struct *p) | ||
898 | { | ||
899 | rq->nr_numa_running -= (p->numa_preferred_nid != -1); | ||
900 | rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); | ||
901 | } | ||
902 | |||
891 | struct numa_group { | 903 | struct numa_group { |
892 | atomic_t refcount; | 904 | atomic_t refcount; |
893 | 905 | ||
@@ -1227,6 +1239,8 @@ static int task_numa_migrate(struct task_struct *p) | |||
1227 | if (env.best_cpu == -1) | 1239 | if (env.best_cpu == -1) |
1228 | return -EAGAIN; | 1240 | return -EAGAIN; |
1229 | 1241 | ||
1242 | sched_setnuma(p, env.dst_nid); | ||
1243 | |||
1230 | if (env.best_task == NULL) { | 1244 | if (env.best_task == NULL) { |
1231 | int ret = migrate_task_to(p, env.best_cpu); | 1245 | int ret = migrate_task_to(p, env.best_cpu); |
1232 | return ret; | 1246 | return ret; |
@@ -1342,8 +1356,7 @@ static void task_numa_placement(struct task_struct *p) | |||
1342 | /* Preferred node as the node with the most faults */ | 1356 | /* Preferred node as the node with the most faults */ |
1343 | if (max_faults && max_nid != p->numa_preferred_nid) { | 1357 | if (max_faults && max_nid != p->numa_preferred_nid) { |
1344 | /* Update the preferred nid and migrate task if possible */ | 1358 | /* Update the preferred nid and migrate task if possible */ |
1345 | p->numa_preferred_nid = max_nid; | 1359 | sched_setnuma(p, max_nid); |
1346 | p->numa_migrate_seq = 1; | ||
1347 | numa_migrate_preferred(p); | 1360 | numa_migrate_preferred(p); |
1348 | } | 1361 | } |
1349 | } | 1362 | } |
@@ -1741,6 +1754,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | |||
1741 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) | 1754 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) |
1742 | { | 1755 | { |
1743 | } | 1756 | } |
1757 | |||
1758 | static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) | ||
1759 | { | ||
1760 | } | ||
1761 | |||
1762 | static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) | ||
1763 | { | ||
1764 | } | ||
1744 | #endif /* CONFIG_NUMA_BALANCING */ | 1765 | #endif /* CONFIG_NUMA_BALANCING */ |
1745 | 1766 | ||
1746 | static void | 1767 | static void |
@@ -1750,8 +1771,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1750 | if (!parent_entity(se)) | 1771 | if (!parent_entity(se)) |
1751 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 1772 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
1752 | #ifdef CONFIG_SMP | 1773 | #ifdef CONFIG_SMP |
1753 | if (entity_is_task(se)) | 1774 | if (entity_is_task(se)) { |
1754 | list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); | 1775 | struct rq *rq = rq_of(cfs_rq); |
1776 | |||
1777 | account_numa_enqueue(rq, task_of(se)); | ||
1778 | list_add(&se->group_node, &rq->cfs_tasks); | ||
1779 | } | ||
1755 | #endif | 1780 | #endif |
1756 | cfs_rq->nr_running++; | 1781 | cfs_rq->nr_running++; |
1757 | } | 1782 | } |
@@ -1762,8 +1787,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1762 | update_load_sub(&cfs_rq->load, se->load.weight); | 1787 | update_load_sub(&cfs_rq->load, se->load.weight); |
1763 | if (!parent_entity(se)) | 1788 | if (!parent_entity(se)) |
1764 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); | 1789 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
1765 | if (entity_is_task(se)) | 1790 | if (entity_is_task(se)) { |
1791 | account_numa_dequeue(rq_of(cfs_rq), task_of(se)); | ||
1766 | list_del_init(&se->group_node); | 1792 | list_del_init(&se->group_node); |
1793 | } | ||
1767 | cfs_rq->nr_running--; | 1794 | cfs_rq->nr_running--; |
1768 | } | 1795 | } |
1769 | 1796 | ||
@@ -4605,6 +4632,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
4605 | 4632 | ||
4606 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 4633 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
4607 | 4634 | ||
4635 | enum fbq_type { regular, remote, all }; | ||
4636 | |||
4608 | #define LBF_ALL_PINNED 0x01 | 4637 | #define LBF_ALL_PINNED 0x01 |
4609 | #define LBF_NEED_BREAK 0x02 | 4638 | #define LBF_NEED_BREAK 0x02 |
4610 | #define LBF_DST_PINNED 0x04 | 4639 | #define LBF_DST_PINNED 0x04 |
@@ -4631,6 +4660,8 @@ struct lb_env { | |||
4631 | unsigned int loop; | 4660 | unsigned int loop; |
4632 | unsigned int loop_break; | 4661 | unsigned int loop_break; |
4633 | unsigned int loop_max; | 4662 | unsigned int loop_max; |
4663 | |||
4664 | enum fbq_type fbq_type; | ||
4634 | }; | 4665 | }; |
4635 | 4666 | ||
4636 | /* | 4667 | /* |
@@ -5092,6 +5123,10 @@ struct sg_lb_stats { | |||
5092 | unsigned int group_weight; | 5123 | unsigned int group_weight; |
5093 | int group_imb; /* Is there an imbalance in the group ? */ | 5124 | int group_imb; /* Is there an imbalance in the group ? */ |
5094 | int group_has_capacity; /* Is there extra capacity in the group? */ | 5125 | int group_has_capacity; /* Is there extra capacity in the group? */ |
5126 | #ifdef CONFIG_NUMA_BALANCING | ||
5127 | unsigned int nr_numa_running; | ||
5128 | unsigned int nr_preferred_running; | ||
5129 | #endif | ||
5095 | }; | 5130 | }; |
5096 | 5131 | ||
5097 | /* | 5132 | /* |
@@ -5409,6 +5444,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5409 | 5444 | ||
5410 | sgs->group_load += load; | 5445 | sgs->group_load += load; |
5411 | sgs->sum_nr_running += nr_running; | 5446 | sgs->sum_nr_running += nr_running; |
5447 | #ifdef CONFIG_NUMA_BALANCING | ||
5448 | sgs->nr_numa_running += rq->nr_numa_running; | ||
5449 | sgs->nr_preferred_running += rq->nr_preferred_running; | ||
5450 | #endif | ||
5412 | sgs->sum_weighted_load += weighted_cpuload(i); | 5451 | sgs->sum_weighted_load += weighted_cpuload(i); |
5413 | if (idle_cpu(i)) | 5452 | if (idle_cpu(i)) |
5414 | sgs->idle_cpus++; | 5453 | sgs->idle_cpus++; |
@@ -5474,14 +5513,43 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
5474 | return false; | 5513 | return false; |
5475 | } | 5514 | } |
5476 | 5515 | ||
5516 | #ifdef CONFIG_NUMA_BALANCING | ||
5517 | static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) | ||
5518 | { | ||
5519 | if (sgs->sum_nr_running > sgs->nr_numa_running) | ||
5520 | return regular; | ||
5521 | if (sgs->sum_nr_running > sgs->nr_preferred_running) | ||
5522 | return remote; | ||
5523 | return all; | ||
5524 | } | ||
5525 | |||
5526 | static inline enum fbq_type fbq_classify_rq(struct rq *rq) | ||
5527 | { | ||
5528 | if (rq->nr_running > rq->nr_numa_running) | ||
5529 | return regular; | ||
5530 | if (rq->nr_running > rq->nr_preferred_running) | ||
5531 | return remote; | ||
5532 | return all; | ||
5533 | } | ||
5534 | #else | ||
5535 | static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) | ||
5536 | { | ||
5537 | return all; | ||
5538 | } | ||
5539 | |||
5540 | static inline enum fbq_type fbq_classify_rq(struct rq *rq) | ||
5541 | { | ||
5542 | return regular; | ||
5543 | } | ||
5544 | #endif /* CONFIG_NUMA_BALANCING */ | ||
5545 | |||
5477 | /** | 5546 | /** |
5478 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. | 5547 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
5479 | * @env: The load balancing environment. | 5548 | * @env: The load balancing environment. |
5480 | * @balance: Should we balance. | 5549 | * @balance: Should we balance. |
5481 | * @sds: variable to hold the statistics for this sched_domain. | 5550 | * @sds: variable to hold the statistics for this sched_domain. |
5482 | */ | 5551 | */ |
5483 | static inline void update_sd_lb_stats(struct lb_env *env, | 5552 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) |
5484 | struct sd_lb_stats *sds) | ||
5485 | { | 5553 | { |
5486 | struct sched_domain *child = env->sd->child; | 5554 | struct sched_domain *child = env->sd->child; |
5487 | struct sched_group *sg = env->sd->groups; | 5555 | struct sched_group *sg = env->sd->groups; |
@@ -5538,6 +5606,9 @@ next_group: | |||
5538 | 5606 | ||
5539 | sg = sg->next; | 5607 | sg = sg->next; |
5540 | } while (sg != env->sd->groups); | 5608 | } while (sg != env->sd->groups); |
5609 | |||
5610 | if (env->sd->flags & SD_NUMA) | ||
5611 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); | ||
5541 | } | 5612 | } |
5542 | 5613 | ||
5543 | /** | 5614 | /** |
@@ -5841,15 +5912,39 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
5841 | int i; | 5912 | int i; |
5842 | 5913 | ||
5843 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5914 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
5844 | unsigned long power = power_of(i); | 5915 | unsigned long power, capacity, wl; |
5845 | unsigned long capacity = DIV_ROUND_CLOSEST(power, | 5916 | enum fbq_type rt; |
5846 | SCHED_POWER_SCALE); | 5917 | |
5847 | unsigned long wl; | 5918 | rq = cpu_rq(i); |
5919 | rt = fbq_classify_rq(rq); | ||
5848 | 5920 | ||
5921 | /* | ||
5922 | * We classify groups/runqueues into three groups: | ||
5923 | * - regular: there are !numa tasks | ||
5924 | * - remote: there are numa tasks that run on the 'wrong' node | ||
5925 | * - all: there is no distinction | ||
5926 | * | ||
5927 | * In order to avoid migrating ideally placed numa tasks, | ||
5928 | * ignore those when there's better options. | ||
5929 | * | ||
5930 | * If we ignore the actual busiest queue to migrate another | ||
5931 | * task, the next balance pass can still reduce the busiest | ||
5932 | * queue by moving tasks around inside the node. | ||
5933 | * | ||
5934 | * If we cannot move enough load due to this classification | ||
5935 | * the next pass will adjust the group classification and | ||
5936 | * allow migration of more tasks. | ||
5937 | * | ||
5938 | * Both cases only affect the total convergence complexity. | ||
5939 | */ | ||
5940 | if (rt > env->fbq_type) | ||
5941 | continue; | ||
5942 | |||
5943 | power = power_of(i); | ||
5944 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | ||
5849 | if (!capacity) | 5945 | if (!capacity) |
5850 | capacity = fix_small_capacity(env->sd, group); | 5946 | capacity = fix_small_capacity(env->sd, group); |
5851 | 5947 | ||
5852 | rq = cpu_rq(i); | ||
5853 | wl = weighted_cpuload(i); | 5948 | wl = weighted_cpuload(i); |
5854 | 5949 | ||
5855 | /* | 5950 | /* |
@@ -5966,6 +6061,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
5966 | .idle = idle, | 6061 | .idle = idle, |
5967 | .loop_break = sched_nr_migrate_break, | 6062 | .loop_break = sched_nr_migrate_break, |
5968 | .cpus = cpus, | 6063 | .cpus = cpus, |
6064 | .fbq_type = all, | ||
5969 | }; | 6065 | }; |
5970 | 6066 | ||
5971 | /* | 6067 | /* |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eeb1923812a1..d69cb325c27e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -409,6 +409,10 @@ struct rq { | |||
409 | * remote CPUs use both these fields when doing load calculation. | 409 | * remote CPUs use both these fields when doing load calculation. |
410 | */ | 410 | */ |
411 | unsigned int nr_running; | 411 | unsigned int nr_running; |
412 | #ifdef CONFIG_NUMA_BALANCING | ||
413 | unsigned int nr_numa_running; | ||
414 | unsigned int nr_preferred_running; | ||
415 | #endif | ||
412 | #define CPU_LOAD_IDX_MAX 5 | 416 | #define CPU_LOAD_IDX_MAX 5 |
413 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 417 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
414 | unsigned long last_load_update_tick; | 418 | unsigned long last_load_update_tick; |
@@ -557,6 +561,7 @@ static inline u64 rq_clock_task(struct rq *rq) | |||
557 | } | 561 | } |
558 | 562 | ||
559 | #ifdef CONFIG_NUMA_BALANCING | 563 | #ifdef CONFIG_NUMA_BALANCING |
564 | extern void sched_setnuma(struct task_struct *p, int node); | ||
560 | extern int migrate_task_to(struct task_struct *p, int cpu); | 565 | extern int migrate_task_to(struct task_struct *p, int cpu); |
561 | extern int migrate_swap(struct task_struct *, struct task_struct *); | 566 | extern int migrate_swap(struct task_struct *, struct task_struct *); |
562 | #endif /* CONFIG_NUMA_BALANCING */ | 567 | #endif /* CONFIG_NUMA_BALANCING */ |