aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c24
-rw-r--r--kernel/sched/fair.c66
-rw-r--r--kernel/sched/psi.c71
-rw-r--r--kernel/sched/sched.h4
-rw-r--r--kernel/sched/stats.h8
5 files changed, 112 insertions, 61 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f12225f26b70..6fedf3a98581 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5738,15 +5738,10 @@ int sched_cpu_activate(unsigned int cpu)
5738 5738
5739#ifdef CONFIG_SCHED_SMT 5739#ifdef CONFIG_SCHED_SMT
5740 /* 5740 /*
5741 * The sched_smt_present static key needs to be evaluated on every 5741 * When going up, increment the number of cores with SMT present.
5742 * hotplug event because at boot time SMT might be disabled when
5743 * the number of booted CPUs is limited.
5744 *
5745 * If then later a sibling gets hotplugged, then the key would stay
5746 * off and SMT scheduling would never be functional.
5747 */ 5742 */
5748 if (cpumask_weight(cpu_smt_mask(cpu)) > 1) 5743 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
5749 static_branch_enable_cpuslocked(&sched_smt_present); 5744 static_branch_inc_cpuslocked(&sched_smt_present);
5750#endif 5745#endif
5751 set_cpu_active(cpu, true); 5746 set_cpu_active(cpu, true);
5752 5747
@@ -5790,6 +5785,14 @@ int sched_cpu_deactivate(unsigned int cpu)
5790 */ 5785 */
5791 synchronize_rcu_mult(call_rcu, call_rcu_sched); 5786 synchronize_rcu_mult(call_rcu, call_rcu_sched);
5792 5787
5788#ifdef CONFIG_SCHED_SMT
5789 /*
5790 * When going down, decrement the number of cores with SMT present.
5791 */
5792 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
5793 static_branch_dec_cpuslocked(&sched_smt_present);
5794#endif
5795
5793 if (!sched_smp_initialized) 5796 if (!sched_smp_initialized)
5794 return 0; 5797 return 0;
5795 5798
@@ -5851,11 +5854,14 @@ void __init sched_init_smp(void)
5851 /* 5854 /*
5852 * There's no userspace yet to cause hotplug operations; hence all the 5855 * There's no userspace yet to cause hotplug operations; hence all the
5853 * CPU masks are stable and all blatant races in the below code cannot 5856 * CPU masks are stable and all blatant races in the below code cannot
5854 * happen. 5857 * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
5858 * but there won't be any contention on it.
5855 */ 5859 */
5860 cpus_read_lock();
5856 mutex_lock(&sched_domains_mutex); 5861 mutex_lock(&sched_domains_mutex);
5857 sched_init_domains(cpu_active_mask); 5862 sched_init_domains(cpu_active_mask);
5858 mutex_unlock(&sched_domains_mutex); 5863 mutex_unlock(&sched_domains_mutex);
5864 cpus_read_unlock();
5859 5865
5860 /* Move init over to a non-isolated CPU */ 5866 /* Move init over to a non-isolated CPU */
5861 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) 5867 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee271bb661cc..ac855b2f4774 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2400,8 +2400,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2400 local = 1; 2400 local = 1;
2401 2401
2402 /* 2402 /*
2403 * Retry task to preferred node migration periodically, in case it 2403 * Retry to migrate task to preferred node periodically, in case it
2404 * case it previously failed, or the scheduler moved us. 2404 * previously failed, or the scheduler moved us.
2405 */ 2405 */
2406 if (time_after(jiffies, p->numa_migrate_retry)) { 2406 if (time_after(jiffies, p->numa_migrate_retry)) {
2407 task_numa_placement(p); 2407 task_numa_placement(p);
@@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5674 return target; 5674 return target;
5675} 5675}
5676 5676
5677static unsigned long cpu_util_wake(int cpu, struct task_struct *p); 5677static unsigned long cpu_util_without(int cpu, struct task_struct *p);
5678 5678
5679static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) 5679static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
5680{ 5680{
5681 return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0); 5681 return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
5682} 5682}
5683 5683
5684/* 5684/*
@@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5738 5738
5739 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); 5739 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
5740 5740
5741 spare_cap = capacity_spare_wake(i, p); 5741 spare_cap = capacity_spare_without(i, p);
5742 5742
5743 if (spare_cap > max_spare_cap) 5743 if (spare_cap > max_spare_cap)
5744 max_spare_cap = spare_cap; 5744 max_spare_cap = spare_cap;
@@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
5889 return prev_cpu; 5889 return prev_cpu;
5890 5890
5891 /* 5891 /*
5892 * We need task's util for capacity_spare_wake, sync it up to prev_cpu's 5892 * We need task's util for capacity_spare_without, sync it up to
5893 * last_update_time. 5893 * prev_cpu's last_update_time.
5894 */ 5894 */
5895 if (!(sd_flag & SD_BALANCE_FORK)) 5895 if (!(sd_flag & SD_BALANCE_FORK))
5896 sync_entity_load_avg(&p->se); 5896 sync_entity_load_avg(&p->se);
@@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu)
6216} 6216}
6217 6217
6218/* 6218/*
6219 * cpu_util_wake: Compute CPU utilization with any contributions from 6219 * cpu_util_without: compute cpu utilization without any contributions from *p
6220 * the waking task p removed. 6220 * @cpu: the CPU which utilization is requested
6221 * @p: the task which utilization should be discounted
6222 *
6223 * The utilization of a CPU is defined by the utilization of tasks currently
6224 * enqueued on that CPU as well as tasks which are currently sleeping after an
6225 * execution on that CPU.
6226 *
6227 * This method returns the utilization of the specified CPU by discounting the
6228 * utilization of the specified task, whenever the task is currently
6229 * contributing to the CPU utilization.
6221 */ 6230 */
6222static unsigned long cpu_util_wake(int cpu, struct task_struct *p) 6231static unsigned long cpu_util_without(int cpu, struct task_struct *p)
6223{ 6232{
6224 struct cfs_rq *cfs_rq; 6233 struct cfs_rq *cfs_rq;
6225 unsigned int util; 6234 unsigned int util;
@@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6231 cfs_rq = &cpu_rq(cpu)->cfs; 6240 cfs_rq = &cpu_rq(cpu)->cfs;
6232 util = READ_ONCE(cfs_rq->avg.util_avg); 6241 util = READ_ONCE(cfs_rq->avg.util_avg);
6233 6242
6234 /* Discount task's blocked util from CPU's util */ 6243 /* Discount task's util from CPU's util */
6235 util -= min_t(unsigned int, util, task_util(p)); 6244 util -= min_t(unsigned int, util, task_util(p));
6236 6245
6237 /* 6246 /*
@@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6240 * a) if *p is the only task sleeping on this CPU, then: 6249 * a) if *p is the only task sleeping on this CPU, then:
6241 * cpu_util (== task_util) > util_est (== 0) 6250 * cpu_util (== task_util) > util_est (== 0)
6242 * and thus we return: 6251 * and thus we return:
6243 * cpu_util_wake = (cpu_util - task_util) = 0 6252 * cpu_util_without = (cpu_util - task_util) = 0
6244 * 6253 *
6245 * b) if other tasks are SLEEPING on this CPU, which is now exiting 6254 * b) if other tasks are SLEEPING on this CPU, which is now exiting
6246 * IDLE, then: 6255 * IDLE, then:
6247 * cpu_util >= task_util 6256 * cpu_util >= task_util
6248 * cpu_util > util_est (== 0) 6257 * cpu_util > util_est (== 0)
6249 * and thus we discount *p's blocked utilization to return: 6258 * and thus we discount *p's blocked utilization to return:
6250 * cpu_util_wake = (cpu_util - task_util) >= 0 6259 * cpu_util_without = (cpu_util - task_util) >= 0
6251 * 6260 *
6252 * c) if other tasks are RUNNABLE on that CPU and 6261 * c) if other tasks are RUNNABLE on that CPU and
6253 * util_est > cpu_util 6262 * util_est > cpu_util
@@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6260 * covered by the following code when estimated utilization is 6269 * covered by the following code when estimated utilization is
6261 * enabled. 6270 * enabled.
6262 */ 6271 */
6263 if (sched_feat(UTIL_EST)) 6272 if (sched_feat(UTIL_EST)) {
6264 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); 6273 unsigned int estimated =
6274 READ_ONCE(cfs_rq->avg.util_est.enqueued);
6275
6276 /*
6277 * Despite the following checks we still have a small window
6278 * for a possible race, when an execl's select_task_rq_fair()
6279 * races with LB's detach_task():
6280 *
6281 * detach_task()
6282 * p->on_rq = TASK_ON_RQ_MIGRATING;
6283 * ---------------------------------- A
6284 * deactivate_task() \
6285 * dequeue_task() + RaceTime
6286 * util_est_dequeue() /
6287 * ---------------------------------- B
6288 *
6289 * The additional check on "current == p" it's required to
6290 * properly fix the execl regression and it helps in further
6291 * reducing the chances for the above race.
6292 */
6293 if (unlikely(task_on_rq_queued(p) || current == p)) {
6294 estimated -= min_t(unsigned int, estimated,
6295 (_task_util_est(p) | UTIL_AVG_UNCHANGED));
6296 }
6297 util = max(util, estimated);
6298 }
6265 6299
6266 /* 6300 /*
6267 * Utilization (estimated) can exceed the CPU capacity, thus let's 6301 * Utilization (estimated) can exceed the CPU capacity, thus let's
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7cdecfc010af..fe24de3fbc93 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -136,8 +136,18 @@
136 136
137static int psi_bug __read_mostly; 137static int psi_bug __read_mostly;
138 138
139bool psi_disabled __read_mostly; 139DEFINE_STATIC_KEY_FALSE(psi_disabled);
140core_param(psi_disabled, psi_disabled, bool, 0644); 140
141#ifdef CONFIG_PSI_DEFAULT_DISABLED
142bool psi_enable;
143#else
144bool psi_enable = true;
145#endif
146static int __init setup_psi(char *str)
147{
148 return kstrtobool(str, &psi_enable) == 0;
149}
150__setup("psi=", setup_psi);
141 151
142/* Running averages - we need to be higher-res than loadavg */ 152/* Running averages - we need to be higher-res than loadavg */
143#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ 153#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */
@@ -169,8 +179,10 @@ static void group_init(struct psi_group *group)
169 179
170void __init psi_init(void) 180void __init psi_init(void)
171{ 181{
172 if (psi_disabled) 182 if (!psi_enable) {
183 static_branch_enable(&psi_disabled);
173 return; 184 return;
185 }
174 186
175 psi_period = jiffies_to_nsecs(PSI_FREQ); 187 psi_period = jiffies_to_nsecs(PSI_FREQ);
176 group_init(&psi_system); 188 group_init(&psi_system);
@@ -549,7 +561,7 @@ void psi_memstall_enter(unsigned long *flags)
549 struct rq_flags rf; 561 struct rq_flags rf;
550 struct rq *rq; 562 struct rq *rq;
551 563
552 if (psi_disabled) 564 if (static_branch_likely(&psi_disabled))
553 return; 565 return;
554 566
555 *flags = current->flags & PF_MEMSTALL; 567 *flags = current->flags & PF_MEMSTALL;
@@ -579,7 +591,7 @@ void psi_memstall_leave(unsigned long *flags)
579 struct rq_flags rf; 591 struct rq_flags rf;
580 struct rq *rq; 592 struct rq *rq;
581 593
582 if (psi_disabled) 594 if (static_branch_likely(&psi_disabled))
583 return; 595 return;
584 596
585 if (*flags) 597 if (*flags)
@@ -600,7 +612,7 @@ void psi_memstall_leave(unsigned long *flags)
600#ifdef CONFIG_CGROUPS 612#ifdef CONFIG_CGROUPS
601int psi_cgroup_alloc(struct cgroup *cgroup) 613int psi_cgroup_alloc(struct cgroup *cgroup)
602{ 614{
603 if (psi_disabled) 615 if (static_branch_likely(&psi_disabled))
604 return 0; 616 return 0;
605 617
606 cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); 618 cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
@@ -612,7 +624,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
612 624
613void psi_cgroup_free(struct cgroup *cgroup) 625void psi_cgroup_free(struct cgroup *cgroup)
614{ 626{
615 if (psi_disabled) 627 if (static_branch_likely(&psi_disabled))
616 return; 628 return;
617 629
618 cancel_delayed_work_sync(&cgroup->psi.clock_work); 630 cancel_delayed_work_sync(&cgroup->psi.clock_work);
@@ -633,38 +645,39 @@ void psi_cgroup_free(struct cgroup *cgroup)
633 */ 645 */
634void cgroup_move_task(struct task_struct *task, struct css_set *to) 646void cgroup_move_task(struct task_struct *task, struct css_set *to)
635{ 647{
636 bool move_psi = !psi_disabled;
637 unsigned int task_flags = 0; 648 unsigned int task_flags = 0;
638 struct rq_flags rf; 649 struct rq_flags rf;
639 struct rq *rq; 650 struct rq *rq;
640 651
641 if (move_psi) { 652 if (static_branch_likely(&psi_disabled)) {
642 rq = task_rq_lock(task, &rf); 653 /*
654 * Lame to do this here, but the scheduler cannot be locked
655 * from the outside, so we move cgroups from inside sched/.
656 */
657 rcu_assign_pointer(task->cgroups, to);
658 return;
659 }
643 660
644 if (task_on_rq_queued(task)) 661 rq = task_rq_lock(task, &rf);
645 task_flags = TSK_RUNNING;
646 else if (task->in_iowait)
647 task_flags = TSK_IOWAIT;
648 662
649 if (task->flags & PF_MEMSTALL) 663 if (task_on_rq_queued(task))
650 task_flags |= TSK_MEMSTALL; 664 task_flags = TSK_RUNNING;
665 else if (task->in_iowait)
666 task_flags = TSK_IOWAIT;
651 667
652 if (task_flags) 668 if (task->flags & PF_MEMSTALL)
653 psi_task_change(task, task_flags, 0); 669 task_flags |= TSK_MEMSTALL;
654 }
655 670
656 /* 671 if (task_flags)
657 * Lame to do this here, but the scheduler cannot be locked 672 psi_task_change(task, task_flags, 0);
658 * from the outside, so we move cgroups from inside sched/. 673
659 */ 674 /* See comment above */
660 rcu_assign_pointer(task->cgroups, to); 675 rcu_assign_pointer(task->cgroups, to);
661 676
662 if (move_psi) { 677 if (task_flags)
663 if (task_flags) 678 psi_task_change(task, 0, task_flags);
664 psi_task_change(task, 0, task_flags);
665 679
666 task_rq_unlock(rq, task, &rf); 680 task_rq_unlock(rq, task, &rf);
667 }
668} 681}
669#endif /* CONFIG_CGROUPS */ 682#endif /* CONFIG_CGROUPS */
670 683
@@ -672,7 +685,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
672{ 685{
673 int full; 686 int full;
674 687
675 if (psi_disabled) 688 if (static_branch_likely(&psi_disabled))
676 return -EOPNOTSUPP; 689 return -EOPNOTSUPP;
677 690
678 update_stats(group); 691 update_stats(group);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 618577fc9aa8..4e524ab589c9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -23,6 +23,7 @@
23#include <linux/sched/prio.h> 23#include <linux/sched/prio.h>
24#include <linux/sched/rt.h> 24#include <linux/sched/rt.h>
25#include <linux/sched/signal.h> 25#include <linux/sched/signal.h>
26#include <linux/sched/smt.h>
26#include <linux/sched/stat.h> 27#include <linux/sched/stat.h>
27#include <linux/sched/sysctl.h> 28#include <linux/sched/sysctl.h>
28#include <linux/sched/task.h> 29#include <linux/sched/task.h>
@@ -936,9 +937,6 @@ static inline int cpu_of(struct rq *rq)
936 937
937 938
938#ifdef CONFIG_SCHED_SMT 939#ifdef CONFIG_SCHED_SMT
939
940extern struct static_key_false sched_smt_present;
941
942extern void __update_idle_core(struct rq *rq); 940extern void __update_idle_core(struct rq *rq);
943 941
944static inline void update_idle_core(struct rq *rq) 942static inline void update_idle_core(struct rq *rq)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4904c4677000..aa0de240fb41 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -66,7 +66,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
66{ 66{
67 int clear = 0, set = TSK_RUNNING; 67 int clear = 0, set = TSK_RUNNING;
68 68
69 if (psi_disabled) 69 if (static_branch_likely(&psi_disabled))
70 return; 70 return;
71 71
72 if (!wakeup || p->sched_psi_wake_requeue) { 72 if (!wakeup || p->sched_psi_wake_requeue) {
@@ -86,7 +86,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
86{ 86{
87 int clear = TSK_RUNNING, set = 0; 87 int clear = TSK_RUNNING, set = 0;
88 88
89 if (psi_disabled) 89 if (static_branch_likely(&psi_disabled))
90 return; 90 return;
91 91
92 if (!sleep) { 92 if (!sleep) {
@@ -102,7 +102,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
102 102
103static inline void psi_ttwu_dequeue(struct task_struct *p) 103static inline void psi_ttwu_dequeue(struct task_struct *p)
104{ 104{
105 if (psi_disabled) 105 if (static_branch_likely(&psi_disabled))
106 return; 106 return;
107 /* 107 /*
108 * Is the task being migrated during a wakeup? Make sure to 108 * Is the task being migrated during a wakeup? Make sure to
@@ -128,7 +128,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
128 128
129static inline void psi_task_tick(struct rq *rq) 129static inline void psi_task_tick(struct rq *rq)
130{ 130{
131 if (psi_disabled) 131 if (static_branch_likely(&psi_disabled))
132 return; 132 return;
133 133
134 if (unlikely(rq->curr->flags & PF_MEMSTALL)) 134 if (unlikely(rq->curr->flags & PF_MEMSTALL))