aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/sched/fair.c178
-rw-r--r--kernel/sched/rt.c14
3 files changed, 115 insertions, 87 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1808606ee5f..a88f4a485c5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2660,6 +2660,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
2660 } while (need_resched()); 2660 } while (need_resched());
2661} 2661}
2662EXPORT_SYMBOL(preempt_schedule); 2662EXPORT_SYMBOL(preempt_schedule);
2663#endif /* CONFIG_PREEMPT */
2663 2664
2664/* 2665/*
2665 * this is the entry point to schedule() from kernel preemption 2666 * this is the entry point to schedule() from kernel preemption
@@ -2693,8 +2694,6 @@ asmlinkage void __sched preempt_schedule_irq(void)
2693 exception_exit(prev_state); 2694 exception_exit(prev_state);
2694} 2695}
2695 2696
2696#endif /* CONFIG_PREEMPT */
2697
2698int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 2697int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2699 void *key) 2698 void *key)
2700{ 2699{
@@ -4762,7 +4761,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
4762 cpumask_clear_cpu(rq->cpu, old_rd->span); 4761 cpumask_clear_cpu(rq->cpu, old_rd->span);
4763 4762
4764 /* 4763 /*
4765 * If we dont want to free the old_rt yet then 4764 * If we dont want to free the old_rd yet then
4766 * set old_rd to NULL to skip the freeing later 4765 * set old_rd to NULL to skip the freeing later
4767 * in this function: 4766 * in this function:
4768 */ 4767 */
@@ -4903,6 +4902,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_asym);
4903static void update_top_cache_domain(int cpu) 4902static void update_top_cache_domain(int cpu)
4904{ 4903{
4905 struct sched_domain *sd; 4904 struct sched_domain *sd;
4905 struct sched_domain *busy_sd = NULL;
4906 int id = cpu; 4906 int id = cpu;
4907 int size = 1; 4907 int size = 1;
4908 4908
@@ -4910,8 +4910,9 @@ static void update_top_cache_domain(int cpu)
4910 if (sd) { 4910 if (sd) {
4911 id = cpumask_first(sched_domain_span(sd)); 4911 id = cpumask_first(sched_domain_span(sd));
4912 size = cpumask_weight(sched_domain_span(sd)); 4912 size = cpumask_weight(sched_domain_span(sd));
4913 rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent); 4913 busy_sd = sd->parent; /* sd_busy */
4914 } 4914 }
4915 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
4915 4916
4916 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 4917 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
4917 per_cpu(sd_llc_size, cpu) = size; 4918 per_cpu(sd_llc_size, cpu) = size;
@@ -5112,6 +5113,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5112 * die on a /0 trap. 5113 * die on a /0 trap.
5113 */ 5114 */
5114 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); 5115 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5116 sg->sgp->power_orig = sg->sgp->power;
5115 5117
5116 /* 5118 /*
5117 * Make sure the first group of this domain contains the 5119 * Make sure the first group of this domain contains the
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e8b652ebe027..c7395d97e4cb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
178 update_sysctl(); 178 update_sysctl();
179} 179}
180 180
181#if BITS_PER_LONG == 32 181#define WMULT_CONST (~0U)
182# define WMULT_CONST (~0UL)
183#else
184# define WMULT_CONST (1UL << 32)
185#endif
186
187#define WMULT_SHIFT 32 182#define WMULT_SHIFT 32
188 183
189/* 184static void __update_inv_weight(struct load_weight *lw)
190 * Shift right and round: 185{
191 */ 186 unsigned long w;
192#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 187
188 if (likely(lw->inv_weight))
189 return;
190
191 w = scale_load_down(lw->weight);
192
193 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
194 lw->inv_weight = 1;
195 else if (unlikely(!w))
196 lw->inv_weight = WMULT_CONST;
197 else
198 lw->inv_weight = WMULT_CONST / w;
199}
193 200
194/* 201/*
195 * delta *= weight / lw 202 * delta_exec * weight / lw.weight
203 * OR
204 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
205 *
206 * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
207 * we're guaranteed shift stays positive because inv_weight is guaranteed to
208 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
209 *
210 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
211 * weight/lw.weight <= 1, and therefore our shift will also be positive.
196 */ 212 */
197static unsigned long 213static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
198calc_delta_mine(unsigned long delta_exec, unsigned long weight,
199 struct load_weight *lw)
200{ 214{
201 u64 tmp; 215 u64 fact = scale_load_down(weight);
216 int shift = WMULT_SHIFT;
202 217
203 /* 218 __update_inv_weight(lw);
204 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
205 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
206 * 2^SCHED_LOAD_RESOLUTION.
207 */
208 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
209 tmp = (u64)delta_exec * scale_load_down(weight);
210 else
211 tmp = (u64)delta_exec;
212 219
213 if (!lw->inv_weight) { 220 if (unlikely(fact >> 32)) {
214 unsigned long w = scale_load_down(lw->weight); 221 while (fact >> 32) {
215 222 fact >>= 1;
216 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) 223 shift--;
217 lw->inv_weight = 1; 224 }
218 else if (unlikely(!w))
219 lw->inv_weight = WMULT_CONST;
220 else
221 lw->inv_weight = WMULT_CONST / w;
222 } 225 }
223 226
224 /* 227 /* hint to use a 32x32->64 mul */
225 * Check whether we'd overflow the 64-bit multiplication: 228 fact = (u64)(u32)fact * lw->inv_weight;
226 */
227 if (unlikely(tmp > WMULT_CONST))
228 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
229 WMULT_SHIFT/2);
230 else
231 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
232 229
233 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 230 while (fact >> 32) {
231 fact >>= 1;
232 shift--;
233 }
234
235 return mul_u64_u32_shr(delta_exec, fact, shift);
234} 236}
235 237
236 238
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
443#endif /* CONFIG_FAIR_GROUP_SCHED */ 445#endif /* CONFIG_FAIR_GROUP_SCHED */
444 446
445static __always_inline 447static __always_inline
446void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); 448void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
447 449
448/************************************************************** 450/**************************************************************
449 * Scheduling class tree data structure manipulation methods: 451 * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
612/* 614/*
613 * delta /= w 615 * delta /= w
614 */ 616 */
615static inline unsigned long 617static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
616calc_delta_fair(unsigned long delta, struct sched_entity *se)
617{ 618{
618 if (unlikely(se->load.weight != NICE_0_LOAD)) 619 if (unlikely(se->load.weight != NICE_0_LOAD))
619 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); 620 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
620 621
621 return delta; 622 return delta;
622} 623}
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
665 update_load_add(&lw, se->load.weight); 666 update_load_add(&lw, se->load.weight);
666 load = &lw; 667 load = &lw;
667 } 668 }
668 slice = calc_delta_mine(slice, se->load.weight, load); 669 slice = __calc_delta(slice, se->load.weight, load);
669 } 670 }
670 return slice; 671 return slice;
671} 672}
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
703#endif 704#endif
704 705
705/* 706/*
706 * Update the current task's runtime statistics. Skip current tasks that 707 * Update the current task's runtime statistics.
707 * are not in our scheduling class.
708 */ 708 */
709static inline void
710__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
711 unsigned long delta_exec)
712{
713 unsigned long delta_exec_weighted;
714
715 schedstat_set(curr->statistics.exec_max,
716 max((u64)delta_exec, curr->statistics.exec_max));
717
718 curr->sum_exec_runtime += delta_exec;
719 schedstat_add(cfs_rq, exec_clock, delta_exec);
720 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
721
722 curr->vruntime += delta_exec_weighted;
723 update_min_vruntime(cfs_rq);
724}
725
726static void update_curr(struct cfs_rq *cfs_rq) 709static void update_curr(struct cfs_rq *cfs_rq)
727{ 710{
728 struct sched_entity *curr = cfs_rq->curr; 711 struct sched_entity *curr = cfs_rq->curr;
729 u64 now = rq_clock_task(rq_of(cfs_rq)); 712 u64 now = rq_clock_task(rq_of(cfs_rq));
730 unsigned long delta_exec; 713 u64 delta_exec;
731 714
732 if (unlikely(!curr)) 715 if (unlikely(!curr))
733 return; 716 return;
734 717
735 /* 718 delta_exec = now - curr->exec_start;
736 * Get the amount of time the current task was running 719 if (unlikely((s64)delta_exec <= 0))
737 * since the last time we changed load (this cannot
738 * overflow on 32 bits):
739 */
740 delta_exec = (unsigned long)(now - curr->exec_start);
741 if (!delta_exec)
742 return; 720 return;
743 721
744 __update_curr(cfs_rq, curr, delta_exec);
745 curr->exec_start = now; 722 curr->exec_start = now;
746 723
724 schedstat_set(curr->statistics.exec_max,
725 max(delta_exec, curr->statistics.exec_max));
726
727 curr->sum_exec_runtime += delta_exec;
728 schedstat_add(cfs_rq, exec_clock, delta_exec);
729
730 curr->vruntime += calc_delta_fair(delta_exec, curr);
731 update_min_vruntime(cfs_rq);
732
747 if (entity_is_task(curr)) { 733 if (entity_is_task(curr)) {
748 struct task_struct *curtask = task_of(curr); 734 struct task_struct *curtask = task_of(curr);
749 735
@@ -1752,6 +1738,13 @@ void task_numa_work(struct callback_head *work)
1752 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) 1738 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
1753 continue; 1739 continue;
1754 1740
1741 /*
1742 * Skip inaccessible VMAs to avoid any confusion between
1743 * PROT_NONE and NUMA hinting ptes
1744 */
1745 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1746 continue;
1747
1755 do { 1748 do {
1756 start = max(start, vma->vm_start); 1749 start = max(start, vma->vm_start);
1757 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1750 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
@@ -3015,8 +3008,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3015 } 3008 }
3016} 3009}
3017 3010
3018static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 3011static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3019 unsigned long delta_exec)
3020{ 3012{
3021 /* dock delta_exec before expiring quota (as it could span periods) */ 3013 /* dock delta_exec before expiring quota (as it could span periods) */
3022 cfs_rq->runtime_remaining -= delta_exec; 3014 cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3026,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
3034} 3026}
3035 3027
3036static __always_inline 3028static __always_inline
3037void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) 3029void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3038{ 3030{
3039 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) 3031 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3040 return; 3032 return;
@@ -3574,8 +3566,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3574 return rq_clock_task(rq_of(cfs_rq)); 3566 return rq_clock_task(rq_of(cfs_rq));
3575} 3567}
3576 3568
3577static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 3569static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3578 unsigned long delta_exec) {}
3579static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3570static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3580static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 3571static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3581static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3572static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -5379,10 +5370,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
5379 */ 5370 */
5380 5371
5381 for_each_cpu(cpu, sched_group_cpus(sdg)) { 5372 for_each_cpu(cpu, sched_group_cpus(sdg)) {
5382 struct sched_group *sg = cpu_rq(cpu)->sd->groups; 5373 struct sched_group_power *sgp;
5374 struct rq *rq = cpu_rq(cpu);
5375
5376 /*
5377 * build_sched_domains() -> init_sched_groups_power()
5378 * gets here before we've attached the domains to the
5379 * runqueues.
5380 *
5381 * Use power_of(), which is set irrespective of domains
5382 * in update_cpu_power().
5383 *
5384 * This avoids power/power_orig from being 0 and
5385 * causing divide-by-zero issues on boot.
5386 *
5387 * Runtime updates will correct power_orig.
5388 */
5389 if (unlikely(!rq->sd)) {
5390 power_orig += power_of(cpu);
5391 power += power_of(cpu);
5392 continue;
5393 }
5383 5394
5384 power_orig += sg->sgp->power_orig; 5395 sgp = rq->sd->groups->sgp;
5385 power += sg->sgp->power; 5396 power_orig += sgp->power_orig;
5397 power += sgp->power;
5386 } 5398 }
5387 } else { 5399 } else {
5388 /* 5400 /*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7d57275fc396..1c4065575fa2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -901,6 +901,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
901{ 901{
902 struct rq *rq = rq_of_rt_rq(rt_rq); 902 struct rq *rq = rq_of_rt_rq(rt_rq);
903 903
904#ifdef CONFIG_RT_GROUP_SCHED
905 /*
906 * Change rq's cpupri only if rt_rq is the top queue.
907 */
908 if (&rq->rt != rt_rq)
909 return;
910#endif
904 if (rq->online && prio < prev_prio) 911 if (rq->online && prio < prev_prio)
905 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 912 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
906} 913}
@@ -910,6 +917,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
910{ 917{
911 struct rq *rq = rq_of_rt_rq(rt_rq); 918 struct rq *rq = rq_of_rt_rq(rt_rq);
912 919
920#ifdef CONFIG_RT_GROUP_SCHED
921 /*
922 * Change rq's cpupri only if rt_rq is the top queue.
923 */
924 if (&rq->rt != rt_rq)
925 return;
926#endif
913 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 927 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
914 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 928 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
915} 929}