aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c178
1 files changed, 95 insertions, 83 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e8b652ebe027..c7395d97e4cb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
178 update_sysctl(); 178 update_sysctl();
179} 179}
180 180
181#if BITS_PER_LONG == 32 181#define WMULT_CONST (~0U)
182# define WMULT_CONST (~0UL)
183#else
184# define WMULT_CONST (1UL << 32)
185#endif
186
187#define WMULT_SHIFT 32 182#define WMULT_SHIFT 32
188 183
189/* 184static void __update_inv_weight(struct load_weight *lw)
190 * Shift right and round: 185{
191 */ 186 unsigned long w;
192#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 187
188 if (likely(lw->inv_weight))
189 return;
190
191 w = scale_load_down(lw->weight);
192
193 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
194 lw->inv_weight = 1;
195 else if (unlikely(!w))
196 lw->inv_weight = WMULT_CONST;
197 else
198 lw->inv_weight = WMULT_CONST / w;
199}
193 200
194/* 201/*
195 * delta *= weight / lw 202 * delta_exec * weight / lw.weight
203 * OR
204 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
205 *
206 * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
207 * we're guaranteed shift stays positive because inv_weight is guaranteed to
208 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
209 *
210 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
211 * weight/lw.weight <= 1, and therefore our shift will also be positive.
196 */ 212 */
197static unsigned long 213static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
198calc_delta_mine(unsigned long delta_exec, unsigned long weight,
199 struct load_weight *lw)
200{ 214{
201 u64 tmp; 215 u64 fact = scale_load_down(weight);
216 int shift = WMULT_SHIFT;
202 217
203 /* 218 __update_inv_weight(lw);
204 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
205 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
206 * 2^SCHED_LOAD_RESOLUTION.
207 */
208 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
209 tmp = (u64)delta_exec * scale_load_down(weight);
210 else
211 tmp = (u64)delta_exec;
212 219
213 if (!lw->inv_weight) { 220 if (unlikely(fact >> 32)) {
214 unsigned long w = scale_load_down(lw->weight); 221 while (fact >> 32) {
215 222 fact >>= 1;
216 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) 223 shift--;
217 lw->inv_weight = 1; 224 }
218 else if (unlikely(!w))
219 lw->inv_weight = WMULT_CONST;
220 else
221 lw->inv_weight = WMULT_CONST / w;
222 } 225 }
223 226
224 /* 227 /* hint to use a 32x32->64 mul */
225 * Check whether we'd overflow the 64-bit multiplication: 228 fact = (u64)(u32)fact * lw->inv_weight;
226 */
227 if (unlikely(tmp > WMULT_CONST))
228 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
229 WMULT_SHIFT/2);
230 else
231 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
232 229
233 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 230 while (fact >> 32) {
231 fact >>= 1;
232 shift--;
233 }
234
235 return mul_u64_u32_shr(delta_exec, fact, shift);
234} 236}
235 237
236 238
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
443#endif /* CONFIG_FAIR_GROUP_SCHED */ 445#endif /* CONFIG_FAIR_GROUP_SCHED */
444 446
445static __always_inline 447static __always_inline
446void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); 448void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
447 449
448/************************************************************** 450/**************************************************************
449 * Scheduling class tree data structure manipulation methods: 451 * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
612/* 614/*
613 * delta /= w 615 * delta /= w
614 */ 616 */
615static inline unsigned long 617static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
616calc_delta_fair(unsigned long delta, struct sched_entity *se)
617{ 618{
618 if (unlikely(se->load.weight != NICE_0_LOAD)) 619 if (unlikely(se->load.weight != NICE_0_LOAD))
619 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); 620 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
620 621
621 return delta; 622 return delta;
622} 623}
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
665 update_load_add(&lw, se->load.weight); 666 update_load_add(&lw, se->load.weight);
666 load = &lw; 667 load = &lw;
667 } 668 }
668 slice = calc_delta_mine(slice, se->load.weight, load); 669 slice = __calc_delta(slice, se->load.weight, load);
669 } 670 }
670 return slice; 671 return slice;
671} 672}
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
703#endif 704#endif
704 705
705/* 706/*
706 * Update the current task's runtime statistics. Skip current tasks that 707 * Update the current task's runtime statistics.
707 * are not in our scheduling class.
708 */ 708 */
709static inline void
710__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
711 unsigned long delta_exec)
712{
713 unsigned long delta_exec_weighted;
714
715 schedstat_set(curr->statistics.exec_max,
716 max((u64)delta_exec, curr->statistics.exec_max));
717
718 curr->sum_exec_runtime += delta_exec;
719 schedstat_add(cfs_rq, exec_clock, delta_exec);
720 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
721
722 curr->vruntime += delta_exec_weighted;
723 update_min_vruntime(cfs_rq);
724}
725
726static void update_curr(struct cfs_rq *cfs_rq) 709static void update_curr(struct cfs_rq *cfs_rq)
727{ 710{
728 struct sched_entity *curr = cfs_rq->curr; 711 struct sched_entity *curr = cfs_rq->curr;
729 u64 now = rq_clock_task(rq_of(cfs_rq)); 712 u64 now = rq_clock_task(rq_of(cfs_rq));
730 unsigned long delta_exec; 713 u64 delta_exec;
731 714
732 if (unlikely(!curr)) 715 if (unlikely(!curr))
733 return; 716 return;
734 717
735 /* 718 delta_exec = now - curr->exec_start;
736 * Get the amount of time the current task was running 719 if (unlikely((s64)delta_exec <= 0))
737 * since the last time we changed load (this cannot
738 * overflow on 32 bits):
739 */
740 delta_exec = (unsigned long)(now - curr->exec_start);
741 if (!delta_exec)
742 return; 720 return;
743 721
744 __update_curr(cfs_rq, curr, delta_exec);
745 curr->exec_start = now; 722 curr->exec_start = now;
746 723
724 schedstat_set(curr->statistics.exec_max,
725 max(delta_exec, curr->statistics.exec_max));
726
727 curr->sum_exec_runtime += delta_exec;
728 schedstat_add(cfs_rq, exec_clock, delta_exec);
729
730 curr->vruntime += calc_delta_fair(delta_exec, curr);
731 update_min_vruntime(cfs_rq);
732
747 if (entity_is_task(curr)) { 733 if (entity_is_task(curr)) {
748 struct task_struct *curtask = task_of(curr); 734 struct task_struct *curtask = task_of(curr);
749 735
@@ -1752,6 +1738,13 @@ void task_numa_work(struct callback_head *work)
1752 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) 1738 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
1753 continue; 1739 continue;
1754 1740
1741 /*
1742 * Skip inaccessible VMAs to avoid any confusion between
1743 * PROT_NONE and NUMA hinting ptes
1744 */
1745 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1746 continue;
1747
1755 do { 1748 do {
1756 start = max(start, vma->vm_start); 1749 start = max(start, vma->vm_start);
1757 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1750 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
@@ -3015,8 +3008,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3015 } 3008 }
3016} 3009}
3017 3010
3018static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 3011static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3019 unsigned long delta_exec)
3020{ 3012{
3021 /* dock delta_exec before expiring quota (as it could span periods) */ 3013 /* dock delta_exec before expiring quota (as it could span periods) */
3022 cfs_rq->runtime_remaining -= delta_exec; 3014 cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3026,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
3034} 3026}
3035 3027
3036static __always_inline 3028static __always_inline
3037void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) 3029void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3038{ 3030{
3039 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) 3031 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3040 return; 3032 return;
@@ -3574,8 +3566,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3574 return rq_clock_task(rq_of(cfs_rq)); 3566 return rq_clock_task(rq_of(cfs_rq));
3575} 3567}
3576 3568
3577static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 3569static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3578 unsigned long delta_exec) {}
3579static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3570static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3580static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 3571static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3581static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3572static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -5379,10 +5370,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
5379 */ 5370 */
5380 5371
5381 for_each_cpu(cpu, sched_group_cpus(sdg)) { 5372 for_each_cpu(cpu, sched_group_cpus(sdg)) {
5382 struct sched_group *sg = cpu_rq(cpu)->sd->groups; 5373 struct sched_group_power *sgp;
5374 struct rq *rq = cpu_rq(cpu);
5375
5376 /*
5377 * build_sched_domains() -> init_sched_groups_power()
5378 * gets here before we've attached the domains to the
5379 * runqueues.
5380 *
5381 * Use power_of(), which is set irrespective of domains
5382 * in update_cpu_power().
5383 *
5384 * This avoids power/power_orig from being 0 and
5385 * causing divide-by-zero issues on boot.
5386 *
5387 * Runtime updates will correct power_orig.
5388 */
5389 if (unlikely(!rq->sd)) {
5390 power_orig += power_of(cpu);
5391 power += power_of(cpu);
5392 continue;
5393 }
5383 5394
5384 power_orig += sg->sgp->power_orig; 5395 sgp = rq->sd->groups->sgp;
5385 power += sg->sgp->power; 5396 power_orig += sgp->power_orig;
5397 power += sgp->power;
5386 } 5398 }
5387 } else { 5399 } else {
5388 /* 5400 /*