aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c292
1 files changed, 149 insertions, 143 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6edbef296ece..268a45ea238c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
73#include <linux/init_task.h> 73#include <linux/init_task.h>
74#include <linux/binfmts.h> 74#include <linux/binfmts.h>
75#include <linux/context_tracking.h> 75#include <linux/context_tracking.h>
76#include <linux/compiler.h>
76 77
77#include <asm/switch_to.h> 78#include <asm/switch_to.h>
78#include <asm/tlb.h> 79#include <asm/tlb.h>
@@ -432,7 +433,7 @@ void hrtick_start(struct rq *rq, u64 delay)
432 if (rq == this_rq()) { 433 if (rq == this_rq()) {
433 __hrtick_restart(rq); 434 __hrtick_restart(rq);
434 } else if (!rq->hrtick_csd_pending) { 435 } else if (!rq->hrtick_csd_pending) {
435 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 436 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
436 rq->hrtick_csd_pending = 1; 437 rq->hrtick_csd_pending = 1;
437 } 438 }
438} 439}
@@ -555,12 +556,15 @@ void resched_cpu(int cpu)
555 * selecting an idle cpu will add more delays to the timers than intended 556 * selecting an idle cpu will add more delays to the timers than intended
556 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 557 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
557 */ 558 */
558int get_nohz_timer_target(void) 559int get_nohz_timer_target(int pinned)
559{ 560{
560 int cpu = smp_processor_id(); 561 int cpu = smp_processor_id();
561 int i; 562 int i;
562 struct sched_domain *sd; 563 struct sched_domain *sd;
563 564
565 if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
566 return cpu;
567
564 rcu_read_lock(); 568 rcu_read_lock();
565 for_each_domain(cpu, sd) { 569 for_each_domain(cpu, sd) {
566 for_each_cpu(i, sched_domain_span(sd)) { 570 for_each_cpu(i, sched_domain_span(sd)) {
@@ -823,19 +827,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
823#endif 827#endif
824#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 828#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
825 if (static_key_false((&paravirt_steal_rq_enabled))) { 829 if (static_key_false((&paravirt_steal_rq_enabled))) {
826 u64 st;
827
828 steal = paravirt_steal_clock(cpu_of(rq)); 830 steal = paravirt_steal_clock(cpu_of(rq));
829 steal -= rq->prev_steal_time_rq; 831 steal -= rq->prev_steal_time_rq;
830 832
831 if (unlikely(steal > delta)) 833 if (unlikely(steal > delta))
832 steal = delta; 834 steal = delta;
833 835
834 st = steal_ticks(steal);
835 steal = st * TICK_NSEC;
836
837 rq->prev_steal_time_rq += steal; 836 rq->prev_steal_time_rq += steal;
838
839 delta -= steal; 837 delta -= steal;
840 } 838 }
841#endif 839#endif
@@ -1745,8 +1743,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1745 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1743 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1746 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1744 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1747 p->numa_work.next = &p->numa_work; 1745 p->numa_work.next = &p->numa_work;
1748 p->numa_faults = NULL; 1746 p->numa_faults_memory = NULL;
1749 p->numa_faults_buffer = NULL; 1747 p->numa_faults_buffer_memory = NULL;
1748 p->last_task_numa_placement = 0;
1749 p->last_sum_exec_runtime = 0;
1750 1750
1751 INIT_LIST_HEAD(&p->numa_entry); 1751 INIT_LIST_HEAD(&p->numa_entry);
1752 p->numa_group = NULL; 1752 p->numa_group = NULL;
@@ -2149,8 +2149,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2149 if (mm) 2149 if (mm)
2150 mmdrop(mm); 2150 mmdrop(mm);
2151 if (unlikely(prev_state == TASK_DEAD)) { 2151 if (unlikely(prev_state == TASK_DEAD)) {
2152 task_numa_free(prev);
2153
2154 if (prev->sched_class->task_dead) 2152 if (prev->sched_class->task_dead)
2155 prev->sched_class->task_dead(prev); 2153 prev->sched_class->task_dead(prev);
2156 2154
@@ -2167,13 +2165,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2167 2165
2168#ifdef CONFIG_SMP 2166#ifdef CONFIG_SMP
2169 2167
2170/* assumes rq->lock is held */
2171static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2172{
2173 if (prev->sched_class->pre_schedule)
2174 prev->sched_class->pre_schedule(rq, prev);
2175}
2176
2177/* rq->lock is NOT held, but preemption is disabled */ 2168/* rq->lock is NOT held, but preemption is disabled */
2178static inline void post_schedule(struct rq *rq) 2169static inline void post_schedule(struct rq *rq)
2179{ 2170{
@@ -2191,10 +2182,6 @@ static inline void post_schedule(struct rq *rq)
2191 2182
2192#else 2183#else
2193 2184
2194static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2195{
2196}
2197
2198static inline void post_schedule(struct rq *rq) 2185static inline void post_schedule(struct rq *rq)
2199{ 2186{
2200} 2187}
@@ -2510,8 +2497,13 @@ void __kprobes preempt_count_add(int val)
2510 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2497 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2511 PREEMPT_MASK - 10); 2498 PREEMPT_MASK - 10);
2512#endif 2499#endif
2513 if (preempt_count() == val) 2500 if (preempt_count() == val) {
2514 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2501 unsigned long ip = get_parent_ip(CALLER_ADDR1);
2502#ifdef CONFIG_DEBUG_PREEMPT
2503 current->preempt_disable_ip = ip;
2504#endif
2505 trace_preempt_off(CALLER_ADDR0, ip);
2506 }
2515} 2507}
2516EXPORT_SYMBOL(preempt_count_add); 2508EXPORT_SYMBOL(preempt_count_add);
2517 2509
@@ -2554,6 +2546,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
2554 print_modules(); 2546 print_modules();
2555 if (irqs_disabled()) 2547 if (irqs_disabled())
2556 print_irqtrace_events(prev); 2548 print_irqtrace_events(prev);
2549#ifdef CONFIG_DEBUG_PREEMPT
2550 if (in_atomic_preempt_off()) {
2551 pr_err("Preemption disabled at:");
2552 print_ip_sym(current->preempt_disable_ip);
2553 pr_cont("\n");
2554 }
2555#endif
2557 dump_stack(); 2556 dump_stack();
2558 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 2557 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2559} 2558}
@@ -2577,36 +2576,34 @@ static inline void schedule_debug(struct task_struct *prev)
2577 schedstat_inc(this_rq(), sched_count); 2576 schedstat_inc(this_rq(), sched_count);
2578} 2577}
2579 2578
2580static void put_prev_task(struct rq *rq, struct task_struct *prev)
2581{
2582 if (prev->on_rq || rq->skip_clock_update < 0)
2583 update_rq_clock(rq);
2584 prev->sched_class->put_prev_task(rq, prev);
2585}
2586
2587/* 2579/*
2588 * Pick up the highest-prio task: 2580 * Pick up the highest-prio task:
2589 */ 2581 */
2590static inline struct task_struct * 2582static inline struct task_struct *
2591pick_next_task(struct rq *rq) 2583pick_next_task(struct rq *rq, struct task_struct *prev)
2592{ 2584{
2593 const struct sched_class *class; 2585 const struct sched_class *class = &fair_sched_class;
2594 struct task_struct *p; 2586 struct task_struct *p;
2595 2587
2596 /* 2588 /*
2597 * Optimization: we know that if all tasks are in 2589 * Optimization: we know that if all tasks are in
2598 * the fair class we can call that function directly: 2590 * the fair class we can call that function directly:
2599 */ 2591 */
2600 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { 2592 if (likely(prev->sched_class == class &&
2601 p = fair_sched_class.pick_next_task(rq); 2593 rq->nr_running == rq->cfs.h_nr_running)) {
2602 if (likely(p)) 2594 p = fair_sched_class.pick_next_task(rq, prev);
2595 if (likely(p && p != RETRY_TASK))
2603 return p; 2596 return p;
2604 } 2597 }
2605 2598
2599again:
2606 for_each_class(class) { 2600 for_each_class(class) {
2607 p = class->pick_next_task(rq); 2601 p = class->pick_next_task(rq, prev);
2608 if (p) 2602 if (p) {
2603 if (unlikely(p == RETRY_TASK))
2604 goto again;
2609 return p; 2605 return p;
2606 }
2610 } 2607 }
2611 2608
2612 BUG(); /* the idle class will always have a runnable task */ 2609 BUG(); /* the idle class will always have a runnable task */
@@ -2700,13 +2697,10 @@ need_resched:
2700 switch_count = &prev->nvcsw; 2697 switch_count = &prev->nvcsw;
2701 } 2698 }
2702 2699
2703 pre_schedule(rq, prev); 2700 if (prev->on_rq || rq->skip_clock_update < 0)
2704 2701 update_rq_clock(rq);
2705 if (unlikely(!rq->nr_running))
2706 idle_balance(cpu, rq);
2707 2702
2708 put_prev_task(rq, prev); 2703 next = pick_next_task(rq, prev);
2709 next = pick_next_task(rq);
2710 clear_tsk_need_resched(prev); 2704 clear_tsk_need_resched(prev);
2711 clear_preempt_need_resched(); 2705 clear_preempt_need_resched();
2712 rq->skip_clock_update = 0; 2706 rq->skip_clock_update = 0;
@@ -2852,52 +2846,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2852} 2846}
2853EXPORT_SYMBOL(default_wake_function); 2847EXPORT_SYMBOL(default_wake_function);
2854 2848
2855static long __sched
2856sleep_on_common(wait_queue_head_t *q, int state, long timeout)
2857{
2858 unsigned long flags;
2859 wait_queue_t wait;
2860
2861 init_waitqueue_entry(&wait, current);
2862
2863 __set_current_state(state);
2864
2865 spin_lock_irqsave(&q->lock, flags);
2866 __add_wait_queue(q, &wait);
2867 spin_unlock(&q->lock);
2868 timeout = schedule_timeout(timeout);
2869 spin_lock_irq(&q->lock);
2870 __remove_wait_queue(q, &wait);
2871 spin_unlock_irqrestore(&q->lock, flags);
2872
2873 return timeout;
2874}
2875
2876void __sched interruptible_sleep_on(wait_queue_head_t *q)
2877{
2878 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2879}
2880EXPORT_SYMBOL(interruptible_sleep_on);
2881
2882long __sched
2883interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
2884{
2885 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
2886}
2887EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2888
2889void __sched sleep_on(wait_queue_head_t *q)
2890{
2891 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2892}
2893EXPORT_SYMBOL(sleep_on);
2894
2895long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
2896{
2897 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
2898}
2899EXPORT_SYMBOL(sleep_on_timeout);
2900
2901#ifdef CONFIG_RT_MUTEXES 2849#ifdef CONFIG_RT_MUTEXES
2902 2850
2903/* 2851/*
@@ -2908,7 +2856,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
2908 * This function changes the 'effective' priority of a task. It does 2856 * This function changes the 'effective' priority of a task. It does
2909 * not touch ->normal_prio like __setscheduler(). 2857 * not touch ->normal_prio like __setscheduler().
2910 * 2858 *
2911 * Used by the rt_mutex code to implement priority inheritance logic. 2859 * Used by the rt_mutex code to implement priority inheritance
2860 * logic. Call site only calls if the priority of the task changed.
2912 */ 2861 */
2913void rt_mutex_setprio(struct task_struct *p, int prio) 2862void rt_mutex_setprio(struct task_struct *p, int prio)
2914{ 2863{
@@ -2998,7 +2947,7 @@ void set_user_nice(struct task_struct *p, long nice)
2998 unsigned long flags; 2947 unsigned long flags;
2999 struct rq *rq; 2948 struct rq *rq;
3000 2949
3001 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 2950 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3002 return; 2951 return;
3003 /* 2952 /*
3004 * We have to be careful, if called from sys_setpriority(), 2953 * We have to be careful, if called from sys_setpriority(),
@@ -3076,11 +3025,11 @@ SYSCALL_DEFINE1(nice, int, increment)
3076 if (increment > 40) 3025 if (increment > 40)
3077 increment = 40; 3026 increment = 40;
3078 3027
3079 nice = TASK_NICE(current) + increment; 3028 nice = task_nice(current) + increment;
3080 if (nice < -20) 3029 if (nice < MIN_NICE)
3081 nice = -20; 3030 nice = MIN_NICE;
3082 if (nice > 19) 3031 if (nice > MAX_NICE)
3083 nice = 19; 3032 nice = MAX_NICE;
3084 3033
3085 if (increment < 0 && !can_nice(current, nice)) 3034 if (increment < 0 && !can_nice(current, nice))
3086 return -EPERM; 3035 return -EPERM;
@@ -3109,18 +3058,6 @@ int task_prio(const struct task_struct *p)
3109} 3058}
3110 3059
3111/** 3060/**
3112 * task_nice - return the nice value of a given task.
3113 * @p: the task in question.
3114 *
3115 * Return: The nice value [ -20 ... 0 ... 19 ].
3116 */
3117int task_nice(const struct task_struct *p)
3118{
3119 return TASK_NICE(p);
3120}
3121EXPORT_SYMBOL(task_nice);
3122
3123/**
3124 * idle_cpu - is a given cpu idle currently? 3061 * idle_cpu - is a given cpu idle currently?
3125 * @cpu: the processor in question. 3062 * @cpu: the processor in question.
3126 * 3063 *
@@ -3189,9 +3126,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3189 dl_se->dl_new = 1; 3126 dl_se->dl_new = 1;
3190} 3127}
3191 3128
3192/* Actually do priority change: must hold pi & rq lock. */ 3129static void __setscheduler_params(struct task_struct *p,
3193static void __setscheduler(struct rq *rq, struct task_struct *p, 3130 const struct sched_attr *attr)
3194 const struct sched_attr *attr)
3195{ 3131{
3196 int policy = attr->sched_policy; 3132 int policy = attr->sched_policy;
3197 3133
@@ -3211,9 +3147,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
3211 * getparam()/getattr() don't report silly values for !rt tasks. 3147 * getparam()/getattr() don't report silly values for !rt tasks.
3212 */ 3148 */
3213 p->rt_priority = attr->sched_priority; 3149 p->rt_priority = attr->sched_priority;
3214
3215 p->normal_prio = normal_prio(p); 3150 p->normal_prio = normal_prio(p);
3216 p->prio = rt_mutex_getprio(p); 3151 set_load_weight(p);
3152}
3153
3154/* Actually do priority change: must hold pi & rq lock. */
3155static void __setscheduler(struct rq *rq, struct task_struct *p,
3156 const struct sched_attr *attr)
3157{
3158 __setscheduler_params(p, attr);
3159
3160 /*
3161 * If we get here, there was no pi waiters boosting the
3162 * task. It is safe to use the normal prio.
3163 */
3164 p->prio = normal_prio(p);
3217 3165
3218 if (dl_prio(p->prio)) 3166 if (dl_prio(p->prio))
3219 p->sched_class = &dl_sched_class; 3167 p->sched_class = &dl_sched_class;
@@ -3221,8 +3169,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
3221 p->sched_class = &rt_sched_class; 3169 p->sched_class = &rt_sched_class;
3222 else 3170 else
3223 p->sched_class = &fair_sched_class; 3171 p->sched_class = &fair_sched_class;
3224
3225 set_load_weight(p);
3226} 3172}
3227 3173
3228static void 3174static void
@@ -3275,6 +3221,8 @@ static int __sched_setscheduler(struct task_struct *p,
3275 const struct sched_attr *attr, 3221 const struct sched_attr *attr,
3276 bool user) 3222 bool user)
3277{ 3223{
3224 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3225 MAX_RT_PRIO - 1 - attr->sched_priority;
3278 int retval, oldprio, oldpolicy = -1, on_rq, running; 3226 int retval, oldprio, oldpolicy = -1, on_rq, running;
3279 int policy = attr->sched_policy; 3227 int policy = attr->sched_policy;
3280 unsigned long flags; 3228 unsigned long flags;
@@ -3319,7 +3267,7 @@ recheck:
3319 */ 3267 */
3320 if (user && !capable(CAP_SYS_NICE)) { 3268 if (user && !capable(CAP_SYS_NICE)) {
3321 if (fair_policy(policy)) { 3269 if (fair_policy(policy)) {
3322 if (attr->sched_nice < TASK_NICE(p) && 3270 if (attr->sched_nice < task_nice(p) &&
3323 !can_nice(p, attr->sched_nice)) 3271 !can_nice(p, attr->sched_nice))
3324 return -EPERM; 3272 return -EPERM;
3325 } 3273 }
@@ -3338,12 +3286,21 @@ recheck:
3338 return -EPERM; 3286 return -EPERM;
3339 } 3287 }
3340 3288
3289 /*
3290 * Can't set/change SCHED_DEADLINE policy at all for now
3291 * (safest behavior); in the future we would like to allow
3292 * unprivileged DL tasks to increase their relative deadline
3293 * or reduce their runtime (both ways reducing utilization)
3294 */
3295 if (dl_policy(policy))
3296 return -EPERM;
3297
3341 /* 3298 /*
3342 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3299 * Treat SCHED_IDLE as nice 20. Only allow a switch to
3343 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3300 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3344 */ 3301 */
3345 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3302 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3346 if (!can_nice(p, TASK_NICE(p))) 3303 if (!can_nice(p, task_nice(p)))
3347 return -EPERM; 3304 return -EPERM;
3348 } 3305 }
3349 3306
@@ -3380,16 +3337,18 @@ recheck:
3380 } 3337 }
3381 3338
3382 /* 3339 /*
3383 * If not changing anything there's no need to proceed further: 3340 * If not changing anything there's no need to proceed further,
3341 * but store a possible modification of reset_on_fork.
3384 */ 3342 */
3385 if (unlikely(policy == p->policy)) { 3343 if (unlikely(policy == p->policy)) {
3386 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) 3344 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
3387 goto change; 3345 goto change;
3388 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3346 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3389 goto change; 3347 goto change;
3390 if (dl_policy(policy)) 3348 if (dl_policy(policy))
3391 goto change; 3349 goto change;
3392 3350
3351 p->sched_reset_on_fork = reset_on_fork;
3393 task_rq_unlock(rq, p, &flags); 3352 task_rq_unlock(rq, p, &flags);
3394 return 0; 3353 return 0;
3395 } 3354 }
@@ -3443,6 +3402,24 @@ change:
3443 return -EBUSY; 3402 return -EBUSY;
3444 } 3403 }
3445 3404
3405 p->sched_reset_on_fork = reset_on_fork;
3406 oldprio = p->prio;
3407
3408 /*
3409 * Special case for priority boosted tasks.
3410 *
3411 * If the new priority is lower or equal (user space view)
3412 * than the current (boosted) priority, we just store the new
3413 * normal parameters and do not touch the scheduler class and
3414 * the runqueue. This will be done when the task deboost
3415 * itself.
3416 */
3417 if (rt_mutex_check_prio(p, newprio)) {
3418 __setscheduler_params(p, attr);
3419 task_rq_unlock(rq, p, &flags);
3420 return 0;
3421 }
3422
3446 on_rq = p->on_rq; 3423 on_rq = p->on_rq;
3447 running = task_current(rq, p); 3424 running = task_current(rq, p);
3448 if (on_rq) 3425 if (on_rq)
@@ -3450,16 +3427,18 @@ change:
3450 if (running) 3427 if (running)
3451 p->sched_class->put_prev_task(rq, p); 3428 p->sched_class->put_prev_task(rq, p);
3452 3429
3453 p->sched_reset_on_fork = reset_on_fork;
3454
3455 oldprio = p->prio;
3456 prev_class = p->sched_class; 3430 prev_class = p->sched_class;
3457 __setscheduler(rq, p, attr); 3431 __setscheduler(rq, p, attr);
3458 3432
3459 if (running) 3433 if (running)
3460 p->sched_class->set_curr_task(rq); 3434 p->sched_class->set_curr_task(rq);
3461 if (on_rq) 3435 if (on_rq) {
3462 enqueue_task(rq, p, 0); 3436 /*
3437 * We enqueue to tail when the priority of a task is
3438 * increased (user space view).
3439 */
3440 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
3441 }
3463 3442
3464 check_class_changed(rq, p, prev_class, oldprio); 3443 check_class_changed(rq, p, prev_class, oldprio);
3465 task_rq_unlock(rq, p, &flags); 3444 task_rq_unlock(rq, p, &flags);
@@ -3615,7 +3594,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
3615 * XXX: do we want to be lenient like existing syscalls; or do we want 3594 * XXX: do we want to be lenient like existing syscalls; or do we want
3616 * to be strict and return an error on out-of-bounds values? 3595 * to be strict and return an error on out-of-bounds values?
3617 */ 3596 */
3618 attr->sched_nice = clamp(attr->sched_nice, -20, 19); 3597 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3619 3598
3620out: 3599out:
3621 return ret; 3600 return ret;
@@ -3836,7 +3815,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3836 else if (task_has_rt_policy(p)) 3815 else if (task_has_rt_policy(p))
3837 attr.sched_priority = p->rt_priority; 3816 attr.sched_priority = p->rt_priority;
3838 else 3817 else
3839 attr.sched_nice = TASK_NICE(p); 3818 attr.sched_nice = task_nice(p);
3840 3819
3841 rcu_read_unlock(); 3820 rcu_read_unlock();
3842 3821
@@ -4474,6 +4453,7 @@ void init_idle(struct task_struct *idle, int cpu)
4474 rcu_read_unlock(); 4453 rcu_read_unlock();
4475 4454
4476 rq->curr = rq->idle = idle; 4455 rq->curr = rq->idle = idle;
4456 idle->on_rq = 1;
4477#if defined(CONFIG_SMP) 4457#if defined(CONFIG_SMP)
4478 idle->on_cpu = 1; 4458 idle->on_cpu = 1;
4479#endif 4459#endif
@@ -4693,8 +4673,10 @@ void idle_task_exit(void)
4693 4673
4694 BUG_ON(cpu_online(smp_processor_id())); 4674 BUG_ON(cpu_online(smp_processor_id()));
4695 4675
4696 if (mm != &init_mm) 4676 if (mm != &init_mm) {
4697 switch_mm(mm, &init_mm, current); 4677 switch_mm(mm, &init_mm, current);
4678 finish_arch_post_lock_switch();
4679 }
4698 mmdrop(mm); 4680 mmdrop(mm);
4699} 4681}
4700 4682
@@ -4712,6 +4694,22 @@ static void calc_load_migrate(struct rq *rq)
4712 atomic_long_add(delta, &calc_load_tasks); 4694 atomic_long_add(delta, &calc_load_tasks);
4713} 4695}
4714 4696
4697static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
4698{
4699}
4700
4701static const struct sched_class fake_sched_class = {
4702 .put_prev_task = put_prev_task_fake,
4703};
4704
4705static struct task_struct fake_task = {
4706 /*
4707 * Avoid pull_{rt,dl}_task()
4708 */
4709 .prio = MAX_PRIO + 1,
4710 .sched_class = &fake_sched_class,
4711};
4712
4715/* 4713/*
4716 * Migrate all tasks from the rq, sleeping tasks will be migrated by 4714 * Migrate all tasks from the rq, sleeping tasks will be migrated by
4717 * try_to_wake_up()->select_task_rq(). 4715 * try_to_wake_up()->select_task_rq().
@@ -4752,7 +4750,7 @@ static void migrate_tasks(unsigned int dead_cpu)
4752 if (rq->nr_running == 1) 4750 if (rq->nr_running == 1)
4753 break; 4751 break;
4754 4752
4755 next = pick_next_task(rq); 4753 next = pick_next_task(rq, &fake_task);
4756 BUG_ON(!next); 4754 BUG_ON(!next);
4757 next->sched_class->put_prev_task(rq, next); 4755 next->sched_class->put_prev_task(rq, next);
4758 4756
@@ -4842,7 +4840,7 @@ set_table_entry(struct ctl_table *entry,
4842static struct ctl_table * 4840static struct ctl_table *
4843sd_alloc_ctl_domain_table(struct sched_domain *sd) 4841sd_alloc_ctl_domain_table(struct sched_domain *sd)
4844{ 4842{
4845 struct ctl_table *table = sd_alloc_ctl_entry(13); 4843 struct ctl_table *table = sd_alloc_ctl_entry(14);
4846 4844
4847 if (table == NULL) 4845 if (table == NULL)
4848 return NULL; 4846 return NULL;
@@ -4870,9 +4868,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
4870 sizeof(int), 0644, proc_dointvec_minmax, false); 4868 sizeof(int), 0644, proc_dointvec_minmax, false);
4871 set_table_entry(&table[10], "flags", &sd->flags, 4869 set_table_entry(&table[10], "flags", &sd->flags,
4872 sizeof(int), 0644, proc_dointvec_minmax, false); 4870 sizeof(int), 0644, proc_dointvec_minmax, false);
4873 set_table_entry(&table[11], "name", sd->name, 4871 set_table_entry(&table[11], "max_newidle_lb_cost",
4872 &sd->max_newidle_lb_cost,
4873 sizeof(long), 0644, proc_doulongvec_minmax, false);
4874 set_table_entry(&table[12], "name", sd->name,
4874 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 4875 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4875 /* &table[12] is terminator */ 4876 /* &table[13] is terminator */
4876 4877
4877 return table; 4878 return table;
4878} 4879}
@@ -6452,7 +6453,7 @@ static cpumask_var_t fallback_doms;
6452 * cpu core maps. It is supposed to return 1 if the topology changed 6453 * cpu core maps. It is supposed to return 1 if the topology changed
6453 * or 0 if it stayed the same. 6454 * or 0 if it stayed the same.
6454 */ 6455 */
6455int __attribute__((weak)) arch_update_cpu_topology(void) 6456int __weak arch_update_cpu_topology(void)
6456{ 6457{
6457 return 0; 6458 return 0;
6458} 6459}
@@ -6849,7 +6850,6 @@ void __init sched_init(void)
6849 6850
6850 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6851 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6851#ifdef CONFIG_RT_GROUP_SCHED 6852#ifdef CONFIG_RT_GROUP_SCHED
6852 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6853 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6853 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6854#endif 6854#endif
6855 6855
@@ -6938,7 +6938,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
6938 static unsigned long prev_jiffy; /* ratelimiting */ 6938 static unsigned long prev_jiffy; /* ratelimiting */
6939 6939
6940 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 6940 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
6941 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 6941 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6942 !is_idle_task(current)) ||
6942 system_state != SYSTEM_RUNNING || oops_in_progress) 6943 system_state != SYSTEM_RUNNING || oops_in_progress)
6943 return; 6944 return;
6944 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6945 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -6956,6 +6957,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
6956 debug_show_held_locks(current); 6957 debug_show_held_locks(current);
6957 if (irqs_disabled()) 6958 if (irqs_disabled())
6958 print_irqtrace_events(current); 6959 print_irqtrace_events(current);
6960#ifdef CONFIG_DEBUG_PREEMPT
6961 if (!preempt_count_equals(preempt_offset)) {
6962 pr_err("Preemption disabled at:");
6963 print_ip_sym(current->preempt_disable_ip);
6964 pr_cont("\n");
6965 }
6966#endif
6959 dump_stack(); 6967 dump_stack();
6960} 6968}
6961EXPORT_SYMBOL(__might_sleep); 6969EXPORT_SYMBOL(__might_sleep);
@@ -7009,7 +7017,7 @@ void normalize_rt_tasks(void)
7009 * Renice negative nice level userspace 7017 * Renice negative nice level userspace
7010 * tasks back to 0: 7018 * tasks back to 0:
7011 */ 7019 */
7012 if (TASK_NICE(p) < 0 && p->mm) 7020 if (task_nice(p) < 0 && p->mm)
7013 set_user_nice(p, 0); 7021 set_user_nice(p, 0);
7014 continue; 7022 continue;
7015 } 7023 }
@@ -7177,7 +7185,7 @@ void sched_move_task(struct task_struct *tsk)
7177 if (unlikely(running)) 7185 if (unlikely(running))
7178 tsk->sched_class->put_prev_task(rq, tsk); 7186 tsk->sched_class->put_prev_task(rq, tsk);
7179 7187
7180 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, 7188 tg = container_of(task_css_check(tsk, cpu_cgrp_id,
7181 lockdep_is_held(&tsk->sighand->siglock)), 7189 lockdep_is_held(&tsk->sighand->siglock)),
7182 struct task_group, css); 7190 struct task_group, css);
7183 tg = autogroup_task_group(tsk, tg); 7191 tg = autogroup_task_group(tsk, tg);
@@ -7604,7 +7612,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7604{ 7612{
7605 struct task_struct *task; 7613 struct task_struct *task;
7606 7614
7607 cgroup_taskset_for_each(task, css, tset) { 7615 cgroup_taskset_for_each(task, tset) {
7608#ifdef CONFIG_RT_GROUP_SCHED 7616#ifdef CONFIG_RT_GROUP_SCHED
7609 if (!sched_rt_can_attach(css_tg(css), task)) 7617 if (!sched_rt_can_attach(css_tg(css), task))
7610 return -EINVAL; 7618 return -EINVAL;
@@ -7622,7 +7630,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7622{ 7630{
7623 struct task_struct *task; 7631 struct task_struct *task;
7624 7632
7625 cgroup_taskset_for_each(task, css, tset) 7633 cgroup_taskset_for_each(task, tset)
7626 sched_move_task(task); 7634 sched_move_task(task);
7627} 7635}
7628 7636
@@ -7961,8 +7969,7 @@ static struct cftype cpu_files[] = {
7961 { } /* terminate */ 7969 { } /* terminate */
7962}; 7970};
7963 7971
7964struct cgroup_subsys cpu_cgroup_subsys = { 7972struct cgroup_subsys cpu_cgrp_subsys = {
7965 .name = "cpu",
7966 .css_alloc = cpu_cgroup_css_alloc, 7973 .css_alloc = cpu_cgroup_css_alloc,
7967 .css_free = cpu_cgroup_css_free, 7974 .css_free = cpu_cgroup_css_free,
7968 .css_online = cpu_cgroup_css_online, 7975 .css_online = cpu_cgroup_css_online,
@@ -7970,7 +7977,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7970 .can_attach = cpu_cgroup_can_attach, 7977 .can_attach = cpu_cgroup_can_attach,
7971 .attach = cpu_cgroup_attach, 7978 .attach = cpu_cgroup_attach,
7972 .exit = cpu_cgroup_exit, 7979 .exit = cpu_cgroup_exit,
7973 .subsys_id = cpu_cgroup_subsys_id,
7974 .base_cftypes = cpu_files, 7980 .base_cftypes = cpu_files,
7975 .early_init = 1, 7981 .early_init = 1,
7976}; 7982};