diff options
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/Makefile | 2 | ||||
| -rw-r--r-- | kernel/sched/auto_group.c | 2 | ||||
| -rw-r--r-- | kernel/sched/clock.c | 7 | ||||
| -rw-r--r-- | kernel/sched/core.c | 292 | ||||
| -rw-r--r-- | kernel/sched/cpuacct.c | 6 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.c | 4 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 20 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 66 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 10 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 608 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 265 | ||||
| -rw-r--r-- | kernel/sched/idle_task.c | 25 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 110 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 75 | ||||
| -rw-r--r-- | kernel/sched/stats.c | 2 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 15 |
16 files changed, 1085 insertions, 424 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a95c8c2af2a..ab32b7b0db5c 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
| @@ -13,7 +13,7 @@ endif | |||
| 13 | 13 | ||
| 14 | obj-y += core.o proc.o clock.o cputime.o | 14 | obj-y += core.o proc.o clock.o cputime.o |
| 15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o | 15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o |
| 16 | obj-y += wait.o completion.o | 16 | obj-y += wait.o completion.o idle.o |
| 17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o | 17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o |
| 18 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 18 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
| 19 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 19 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 4a073539c58e..e73efba98301 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
| @@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) | |||
| 203 | struct autogroup *ag; | 203 | struct autogroup *ag; |
| 204 | int err; | 204 | int err; |
| 205 | 205 | ||
| 206 | if (nice < -20 || nice > 19) | 206 | if (nice < MIN_NICE || nice > MAX_NICE) |
| 207 | return -EINVAL; | 207 | return -EINVAL; |
| 208 | 208 | ||
| 209 | err = security_task_setnice(current, nice); | 209 | err = security_task_setnice(current, nice); |
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 43c2bcc35761..3ef6451e972e 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
| @@ -60,13 +60,14 @@ | |||
| 60 | #include <linux/sched.h> | 60 | #include <linux/sched.h> |
| 61 | #include <linux/static_key.h> | 61 | #include <linux/static_key.h> |
| 62 | #include <linux/workqueue.h> | 62 | #include <linux/workqueue.h> |
| 63 | #include <linux/compiler.h> | ||
| 63 | 64 | ||
| 64 | /* | 65 | /* |
| 65 | * Scheduler clock - returns current time in nanosec units. | 66 | * Scheduler clock - returns current time in nanosec units. |
| 66 | * This is default implementation. | 67 | * This is default implementation. |
| 67 | * Architectures and sub-architectures can override this. | 68 | * Architectures and sub-architectures can override this. |
| 68 | */ | 69 | */ |
| 69 | unsigned long long __attribute__((weak)) sched_clock(void) | 70 | unsigned long long __weak sched_clock(void) |
| 70 | { | 71 | { |
| 71 | return (unsigned long long)(jiffies - INITIAL_JIFFIES) | 72 | return (unsigned long long)(jiffies - INITIAL_JIFFIES) |
| 72 | * (NSEC_PER_SEC / HZ); | 73 | * (NSEC_PER_SEC / HZ); |
| @@ -301,14 +302,14 @@ u64 sched_clock_cpu(int cpu) | |||
| 301 | if (unlikely(!sched_clock_running)) | 302 | if (unlikely(!sched_clock_running)) |
| 302 | return 0ull; | 303 | return 0ull; |
| 303 | 304 | ||
| 304 | preempt_disable(); | 305 | preempt_disable_notrace(); |
| 305 | scd = cpu_sdc(cpu); | 306 | scd = cpu_sdc(cpu); |
| 306 | 307 | ||
| 307 | if (cpu != smp_processor_id()) | 308 | if (cpu != smp_processor_id()) |
| 308 | clock = sched_clock_remote(scd); | 309 | clock = sched_clock_remote(scd); |
| 309 | else | 310 | else |
| 310 | clock = sched_clock_local(scd); | 311 | clock = sched_clock_local(scd); |
| 311 | preempt_enable(); | 312 | preempt_enable_notrace(); |
| 312 | 313 | ||
| 313 | return clock; | 314 | return clock; |
| 314 | } | 315 | } |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6edbef296ece..268a45ea238c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -73,6 +73,7 @@ | |||
| 73 | #include <linux/init_task.h> | 73 | #include <linux/init_task.h> |
| 74 | #include <linux/binfmts.h> | 74 | #include <linux/binfmts.h> |
| 75 | #include <linux/context_tracking.h> | 75 | #include <linux/context_tracking.h> |
| 76 | #include <linux/compiler.h> | ||
| 76 | 77 | ||
| 77 | #include <asm/switch_to.h> | 78 | #include <asm/switch_to.h> |
| 78 | #include <asm/tlb.h> | 79 | #include <asm/tlb.h> |
| @@ -432,7 +433,7 @@ void hrtick_start(struct rq *rq, u64 delay) | |||
| 432 | if (rq == this_rq()) { | 433 | if (rq == this_rq()) { |
| 433 | __hrtick_restart(rq); | 434 | __hrtick_restart(rq); |
| 434 | } else if (!rq->hrtick_csd_pending) { | 435 | } else if (!rq->hrtick_csd_pending) { |
| 435 | __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); | 436 | smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); |
| 436 | rq->hrtick_csd_pending = 1; | 437 | rq->hrtick_csd_pending = 1; |
| 437 | } | 438 | } |
| 438 | } | 439 | } |
| @@ -555,12 +556,15 @@ void resched_cpu(int cpu) | |||
| 555 | * selecting an idle cpu will add more delays to the timers than intended | 556 | * selecting an idle cpu will add more delays to the timers than intended |
| 556 | * (as that cpu's timer base may not be uptodate wrt jiffies etc). | 557 | * (as that cpu's timer base may not be uptodate wrt jiffies etc). |
| 557 | */ | 558 | */ |
| 558 | int get_nohz_timer_target(void) | 559 | int get_nohz_timer_target(int pinned) |
| 559 | { | 560 | { |
| 560 | int cpu = smp_processor_id(); | 561 | int cpu = smp_processor_id(); |
| 561 | int i; | 562 | int i; |
| 562 | struct sched_domain *sd; | 563 | struct sched_domain *sd; |
| 563 | 564 | ||
| 565 | if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) | ||
| 566 | return cpu; | ||
| 567 | |||
| 564 | rcu_read_lock(); | 568 | rcu_read_lock(); |
| 565 | for_each_domain(cpu, sd) { | 569 | for_each_domain(cpu, sd) { |
| 566 | for_each_cpu(i, sched_domain_span(sd)) { | 570 | for_each_cpu(i, sched_domain_span(sd)) { |
| @@ -823,19 +827,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
| 823 | #endif | 827 | #endif |
| 824 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | 828 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING |
| 825 | if (static_key_false((¶virt_steal_rq_enabled))) { | 829 | if (static_key_false((¶virt_steal_rq_enabled))) { |
| 826 | u64 st; | ||
| 827 | |||
| 828 | steal = paravirt_steal_clock(cpu_of(rq)); | 830 | steal = paravirt_steal_clock(cpu_of(rq)); |
| 829 | steal -= rq->prev_steal_time_rq; | 831 | steal -= rq->prev_steal_time_rq; |
| 830 | 832 | ||
| 831 | if (unlikely(steal > delta)) | 833 | if (unlikely(steal > delta)) |
| 832 | steal = delta; | 834 | steal = delta; |
| 833 | 835 | ||
| 834 | st = steal_ticks(steal); | ||
| 835 | steal = st * TICK_NSEC; | ||
| 836 | |||
| 837 | rq->prev_steal_time_rq += steal; | 836 | rq->prev_steal_time_rq += steal; |
| 838 | |||
| 839 | delta -= steal; | 837 | delta -= steal; |
| 840 | } | 838 | } |
| 841 | #endif | 839 | #endif |
| @@ -1745,8 +1743,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 1745 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | 1743 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; |
| 1746 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | 1744 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; |
| 1747 | p->numa_work.next = &p->numa_work; | 1745 | p->numa_work.next = &p->numa_work; |
| 1748 | p->numa_faults = NULL; | 1746 | p->numa_faults_memory = NULL; |
| 1749 | p->numa_faults_buffer = NULL; | 1747 | p->numa_faults_buffer_memory = NULL; |
| 1748 | p->last_task_numa_placement = 0; | ||
| 1749 | p->last_sum_exec_runtime = 0; | ||
| 1750 | 1750 | ||
| 1751 | INIT_LIST_HEAD(&p->numa_entry); | 1751 | INIT_LIST_HEAD(&p->numa_entry); |
| 1752 | p->numa_group = NULL; | 1752 | p->numa_group = NULL; |
| @@ -2149,8 +2149,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2149 | if (mm) | 2149 | if (mm) |
| 2150 | mmdrop(mm); | 2150 | mmdrop(mm); |
| 2151 | if (unlikely(prev_state == TASK_DEAD)) { | 2151 | if (unlikely(prev_state == TASK_DEAD)) { |
| 2152 | task_numa_free(prev); | ||
| 2153 | |||
| 2154 | if (prev->sched_class->task_dead) | 2152 | if (prev->sched_class->task_dead) |
| 2155 | prev->sched_class->task_dead(prev); | 2153 | prev->sched_class->task_dead(prev); |
| 2156 | 2154 | ||
| @@ -2167,13 +2165,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2167 | 2165 | ||
| 2168 | #ifdef CONFIG_SMP | 2166 | #ifdef CONFIG_SMP |
| 2169 | 2167 | ||
| 2170 | /* assumes rq->lock is held */ | ||
| 2171 | static inline void pre_schedule(struct rq *rq, struct task_struct *prev) | ||
| 2172 | { | ||
| 2173 | if (prev->sched_class->pre_schedule) | ||
| 2174 | prev->sched_class->pre_schedule(rq, prev); | ||
| 2175 | } | ||
| 2176 | |||
| 2177 | /* rq->lock is NOT held, but preemption is disabled */ | 2168 | /* rq->lock is NOT held, but preemption is disabled */ |
| 2178 | static inline void post_schedule(struct rq *rq) | 2169 | static inline void post_schedule(struct rq *rq) |
| 2179 | { | 2170 | { |
| @@ -2191,10 +2182,6 @@ static inline void post_schedule(struct rq *rq) | |||
| 2191 | 2182 | ||
| 2192 | #else | 2183 | #else |
| 2193 | 2184 | ||
| 2194 | static inline void pre_schedule(struct rq *rq, struct task_struct *p) | ||
| 2195 | { | ||
| 2196 | } | ||
| 2197 | |||
| 2198 | static inline void post_schedule(struct rq *rq) | 2185 | static inline void post_schedule(struct rq *rq) |
| 2199 | { | 2186 | { |
| 2200 | } | 2187 | } |
| @@ -2510,8 +2497,13 @@ void __kprobes preempt_count_add(int val) | |||
| 2510 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= | 2497 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= |
| 2511 | PREEMPT_MASK - 10); | 2498 | PREEMPT_MASK - 10); |
| 2512 | #endif | 2499 | #endif |
| 2513 | if (preempt_count() == val) | 2500 | if (preempt_count() == val) { |
| 2514 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 2501 | unsigned long ip = get_parent_ip(CALLER_ADDR1); |
| 2502 | #ifdef CONFIG_DEBUG_PREEMPT | ||
| 2503 | current->preempt_disable_ip = ip; | ||
| 2504 | #endif | ||
| 2505 | trace_preempt_off(CALLER_ADDR0, ip); | ||
| 2506 | } | ||
| 2515 | } | 2507 | } |
| 2516 | EXPORT_SYMBOL(preempt_count_add); | 2508 | EXPORT_SYMBOL(preempt_count_add); |
| 2517 | 2509 | ||
| @@ -2554,6 +2546,13 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
| 2554 | print_modules(); | 2546 | print_modules(); |
| 2555 | if (irqs_disabled()) | 2547 | if (irqs_disabled()) |
| 2556 | print_irqtrace_events(prev); | 2548 | print_irqtrace_events(prev); |
| 2549 | #ifdef CONFIG_DEBUG_PREEMPT | ||
| 2550 | if (in_atomic_preempt_off()) { | ||
| 2551 | pr_err("Preemption disabled at:"); | ||
| 2552 | print_ip_sym(current->preempt_disable_ip); | ||
| 2553 | pr_cont("\n"); | ||
| 2554 | } | ||
| 2555 | #endif | ||
| 2557 | dump_stack(); | 2556 | dump_stack(); |
| 2558 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); | 2557 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); |
| 2559 | } | 2558 | } |
| @@ -2577,36 +2576,34 @@ static inline void schedule_debug(struct task_struct *prev) | |||
| 2577 | schedstat_inc(this_rq(), sched_count); | 2576 | schedstat_inc(this_rq(), sched_count); |
| 2578 | } | 2577 | } |
| 2579 | 2578 | ||
| 2580 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | ||
| 2581 | { | ||
| 2582 | if (prev->on_rq || rq->skip_clock_update < 0) | ||
| 2583 | update_rq_clock(rq); | ||
| 2584 | prev->sched_class->put_prev_task(rq, prev); | ||
| 2585 | } | ||
| 2586 | |||
| 2587 | /* | 2579 | /* |
| 2588 | * Pick up the highest-prio task: | 2580 | * Pick up the highest-prio task: |
| 2589 | */ | 2581 | */ |
| 2590 | static inline struct task_struct * | 2582 | static inline struct task_struct * |
| 2591 | pick_next_task(struct rq *rq) | 2583 | pick_next_task(struct rq *rq, struct task_struct *prev) |
| 2592 | { | 2584 | { |
| 2593 | const struct sched_class *class; | 2585 | const struct sched_class *class = &fair_sched_class; |
| 2594 | struct task_struct *p; | 2586 | struct task_struct *p; |
| 2595 | 2587 | ||
| 2596 | /* | 2588 | /* |
| 2597 | * Optimization: we know that if all tasks are in | 2589 | * Optimization: we know that if all tasks are in |
| 2598 | * the fair class we can call that function directly: | 2590 | * the fair class we can call that function directly: |
| 2599 | */ | 2591 | */ |
| 2600 | if (likely(rq->nr_running == rq->cfs.h_nr_running)) { | 2592 | if (likely(prev->sched_class == class && |
| 2601 | p = fair_sched_class.pick_next_task(rq); | 2593 | rq->nr_running == rq->cfs.h_nr_running)) { |
| 2602 | if (likely(p)) | 2594 | p = fair_sched_class.pick_next_task(rq, prev); |
| 2595 | if (likely(p && p != RETRY_TASK)) | ||
| 2603 | return p; | 2596 | return p; |
| 2604 | } | 2597 | } |
| 2605 | 2598 | ||
| 2599 | again: | ||
| 2606 | for_each_class(class) { | 2600 | for_each_class(class) { |
| 2607 | p = class->pick_next_task(rq); | 2601 | p = class->pick_next_task(rq, prev); |
| 2608 | if (p) | 2602 | if (p) { |
| 2603 | if (unlikely(p == RETRY_TASK)) | ||
| 2604 | goto again; | ||
| 2609 | return p; | 2605 | return p; |
| 2606 | } | ||
| 2610 | } | 2607 | } |
| 2611 | 2608 | ||
| 2612 | BUG(); /* the idle class will always have a runnable task */ | 2609 | BUG(); /* the idle class will always have a runnable task */ |
| @@ -2700,13 +2697,10 @@ need_resched: | |||
| 2700 | switch_count = &prev->nvcsw; | 2697 | switch_count = &prev->nvcsw; |
| 2701 | } | 2698 | } |
| 2702 | 2699 | ||
| 2703 | pre_schedule(rq, prev); | 2700 | if (prev->on_rq || rq->skip_clock_update < 0) |
| 2704 | 2701 | update_rq_clock(rq); | |
| 2705 | if (unlikely(!rq->nr_running)) | ||
| 2706 | idle_balance(cpu, rq); | ||
| 2707 | 2702 | ||
| 2708 | put_prev_task(rq, prev); | 2703 | next = pick_next_task(rq, prev); |
| 2709 | next = pick_next_task(rq); | ||
| 2710 | clear_tsk_need_resched(prev); | 2704 | clear_tsk_need_resched(prev); |
| 2711 | clear_preempt_need_resched(); | 2705 | clear_preempt_need_resched(); |
| 2712 | rq->skip_clock_update = 0; | 2706 | rq->skip_clock_update = 0; |
| @@ -2852,52 +2846,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, | |||
| 2852 | } | 2846 | } |
| 2853 | EXPORT_SYMBOL(default_wake_function); | 2847 | EXPORT_SYMBOL(default_wake_function); |
| 2854 | 2848 | ||
| 2855 | static long __sched | ||
| 2856 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) | ||
| 2857 | { | ||
| 2858 | unsigned long flags; | ||
| 2859 | wait_queue_t wait; | ||
| 2860 | |||
| 2861 | init_waitqueue_entry(&wait, current); | ||
| 2862 | |||
| 2863 | __set_current_state(state); | ||
| 2864 | |||
| 2865 | spin_lock_irqsave(&q->lock, flags); | ||
| 2866 | __add_wait_queue(q, &wait); | ||
| 2867 | spin_unlock(&q->lock); | ||
| 2868 | timeout = schedule_timeout(timeout); | ||
| 2869 | spin_lock_irq(&q->lock); | ||
| 2870 | __remove_wait_queue(q, &wait); | ||
| 2871 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 2872 | |||
| 2873 | return timeout; | ||
| 2874 | } | ||
| 2875 | |||
| 2876 | void __sched interruptible_sleep_on(wait_queue_head_t *q) | ||
| 2877 | { | ||
| 2878 | sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | ||
| 2879 | } | ||
| 2880 | EXPORT_SYMBOL(interruptible_sleep_on); | ||
| 2881 | |||
| 2882 | long __sched | ||
| 2883 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | ||
| 2884 | { | ||
| 2885 | return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); | ||
| 2886 | } | ||
| 2887 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | ||
| 2888 | |||
| 2889 | void __sched sleep_on(wait_queue_head_t *q) | ||
| 2890 | { | ||
| 2891 | sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | ||
| 2892 | } | ||
| 2893 | EXPORT_SYMBOL(sleep_on); | ||
| 2894 | |||
| 2895 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | ||
| 2896 | { | ||
| 2897 | return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); | ||
| 2898 | } | ||
| 2899 | EXPORT_SYMBOL(sleep_on_timeout); | ||
| 2900 | |||
| 2901 | #ifdef CONFIG_RT_MUTEXES | 2849 | #ifdef CONFIG_RT_MUTEXES |
| 2902 | 2850 | ||
| 2903 | /* | 2851 | /* |
| @@ -2908,7 +2856,8 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
| 2908 | * This function changes the 'effective' priority of a task. It does | 2856 | * This function changes the 'effective' priority of a task. It does |
| 2909 | * not touch ->normal_prio like __setscheduler(). | 2857 | * not touch ->normal_prio like __setscheduler(). |
| 2910 | * | 2858 | * |
| 2911 | * Used by the rt_mutex code to implement priority inheritance logic. | 2859 | * Used by the rt_mutex code to implement priority inheritance |
| 2860 | * logic. Call site only calls if the priority of the task changed. | ||
| 2912 | */ | 2861 | */ |
| 2913 | void rt_mutex_setprio(struct task_struct *p, int prio) | 2862 | void rt_mutex_setprio(struct task_struct *p, int prio) |
| 2914 | { | 2863 | { |
| @@ -2998,7 +2947,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 2998 | unsigned long flags; | 2947 | unsigned long flags; |
| 2999 | struct rq *rq; | 2948 | struct rq *rq; |
| 3000 | 2949 | ||
| 3001 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 2950 | if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) |
| 3002 | return; | 2951 | return; |
| 3003 | /* | 2952 | /* |
| 3004 | * We have to be careful, if called from sys_setpriority(), | 2953 | * We have to be careful, if called from sys_setpriority(), |
| @@ -3076,11 +3025,11 @@ SYSCALL_DEFINE1(nice, int, increment) | |||
| 3076 | if (increment > 40) | 3025 | if (increment > 40) |
| 3077 | increment = 40; | 3026 | increment = 40; |
| 3078 | 3027 | ||
| 3079 | nice = TASK_NICE(current) + increment; | 3028 | nice = task_nice(current) + increment; |
| 3080 | if (nice < -20) | 3029 | if (nice < MIN_NICE) |
| 3081 | nice = -20; | 3030 | nice = MIN_NICE; |
| 3082 | if (nice > 19) | 3031 | if (nice > MAX_NICE) |
| 3083 | nice = 19; | 3032 | nice = MAX_NICE; |
| 3084 | 3033 | ||
| 3085 | if (increment < 0 && !can_nice(current, nice)) | 3034 | if (increment < 0 && !can_nice(current, nice)) |
| 3086 | return -EPERM; | 3035 | return -EPERM; |
| @@ -3109,18 +3058,6 @@ int task_prio(const struct task_struct *p) | |||
| 3109 | } | 3058 | } |
| 3110 | 3059 | ||
| 3111 | /** | 3060 | /** |
| 3112 | * task_nice - return the nice value of a given task. | ||
| 3113 | * @p: the task in question. | ||
| 3114 | * | ||
| 3115 | * Return: The nice value [ -20 ... 0 ... 19 ]. | ||
| 3116 | */ | ||
| 3117 | int task_nice(const struct task_struct *p) | ||
| 3118 | { | ||
| 3119 | return TASK_NICE(p); | ||
| 3120 | } | ||
| 3121 | EXPORT_SYMBOL(task_nice); | ||
| 3122 | |||
| 3123 | /** | ||
| 3124 | * idle_cpu - is a given cpu idle currently? | 3061 | * idle_cpu - is a given cpu idle currently? |
| 3125 | * @cpu: the processor in question. | 3062 | * @cpu: the processor in question. |
| 3126 | * | 3063 | * |
| @@ -3189,9 +3126,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) | |||
| 3189 | dl_se->dl_new = 1; | 3126 | dl_se->dl_new = 1; |
| 3190 | } | 3127 | } |
| 3191 | 3128 | ||
| 3192 | /* Actually do priority change: must hold pi & rq lock. */ | 3129 | static void __setscheduler_params(struct task_struct *p, |
| 3193 | static void __setscheduler(struct rq *rq, struct task_struct *p, | 3130 | const struct sched_attr *attr) |
| 3194 | const struct sched_attr *attr) | ||
| 3195 | { | 3131 | { |
| 3196 | int policy = attr->sched_policy; | 3132 | int policy = attr->sched_policy; |
| 3197 | 3133 | ||
| @@ -3211,9 +3147,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, | |||
| 3211 | * getparam()/getattr() don't report silly values for !rt tasks. | 3147 | * getparam()/getattr() don't report silly values for !rt tasks. |
| 3212 | */ | 3148 | */ |
| 3213 | p->rt_priority = attr->sched_priority; | 3149 | p->rt_priority = attr->sched_priority; |
| 3214 | |||
| 3215 | p->normal_prio = normal_prio(p); | 3150 | p->normal_prio = normal_prio(p); |
| 3216 | p->prio = rt_mutex_getprio(p); | 3151 | set_load_weight(p); |
| 3152 | } | ||
| 3153 | |||
| 3154 | /* Actually do priority change: must hold pi & rq lock. */ | ||
| 3155 | static void __setscheduler(struct rq *rq, struct task_struct *p, | ||
| 3156 | const struct sched_attr *attr) | ||
| 3157 | { | ||
| 3158 | __setscheduler_params(p, attr); | ||
| 3159 | |||
| 3160 | /* | ||
| 3161 | * If we get here, there was no pi waiters boosting the | ||
| 3162 | * task. It is safe to use the normal prio. | ||
| 3163 | */ | ||
| 3164 | p->prio = normal_prio(p); | ||
| 3217 | 3165 | ||
| 3218 | if (dl_prio(p->prio)) | 3166 | if (dl_prio(p->prio)) |
| 3219 | p->sched_class = &dl_sched_class; | 3167 | p->sched_class = &dl_sched_class; |
| @@ -3221,8 +3169,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, | |||
| 3221 | p->sched_class = &rt_sched_class; | 3169 | p->sched_class = &rt_sched_class; |
| 3222 | else | 3170 | else |
| 3223 | p->sched_class = &fair_sched_class; | 3171 | p->sched_class = &fair_sched_class; |
| 3224 | |||
| 3225 | set_load_weight(p); | ||
| 3226 | } | 3172 | } |
| 3227 | 3173 | ||
| 3228 | static void | 3174 | static void |
| @@ -3275,6 +3221,8 @@ static int __sched_setscheduler(struct task_struct *p, | |||
| 3275 | const struct sched_attr *attr, | 3221 | const struct sched_attr *attr, |
| 3276 | bool user) | 3222 | bool user) |
| 3277 | { | 3223 | { |
| 3224 | int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : | ||
| 3225 | MAX_RT_PRIO - 1 - attr->sched_priority; | ||
| 3278 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 3226 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
| 3279 | int policy = attr->sched_policy; | 3227 | int policy = attr->sched_policy; |
| 3280 | unsigned long flags; | 3228 | unsigned long flags; |
| @@ -3319,7 +3267,7 @@ recheck: | |||
| 3319 | */ | 3267 | */ |
| 3320 | if (user && !capable(CAP_SYS_NICE)) { | 3268 | if (user && !capable(CAP_SYS_NICE)) { |
| 3321 | if (fair_policy(policy)) { | 3269 | if (fair_policy(policy)) { |
| 3322 | if (attr->sched_nice < TASK_NICE(p) && | 3270 | if (attr->sched_nice < task_nice(p) && |
| 3323 | !can_nice(p, attr->sched_nice)) | 3271 | !can_nice(p, attr->sched_nice)) |
| 3324 | return -EPERM; | 3272 | return -EPERM; |
| 3325 | } | 3273 | } |
| @@ -3338,12 +3286,21 @@ recheck: | |||
| 3338 | return -EPERM; | 3286 | return -EPERM; |
| 3339 | } | 3287 | } |
| 3340 | 3288 | ||
| 3289 | /* | ||
| 3290 | * Can't set/change SCHED_DEADLINE policy at all for now | ||
| 3291 | * (safest behavior); in the future we would like to allow | ||
| 3292 | * unprivileged DL tasks to increase their relative deadline | ||
| 3293 | * or reduce their runtime (both ways reducing utilization) | ||
| 3294 | */ | ||
| 3295 | if (dl_policy(policy)) | ||
| 3296 | return -EPERM; | ||
| 3297 | |||
| 3341 | /* | 3298 | /* |
| 3342 | * Treat SCHED_IDLE as nice 20. Only allow a switch to | 3299 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
| 3343 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. | 3300 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
| 3344 | */ | 3301 | */ |
| 3345 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { | 3302 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { |
| 3346 | if (!can_nice(p, TASK_NICE(p))) | 3303 | if (!can_nice(p, task_nice(p))) |
| 3347 | return -EPERM; | 3304 | return -EPERM; |
| 3348 | } | 3305 | } |
| 3349 | 3306 | ||
| @@ -3380,16 +3337,18 @@ recheck: | |||
| 3380 | } | 3337 | } |
| 3381 | 3338 | ||
| 3382 | /* | 3339 | /* |
| 3383 | * If not changing anything there's no need to proceed further: | 3340 | * If not changing anything there's no need to proceed further, |
| 3341 | * but store a possible modification of reset_on_fork. | ||
| 3384 | */ | 3342 | */ |
| 3385 | if (unlikely(policy == p->policy)) { | 3343 | if (unlikely(policy == p->policy)) { |
| 3386 | if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) | 3344 | if (fair_policy(policy) && attr->sched_nice != task_nice(p)) |
| 3387 | goto change; | 3345 | goto change; |
| 3388 | if (rt_policy(policy) && attr->sched_priority != p->rt_priority) | 3346 | if (rt_policy(policy) && attr->sched_priority != p->rt_priority) |
| 3389 | goto change; | 3347 | goto change; |
| 3390 | if (dl_policy(policy)) | 3348 | if (dl_policy(policy)) |
| 3391 | goto change; | 3349 | goto change; |
| 3392 | 3350 | ||
| 3351 | p->sched_reset_on_fork = reset_on_fork; | ||
| 3393 | task_rq_unlock(rq, p, &flags); | 3352 | task_rq_unlock(rq, p, &flags); |
| 3394 | return 0; | 3353 | return 0; |
| 3395 | } | 3354 | } |
| @@ -3443,6 +3402,24 @@ change: | |||
| 3443 | return -EBUSY; | 3402 | return -EBUSY; |
| 3444 | } | 3403 | } |
| 3445 | 3404 | ||
| 3405 | p->sched_reset_on_fork = reset_on_fork; | ||
| 3406 | oldprio = p->prio; | ||
| 3407 | |||
| 3408 | /* | ||
| 3409 | * Special case for priority boosted tasks. | ||
| 3410 | * | ||
| 3411 | * If the new priority is lower or equal (user space view) | ||
| 3412 | * than the current (boosted) priority, we just store the new | ||
| 3413 | * normal parameters and do not touch the scheduler class and | ||
| 3414 | * the runqueue. This will be done when the task deboost | ||
| 3415 | * itself. | ||
| 3416 | */ | ||
| 3417 | if (rt_mutex_check_prio(p, newprio)) { | ||
| 3418 | __setscheduler_params(p, attr); | ||
| 3419 | task_rq_unlock(rq, p, &flags); | ||
| 3420 | return 0; | ||
| 3421 | } | ||
| 3422 | |||
| 3446 | on_rq = p->on_rq; | 3423 | on_rq = p->on_rq; |
| 3447 | running = task_current(rq, p); | 3424 | running = task_current(rq, p); |
| 3448 | if (on_rq) | 3425 | if (on_rq) |
| @@ -3450,16 +3427,18 @@ change: | |||
| 3450 | if (running) | 3427 | if (running) |
| 3451 | p->sched_class->put_prev_task(rq, p); | 3428 | p->sched_class->put_prev_task(rq, p); |
| 3452 | 3429 | ||
| 3453 | p->sched_reset_on_fork = reset_on_fork; | ||
| 3454 | |||
| 3455 | oldprio = p->prio; | ||
| 3456 | prev_class = p->sched_class; | 3430 | prev_class = p->sched_class; |
| 3457 | __setscheduler(rq, p, attr); | 3431 | __setscheduler(rq, p, attr); |
| 3458 | 3432 | ||
| 3459 | if (running) | 3433 | if (running) |
| 3460 | p->sched_class->set_curr_task(rq); | 3434 | p->sched_class->set_curr_task(rq); |
| 3461 | if (on_rq) | 3435 | if (on_rq) { |
| 3462 | enqueue_task(rq, p, 0); | 3436 | /* |
| 3437 | * We enqueue to tail when the priority of a task is | ||
| 3438 | * increased (user space view). | ||
| 3439 | */ | ||
| 3440 | enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); | ||
| 3441 | } | ||
| 3463 | 3442 | ||
| 3464 | check_class_changed(rq, p, prev_class, oldprio); | 3443 | check_class_changed(rq, p, prev_class, oldprio); |
| 3465 | task_rq_unlock(rq, p, &flags); | 3444 | task_rq_unlock(rq, p, &flags); |
| @@ -3615,7 +3594,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr, | |||
| 3615 | * XXX: do we want to be lenient like existing syscalls; or do we want | 3594 | * XXX: do we want to be lenient like existing syscalls; or do we want |
| 3616 | * to be strict and return an error on out-of-bounds values? | 3595 | * to be strict and return an error on out-of-bounds values? |
| 3617 | */ | 3596 | */ |
| 3618 | attr->sched_nice = clamp(attr->sched_nice, -20, 19); | 3597 | attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); |
| 3619 | 3598 | ||
| 3620 | out: | 3599 | out: |
| 3621 | return ret; | 3600 | return ret; |
| @@ -3836,7 +3815,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, | |||
| 3836 | else if (task_has_rt_policy(p)) | 3815 | else if (task_has_rt_policy(p)) |
| 3837 | attr.sched_priority = p->rt_priority; | 3816 | attr.sched_priority = p->rt_priority; |
| 3838 | else | 3817 | else |
| 3839 | attr.sched_nice = TASK_NICE(p); | 3818 | attr.sched_nice = task_nice(p); |
| 3840 | 3819 | ||
| 3841 | rcu_read_unlock(); | 3820 | rcu_read_unlock(); |
| 3842 | 3821 | ||
| @@ -4474,6 +4453,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
| 4474 | rcu_read_unlock(); | 4453 | rcu_read_unlock(); |
| 4475 | 4454 | ||
| 4476 | rq->curr = rq->idle = idle; | 4455 | rq->curr = rq->idle = idle; |
| 4456 | idle->on_rq = 1; | ||
| 4477 | #if defined(CONFIG_SMP) | 4457 | #if defined(CONFIG_SMP) |
| 4478 | idle->on_cpu = 1; | 4458 | idle->on_cpu = 1; |
| 4479 | #endif | 4459 | #endif |
| @@ -4693,8 +4673,10 @@ void idle_task_exit(void) | |||
| 4693 | 4673 | ||
| 4694 | BUG_ON(cpu_online(smp_processor_id())); | 4674 | BUG_ON(cpu_online(smp_processor_id())); |
| 4695 | 4675 | ||
| 4696 | if (mm != &init_mm) | 4676 | if (mm != &init_mm) { |
| 4697 | switch_mm(mm, &init_mm, current); | 4677 | switch_mm(mm, &init_mm, current); |
| 4678 | finish_arch_post_lock_switch(); | ||
| 4679 | } | ||
| 4698 | mmdrop(mm); | 4680 | mmdrop(mm); |
| 4699 | } | 4681 | } |
| 4700 | 4682 | ||
| @@ -4712,6 +4694,22 @@ static void calc_load_migrate(struct rq *rq) | |||
| 4712 | atomic_long_add(delta, &calc_load_tasks); | 4694 | atomic_long_add(delta, &calc_load_tasks); |
| 4713 | } | 4695 | } |
| 4714 | 4696 | ||
| 4697 | static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) | ||
| 4698 | { | ||
| 4699 | } | ||
| 4700 | |||
| 4701 | static const struct sched_class fake_sched_class = { | ||
| 4702 | .put_prev_task = put_prev_task_fake, | ||
| 4703 | }; | ||
| 4704 | |||
| 4705 | static struct task_struct fake_task = { | ||
| 4706 | /* | ||
| 4707 | * Avoid pull_{rt,dl}_task() | ||
| 4708 | */ | ||
| 4709 | .prio = MAX_PRIO + 1, | ||
| 4710 | .sched_class = &fake_sched_class, | ||
| 4711 | }; | ||
| 4712 | |||
| 4715 | /* | 4713 | /* |
| 4716 | * Migrate all tasks from the rq, sleeping tasks will be migrated by | 4714 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
| 4717 | * try_to_wake_up()->select_task_rq(). | 4715 | * try_to_wake_up()->select_task_rq(). |
| @@ -4752,7 +4750,7 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
| 4752 | if (rq->nr_running == 1) | 4750 | if (rq->nr_running == 1) |
| 4753 | break; | 4751 | break; |
| 4754 | 4752 | ||
| 4755 | next = pick_next_task(rq); | 4753 | next = pick_next_task(rq, &fake_task); |
| 4756 | BUG_ON(!next); | 4754 | BUG_ON(!next); |
| 4757 | next->sched_class->put_prev_task(rq, next); | 4755 | next->sched_class->put_prev_task(rq, next); |
| 4758 | 4756 | ||
| @@ -4842,7 +4840,7 @@ set_table_entry(struct ctl_table *entry, | |||
| 4842 | static struct ctl_table * | 4840 | static struct ctl_table * |
| 4843 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 4841 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
| 4844 | { | 4842 | { |
| 4845 | struct ctl_table *table = sd_alloc_ctl_entry(13); | 4843 | struct ctl_table *table = sd_alloc_ctl_entry(14); |
| 4846 | 4844 | ||
| 4847 | if (table == NULL) | 4845 | if (table == NULL) |
| 4848 | return NULL; | 4846 | return NULL; |
| @@ -4870,9 +4868,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
| 4870 | sizeof(int), 0644, proc_dointvec_minmax, false); | 4868 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 4871 | set_table_entry(&table[10], "flags", &sd->flags, | 4869 | set_table_entry(&table[10], "flags", &sd->flags, |
| 4872 | sizeof(int), 0644, proc_dointvec_minmax, false); | 4870 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 4873 | set_table_entry(&table[11], "name", sd->name, | 4871 | set_table_entry(&table[11], "max_newidle_lb_cost", |
| 4872 | &sd->max_newidle_lb_cost, | ||
| 4873 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
| 4874 | set_table_entry(&table[12], "name", sd->name, | ||
| 4874 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); | 4875 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); |
| 4875 | /* &table[12] is terminator */ | 4876 | /* &table[13] is terminator */ |
| 4876 | 4877 | ||
| 4877 | return table; | 4878 | return table; |
| 4878 | } | 4879 | } |
| @@ -6452,7 +6453,7 @@ static cpumask_var_t fallback_doms; | |||
| 6452 | * cpu core maps. It is supposed to return 1 if the topology changed | 6453 | * cpu core maps. It is supposed to return 1 if the topology changed |
| 6453 | * or 0 if it stayed the same. | 6454 | * or 0 if it stayed the same. |
| 6454 | */ | 6455 | */ |
| 6455 | int __attribute__((weak)) arch_update_cpu_topology(void) | 6456 | int __weak arch_update_cpu_topology(void) |
| 6456 | { | 6457 | { |
| 6457 | return 0; | 6458 | return 0; |
| 6458 | } | 6459 | } |
| @@ -6849,7 +6850,6 @@ void __init sched_init(void) | |||
| 6849 | 6850 | ||
| 6850 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | 6851 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; |
| 6851 | #ifdef CONFIG_RT_GROUP_SCHED | 6852 | #ifdef CONFIG_RT_GROUP_SCHED |
| 6852 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | ||
| 6853 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); | 6853 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); |
| 6854 | #endif | 6854 | #endif |
| 6855 | 6855 | ||
| @@ -6938,7 +6938,8 @@ void __might_sleep(const char *file, int line, int preempt_offset) | |||
| 6938 | static unsigned long prev_jiffy; /* ratelimiting */ | 6938 | static unsigned long prev_jiffy; /* ratelimiting */ |
| 6939 | 6939 | ||
| 6940 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ | 6940 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ |
| 6941 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || | 6941 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && |
| 6942 | !is_idle_task(current)) || | ||
| 6942 | system_state != SYSTEM_RUNNING || oops_in_progress) | 6943 | system_state != SYSTEM_RUNNING || oops_in_progress) |
| 6943 | return; | 6944 | return; |
| 6944 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 6945 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
| @@ -6956,6 +6957,13 @@ void __might_sleep(const char *file, int line, int preempt_offset) | |||
| 6956 | debug_show_held_locks(current); | 6957 | debug_show_held_locks(current); |
| 6957 | if (irqs_disabled()) | 6958 | if (irqs_disabled()) |
| 6958 | print_irqtrace_events(current); | 6959 | print_irqtrace_events(current); |
| 6960 | #ifdef CONFIG_DEBUG_PREEMPT | ||
| 6961 | if (!preempt_count_equals(preempt_offset)) { | ||
| 6962 | pr_err("Preemption disabled at:"); | ||
| 6963 | print_ip_sym(current->preempt_disable_ip); | ||
| 6964 | pr_cont("\n"); | ||
| 6965 | } | ||
| 6966 | #endif | ||
| 6959 | dump_stack(); | 6967 | dump_stack(); |
| 6960 | } | 6968 | } |
| 6961 | EXPORT_SYMBOL(__might_sleep); | 6969 | EXPORT_SYMBOL(__might_sleep); |
| @@ -7009,7 +7017,7 @@ void normalize_rt_tasks(void) | |||
| 7009 | * Renice negative nice level userspace | 7017 | * Renice negative nice level userspace |
| 7010 | * tasks back to 0: | 7018 | * tasks back to 0: |
| 7011 | */ | 7019 | */ |
| 7012 | if (TASK_NICE(p) < 0 && p->mm) | 7020 | if (task_nice(p) < 0 && p->mm) |
| 7013 | set_user_nice(p, 0); | 7021 | set_user_nice(p, 0); |
| 7014 | continue; | 7022 | continue; |
| 7015 | } | 7023 | } |
| @@ -7177,7 +7185,7 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7177 | if (unlikely(running)) | 7185 | if (unlikely(running)) |
| 7178 | tsk->sched_class->put_prev_task(rq, tsk); | 7186 | tsk->sched_class->put_prev_task(rq, tsk); |
| 7179 | 7187 | ||
| 7180 | tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, | 7188 | tg = container_of(task_css_check(tsk, cpu_cgrp_id, |
| 7181 | lockdep_is_held(&tsk->sighand->siglock)), | 7189 | lockdep_is_held(&tsk->sighand->siglock)), |
| 7182 | struct task_group, css); | 7190 | struct task_group, css); |
| 7183 | tg = autogroup_task_group(tsk, tg); | 7191 | tg = autogroup_task_group(tsk, tg); |
| @@ -7604,7 +7612,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
| 7604 | { | 7612 | { |
| 7605 | struct task_struct *task; | 7613 | struct task_struct *task; |
| 7606 | 7614 | ||
| 7607 | cgroup_taskset_for_each(task, css, tset) { | 7615 | cgroup_taskset_for_each(task, tset) { |
| 7608 | #ifdef CONFIG_RT_GROUP_SCHED | 7616 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7609 | if (!sched_rt_can_attach(css_tg(css), task)) | 7617 | if (!sched_rt_can_attach(css_tg(css), task)) |
| 7610 | return -EINVAL; | 7618 | return -EINVAL; |
| @@ -7622,7 +7630,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, | |||
| 7622 | { | 7630 | { |
| 7623 | struct task_struct *task; | 7631 | struct task_struct *task; |
| 7624 | 7632 | ||
| 7625 | cgroup_taskset_for_each(task, css, tset) | 7633 | cgroup_taskset_for_each(task, tset) |
| 7626 | sched_move_task(task); | 7634 | sched_move_task(task); |
| 7627 | } | 7635 | } |
| 7628 | 7636 | ||
| @@ -7961,8 +7969,7 @@ static struct cftype cpu_files[] = { | |||
| 7961 | { } /* terminate */ | 7969 | { } /* terminate */ |
| 7962 | }; | 7970 | }; |
| 7963 | 7971 | ||
| 7964 | struct cgroup_subsys cpu_cgroup_subsys = { | 7972 | struct cgroup_subsys cpu_cgrp_subsys = { |
| 7965 | .name = "cpu", | ||
| 7966 | .css_alloc = cpu_cgroup_css_alloc, | 7973 | .css_alloc = cpu_cgroup_css_alloc, |
| 7967 | .css_free = cpu_cgroup_css_free, | 7974 | .css_free = cpu_cgroup_css_free, |
| 7968 | .css_online = cpu_cgroup_css_online, | 7975 | .css_online = cpu_cgroup_css_online, |
| @@ -7970,7 +7977,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
| 7970 | .can_attach = cpu_cgroup_can_attach, | 7977 | .can_attach = cpu_cgroup_can_attach, |
| 7971 | .attach = cpu_cgroup_attach, | 7978 | .attach = cpu_cgroup_attach, |
| 7972 | .exit = cpu_cgroup_exit, | 7979 | .exit = cpu_cgroup_exit, |
| 7973 | .subsys_id = cpu_cgroup_subsys_id, | ||
| 7974 | .base_cftypes = cpu_files, | 7980 | .base_cftypes = cpu_files, |
| 7975 | .early_init = 1, | 7981 | .early_init = 1, |
| 7976 | }; | 7982 | }; |
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 622e0818f905..c143ee380e3a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
| @@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) | |||
| 41 | /* return cpu accounting group to which this task belongs */ | 41 | /* return cpu accounting group to which this task belongs */ |
| 42 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | 42 | static inline struct cpuacct *task_ca(struct task_struct *tsk) |
| 43 | { | 43 | { |
| 44 | return css_ca(task_css(tsk, cpuacct_subsys_id)); | 44 | return css_ca(task_css(tsk, cpuacct_cgrp_id)); |
| 45 | } | 45 | } |
| 46 | 46 | ||
| 47 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | 47 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) |
| @@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) | |||
| 275 | rcu_read_unlock(); | 275 | rcu_read_unlock(); |
| 276 | } | 276 | } |
| 277 | 277 | ||
| 278 | struct cgroup_subsys cpuacct_subsys = { | 278 | struct cgroup_subsys cpuacct_cgrp_subsys = { |
| 279 | .name = "cpuacct", | ||
| 280 | .css_alloc = cpuacct_css_alloc, | 279 | .css_alloc = cpuacct_css_alloc, |
| 281 | .css_free = cpuacct_css_free, | 280 | .css_free = cpuacct_css_free, |
| 282 | .subsys_id = cpuacct_subsys_id, | ||
| 283 | .base_cftypes = files, | 281 | .base_cftypes = files, |
| 284 | .early_init = 1, | 282 | .early_init = 1, |
| 285 | }; | 283 | }; |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b8838b56d1c..5b9bb42b2d47 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
| @@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx) | |||
| 70 | 70 | ||
| 71 | static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) | 71 | static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) |
| 72 | { | 72 | { |
| 73 | WARN_ON(!cpu_present(idx) || idx == IDX_INVALID); | 73 | WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); |
| 74 | 74 | ||
| 75 | if (dl_time_before(new_dl, cp->elements[idx].dl)) { | 75 | if (dl_time_before(new_dl, cp->elements[idx].dl)) { |
| 76 | cp->elements[idx].dl = new_dl; | 76 | cp->elements[idx].dl = new_dl; |
| @@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
| 117 | } | 117 | } |
| 118 | 118 | ||
| 119 | out: | 119 | out: |
| 120 | WARN_ON(!cpu_present(best_cpu) && best_cpu != -1); | 120 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); |
| 121 | 121 | ||
| 122 | return best_cpu; | 122 | return best_cpu; |
| 123 | } | 123 | } |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 99947919e30b..a95097cb4591 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, | |||
| 142 | p->utimescaled += cputime_scaled; | 142 | p->utimescaled += cputime_scaled; |
| 143 | account_group_user_time(p, cputime); | 143 | account_group_user_time(p, cputime); |
| 144 | 144 | ||
| 145 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | 145 | index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; |
| 146 | 146 | ||
| 147 | /* Add user time to cpustat. */ | 147 | /* Add user time to cpustat. */ |
| 148 | task_group_account_field(p, index, (__force u64) cputime); | 148 | task_group_account_field(p, index, (__force u64) cputime); |
| @@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
| 169 | p->gtime += cputime; | 169 | p->gtime += cputime; |
| 170 | 170 | ||
| 171 | /* Add guest time to cpustat. */ | 171 | /* Add guest time to cpustat. */ |
| 172 | if (TASK_NICE(p) > 0) { | 172 | if (task_nice(p) > 0) { |
| 173 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | 173 | cpustat[CPUTIME_NICE] += (__force u64) cputime; |
| 174 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | 174 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; |
| 175 | } else { | 175 | } else { |
| @@ -258,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void) | |||
| 258 | { | 258 | { |
| 259 | #ifdef CONFIG_PARAVIRT | 259 | #ifdef CONFIG_PARAVIRT |
| 260 | if (static_key_false(¶virt_steal_enabled)) { | 260 | if (static_key_false(¶virt_steal_enabled)) { |
| 261 | u64 steal, st = 0; | 261 | u64 steal; |
| 262 | cputime_t steal_ct; | ||
| 262 | 263 | ||
| 263 | steal = paravirt_steal_clock(smp_processor_id()); | 264 | steal = paravirt_steal_clock(smp_processor_id()); |
| 264 | steal -= this_rq()->prev_steal_time; | 265 | steal -= this_rq()->prev_steal_time; |
| 265 | 266 | ||
| 266 | st = steal_ticks(steal); | 267 | /* |
| 267 | this_rq()->prev_steal_time += st * TICK_NSEC; | 268 | * cputime_t may be less precise than nsecs (eg: if it's |
| 269 | * based on jiffies). Lets cast the result to cputime | ||
| 270 | * granularity and account the rest on the next rounds. | ||
| 271 | */ | ||
| 272 | steal_ct = nsecs_to_cputime(steal); | ||
| 273 | this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); | ||
| 268 | 274 | ||
| 269 | account_steal_time(st); | 275 | account_steal_time(steal_ct); |
| 270 | return st; | 276 | return steal_ct; |
| 271 | } | 277 | } |
| 272 | #endif | 278 | #endif |
| 273 | return false; | 279 | return false; |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 15cbc17fbf84..27ef40925525 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -135,7 +135,6 @@ static void update_dl_migration(struct dl_rq *dl_rq) | |||
| 135 | static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | 135 | static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) |
| 136 | { | 136 | { |
| 137 | struct task_struct *p = dl_task_of(dl_se); | 137 | struct task_struct *p = dl_task_of(dl_se); |
| 138 | dl_rq = &rq_of_dl_rq(dl_rq)->dl; | ||
| 139 | 138 | ||
| 140 | if (p->nr_cpus_allowed > 1) | 139 | if (p->nr_cpus_allowed > 1) |
| 141 | dl_rq->dl_nr_migratory++; | 140 | dl_rq->dl_nr_migratory++; |
| @@ -146,7 +145,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | |||
| 146 | static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | 145 | static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) |
| 147 | { | 146 | { |
| 148 | struct task_struct *p = dl_task_of(dl_se); | 147 | struct task_struct *p = dl_task_of(dl_se); |
| 149 | dl_rq = &rq_of_dl_rq(dl_rq)->dl; | ||
| 150 | 148 | ||
| 151 | if (p->nr_cpus_allowed > 1) | 149 | if (p->nr_cpus_allowed > 1) |
| 152 | dl_rq->dl_nr_migratory--; | 150 | dl_rq->dl_nr_migratory--; |
| @@ -212,6 +210,16 @@ static inline int has_pushable_dl_tasks(struct rq *rq) | |||
| 212 | 210 | ||
| 213 | static int push_dl_task(struct rq *rq); | 211 | static int push_dl_task(struct rq *rq); |
| 214 | 212 | ||
| 213 | static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) | ||
| 214 | { | ||
| 215 | return dl_task(prev); | ||
| 216 | } | ||
| 217 | |||
| 218 | static inline void set_post_schedule(struct rq *rq) | ||
| 219 | { | ||
| 220 | rq->post_schedule = has_pushable_dl_tasks(rq); | ||
| 221 | } | ||
| 222 | |||
| 215 | #else | 223 | #else |
| 216 | 224 | ||
| 217 | static inline | 225 | static inline |
| @@ -234,6 +242,19 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | |||
| 234 | { | 242 | { |
| 235 | } | 243 | } |
| 236 | 244 | ||
| 245 | static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) | ||
| 246 | { | ||
| 247 | return false; | ||
| 248 | } | ||
| 249 | |||
| 250 | static inline int pull_dl_task(struct rq *rq) | ||
| 251 | { | ||
| 252 | return 0; | ||
| 253 | } | ||
| 254 | |||
| 255 | static inline void set_post_schedule(struct rq *rq) | ||
| 256 | { | ||
| 257 | } | ||
| 237 | #endif /* CONFIG_SMP */ | 258 | #endif /* CONFIG_SMP */ |
| 238 | 259 | ||
| 239 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); | 260 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); |
| @@ -564,6 +585,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) | |||
| 564 | return 1; | 585 | return 1; |
| 565 | } | 586 | } |
| 566 | 587 | ||
| 588 | extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); | ||
| 589 | |||
| 567 | /* | 590 | /* |
| 568 | * Update the current task's runtime statistics (provided it is still | 591 | * Update the current task's runtime statistics (provided it is still |
| 569 | * a -deadline task and has not been removed from the dl_rq). | 592 | * a -deadline task and has not been removed from the dl_rq). |
| @@ -586,8 +609,8 @@ static void update_curr_dl(struct rq *rq) | |||
| 586 | * approach need further study. | 609 | * approach need further study. |
| 587 | */ | 610 | */ |
| 588 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; | 611 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; |
| 589 | if (unlikely((s64)delta_exec < 0)) | 612 | if (unlikely((s64)delta_exec <= 0)) |
| 590 | delta_exec = 0; | 613 | return; |
| 591 | 614 | ||
| 592 | schedstat_set(curr->se.statistics.exec_max, | 615 | schedstat_set(curr->se.statistics.exec_max, |
| 593 | max(curr->se.statistics.exec_max, delta_exec)); | 616 | max(curr->se.statistics.exec_max, delta_exec)); |
| @@ -627,11 +650,13 @@ static void update_curr_dl(struct rq *rq) | |||
| 627 | struct rt_rq *rt_rq = &rq->rt; | 650 | struct rt_rq *rt_rq = &rq->rt; |
| 628 | 651 | ||
| 629 | raw_spin_lock(&rt_rq->rt_runtime_lock); | 652 | raw_spin_lock(&rt_rq->rt_runtime_lock); |
| 630 | rt_rq->rt_time += delta_exec; | ||
| 631 | /* | 653 | /* |
| 632 | * We'll let actual RT tasks worry about the overflow here, we | 654 | * We'll let actual RT tasks worry about the overflow here, we |
| 633 | * have our own CBS to keep us inline -- see above. | 655 | * have our own CBS to keep us inline; only account when RT |
| 656 | * bandwidth is relevant. | ||
| 634 | */ | 657 | */ |
| 658 | if (sched_rt_bandwidth_account(rt_rq)) | ||
| 659 | rt_rq->rt_time += delta_exec; | ||
| 635 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 660 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
| 636 | } | 661 | } |
| 637 | } | 662 | } |
| @@ -940,6 +965,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) | |||
| 940 | resched_task(rq->curr); | 965 | resched_task(rq->curr); |
| 941 | } | 966 | } |
| 942 | 967 | ||
| 968 | static int pull_dl_task(struct rq *this_rq); | ||
| 969 | |||
| 943 | #endif /* CONFIG_SMP */ | 970 | #endif /* CONFIG_SMP */ |
| 944 | 971 | ||
| 945 | /* | 972 | /* |
| @@ -986,7 +1013,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, | |||
| 986 | return rb_entry(left, struct sched_dl_entity, rb_node); | 1013 | return rb_entry(left, struct sched_dl_entity, rb_node); |
| 987 | } | 1014 | } |
| 988 | 1015 | ||
| 989 | struct task_struct *pick_next_task_dl(struct rq *rq) | 1016 | struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) |
| 990 | { | 1017 | { |
| 991 | struct sched_dl_entity *dl_se; | 1018 | struct sched_dl_entity *dl_se; |
| 992 | struct task_struct *p; | 1019 | struct task_struct *p; |
| @@ -994,9 +1021,20 @@ struct task_struct *pick_next_task_dl(struct rq *rq) | |||
| 994 | 1021 | ||
| 995 | dl_rq = &rq->dl; | 1022 | dl_rq = &rq->dl; |
| 996 | 1023 | ||
| 1024 | if (need_pull_dl_task(rq, prev)) | ||
| 1025 | pull_dl_task(rq); | ||
| 1026 | /* | ||
| 1027 | * When prev is DL, we may throttle it in put_prev_task(). | ||
| 1028 | * So, we update time before we check for dl_nr_running. | ||
| 1029 | */ | ||
| 1030 | if (prev->sched_class == &dl_sched_class) | ||
| 1031 | update_curr_dl(rq); | ||
| 1032 | |||
| 997 | if (unlikely(!dl_rq->dl_nr_running)) | 1033 | if (unlikely(!dl_rq->dl_nr_running)) |
| 998 | return NULL; | 1034 | return NULL; |
| 999 | 1035 | ||
| 1036 | put_prev_task(rq, prev); | ||
| 1037 | |||
| 1000 | dl_se = pick_next_dl_entity(rq, dl_rq); | 1038 | dl_se = pick_next_dl_entity(rq, dl_rq); |
| 1001 | BUG_ON(!dl_se); | 1039 | BUG_ON(!dl_se); |
| 1002 | 1040 | ||
| @@ -1011,9 +1049,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq) | |||
| 1011 | start_hrtick_dl(rq, p); | 1049 | start_hrtick_dl(rq, p); |
| 1012 | #endif | 1050 | #endif |
| 1013 | 1051 | ||
| 1014 | #ifdef CONFIG_SMP | 1052 | set_post_schedule(rq); |
| 1015 | rq->post_schedule = has_pushable_dl_tasks(rq); | ||
| 1016 | #endif /* CONFIG_SMP */ | ||
| 1017 | 1053 | ||
| 1018 | return p; | 1054 | return p; |
| 1019 | } | 1055 | } |
| @@ -1422,13 +1458,6 @@ skip: | |||
| 1422 | return ret; | 1458 | return ret; |
| 1423 | } | 1459 | } |
| 1424 | 1460 | ||
| 1425 | static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) | ||
| 1426 | { | ||
| 1427 | /* Try to pull other tasks here */ | ||
| 1428 | if (dl_task(prev)) | ||
| 1429 | pull_dl_task(rq); | ||
| 1430 | } | ||
| 1431 | |||
| 1432 | static void post_schedule_dl(struct rq *rq) | 1461 | static void post_schedule_dl(struct rq *rq) |
| 1433 | { | 1462 | { |
| 1434 | push_dl_tasks(rq); | 1463 | push_dl_tasks(rq); |
| @@ -1556,7 +1585,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
| 1556 | if (unlikely(p->dl.dl_throttled)) | 1585 | if (unlikely(p->dl.dl_throttled)) |
| 1557 | return; | 1586 | return; |
| 1558 | 1587 | ||
| 1559 | if (p->on_rq || rq->curr != p) { | 1588 | if (p->on_rq && rq->curr != p) { |
| 1560 | #ifdef CONFIG_SMP | 1589 | #ifdef CONFIG_SMP |
| 1561 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) | 1590 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) |
| 1562 | /* Only reschedule if pushing failed */ | 1591 | /* Only reschedule if pushing failed */ |
| @@ -1621,7 +1650,6 @@ const struct sched_class dl_sched_class = { | |||
| 1621 | .set_cpus_allowed = set_cpus_allowed_dl, | 1650 | .set_cpus_allowed = set_cpus_allowed_dl, |
| 1622 | .rq_online = rq_online_dl, | 1651 | .rq_online = rq_online_dl, |
| 1623 | .rq_offline = rq_offline_dl, | 1652 | .rq_offline = rq_offline_dl, |
| 1624 | .pre_schedule = pre_schedule_dl, | ||
| 1625 | .post_schedule = post_schedule_dl, | 1653 | .post_schedule = post_schedule_dl, |
| 1626 | .task_woken = task_woken_dl, | 1654 | .task_woken = task_woken_dl, |
| 1627 | #endif | 1655 | #endif |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index dd52e7ffb10e..695f9773bb60 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg) | |||
| 111 | if (autogroup_path(tg, group_path, PATH_MAX)) | 111 | if (autogroup_path(tg, group_path, PATH_MAX)) |
| 112 | return group_path; | 112 | return group_path; |
| 113 | 113 | ||
| 114 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); | 114 | return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); |
| 115 | return group_path; | ||
| 116 | } | 115 | } |
| 117 | #endif | 116 | #endif |
| 118 | 117 | ||
| @@ -321,6 +320,7 @@ do { \ | |||
| 321 | P(sched_goidle); | 320 | P(sched_goidle); |
| 322 | #ifdef CONFIG_SMP | 321 | #ifdef CONFIG_SMP |
| 323 | P64(avg_idle); | 322 | P64(avg_idle); |
| 323 | P64(max_idle_balance_cost); | ||
| 324 | #endif | 324 | #endif |
| 325 | 325 | ||
| 326 | P(ttwu_count); | 326 | P(ttwu_count); |
| @@ -533,15 +533,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) | |||
| 533 | unsigned long nr_faults = -1; | 533 | unsigned long nr_faults = -1; |
| 534 | int cpu_current, home_node; | 534 | int cpu_current, home_node; |
| 535 | 535 | ||
| 536 | if (p->numa_faults) | 536 | if (p->numa_faults_memory) |
| 537 | nr_faults = p->numa_faults[2*node + i]; | 537 | nr_faults = p->numa_faults_memory[2*node + i]; |
| 538 | 538 | ||
| 539 | cpu_current = !i ? (task_node(p) == node) : | 539 | cpu_current = !i ? (task_node(p) == node) : |
| 540 | (pol && node_isset(node, pol->v.nodes)); | 540 | (pol && node_isset(node, pol->v.nodes)); |
| 541 | 541 | ||
| 542 | home_node = (p->numa_preferred_nid == node); | 542 | home_node = (p->numa_preferred_nid == node); |
| 543 | 543 | ||
| 544 | SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", | 544 | SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n", |
| 545 | i, node, cpu_current, home_node, nr_faults); | 545 | i, node, cpu_current, home_node, nr_faults); |
| 546 | } | 546 | } |
| 547 | } | 547 | } |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 78157099b167..7e9bd0b1fa9e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 322 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 322 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
| 323 | 323 | ||
| 324 | /* Do the two (enqueued) entities belong to the same group ? */ | 324 | /* Do the two (enqueued) entities belong to the same group ? */ |
| 325 | static inline int | 325 | static inline struct cfs_rq * |
| 326 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | 326 | is_same_group(struct sched_entity *se, struct sched_entity *pse) |
| 327 | { | 327 | { |
| 328 | if (se->cfs_rq == pse->cfs_rq) | 328 | if (se->cfs_rq == pse->cfs_rq) |
| 329 | return 1; | 329 | return se->cfs_rq; |
| 330 | 330 | ||
| 331 | return 0; | 331 | return NULL; |
| 332 | } | 332 | } |
| 333 | 333 | ||
| 334 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | 334 | static inline struct sched_entity *parent_entity(struct sched_entity *se) |
| @@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
| 336 | return se->parent; | 336 | return se->parent; |
| 337 | } | 337 | } |
| 338 | 338 | ||
| 339 | /* return depth at which a sched entity is present in the hierarchy */ | ||
| 340 | static inline int depth_se(struct sched_entity *se) | ||
| 341 | { | ||
| 342 | int depth = 0; | ||
| 343 | |||
| 344 | for_each_sched_entity(se) | ||
| 345 | depth++; | ||
| 346 | |||
| 347 | return depth; | ||
| 348 | } | ||
| 349 | |||
| 350 | static void | 339 | static void |
| 351 | find_matching_se(struct sched_entity **se, struct sched_entity **pse) | 340 | find_matching_se(struct sched_entity **se, struct sched_entity **pse) |
| 352 | { | 341 | { |
| @@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) | |||
| 360 | */ | 349 | */ |
| 361 | 350 | ||
| 362 | /* First walk up until both entities are at same depth */ | 351 | /* First walk up until both entities are at same depth */ |
| 363 | se_depth = depth_se(*se); | 352 | se_depth = (*se)->depth; |
| 364 | pse_depth = depth_se(*pse); | 353 | pse_depth = (*pse)->depth; |
| 365 | 354 | ||
| 366 | while (se_depth > pse_depth) { | 355 | while (se_depth > pse_depth) { |
| 367 | se_depth--; | 356 | se_depth--; |
| @@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 426 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 415 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
| 427 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 416 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) |
| 428 | 417 | ||
| 429 | static inline int | ||
| 430 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
| 431 | { | ||
| 432 | return 1; | ||
| 433 | } | ||
| 434 | |||
| 435 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | 418 | static inline struct sched_entity *parent_entity(struct sched_entity *se) |
| 436 | { | 419 | { |
| 437 | return NULL; | 420 | return NULL; |
| @@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256; | |||
| 819 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | 802 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ |
| 820 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | 803 | unsigned int sysctl_numa_balancing_scan_delay = 1000; |
| 821 | 804 | ||
| 822 | /* | ||
| 823 | * After skipping a page migration on a shared page, skip N more numa page | ||
| 824 | * migrations unconditionally. This reduces the number of NUMA migrations | ||
| 825 | * in shared memory workloads, and has the effect of pulling tasks towards | ||
| 826 | * where their memory lives, over pulling the memory towards the task. | ||
| 827 | */ | ||
| 828 | unsigned int sysctl_numa_balancing_migrate_deferred = 16; | ||
| 829 | |||
| 830 | static unsigned int task_nr_scan_windows(struct task_struct *p) | 805 | static unsigned int task_nr_scan_windows(struct task_struct *p) |
| 831 | { | 806 | { |
| 832 | unsigned long rss = 0; | 807 | unsigned long rss = 0; |
| @@ -893,10 +868,26 @@ struct numa_group { | |||
| 893 | struct list_head task_list; | 868 | struct list_head task_list; |
| 894 | 869 | ||
| 895 | struct rcu_head rcu; | 870 | struct rcu_head rcu; |
| 871 | nodemask_t active_nodes; | ||
| 896 | unsigned long total_faults; | 872 | unsigned long total_faults; |
| 873 | /* | ||
| 874 | * Faults_cpu is used to decide whether memory should move | ||
| 875 | * towards the CPU. As a consequence, these stats are weighted | ||
| 876 | * more by CPU use than by memory faults. | ||
| 877 | */ | ||
| 878 | unsigned long *faults_cpu; | ||
| 897 | unsigned long faults[0]; | 879 | unsigned long faults[0]; |
| 898 | }; | 880 | }; |
| 899 | 881 | ||
| 882 | /* Shared or private faults. */ | ||
| 883 | #define NR_NUMA_HINT_FAULT_TYPES 2 | ||
| 884 | |||
| 885 | /* Memory and CPU locality */ | ||
| 886 | #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) | ||
| 887 | |||
| 888 | /* Averaged statistics, and temporary buffers. */ | ||
| 889 | #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) | ||
| 890 | |||
| 900 | pid_t task_numa_group_id(struct task_struct *p) | 891 | pid_t task_numa_group_id(struct task_struct *p) |
| 901 | { | 892 | { |
| 902 | return p->numa_group ? p->numa_group->gid : 0; | 893 | return p->numa_group ? p->numa_group->gid : 0; |
| @@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p) | |||
| 904 | 895 | ||
| 905 | static inline int task_faults_idx(int nid, int priv) | 896 | static inline int task_faults_idx(int nid, int priv) |
| 906 | { | 897 | { |
| 907 | return 2 * nid + priv; | 898 | return NR_NUMA_HINT_FAULT_TYPES * nid + priv; |
| 908 | } | 899 | } |
| 909 | 900 | ||
| 910 | static inline unsigned long task_faults(struct task_struct *p, int nid) | 901 | static inline unsigned long task_faults(struct task_struct *p, int nid) |
| 911 | { | 902 | { |
| 912 | if (!p->numa_faults) | 903 | if (!p->numa_faults_memory) |
| 913 | return 0; | 904 | return 0; |
| 914 | 905 | ||
| 915 | return p->numa_faults[task_faults_idx(nid, 0)] + | 906 | return p->numa_faults_memory[task_faults_idx(nid, 0)] + |
| 916 | p->numa_faults[task_faults_idx(nid, 1)]; | 907 | p->numa_faults_memory[task_faults_idx(nid, 1)]; |
| 917 | } | 908 | } |
| 918 | 909 | ||
| 919 | static inline unsigned long group_faults(struct task_struct *p, int nid) | 910 | static inline unsigned long group_faults(struct task_struct *p, int nid) |
| @@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) | |||
| 925 | p->numa_group->faults[task_faults_idx(nid, 1)]; | 916 | p->numa_group->faults[task_faults_idx(nid, 1)]; |
| 926 | } | 917 | } |
| 927 | 918 | ||
| 919 | static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) | ||
| 920 | { | ||
| 921 | return group->faults_cpu[task_faults_idx(nid, 0)] + | ||
| 922 | group->faults_cpu[task_faults_idx(nid, 1)]; | ||
| 923 | } | ||
| 924 | |||
| 928 | /* | 925 | /* |
| 929 | * These return the fraction of accesses done by a particular task, or | 926 | * These return the fraction of accesses done by a particular task, or |
| 930 | * task group, on a particular numa node. The group weight is given a | 927 | * task group, on a particular numa node. The group weight is given a |
| @@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) | |||
| 935 | { | 932 | { |
| 936 | unsigned long total_faults; | 933 | unsigned long total_faults; |
| 937 | 934 | ||
| 938 | if (!p->numa_faults) | 935 | if (!p->numa_faults_memory) |
| 939 | return 0; | 936 | return 0; |
| 940 | 937 | ||
| 941 | total_faults = p->total_numa_faults; | 938 | total_faults = p->total_numa_faults; |
| @@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid) | |||
| 954 | return 1000 * group_faults(p, nid) / p->numa_group->total_faults; | 951 | return 1000 * group_faults(p, nid) / p->numa_group->total_faults; |
| 955 | } | 952 | } |
| 956 | 953 | ||
| 954 | bool should_numa_migrate_memory(struct task_struct *p, struct page * page, | ||
| 955 | int src_nid, int dst_cpu) | ||
| 956 | { | ||
| 957 | struct numa_group *ng = p->numa_group; | ||
| 958 | int dst_nid = cpu_to_node(dst_cpu); | ||
| 959 | int last_cpupid, this_cpupid; | ||
| 960 | |||
| 961 | this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); | ||
| 962 | |||
| 963 | /* | ||
| 964 | * Multi-stage node selection is used in conjunction with a periodic | ||
| 965 | * migration fault to build a temporal task<->page relation. By using | ||
| 966 | * a two-stage filter we remove short/unlikely relations. | ||
| 967 | * | ||
| 968 | * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate | ||
| 969 | * a task's usage of a particular page (n_p) per total usage of this | ||
| 970 | * page (n_t) (in a given time-span) to a probability. | ||
| 971 | * | ||
| 972 | * Our periodic faults will sample this probability and getting the | ||
| 973 | * same result twice in a row, given these samples are fully | ||
| 974 | * independent, is then given by P(n)^2, provided our sample period | ||
| 975 | * is sufficiently short compared to the usage pattern. | ||
| 976 | * | ||
| 977 | * This quadric squishes small probabilities, making it less likely we | ||
| 978 | * act on an unlikely task<->page relation. | ||
| 979 | */ | ||
| 980 | last_cpupid = page_cpupid_xchg_last(page, this_cpupid); | ||
| 981 | if (!cpupid_pid_unset(last_cpupid) && | ||
| 982 | cpupid_to_nid(last_cpupid) != dst_nid) | ||
| 983 | return false; | ||
| 984 | |||
| 985 | /* Always allow migrate on private faults */ | ||
| 986 | if (cpupid_match_pid(p, last_cpupid)) | ||
| 987 | return true; | ||
| 988 | |||
| 989 | /* A shared fault, but p->numa_group has not been set up yet. */ | ||
| 990 | if (!ng) | ||
| 991 | return true; | ||
| 992 | |||
| 993 | /* | ||
| 994 | * Do not migrate if the destination is not a node that | ||
| 995 | * is actively used by this numa group. | ||
| 996 | */ | ||
| 997 | if (!node_isset(dst_nid, ng->active_nodes)) | ||
| 998 | return false; | ||
| 999 | |||
| 1000 | /* | ||
| 1001 | * Source is a node that is not actively used by this | ||
| 1002 | * numa group, while the destination is. Migrate. | ||
| 1003 | */ | ||
| 1004 | if (!node_isset(src_nid, ng->active_nodes)) | ||
| 1005 | return true; | ||
| 1006 | |||
| 1007 | /* | ||
| 1008 | * Both source and destination are nodes in active | ||
| 1009 | * use by this numa group. Maximize memory bandwidth | ||
| 1010 | * by migrating from more heavily used groups, to less | ||
| 1011 | * heavily used ones, spreading the load around. | ||
| 1012 | * Use a 1/4 hysteresis to avoid spurious page movement. | ||
| 1013 | */ | ||
| 1014 | return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); | ||
| 1015 | } | ||
| 1016 | |||
| 957 | static unsigned long weighted_cpuload(const int cpu); | 1017 | static unsigned long weighted_cpuload(const int cpu); |
| 958 | static unsigned long source_load(int cpu, int type); | 1018 | static unsigned long source_load(int cpu, int type); |
| 959 | static unsigned long target_load(int cpu, int type); | 1019 | static unsigned long target_load(int cpu, int type); |
| @@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
| 1267 | static void numa_migrate_preferred(struct task_struct *p) | 1327 | static void numa_migrate_preferred(struct task_struct *p) |
| 1268 | { | 1328 | { |
| 1269 | /* This task has no NUMA fault statistics yet */ | 1329 | /* This task has no NUMA fault statistics yet */ |
| 1270 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) | 1330 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) |
| 1271 | return; | 1331 | return; |
| 1272 | 1332 | ||
| 1273 | /* Periodically retry migrating the task to the preferred node */ | 1333 | /* Periodically retry migrating the task to the preferred node */ |
| @@ -1282,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
| 1282 | } | 1342 | } |
| 1283 | 1343 | ||
| 1284 | /* | 1344 | /* |
| 1345 | * Find the nodes on which the workload is actively running. We do this by | ||
| 1346 | * tracking the nodes from which NUMA hinting faults are triggered. This can | ||
| 1347 | * be different from the set of nodes where the workload's memory is currently | ||
| 1348 | * located. | ||
| 1349 | * | ||
| 1350 | * The bitmask is used to make smarter decisions on when to do NUMA page | ||
| 1351 | * migrations, To prevent flip-flopping, and excessive page migrations, nodes | ||
| 1352 | * are added when they cause over 6/16 of the maximum number of faults, but | ||
| 1353 | * only removed when they drop below 3/16. | ||
| 1354 | */ | ||
| 1355 | static void update_numa_active_node_mask(struct numa_group *numa_group) | ||
| 1356 | { | ||
| 1357 | unsigned long faults, max_faults = 0; | ||
| 1358 | int nid; | ||
| 1359 | |||
| 1360 | for_each_online_node(nid) { | ||
| 1361 | faults = group_faults_cpu(numa_group, nid); | ||
| 1362 | if (faults > max_faults) | ||
| 1363 | max_faults = faults; | ||
| 1364 | } | ||
| 1365 | |||
| 1366 | for_each_online_node(nid) { | ||
| 1367 | faults = group_faults_cpu(numa_group, nid); | ||
| 1368 | if (!node_isset(nid, numa_group->active_nodes)) { | ||
| 1369 | if (faults > max_faults * 6 / 16) | ||
| 1370 | node_set(nid, numa_group->active_nodes); | ||
| 1371 | } else if (faults < max_faults * 3 / 16) | ||
| 1372 | node_clear(nid, numa_group->active_nodes); | ||
| 1373 | } | ||
| 1374 | } | ||
| 1375 | |||
| 1376 | /* | ||
| 1285 | * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS | 1377 | * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS |
| 1286 | * increments. The more local the fault statistics are, the higher the scan | 1378 | * increments. The more local the fault statistics are, the higher the scan |
| 1287 | * period will be for the next scan window. If local/remote ratio is below | 1379 | * period will be for the next scan window. If local/remote ratio is below |
| @@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p, | |||
| 1355 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | 1447 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); |
| 1356 | } | 1448 | } |
| 1357 | 1449 | ||
| 1450 | /* | ||
| 1451 | * Get the fraction of time the task has been running since the last | ||
| 1452 | * NUMA placement cycle. The scheduler keeps similar statistics, but | ||
| 1453 | * decays those on a 32ms period, which is orders of magnitude off | ||
| 1454 | * from the dozens-of-seconds NUMA balancing period. Use the scheduler | ||
| 1455 | * stats only if the task is so new there are no NUMA statistics yet. | ||
| 1456 | */ | ||
| 1457 | static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | ||
| 1458 | { | ||
| 1459 | u64 runtime, delta, now; | ||
| 1460 | /* Use the start of this time slice to avoid calculations. */ | ||
| 1461 | now = p->se.exec_start; | ||
| 1462 | runtime = p->se.sum_exec_runtime; | ||
| 1463 | |||
| 1464 | if (p->last_task_numa_placement) { | ||
| 1465 | delta = runtime - p->last_sum_exec_runtime; | ||
| 1466 | *period = now - p->last_task_numa_placement; | ||
| 1467 | } else { | ||
| 1468 | delta = p->se.avg.runnable_avg_sum; | ||
| 1469 | *period = p->se.avg.runnable_avg_period; | ||
| 1470 | } | ||
| 1471 | |||
| 1472 | p->last_sum_exec_runtime = runtime; | ||
| 1473 | p->last_task_numa_placement = now; | ||
| 1474 | |||
| 1475 | return delta; | ||
| 1476 | } | ||
| 1477 | |||
| 1358 | static void task_numa_placement(struct task_struct *p) | 1478 | static void task_numa_placement(struct task_struct *p) |
| 1359 | { | 1479 | { |
| 1360 | int seq, nid, max_nid = -1, max_group_nid = -1; | 1480 | int seq, nid, max_nid = -1, max_group_nid = -1; |
| 1361 | unsigned long max_faults = 0, max_group_faults = 0; | 1481 | unsigned long max_faults = 0, max_group_faults = 0; |
| 1362 | unsigned long fault_types[2] = { 0, 0 }; | 1482 | unsigned long fault_types[2] = { 0, 0 }; |
| 1483 | unsigned long total_faults; | ||
| 1484 | u64 runtime, period; | ||
| 1363 | spinlock_t *group_lock = NULL; | 1485 | spinlock_t *group_lock = NULL; |
| 1364 | 1486 | ||
| 1365 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); | 1487 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); |
| @@ -1368,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p) | |||
| 1368 | p->numa_scan_seq = seq; | 1490 | p->numa_scan_seq = seq; |
| 1369 | p->numa_scan_period_max = task_scan_max(p); | 1491 | p->numa_scan_period_max = task_scan_max(p); |
| 1370 | 1492 | ||
| 1493 | total_faults = p->numa_faults_locality[0] + | ||
| 1494 | p->numa_faults_locality[1]; | ||
| 1495 | runtime = numa_get_avg_runtime(p, &period); | ||
| 1496 | |||
| 1371 | /* If the task is part of a group prevent parallel updates to group stats */ | 1497 | /* If the task is part of a group prevent parallel updates to group stats */ |
| 1372 | if (p->numa_group) { | 1498 | if (p->numa_group) { |
| 1373 | group_lock = &p->numa_group->lock; | 1499 | group_lock = &p->numa_group->lock; |
| @@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p) | |||
| 1379 | unsigned long faults = 0, group_faults = 0; | 1505 | unsigned long faults = 0, group_faults = 0; |
| 1380 | int priv, i; | 1506 | int priv, i; |
| 1381 | 1507 | ||
| 1382 | for (priv = 0; priv < 2; priv++) { | 1508 | for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { |
| 1383 | long diff; | 1509 | long diff, f_diff, f_weight; |
| 1384 | 1510 | ||
| 1385 | i = task_faults_idx(nid, priv); | 1511 | i = task_faults_idx(nid, priv); |
| 1386 | diff = -p->numa_faults[i]; | ||
| 1387 | 1512 | ||
| 1388 | /* Decay existing window, copy faults since last scan */ | 1513 | /* Decay existing window, copy faults since last scan */ |
| 1389 | p->numa_faults[i] >>= 1; | 1514 | diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; |
| 1390 | p->numa_faults[i] += p->numa_faults_buffer[i]; | 1515 | fault_types[priv] += p->numa_faults_buffer_memory[i]; |
| 1391 | fault_types[priv] += p->numa_faults_buffer[i]; | 1516 | p->numa_faults_buffer_memory[i] = 0; |
| 1392 | p->numa_faults_buffer[i] = 0; | ||
| 1393 | 1517 | ||
| 1394 | faults += p->numa_faults[i]; | 1518 | /* |
| 1395 | diff += p->numa_faults[i]; | 1519 | * Normalize the faults_from, so all tasks in a group |
| 1520 | * count according to CPU use, instead of by the raw | ||
| 1521 | * number of faults. Tasks with little runtime have | ||
| 1522 | * little over-all impact on throughput, and thus their | ||
| 1523 | * faults are less important. | ||
| 1524 | */ | ||
| 1525 | f_weight = div64_u64(runtime << 16, period + 1); | ||
| 1526 | f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / | ||
| 1527 | (total_faults + 1); | ||
| 1528 | f_diff = f_weight - p->numa_faults_cpu[i] / 2; | ||
| 1529 | p->numa_faults_buffer_cpu[i] = 0; | ||
| 1530 | |||
| 1531 | p->numa_faults_memory[i] += diff; | ||
| 1532 | p->numa_faults_cpu[i] += f_diff; | ||
| 1533 | faults += p->numa_faults_memory[i]; | ||
| 1396 | p->total_numa_faults += diff; | 1534 | p->total_numa_faults += diff; |
| 1397 | if (p->numa_group) { | 1535 | if (p->numa_group) { |
| 1398 | /* safe because we can only change our own group */ | 1536 | /* safe because we can only change our own group */ |
| 1399 | p->numa_group->faults[i] += diff; | 1537 | p->numa_group->faults[i] += diff; |
| 1538 | p->numa_group->faults_cpu[i] += f_diff; | ||
| 1400 | p->numa_group->total_faults += diff; | 1539 | p->numa_group->total_faults += diff; |
| 1401 | group_faults += p->numa_group->faults[i]; | 1540 | group_faults += p->numa_group->faults[i]; |
| 1402 | } | 1541 | } |
| @@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p) | |||
| 1416 | update_task_scan_period(p, fault_types[0], fault_types[1]); | 1555 | update_task_scan_period(p, fault_types[0], fault_types[1]); |
| 1417 | 1556 | ||
| 1418 | if (p->numa_group) { | 1557 | if (p->numa_group) { |
| 1558 | update_numa_active_node_mask(p->numa_group); | ||
| 1419 | /* | 1559 | /* |
| 1420 | * If the preferred task and group nids are different, | 1560 | * If the preferred task and group nids are different, |
| 1421 | * iterate over the nodes again to find the best place. | 1561 | * iterate over the nodes again to find the best place. |
| @@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
| 1465 | 1605 | ||
| 1466 | if (unlikely(!p->numa_group)) { | 1606 | if (unlikely(!p->numa_group)) { |
| 1467 | unsigned int size = sizeof(struct numa_group) + | 1607 | unsigned int size = sizeof(struct numa_group) + |
| 1468 | 2*nr_node_ids*sizeof(unsigned long); | 1608 | 4*nr_node_ids*sizeof(unsigned long); |
| 1469 | 1609 | ||
| 1470 | grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); | 1610 | grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); |
| 1471 | if (!grp) | 1611 | if (!grp) |
| @@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
| 1475 | spin_lock_init(&grp->lock); | 1615 | spin_lock_init(&grp->lock); |
| 1476 | INIT_LIST_HEAD(&grp->task_list); | 1616 | INIT_LIST_HEAD(&grp->task_list); |
| 1477 | grp->gid = p->pid; | 1617 | grp->gid = p->pid; |
| 1618 | /* Second half of the array tracks nids where faults happen */ | ||
| 1619 | grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * | ||
| 1620 | nr_node_ids; | ||
| 1621 | |||
| 1622 | node_set(task_node(current), grp->active_nodes); | ||
| 1478 | 1623 | ||
| 1479 | for (i = 0; i < 2*nr_node_ids; i++) | 1624 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
| 1480 | grp->faults[i] = p->numa_faults[i]; | 1625 | grp->faults[i] = p->numa_faults_memory[i]; |
| 1481 | 1626 | ||
| 1482 | grp->total_faults = p->total_numa_faults; | 1627 | grp->total_faults = p->total_numa_faults; |
| 1483 | 1628 | ||
| @@ -1534,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
| 1534 | 1679 | ||
| 1535 | double_lock(&my_grp->lock, &grp->lock); | 1680 | double_lock(&my_grp->lock, &grp->lock); |
| 1536 | 1681 | ||
| 1537 | for (i = 0; i < 2*nr_node_ids; i++) { | 1682 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { |
| 1538 | my_grp->faults[i] -= p->numa_faults[i]; | 1683 | my_grp->faults[i] -= p->numa_faults_memory[i]; |
| 1539 | grp->faults[i] += p->numa_faults[i]; | 1684 | grp->faults[i] += p->numa_faults_memory[i]; |
| 1540 | } | 1685 | } |
| 1541 | my_grp->total_faults -= p->total_numa_faults; | 1686 | my_grp->total_faults -= p->total_numa_faults; |
| 1542 | grp->total_faults += p->total_numa_faults; | 1687 | grp->total_faults += p->total_numa_faults; |
| @@ -1562,12 +1707,12 @@ void task_numa_free(struct task_struct *p) | |||
| 1562 | { | 1707 | { |
| 1563 | struct numa_group *grp = p->numa_group; | 1708 | struct numa_group *grp = p->numa_group; |
| 1564 | int i; | 1709 | int i; |
| 1565 | void *numa_faults = p->numa_faults; | 1710 | void *numa_faults = p->numa_faults_memory; |
| 1566 | 1711 | ||
| 1567 | if (grp) { | 1712 | if (grp) { |
| 1568 | spin_lock(&grp->lock); | 1713 | spin_lock(&grp->lock); |
| 1569 | for (i = 0; i < 2*nr_node_ids; i++) | 1714 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
| 1570 | grp->faults[i] -= p->numa_faults[i]; | 1715 | grp->faults[i] -= p->numa_faults_memory[i]; |
| 1571 | grp->total_faults -= p->total_numa_faults; | 1716 | grp->total_faults -= p->total_numa_faults; |
| 1572 | 1717 | ||
| 1573 | list_del(&p->numa_entry); | 1718 | list_del(&p->numa_entry); |
| @@ -1577,18 +1722,21 @@ void task_numa_free(struct task_struct *p) | |||
| 1577 | put_numa_group(grp); | 1722 | put_numa_group(grp); |
| 1578 | } | 1723 | } |
| 1579 | 1724 | ||
| 1580 | p->numa_faults = NULL; | 1725 | p->numa_faults_memory = NULL; |
| 1581 | p->numa_faults_buffer = NULL; | 1726 | p->numa_faults_buffer_memory = NULL; |
| 1727 | p->numa_faults_cpu= NULL; | ||
| 1728 | p->numa_faults_buffer_cpu = NULL; | ||
| 1582 | kfree(numa_faults); | 1729 | kfree(numa_faults); |
| 1583 | } | 1730 | } |
| 1584 | 1731 | ||
| 1585 | /* | 1732 | /* |
| 1586 | * Got a PROT_NONE fault for a page on @node. | 1733 | * Got a PROT_NONE fault for a page on @node. |
| 1587 | */ | 1734 | */ |
| 1588 | void task_numa_fault(int last_cpupid, int node, int pages, int flags) | 1735 | void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) |
| 1589 | { | 1736 | { |
| 1590 | struct task_struct *p = current; | 1737 | struct task_struct *p = current; |
| 1591 | bool migrated = flags & TNF_MIGRATED; | 1738 | bool migrated = flags & TNF_MIGRATED; |
| 1739 | int cpu_node = task_node(current); | ||
| 1592 | int priv; | 1740 | int priv; |
| 1593 | 1741 | ||
| 1594 | if (!numabalancing_enabled) | 1742 | if (!numabalancing_enabled) |
| @@ -1603,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) | |||
| 1603 | return; | 1751 | return; |
| 1604 | 1752 | ||
| 1605 | /* Allocate buffer to track faults on a per-node basis */ | 1753 | /* Allocate buffer to track faults on a per-node basis */ |
| 1606 | if (unlikely(!p->numa_faults)) { | 1754 | if (unlikely(!p->numa_faults_memory)) { |
| 1607 | int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; | 1755 | int size = sizeof(*p->numa_faults_memory) * |
| 1756 | NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; | ||
| 1608 | 1757 | ||
| 1609 | /* numa_faults and numa_faults_buffer share the allocation */ | 1758 | p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); |
| 1610 | p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); | 1759 | if (!p->numa_faults_memory) |
| 1611 | if (!p->numa_faults) | ||
| 1612 | return; | 1760 | return; |
| 1613 | 1761 | ||
| 1614 | BUG_ON(p->numa_faults_buffer); | 1762 | BUG_ON(p->numa_faults_buffer_memory); |
| 1615 | p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); | 1763 | /* |
| 1764 | * The averaged statistics, shared & private, memory & cpu, | ||
| 1765 | * occupy the first half of the array. The second half of the | ||
| 1766 | * array is for current counters, which are averaged into the | ||
| 1767 | * first set by task_numa_placement. | ||
| 1768 | */ | ||
| 1769 | p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); | ||
| 1770 | p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); | ||
| 1771 | p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); | ||
| 1616 | p->total_numa_faults = 0; | 1772 | p->total_numa_faults = 0; |
| 1617 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | 1773 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); |
| 1618 | } | 1774 | } |
| @@ -1641,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) | |||
| 1641 | if (migrated) | 1797 | if (migrated) |
| 1642 | p->numa_pages_migrated += pages; | 1798 | p->numa_pages_migrated += pages; |
| 1643 | 1799 | ||
| 1644 | p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; | 1800 | p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; |
| 1801 | p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; | ||
| 1645 | p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; | 1802 | p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; |
| 1646 | } | 1803 | } |
| 1647 | 1804 | ||
| @@ -2219,13 +2376,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) | |||
| 2219 | se->avg.load_avg_contrib >>= NICE_0_SHIFT; | 2376 | se->avg.load_avg_contrib >>= NICE_0_SHIFT; |
| 2220 | } | 2377 | } |
| 2221 | } | 2378 | } |
| 2222 | #else | 2379 | |
| 2380 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | ||
| 2381 | { | ||
| 2382 | __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); | ||
| 2383 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | ||
| 2384 | } | ||
| 2385 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 2223 | static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | 2386 | static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, |
| 2224 | int force_update) {} | 2387 | int force_update) {} |
| 2225 | static inline void __update_tg_runnable_avg(struct sched_avg *sa, | 2388 | static inline void __update_tg_runnable_avg(struct sched_avg *sa, |
| 2226 | struct cfs_rq *cfs_rq) {} | 2389 | struct cfs_rq *cfs_rq) {} |
| 2227 | static inline void __update_group_entity_contrib(struct sched_entity *se) {} | 2390 | static inline void __update_group_entity_contrib(struct sched_entity *se) {} |
| 2228 | #endif | 2391 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} |
| 2392 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 2229 | 2393 | ||
| 2230 | static inline void __update_task_entity_contrib(struct sched_entity *se) | 2394 | static inline void __update_task_entity_contrib(struct sched_entity *se) |
| 2231 | { | 2395 | { |
| @@ -2323,12 +2487,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) | |||
| 2323 | __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); | 2487 | __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); |
| 2324 | } | 2488 | } |
| 2325 | 2489 | ||
| 2326 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | ||
| 2327 | { | ||
| 2328 | __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); | ||
| 2329 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | ||
| 2330 | } | ||
| 2331 | |||
| 2332 | /* Add the load generated by se into cfs_rq's child load-average */ | 2490 | /* Add the load generated by se into cfs_rq's child load-average */ |
| 2333 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | 2491 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, |
| 2334 | struct sched_entity *se, | 2492 | struct sched_entity *se, |
| @@ -2416,7 +2574,10 @@ void idle_exit_fair(struct rq *this_rq) | |||
| 2416 | update_rq_runnable_avg(this_rq, 0); | 2574 | update_rq_runnable_avg(this_rq, 0); |
| 2417 | } | 2575 | } |
| 2418 | 2576 | ||
| 2419 | #else | 2577 | static int idle_balance(struct rq *this_rq); |
| 2578 | |||
| 2579 | #else /* CONFIG_SMP */ | ||
| 2580 | |||
| 2420 | static inline void update_entity_load_avg(struct sched_entity *se, | 2581 | static inline void update_entity_load_avg(struct sched_entity *se, |
| 2421 | int update_cfs_rq) {} | 2582 | int update_cfs_rq) {} |
| 2422 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} | 2583 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} |
| @@ -2428,7 +2589,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
| 2428 | int sleep) {} | 2589 | int sleep) {} |
| 2429 | static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, | 2590 | static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, |
| 2430 | int force_update) {} | 2591 | int force_update) {} |
| 2431 | #endif | 2592 | |
| 2593 | static inline int idle_balance(struct rq *rq) | ||
| 2594 | { | ||
| 2595 | return 0; | ||
| 2596 | } | ||
| 2597 | |||
| 2598 | #endif /* CONFIG_SMP */ | ||
| 2432 | 2599 | ||
| 2433 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 2600 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 2434 | { | 2601 | { |
| @@ -2578,10 +2745,10 @@ static void __clear_buddies_last(struct sched_entity *se) | |||
| 2578 | { | 2745 | { |
| 2579 | for_each_sched_entity(se) { | 2746 | for_each_sched_entity(se) { |
| 2580 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2747 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 2581 | if (cfs_rq->last == se) | 2748 | if (cfs_rq->last != se) |
| 2582 | cfs_rq->last = NULL; | ||
| 2583 | else | ||
| 2584 | break; | 2749 | break; |
| 2750 | |||
| 2751 | cfs_rq->last = NULL; | ||
| 2585 | } | 2752 | } |
| 2586 | } | 2753 | } |
| 2587 | 2754 | ||
| @@ -2589,10 +2756,10 @@ static void __clear_buddies_next(struct sched_entity *se) | |||
| 2589 | { | 2756 | { |
| 2590 | for_each_sched_entity(se) { | 2757 | for_each_sched_entity(se) { |
| 2591 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2758 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 2592 | if (cfs_rq->next == se) | 2759 | if (cfs_rq->next != se) |
| 2593 | cfs_rq->next = NULL; | ||
| 2594 | else | ||
| 2595 | break; | 2760 | break; |
| 2761 | |||
| 2762 | cfs_rq->next = NULL; | ||
| 2596 | } | 2763 | } |
| 2597 | } | 2764 | } |
| 2598 | 2765 | ||
| @@ -2600,10 +2767,10 @@ static void __clear_buddies_skip(struct sched_entity *se) | |||
| 2600 | { | 2767 | { |
| 2601 | for_each_sched_entity(se) { | 2768 | for_each_sched_entity(se) { |
| 2602 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2769 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 2603 | if (cfs_rq->skip == se) | 2770 | if (cfs_rq->skip != se) |
| 2604 | cfs_rq->skip = NULL; | ||
| 2605 | else | ||
| 2606 | break; | 2771 | break; |
| 2772 | |||
| 2773 | cfs_rq->skip = NULL; | ||
| 2607 | } | 2774 | } |
| 2608 | } | 2775 | } |
| 2609 | 2776 | ||
| @@ -2746,17 +2913,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | |||
| 2746 | * 3) pick the "last" process, for cache locality | 2913 | * 3) pick the "last" process, for cache locality |
| 2747 | * 4) do not run the "skip" process, if something else is available | 2914 | * 4) do not run the "skip" process, if something else is available |
| 2748 | */ | 2915 | */ |
| 2749 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 2916 | static struct sched_entity * |
| 2917 | pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) | ||
| 2750 | { | 2918 | { |
| 2751 | struct sched_entity *se = __pick_first_entity(cfs_rq); | 2919 | struct sched_entity *left = __pick_first_entity(cfs_rq); |
| 2752 | struct sched_entity *left = se; | 2920 | struct sched_entity *se; |
| 2921 | |||
| 2922 | /* | ||
| 2923 | * If curr is set we have to see if its left of the leftmost entity | ||
| 2924 | * still in the tree, provided there was anything in the tree at all. | ||
| 2925 | */ | ||
| 2926 | if (!left || (curr && entity_before(curr, left))) | ||
| 2927 | left = curr; | ||
| 2928 | |||
| 2929 | se = left; /* ideally we run the leftmost entity */ | ||
| 2753 | 2930 | ||
| 2754 | /* | 2931 | /* |
| 2755 | * Avoid running the skip buddy, if running something else can | 2932 | * Avoid running the skip buddy, if running something else can |
| 2756 | * be done without getting too unfair. | 2933 | * be done without getting too unfair. |
| 2757 | */ | 2934 | */ |
| 2758 | if (cfs_rq->skip == se) { | 2935 | if (cfs_rq->skip == se) { |
| 2759 | struct sched_entity *second = __pick_next_entity(se); | 2936 | struct sched_entity *second; |
| 2937 | |||
| 2938 | if (se == curr) { | ||
| 2939 | second = __pick_first_entity(cfs_rq); | ||
| 2940 | } else { | ||
| 2941 | second = __pick_next_entity(se); | ||
| 2942 | if (!second || (curr && entity_before(curr, second))) | ||
| 2943 | second = curr; | ||
| 2944 | } | ||
| 2945 | |||
| 2760 | if (second && wakeup_preempt_entity(second, left) < 1) | 2946 | if (second && wakeup_preempt_entity(second, left) < 1) |
| 2761 | se = second; | 2947 | se = second; |
| 2762 | } | 2948 | } |
| @@ -2778,7 +2964,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | |||
| 2778 | return se; | 2964 | return se; |
| 2779 | } | 2965 | } |
| 2780 | 2966 | ||
| 2781 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); | 2967 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); |
| 2782 | 2968 | ||
| 2783 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | 2969 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) |
| 2784 | { | 2970 | { |
| @@ -3433,22 +3619,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | |||
| 3433 | } | 3619 | } |
| 3434 | 3620 | ||
| 3435 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ | 3621 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ |
| 3436 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 3622 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
| 3437 | { | 3623 | { |
| 3438 | if (!cfs_bandwidth_used()) | 3624 | if (!cfs_bandwidth_used()) |
| 3439 | return; | 3625 | return false; |
| 3440 | 3626 | ||
| 3441 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) | 3627 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) |
| 3442 | return; | 3628 | return false; |
| 3443 | 3629 | ||
| 3444 | /* | 3630 | /* |
| 3445 | * it's possible for a throttled entity to be forced into a running | 3631 | * it's possible for a throttled entity to be forced into a running |
| 3446 | * state (e.g. set_curr_task), in this case we're finished. | 3632 | * state (e.g. set_curr_task), in this case we're finished. |
| 3447 | */ | 3633 | */ |
| 3448 | if (cfs_rq_throttled(cfs_rq)) | 3634 | if (cfs_rq_throttled(cfs_rq)) |
| 3449 | return; | 3635 | return true; |
| 3450 | 3636 | ||
| 3451 | throttle_cfs_rq(cfs_rq); | 3637 | throttle_cfs_rq(cfs_rq); |
| 3638 | return true; | ||
| 3452 | } | 3639 | } |
| 3453 | 3640 | ||
| 3454 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | 3641 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) |
| @@ -3558,7 +3745,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | |||
| 3558 | } | 3745 | } |
| 3559 | 3746 | ||
| 3560 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} | 3747 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} |
| 3561 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 3748 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } |
| 3562 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | 3749 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} |
| 3563 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 3750 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
| 3564 | 3751 | ||
| @@ -4213,13 +4400,14 @@ done: | |||
| 4213 | } | 4400 | } |
| 4214 | 4401 | ||
| 4215 | /* | 4402 | /* |
| 4216 | * sched_balance_self: balance the current task (running on cpu) in domains | 4403 | * select_task_rq_fair: Select target runqueue for the waking task in domains |
| 4217 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and | 4404 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, |
| 4218 | * SD_BALANCE_EXEC. | 4405 | * SD_BALANCE_FORK, or SD_BALANCE_EXEC. |
| 4219 | * | 4406 | * |
| 4220 | * Balance, ie. select the least loaded group. | 4407 | * Balances load by selecting the idlest cpu in the idlest group, or under |
| 4408 | * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. | ||
| 4221 | * | 4409 | * |
| 4222 | * Returns the target CPU number, or the same CPU if no balancing is needed. | 4410 | * Returns the target cpu number. |
| 4223 | * | 4411 | * |
| 4224 | * preempt must be disabled. | 4412 | * preempt must be disabled. |
| 4225 | */ | 4413 | */ |
| @@ -4494,26 +4682,124 @@ preempt: | |||
| 4494 | set_last_buddy(se); | 4682 | set_last_buddy(se); |
| 4495 | } | 4683 | } |
| 4496 | 4684 | ||
| 4497 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 4685 | static struct task_struct * |
| 4686 | pick_next_task_fair(struct rq *rq, struct task_struct *prev) | ||
| 4498 | { | 4687 | { |
| 4499 | struct task_struct *p; | ||
| 4500 | struct cfs_rq *cfs_rq = &rq->cfs; | 4688 | struct cfs_rq *cfs_rq = &rq->cfs; |
| 4501 | struct sched_entity *se; | 4689 | struct sched_entity *se; |
| 4690 | struct task_struct *p; | ||
| 4691 | int new_tasks; | ||
| 4502 | 4692 | ||
| 4693 | again: | ||
| 4694 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 4503 | if (!cfs_rq->nr_running) | 4695 | if (!cfs_rq->nr_running) |
| 4504 | return NULL; | 4696 | goto idle; |
| 4697 | |||
| 4698 | if (prev->sched_class != &fair_sched_class) | ||
| 4699 | goto simple; | ||
| 4700 | |||
| 4701 | /* | ||
| 4702 | * Because of the set_next_buddy() in dequeue_task_fair() it is rather | ||
| 4703 | * likely that a next task is from the same cgroup as the current. | ||
| 4704 | * | ||
| 4705 | * Therefore attempt to avoid putting and setting the entire cgroup | ||
| 4706 | * hierarchy, only change the part that actually changes. | ||
| 4707 | */ | ||
| 4708 | |||
| 4709 | do { | ||
| 4710 | struct sched_entity *curr = cfs_rq->curr; | ||
| 4711 | |||
| 4712 | /* | ||
| 4713 | * Since we got here without doing put_prev_entity() we also | ||
| 4714 | * have to consider cfs_rq->curr. If it is still a runnable | ||
| 4715 | * entity, update_curr() will update its vruntime, otherwise | ||
| 4716 | * forget we've ever seen it. | ||
| 4717 | */ | ||
| 4718 | if (curr && curr->on_rq) | ||
| 4719 | update_curr(cfs_rq); | ||
| 4720 | else | ||
| 4721 | curr = NULL; | ||
| 4722 | |||
| 4723 | /* | ||
| 4724 | * This call to check_cfs_rq_runtime() will do the throttle and | ||
| 4725 | * dequeue its entity in the parent(s). Therefore the 'simple' | ||
| 4726 | * nr_running test will indeed be correct. | ||
| 4727 | */ | ||
| 4728 | if (unlikely(check_cfs_rq_runtime(cfs_rq))) | ||
| 4729 | goto simple; | ||
| 4730 | |||
| 4731 | se = pick_next_entity(cfs_rq, curr); | ||
| 4732 | cfs_rq = group_cfs_rq(se); | ||
| 4733 | } while (cfs_rq); | ||
| 4734 | |||
| 4735 | p = task_of(se); | ||
| 4736 | |||
| 4737 | /* | ||
| 4738 | * Since we haven't yet done put_prev_entity and if the selected task | ||
| 4739 | * is a different task than we started out with, try and touch the | ||
| 4740 | * least amount of cfs_rqs. | ||
| 4741 | */ | ||
| 4742 | if (prev != p) { | ||
| 4743 | struct sched_entity *pse = &prev->se; | ||
| 4744 | |||
| 4745 | while (!(cfs_rq = is_same_group(se, pse))) { | ||
| 4746 | int se_depth = se->depth; | ||
| 4747 | int pse_depth = pse->depth; | ||
| 4748 | |||
| 4749 | if (se_depth <= pse_depth) { | ||
| 4750 | put_prev_entity(cfs_rq_of(pse), pse); | ||
| 4751 | pse = parent_entity(pse); | ||
| 4752 | } | ||
| 4753 | if (se_depth >= pse_depth) { | ||
| 4754 | set_next_entity(cfs_rq_of(se), se); | ||
| 4755 | se = parent_entity(se); | ||
| 4756 | } | ||
| 4757 | } | ||
| 4758 | |||
| 4759 | put_prev_entity(cfs_rq, pse); | ||
| 4760 | set_next_entity(cfs_rq, se); | ||
| 4761 | } | ||
| 4762 | |||
| 4763 | if (hrtick_enabled(rq)) | ||
| 4764 | hrtick_start_fair(rq, p); | ||
| 4765 | |||
| 4766 | return p; | ||
| 4767 | simple: | ||
| 4768 | cfs_rq = &rq->cfs; | ||
| 4769 | #endif | ||
| 4770 | |||
| 4771 | if (!cfs_rq->nr_running) | ||
| 4772 | goto idle; | ||
| 4773 | |||
| 4774 | put_prev_task(rq, prev); | ||
| 4505 | 4775 | ||
| 4506 | do { | 4776 | do { |
| 4507 | se = pick_next_entity(cfs_rq); | 4777 | se = pick_next_entity(cfs_rq, NULL); |
| 4508 | set_next_entity(cfs_rq, se); | 4778 | set_next_entity(cfs_rq, se); |
| 4509 | cfs_rq = group_cfs_rq(se); | 4779 | cfs_rq = group_cfs_rq(se); |
| 4510 | } while (cfs_rq); | 4780 | } while (cfs_rq); |
| 4511 | 4781 | ||
| 4512 | p = task_of(se); | 4782 | p = task_of(se); |
| 4783 | |||
| 4513 | if (hrtick_enabled(rq)) | 4784 | if (hrtick_enabled(rq)) |
| 4514 | hrtick_start_fair(rq, p); | 4785 | hrtick_start_fair(rq, p); |
| 4515 | 4786 | ||
| 4516 | return p; | 4787 | return p; |
| 4788 | |||
| 4789 | idle: | ||
| 4790 | new_tasks = idle_balance(rq); | ||
| 4791 | /* | ||
| 4792 | * Because idle_balance() releases (and re-acquires) rq->lock, it is | ||
| 4793 | * possible for any higher priority task to appear. In that case we | ||
| 4794 | * must re-start the pick_next_entity() loop. | ||
| 4795 | */ | ||
| 4796 | if (new_tasks < 0) | ||
| 4797 | return RETRY_TASK; | ||
| 4798 | |||
| 4799 | if (new_tasks > 0) | ||
| 4800 | goto again; | ||
| 4801 | |||
| 4802 | return NULL; | ||
| 4517 | } | 4803 | } |
| 4518 | 4804 | ||
| 4519 | /* | 4805 | /* |
| @@ -4751,7 +5037,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) | |||
| 4751 | * Is this task likely cache-hot: | 5037 | * Is this task likely cache-hot: |
| 4752 | */ | 5038 | */ |
| 4753 | static int | 5039 | static int |
| 4754 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | 5040 | task_hot(struct task_struct *p, u64 now) |
| 4755 | { | 5041 | { |
| 4756 | s64 delta; | 5042 | s64 delta; |
| 4757 | 5043 | ||
| @@ -4785,7 +5071,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | |||
| 4785 | { | 5071 | { |
| 4786 | int src_nid, dst_nid; | 5072 | int src_nid, dst_nid; |
| 4787 | 5073 | ||
| 4788 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || | 5074 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || |
| 4789 | !(env->sd->flags & SD_NUMA)) { | 5075 | !(env->sd->flags & SD_NUMA)) { |
| 4790 | return false; | 5076 | return false; |
| 4791 | } | 5077 | } |
| @@ -4816,7 +5102,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
| 4816 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | 5102 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) |
| 4817 | return false; | 5103 | return false; |
| 4818 | 5104 | ||
| 4819 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) | 5105 | if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) |
| 4820 | return false; | 5106 | return false; |
| 4821 | 5107 | ||
| 4822 | src_nid = cpu_to_node(env->src_cpu); | 5108 | src_nid = cpu_to_node(env->src_cpu); |
| @@ -4912,7 +5198,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 4912 | * 2) task is cache cold, or | 5198 | * 2) task is cache cold, or |
| 4913 | * 3) too many balance attempts have failed. | 5199 | * 3) too many balance attempts have failed. |
| 4914 | */ | 5200 | */ |
| 4915 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); | 5201 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); |
| 4916 | if (!tsk_cache_hot) | 5202 | if (!tsk_cache_hot) |
| 4917 | tsk_cache_hot = migrate_degrades_locality(p, env); | 5203 | tsk_cache_hot = migrate_degrades_locality(p, env); |
| 4918 | 5204 | ||
| @@ -5775,12 +6061,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
| 5775 | pwr_now /= SCHED_POWER_SCALE; | 6061 | pwr_now /= SCHED_POWER_SCALE; |
| 5776 | 6062 | ||
| 5777 | /* Amount of load we'd subtract */ | 6063 | /* Amount of load we'd subtract */ |
| 5778 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / | 6064 | if (busiest->avg_load > scaled_busy_load_per_task) { |
| 5779 | busiest->group_power; | ||
| 5780 | if (busiest->avg_load > tmp) { | ||
| 5781 | pwr_move += busiest->group_power * | 6065 | pwr_move += busiest->group_power * |
| 5782 | min(busiest->load_per_task, | 6066 | min(busiest->load_per_task, |
| 5783 | busiest->avg_load - tmp); | 6067 | busiest->avg_load - scaled_busy_load_per_task); |
| 5784 | } | 6068 | } |
| 5785 | 6069 | ||
| 5786 | /* Amount of load we'd add */ | 6070 | /* Amount of load we'd add */ |
| @@ -6359,17 +6643,23 @@ out: | |||
| 6359 | * idle_balance is called by schedule() if this_cpu is about to become | 6643 | * idle_balance is called by schedule() if this_cpu is about to become |
| 6360 | * idle. Attempts to pull tasks from other CPUs. | 6644 | * idle. Attempts to pull tasks from other CPUs. |
| 6361 | */ | 6645 | */ |
| 6362 | void idle_balance(int this_cpu, struct rq *this_rq) | 6646 | static int idle_balance(struct rq *this_rq) |
| 6363 | { | 6647 | { |
| 6364 | struct sched_domain *sd; | 6648 | struct sched_domain *sd; |
| 6365 | int pulled_task = 0; | 6649 | int pulled_task = 0; |
| 6366 | unsigned long next_balance = jiffies + HZ; | 6650 | unsigned long next_balance = jiffies + HZ; |
| 6367 | u64 curr_cost = 0; | 6651 | u64 curr_cost = 0; |
| 6652 | int this_cpu = this_rq->cpu; | ||
| 6368 | 6653 | ||
| 6654 | idle_enter_fair(this_rq); | ||
| 6655 | /* | ||
| 6656 | * We must set idle_stamp _before_ calling idle_balance(), such that we | ||
| 6657 | * measure the duration of idle_balance() as idle time. | ||
| 6658 | */ | ||
| 6369 | this_rq->idle_stamp = rq_clock(this_rq); | 6659 | this_rq->idle_stamp = rq_clock(this_rq); |
| 6370 | 6660 | ||
| 6371 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | 6661 | if (this_rq->avg_idle < sysctl_sched_migration_cost) |
| 6372 | return; | 6662 | goto out; |
| 6373 | 6663 | ||
| 6374 | /* | 6664 | /* |
| 6375 | * Drop the rq->lock, but keep IRQ/preempt disabled. | 6665 | * Drop the rq->lock, but keep IRQ/preempt disabled. |
| @@ -6407,15 +6697,22 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 6407 | interval = msecs_to_jiffies(sd->balance_interval); | 6697 | interval = msecs_to_jiffies(sd->balance_interval); |
| 6408 | if (time_after(next_balance, sd->last_balance + interval)) | 6698 | if (time_after(next_balance, sd->last_balance + interval)) |
| 6409 | next_balance = sd->last_balance + interval; | 6699 | next_balance = sd->last_balance + interval; |
| 6410 | if (pulled_task) { | 6700 | if (pulled_task) |
| 6411 | this_rq->idle_stamp = 0; | ||
| 6412 | break; | 6701 | break; |
| 6413 | } | ||
| 6414 | } | 6702 | } |
| 6415 | rcu_read_unlock(); | 6703 | rcu_read_unlock(); |
| 6416 | 6704 | ||
| 6417 | raw_spin_lock(&this_rq->lock); | 6705 | raw_spin_lock(&this_rq->lock); |
| 6418 | 6706 | ||
| 6707 | /* | ||
| 6708 | * While browsing the domains, we released the rq lock. | ||
| 6709 | * A task could have be enqueued in the meantime | ||
| 6710 | */ | ||
| 6711 | if (this_rq->cfs.h_nr_running && !pulled_task) { | ||
| 6712 | pulled_task = 1; | ||
| 6713 | goto out; | ||
| 6714 | } | ||
| 6715 | |||
| 6419 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | 6716 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { |
| 6420 | /* | 6717 | /* |
| 6421 | * We are going idle. next_balance may be set based on | 6718 | * We are going idle. next_balance may be set based on |
| @@ -6426,6 +6723,20 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 6426 | 6723 | ||
| 6427 | if (curr_cost > this_rq->max_idle_balance_cost) | 6724 | if (curr_cost > this_rq->max_idle_balance_cost) |
| 6428 | this_rq->max_idle_balance_cost = curr_cost; | 6725 | this_rq->max_idle_balance_cost = curr_cost; |
| 6726 | |||
| 6727 | out: | ||
| 6728 | /* Is there a task of a high priority class? */ | ||
| 6729 | if (this_rq->nr_running != this_rq->cfs.h_nr_running && | ||
| 6730 | (this_rq->dl.dl_nr_running || | ||
| 6731 | (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) | ||
| 6732 | pulled_task = -1; | ||
| 6733 | |||
| 6734 | if (pulled_task) { | ||
| 6735 | idle_exit_fair(this_rq); | ||
| 6736 | this_rq->idle_stamp = 0; | ||
| 6737 | } | ||
| 6738 | |||
| 6739 | return pulled_task; | ||
| 6429 | } | 6740 | } |
| 6430 | 6741 | ||
| 6431 | /* | 6742 | /* |
| @@ -6496,6 +6807,11 @@ out_unlock: | |||
| 6496 | return 0; | 6807 | return 0; |
| 6497 | } | 6808 | } |
| 6498 | 6809 | ||
| 6810 | static inline int on_null_domain(struct rq *rq) | ||
| 6811 | { | ||
| 6812 | return unlikely(!rcu_dereference_sched(rq->sd)); | ||
| 6813 | } | ||
| 6814 | |||
| 6499 | #ifdef CONFIG_NO_HZ_COMMON | 6815 | #ifdef CONFIG_NO_HZ_COMMON |
| 6500 | /* | 6816 | /* |
| 6501 | * idle load balancing details | 6817 | * idle load balancing details |
| @@ -6550,8 +6866,13 @@ static void nohz_balancer_kick(void) | |||
| 6550 | static inline void nohz_balance_exit_idle(int cpu) | 6866 | static inline void nohz_balance_exit_idle(int cpu) |
| 6551 | { | 6867 | { |
| 6552 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { | 6868 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { |
| 6553 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | 6869 | /* |
| 6554 | atomic_dec(&nohz.nr_cpus); | 6870 | * Completely isolated CPUs don't ever set, so we must test. |
| 6871 | */ | ||
| 6872 | if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { | ||
| 6873 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | ||
| 6874 | atomic_dec(&nohz.nr_cpus); | ||
| 6875 | } | ||
| 6555 | clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | 6876 | clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
| 6556 | } | 6877 | } |
| 6557 | } | 6878 | } |
| @@ -6605,6 +6926,12 @@ void nohz_balance_enter_idle(int cpu) | |||
| 6605 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) | 6926 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) |
| 6606 | return; | 6927 | return; |
| 6607 | 6928 | ||
| 6929 | /* | ||
| 6930 | * If we're a completely isolated CPU, we don't play. | ||
| 6931 | */ | ||
| 6932 | if (on_null_domain(cpu_rq(cpu))) | ||
| 6933 | return; | ||
| 6934 | |||
| 6608 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | 6935 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
| 6609 | atomic_inc(&nohz.nr_cpus); | 6936 | atomic_inc(&nohz.nr_cpus); |
| 6610 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | 6937 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
| @@ -6867,11 +7194,6 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
| 6867 | nohz_idle_balance(this_rq, idle); | 7194 | nohz_idle_balance(this_rq, idle); |
| 6868 | } | 7195 | } |
| 6869 | 7196 | ||
| 6870 | static inline int on_null_domain(struct rq *rq) | ||
| 6871 | { | ||
| 6872 | return !rcu_dereference_sched(rq->sd); | ||
| 6873 | } | ||
| 6874 | |||
| 6875 | /* | 7197 | /* |
| 6876 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 7198 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
| 6877 | */ | 7199 | */ |
| @@ -7001,15 +7323,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
| 7001 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 7323 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 7002 | 7324 | ||
| 7003 | /* | 7325 | /* |
| 7004 | * Ensure the task's vruntime is normalized, so that when its | 7326 | * Ensure the task's vruntime is normalized, so that when it's |
| 7005 | * switched back to the fair class the enqueue_entity(.flags=0) will | 7327 | * switched back to the fair class the enqueue_entity(.flags=0) will |
| 7006 | * do the right thing. | 7328 | * do the right thing. |
| 7007 | * | 7329 | * |
| 7008 | * If it was on_rq, then the dequeue_entity(.flags=0) will already | 7330 | * If it's on_rq, then the dequeue_entity(.flags=0) will already |
| 7009 | * have normalized the vruntime, if it was !on_rq, then only when | 7331 | * have normalized the vruntime, if it's !on_rq, then only when |
| 7010 | * the task is sleeping will it still have non-normalized vruntime. | 7332 | * the task is sleeping will it still have non-normalized vruntime. |
| 7011 | */ | 7333 | */ |
| 7012 | if (!se->on_rq && p->state != TASK_RUNNING) { | 7334 | if (!p->on_rq && p->state != TASK_RUNNING) { |
| 7013 | /* | 7335 | /* |
| 7014 | * Fix up our vruntime so that the current sleep doesn't | 7336 | * Fix up our vruntime so that the current sleep doesn't |
| 7015 | * cause 'unlimited' sleep bonus. | 7337 | * cause 'unlimited' sleep bonus. |
| @@ -7036,7 +7358,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
| 7036 | */ | 7358 | */ |
| 7037 | static void switched_to_fair(struct rq *rq, struct task_struct *p) | 7359 | static void switched_to_fair(struct rq *rq, struct task_struct *p) |
| 7038 | { | 7360 | { |
| 7039 | if (!p->se.on_rq) | 7361 | struct sched_entity *se = &p->se; |
| 7362 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 7363 | /* | ||
| 7364 | * Since the real-depth could have been changed (only FAIR | ||
| 7365 | * class maintain depth value), reset depth properly. | ||
| 7366 | */ | ||
| 7367 | se->depth = se->parent ? se->parent->depth + 1 : 0; | ||
| 7368 | #endif | ||
| 7369 | if (!se->on_rq) | ||
| 7040 | return; | 7370 | return; |
| 7041 | 7371 | ||
| 7042 | /* | 7372 | /* |
| @@ -7084,7 +7414,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 7084 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7414 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7085 | static void task_move_group_fair(struct task_struct *p, int on_rq) | 7415 | static void task_move_group_fair(struct task_struct *p, int on_rq) |
| 7086 | { | 7416 | { |
| 7417 | struct sched_entity *se = &p->se; | ||
| 7087 | struct cfs_rq *cfs_rq; | 7418 | struct cfs_rq *cfs_rq; |
| 7419 | |||
| 7088 | /* | 7420 | /* |
| 7089 | * If the task was not on the rq at the time of this cgroup movement | 7421 | * If the task was not on the rq at the time of this cgroup movement |
| 7090 | * it must have been asleep, sleeping tasks keep their ->vruntime | 7422 | * it must have been asleep, sleeping tasks keep their ->vruntime |
| @@ -7110,23 +7442,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
| 7110 | * To prevent boost or penalty in the new cfs_rq caused by delta | 7442 | * To prevent boost or penalty in the new cfs_rq caused by delta |
| 7111 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. | 7443 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. |
| 7112 | */ | 7444 | */ |
| 7113 | if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) | 7445 | if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) |
| 7114 | on_rq = 1; | 7446 | on_rq = 1; |
| 7115 | 7447 | ||
| 7116 | if (!on_rq) | 7448 | if (!on_rq) |
| 7117 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; | 7449 | se->vruntime -= cfs_rq_of(se)->min_vruntime; |
| 7118 | set_task_rq(p, task_cpu(p)); | 7450 | set_task_rq(p, task_cpu(p)); |
| 7451 | se->depth = se->parent ? se->parent->depth + 1 : 0; | ||
| 7119 | if (!on_rq) { | 7452 | if (!on_rq) { |
| 7120 | cfs_rq = cfs_rq_of(&p->se); | 7453 | cfs_rq = cfs_rq_of(se); |
| 7121 | p->se.vruntime += cfs_rq->min_vruntime; | 7454 | se->vruntime += cfs_rq->min_vruntime; |
| 7122 | #ifdef CONFIG_SMP | 7455 | #ifdef CONFIG_SMP |
| 7123 | /* | 7456 | /* |
| 7124 | * migrate_task_rq_fair() will have removed our previous | 7457 | * migrate_task_rq_fair() will have removed our previous |
| 7125 | * contribution, but we must synchronize for ongoing future | 7458 | * contribution, but we must synchronize for ongoing future |
| 7126 | * decay. | 7459 | * decay. |
| 7127 | */ | 7460 | */ |
| 7128 | p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 7461 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); |
| 7129 | cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; | 7462 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; |
| 7130 | #endif | 7463 | #endif |
| 7131 | } | 7464 | } |
| 7132 | } | 7465 | } |
| @@ -7222,10 +7555,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
| 7222 | if (!se) | 7555 | if (!se) |
| 7223 | return; | 7556 | return; |
| 7224 | 7557 | ||
| 7225 | if (!parent) | 7558 | if (!parent) { |
| 7226 | se->cfs_rq = &rq->cfs; | 7559 | se->cfs_rq = &rq->cfs; |
| 7227 | else | 7560 | se->depth = 0; |
| 7561 | } else { | ||
| 7228 | se->cfs_rq = parent->my_q; | 7562 | se->cfs_rq = parent->my_q; |
| 7563 | se->depth = parent->depth + 1; | ||
| 7564 | } | ||
| 7229 | 7565 | ||
| 7230 | se->my_q = cfs_rq; | 7566 | se->my_q = cfs_rq; |
| 7231 | /* guarantee group entities always have weight */ | 7567 | /* guarantee group entities always have weight */ |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c new file mode 100644 index 000000000000..8f4390a079c7 --- /dev/null +++ b/kernel/sched/idle.c | |||
| @@ -0,0 +1,265 @@ | |||
| 1 | /* | ||
| 2 | * Generic entry point for the idle threads | ||
| 3 | */ | ||
| 4 | #include <linux/sched.h> | ||
| 5 | #include <linux/cpu.h> | ||
| 6 | #include <linux/cpuidle.h> | ||
| 7 | #include <linux/tick.h> | ||
| 8 | #include <linux/mm.h> | ||
| 9 | #include <linux/stackprotector.h> | ||
| 10 | |||
| 11 | #include <asm/tlb.h> | ||
| 12 | |||
| 13 | #include <trace/events/power.h> | ||
| 14 | |||
| 15 | static int __read_mostly cpu_idle_force_poll; | ||
| 16 | |||
| 17 | void cpu_idle_poll_ctrl(bool enable) | ||
| 18 | { | ||
| 19 | if (enable) { | ||
| 20 | cpu_idle_force_poll++; | ||
| 21 | } else { | ||
| 22 | cpu_idle_force_poll--; | ||
| 23 | WARN_ON_ONCE(cpu_idle_force_poll < 0); | ||
| 24 | } | ||
| 25 | } | ||
| 26 | |||
| 27 | #ifdef CONFIG_GENERIC_IDLE_POLL_SETUP | ||
| 28 | static int __init cpu_idle_poll_setup(char *__unused) | ||
| 29 | { | ||
| 30 | cpu_idle_force_poll = 1; | ||
| 31 | return 1; | ||
| 32 | } | ||
| 33 | __setup("nohlt", cpu_idle_poll_setup); | ||
| 34 | |||
| 35 | static int __init cpu_idle_nopoll_setup(char *__unused) | ||
| 36 | { | ||
| 37 | cpu_idle_force_poll = 0; | ||
| 38 | return 1; | ||
| 39 | } | ||
| 40 | __setup("hlt", cpu_idle_nopoll_setup); | ||
| 41 | #endif | ||
| 42 | |||
| 43 | static inline int cpu_idle_poll(void) | ||
| 44 | { | ||
| 45 | rcu_idle_enter(); | ||
| 46 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | ||
| 47 | local_irq_enable(); | ||
| 48 | while (!tif_need_resched()) | ||
| 49 | cpu_relax(); | ||
| 50 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | ||
| 51 | rcu_idle_exit(); | ||
| 52 | return 1; | ||
| 53 | } | ||
| 54 | |||
| 55 | /* Weak implementations for optional arch specific functions */ | ||
| 56 | void __weak arch_cpu_idle_prepare(void) { } | ||
| 57 | void __weak arch_cpu_idle_enter(void) { } | ||
| 58 | void __weak arch_cpu_idle_exit(void) { } | ||
| 59 | void __weak arch_cpu_idle_dead(void) { } | ||
| 60 | void __weak arch_cpu_idle(void) | ||
| 61 | { | ||
| 62 | cpu_idle_force_poll = 1; | ||
| 63 | local_irq_enable(); | ||
| 64 | } | ||
| 65 | |||
| 66 | /** | ||
| 67 | * cpuidle_idle_call - the main idle function | ||
| 68 | * | ||
| 69 | * NOTE: no locks or semaphores should be used here | ||
| 70 | * return non-zero on failure | ||
| 71 | */ | ||
| 72 | static int cpuidle_idle_call(void) | ||
| 73 | { | ||
| 74 | struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); | ||
| 75 | struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); | ||
| 76 | int next_state, entered_state, ret; | ||
| 77 | bool broadcast; | ||
| 78 | |||
| 79 | /* | ||
| 80 | * Check if the idle task must be rescheduled. If it is the | ||
| 81 | * case, exit the function after re-enabling the local irq and | ||
| 82 | * set again the polling flag | ||
| 83 | */ | ||
| 84 | if (current_clr_polling_and_test()) { | ||
| 85 | local_irq_enable(); | ||
| 86 | __current_set_polling(); | ||
| 87 | return 0; | ||
| 88 | } | ||
| 89 | |||
| 90 | /* | ||
| 91 | * During the idle period, stop measuring the disabled irqs | ||
| 92 | * critical sections latencies | ||
| 93 | */ | ||
| 94 | stop_critical_timings(); | ||
| 95 | |||
| 96 | /* | ||
| 97 | * Tell the RCU framework we are entering an idle section, | ||
| 98 | * so no more rcu read side critical sections and one more | ||
| 99 | * step to the grace period | ||
| 100 | */ | ||
| 101 | rcu_idle_enter(); | ||
| 102 | |||
| 103 | /* | ||
| 104 | * Check if the cpuidle framework is ready, otherwise fallback | ||
| 105 | * to the default arch specific idle method | ||
| 106 | */ | ||
| 107 | ret = cpuidle_enabled(drv, dev); | ||
| 108 | |||
| 109 | if (!ret) { | ||
| 110 | /* | ||
| 111 | * Ask the governor to choose an idle state it thinks | ||
| 112 | * it is convenient to go to. There is *always* a | ||
| 113 | * convenient idle state | ||
| 114 | */ | ||
| 115 | next_state = cpuidle_select(drv, dev); | ||
| 116 | |||
| 117 | /* | ||
| 118 | * The idle task must be scheduled, it is pointless to | ||
| 119 | * go to idle, just update no idle residency and get | ||
| 120 | * out of this function | ||
| 121 | */ | ||
| 122 | if (current_clr_polling_and_test()) { | ||
| 123 | dev->last_residency = 0; | ||
| 124 | entered_state = next_state; | ||
| 125 | local_irq_enable(); | ||
| 126 | } else { | ||
| 127 | broadcast = !!(drv->states[next_state].flags & | ||
| 128 | CPUIDLE_FLAG_TIMER_STOP); | ||
| 129 | |||
| 130 | if (broadcast) | ||
| 131 | /* | ||
| 132 | * Tell the time framework to switch | ||
| 133 | * to a broadcast timer because our | ||
| 134 | * local timer will be shutdown. If a | ||
| 135 | * local timer is used from another | ||
| 136 | * cpu as a broadcast timer, this call | ||
| 137 | * may fail if it is not available | ||
| 138 | */ | ||
| 139 | ret = clockevents_notify( | ||
| 140 | CLOCK_EVT_NOTIFY_BROADCAST_ENTER, | ||
| 141 | &dev->cpu); | ||
| 142 | |||
| 143 | if (!ret) { | ||
| 144 | trace_cpu_idle_rcuidle(next_state, dev->cpu); | ||
| 145 | |||
| 146 | /* | ||
| 147 | * Enter the idle state previously | ||
| 148 | * returned by the governor | ||
| 149 | * decision. This function will block | ||
| 150 | * until an interrupt occurs and will | ||
| 151 | * take care of re-enabling the local | ||
| 152 | * interrupts | ||
| 153 | */ | ||
| 154 | entered_state = cpuidle_enter(drv, dev, | ||
| 155 | next_state); | ||
| 156 | |||
| 157 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, | ||
| 158 | dev->cpu); | ||
| 159 | |||
| 160 | if (broadcast) | ||
| 161 | clockevents_notify( | ||
| 162 | CLOCK_EVT_NOTIFY_BROADCAST_EXIT, | ||
| 163 | &dev->cpu); | ||
| 164 | |||
| 165 | /* | ||
| 166 | * Give the governor an opportunity to reflect on the | ||
| 167 | * outcome | ||
| 168 | */ | ||
| 169 | cpuidle_reflect(dev, entered_state); | ||
| 170 | } | ||
| 171 | } | ||
| 172 | } | ||
| 173 | |||
| 174 | /* | ||
| 175 | * We can't use the cpuidle framework, let's use the default | ||
| 176 | * idle routine | ||
| 177 | */ | ||
| 178 | if (ret) | ||
| 179 | arch_cpu_idle(); | ||
| 180 | |||
| 181 | __current_set_polling(); | ||
| 182 | |||
| 183 | /* | ||
| 184 | * It is up to the idle functions to enable back the local | ||
| 185 | * interrupt | ||
| 186 | */ | ||
| 187 | if (WARN_ON_ONCE(irqs_disabled())) | ||
| 188 | local_irq_enable(); | ||
| 189 | |||
| 190 | rcu_idle_exit(); | ||
| 191 | start_critical_timings(); | ||
| 192 | |||
| 193 | return 0; | ||
| 194 | } | ||
| 195 | |||
| 196 | /* | ||
| 197 | * Generic idle loop implementation | ||
| 198 | */ | ||
| 199 | static void cpu_idle_loop(void) | ||
| 200 | { | ||
| 201 | while (1) { | ||
| 202 | tick_nohz_idle_enter(); | ||
| 203 | |||
| 204 | while (!need_resched()) { | ||
| 205 | check_pgt_cache(); | ||
| 206 | rmb(); | ||
| 207 | |||
| 208 | if (cpu_is_offline(smp_processor_id())) | ||
| 209 | arch_cpu_idle_dead(); | ||
| 210 | |||
| 211 | local_irq_disable(); | ||
| 212 | arch_cpu_idle_enter(); | ||
| 213 | |||
| 214 | /* | ||
| 215 | * In poll mode we reenable interrupts and spin. | ||
| 216 | * | ||
| 217 | * Also if we detected in the wakeup from idle | ||
| 218 | * path that the tick broadcast device expired | ||
| 219 | * for us, we don't want to go deep idle as we | ||
| 220 | * know that the IPI is going to arrive right | ||
| 221 | * away | ||
| 222 | */ | ||
| 223 | if (cpu_idle_force_poll || tick_check_broadcast_expired()) | ||
| 224 | cpu_idle_poll(); | ||
| 225 | else | ||
| 226 | cpuidle_idle_call(); | ||
| 227 | |||
| 228 | arch_cpu_idle_exit(); | ||
| 229 | } | ||
| 230 | |||
| 231 | /* | ||
| 232 | * Since we fell out of the loop above, we know | ||
| 233 | * TIF_NEED_RESCHED must be set, propagate it into | ||
| 234 | * PREEMPT_NEED_RESCHED. | ||
| 235 | * | ||
| 236 | * This is required because for polling idle loops we will | ||
| 237 | * not have had an IPI to fold the state for us. | ||
| 238 | */ | ||
| 239 | preempt_set_need_resched(); | ||
| 240 | tick_nohz_idle_exit(); | ||
| 241 | schedule_preempt_disabled(); | ||
| 242 | } | ||
| 243 | } | ||
| 244 | |||
| 245 | void cpu_startup_entry(enum cpuhp_state state) | ||
| 246 | { | ||
| 247 | /* | ||
| 248 | * This #ifdef needs to die, but it's too late in the cycle to | ||
| 249 | * make this generic (arm and sh have never invoked the canary | ||
| 250 | * init for the non boot cpus!). Will be fixed in 3.11 | ||
| 251 | */ | ||
| 252 | #ifdef CONFIG_X86 | ||
| 253 | /* | ||
| 254 | * If we're the non-boot CPU, nothing set the stack canary up | ||
| 255 | * for us. The boot CPU already has it initialized but no harm | ||
| 256 | * in doing it again. This is a good place for updating it, as | ||
| 257 | * we wont ever return from this function (so the invalid | ||
| 258 | * canaries already on the stack wont ever trigger). | ||
| 259 | */ | ||
| 260 | boot_init_stack_canary(); | ||
| 261 | #endif | ||
| 262 | __current_set_polling(); | ||
| 263 | arch_cpu_idle_prepare(); | ||
| 264 | cpu_idle_loop(); | ||
| 265 | } | ||
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 516c3d9ceea1..879f2b75266a 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
| @@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
| 13 | { | 13 | { |
| 14 | return task_cpu(p); /* IDLE tasks as never migrated */ | 14 | return task_cpu(p); /* IDLE tasks as never migrated */ |
| 15 | } | 15 | } |
| 16 | |||
| 17 | static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) | ||
| 18 | { | ||
| 19 | idle_exit_fair(rq); | ||
| 20 | rq_last_tick_reset(rq); | ||
| 21 | } | ||
| 22 | |||
| 23 | static void post_schedule_idle(struct rq *rq) | ||
| 24 | { | ||
| 25 | idle_enter_fair(rq); | ||
| 26 | } | ||
| 27 | #endif /* CONFIG_SMP */ | 16 | #endif /* CONFIG_SMP */ |
| 17 | |||
| 28 | /* | 18 | /* |
| 29 | * Idle tasks are unconditionally rescheduled: | 19 | * Idle tasks are unconditionally rescheduled: |
| 30 | */ | 20 | */ |
| @@ -33,13 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |||
| 33 | resched_task(rq->idle); | 23 | resched_task(rq->idle); |
| 34 | } | 24 | } |
| 35 | 25 | ||
| 36 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 26 | static struct task_struct * |
| 27 | pick_next_task_idle(struct rq *rq, struct task_struct *prev) | ||
| 37 | { | 28 | { |
| 29 | put_prev_task(rq, prev); | ||
| 30 | |||
| 38 | schedstat_inc(rq, sched_goidle); | 31 | schedstat_inc(rq, sched_goidle); |
| 39 | #ifdef CONFIG_SMP | ||
| 40 | /* Trigger the post schedule to do an idle_enter for CFS */ | ||
| 41 | rq->post_schedule = 1; | ||
| 42 | #endif | ||
| 43 | return rq->idle; | 32 | return rq->idle; |
| 44 | } | 33 | } |
| 45 | 34 | ||
| @@ -58,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) | |||
| 58 | 47 | ||
| 59 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | 48 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) |
| 60 | { | 49 | { |
| 50 | idle_exit_fair(rq); | ||
| 51 | rq_last_tick_reset(rq); | ||
| 61 | } | 52 | } |
| 62 | 53 | ||
| 63 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) | 54 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) |
| @@ -101,8 +92,6 @@ const struct sched_class idle_sched_class = { | |||
| 101 | 92 | ||
| 102 | #ifdef CONFIG_SMP | 93 | #ifdef CONFIG_SMP |
| 103 | .select_task_rq = select_task_rq_idle, | 94 | .select_task_rq = select_task_rq_idle, |
| 104 | .pre_schedule = pre_schedule_idle, | ||
| 105 | .post_schedule = post_schedule_idle, | ||
| 106 | #endif | 95 | #endif |
| 107 | 96 | ||
| 108 | .set_curr_task = set_curr_task_idle, | 97 | .set_curr_task = set_curr_task_idle, |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a2740b775b45..d8cdf1618551 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -229,6 +229,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 229 | 229 | ||
| 230 | #ifdef CONFIG_SMP | 230 | #ifdef CONFIG_SMP |
| 231 | 231 | ||
| 232 | static int pull_rt_task(struct rq *this_rq); | ||
| 233 | |||
| 234 | static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) | ||
| 235 | { | ||
| 236 | /* Try to pull RT tasks here if we lower this rq's prio */ | ||
| 237 | return rq->rt.highest_prio.curr > prev->prio; | ||
| 238 | } | ||
| 239 | |||
| 232 | static inline int rt_overloaded(struct rq *rq) | 240 | static inline int rt_overloaded(struct rq *rq) |
| 233 | { | 241 | { |
| 234 | return atomic_read(&rq->rd->rto_count); | 242 | return atomic_read(&rq->rd->rto_count); |
| @@ -315,6 +323,15 @@ static inline int has_pushable_tasks(struct rq *rq) | |||
| 315 | return !plist_head_empty(&rq->rt.pushable_tasks); | 323 | return !plist_head_empty(&rq->rt.pushable_tasks); |
| 316 | } | 324 | } |
| 317 | 325 | ||
| 326 | static inline void set_post_schedule(struct rq *rq) | ||
| 327 | { | ||
| 328 | /* | ||
| 329 | * We detect this state here so that we can avoid taking the RQ | ||
| 330 | * lock again later if there is no need to push | ||
| 331 | */ | ||
| 332 | rq->post_schedule = has_pushable_tasks(rq); | ||
| 333 | } | ||
| 334 | |||
| 318 | static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) | 335 | static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) |
| 319 | { | 336 | { |
| 320 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); | 337 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); |
| @@ -359,6 +376,19 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
| 359 | { | 376 | { |
| 360 | } | 377 | } |
| 361 | 378 | ||
| 379 | static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) | ||
| 380 | { | ||
| 381 | return false; | ||
| 382 | } | ||
| 383 | |||
| 384 | static inline int pull_rt_task(struct rq *this_rq) | ||
| 385 | { | ||
| 386 | return 0; | ||
| 387 | } | ||
| 388 | |||
| 389 | static inline void set_post_schedule(struct rq *rq) | ||
| 390 | { | ||
| 391 | } | ||
| 362 | #endif /* CONFIG_SMP */ | 392 | #endif /* CONFIG_SMP */ |
| 363 | 393 | ||
| 364 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) | 394 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) |
| @@ -440,11 +470,6 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | |||
| 440 | dequeue_rt_entity(rt_se); | 470 | dequeue_rt_entity(rt_se); |
| 441 | } | 471 | } |
| 442 | 472 | ||
| 443 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | ||
| 444 | { | ||
| 445 | return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; | ||
| 446 | } | ||
| 447 | |||
| 448 | static int rt_se_boosted(struct sched_rt_entity *rt_se) | 473 | static int rt_se_boosted(struct sched_rt_entity *rt_se) |
| 449 | { | 474 | { |
| 450 | struct rt_rq *rt_rq = group_rt_rq(rt_se); | 475 | struct rt_rq *rt_rq = group_rt_rq(rt_se); |
| @@ -515,11 +540,6 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | |||
| 515 | { | 540 | { |
| 516 | } | 541 | } |
| 517 | 542 | ||
| 518 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | ||
| 519 | { | ||
| 520 | return rt_rq->rt_throttled; | ||
| 521 | } | ||
| 522 | |||
| 523 | static inline const struct cpumask *sched_rt_period_mask(void) | 543 | static inline const struct cpumask *sched_rt_period_mask(void) |
| 524 | { | 544 | { |
| 525 | return cpu_online_mask; | 545 | return cpu_online_mask; |
| @@ -538,6 +558,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | |||
| 538 | 558 | ||
| 539 | #endif /* CONFIG_RT_GROUP_SCHED */ | 559 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 540 | 560 | ||
| 561 | bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) | ||
| 562 | { | ||
| 563 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | ||
| 564 | |||
| 565 | return (hrtimer_active(&rt_b->rt_period_timer) || | ||
| 566 | rt_rq->rt_time < rt_b->rt_runtime); | ||
| 567 | } | ||
| 568 | |||
| 541 | #ifdef CONFIG_SMP | 569 | #ifdef CONFIG_SMP |
| 542 | /* | 570 | /* |
| 543 | * We ran out of runtime, see if we can borrow some from our neighbours. | 571 | * We ran out of runtime, see if we can borrow some from our neighbours. |
| @@ -1310,15 +1338,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) | |||
| 1310 | { | 1338 | { |
| 1311 | struct sched_rt_entity *rt_se; | 1339 | struct sched_rt_entity *rt_se; |
| 1312 | struct task_struct *p; | 1340 | struct task_struct *p; |
| 1313 | struct rt_rq *rt_rq; | 1341 | struct rt_rq *rt_rq = &rq->rt; |
| 1314 | |||
| 1315 | rt_rq = &rq->rt; | ||
| 1316 | |||
| 1317 | if (!rt_rq->rt_nr_running) | ||
| 1318 | return NULL; | ||
| 1319 | |||
| 1320 | if (rt_rq_throttled(rt_rq)) | ||
| 1321 | return NULL; | ||
| 1322 | 1342 | ||
| 1323 | do { | 1343 | do { |
| 1324 | rt_se = pick_next_rt_entity(rq, rt_rq); | 1344 | rt_se = pick_next_rt_entity(rq, rt_rq); |
| @@ -1332,21 +1352,45 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) | |||
| 1332 | return p; | 1352 | return p; |
| 1333 | } | 1353 | } |
| 1334 | 1354 | ||
| 1335 | static struct task_struct *pick_next_task_rt(struct rq *rq) | 1355 | static struct task_struct * |
| 1356 | pick_next_task_rt(struct rq *rq, struct task_struct *prev) | ||
| 1336 | { | 1357 | { |
| 1337 | struct task_struct *p = _pick_next_task_rt(rq); | 1358 | struct task_struct *p; |
| 1359 | struct rt_rq *rt_rq = &rq->rt; | ||
| 1360 | |||
| 1361 | if (need_pull_rt_task(rq, prev)) { | ||
| 1362 | pull_rt_task(rq); | ||
| 1363 | /* | ||
| 1364 | * pull_rt_task() can drop (and re-acquire) rq->lock; this | ||
| 1365 | * means a dl task can slip in, in which case we need to | ||
| 1366 | * re-start task selection. | ||
| 1367 | */ | ||
| 1368 | if (unlikely(rq->dl.dl_nr_running)) | ||
| 1369 | return RETRY_TASK; | ||
| 1370 | } | ||
| 1371 | |||
| 1372 | /* | ||
| 1373 | * We may dequeue prev's rt_rq in put_prev_task(). | ||
| 1374 | * So, we update time before rt_nr_running check. | ||
| 1375 | */ | ||
| 1376 | if (prev->sched_class == &rt_sched_class) | ||
| 1377 | update_curr_rt(rq); | ||
| 1378 | |||
| 1379 | if (!rt_rq->rt_nr_running) | ||
| 1380 | return NULL; | ||
| 1381 | |||
| 1382 | if (rt_rq_throttled(rt_rq)) | ||
| 1383 | return NULL; | ||
| 1384 | |||
| 1385 | put_prev_task(rq, prev); | ||
| 1386 | |||
| 1387 | p = _pick_next_task_rt(rq); | ||
| 1338 | 1388 | ||
| 1339 | /* The running task is never eligible for pushing */ | 1389 | /* The running task is never eligible for pushing */ |
| 1340 | if (p) | 1390 | if (p) |
| 1341 | dequeue_pushable_task(rq, p); | 1391 | dequeue_pushable_task(rq, p); |
| 1342 | 1392 | ||
| 1343 | #ifdef CONFIG_SMP | 1393 | set_post_schedule(rq); |
| 1344 | /* | ||
| 1345 | * We detect this state here so that we can avoid taking the RQ | ||
| 1346 | * lock again later if there is no need to push | ||
| 1347 | */ | ||
| 1348 | rq->post_schedule = has_pushable_tasks(rq); | ||
| 1349 | #endif | ||
| 1350 | 1394 | ||
| 1351 | return p; | 1395 | return p; |
| 1352 | } | 1396 | } |
| @@ -1716,13 +1760,6 @@ skip: | |||
| 1716 | return ret; | 1760 | return ret; |
| 1717 | } | 1761 | } |
| 1718 | 1762 | ||
| 1719 | static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) | ||
| 1720 | { | ||
| 1721 | /* Try to pull RT tasks here if we lower this rq's prio */ | ||
| 1722 | if (rq->rt.highest_prio.curr > prev->prio) | ||
| 1723 | pull_rt_task(rq); | ||
| 1724 | } | ||
| 1725 | |||
| 1726 | static void post_schedule_rt(struct rq *rq) | 1763 | static void post_schedule_rt(struct rq *rq) |
| 1727 | { | 1764 | { |
| 1728 | push_rt_tasks(rq); | 1765 | push_rt_tasks(rq); |
| @@ -1825,7 +1862,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
| 1825 | resched_task(rq->curr); | 1862 | resched_task(rq->curr); |
| 1826 | } | 1863 | } |
| 1827 | 1864 | ||
| 1828 | void init_sched_rt_class(void) | 1865 | void __init init_sched_rt_class(void) |
| 1829 | { | 1866 | { |
| 1830 | unsigned int i; | 1867 | unsigned int i; |
| 1831 | 1868 | ||
| @@ -1999,7 +2036,6 @@ const struct sched_class rt_sched_class = { | |||
| 1999 | .set_cpus_allowed = set_cpus_allowed_rt, | 2036 | .set_cpus_allowed = set_cpus_allowed_rt, |
| 2000 | .rq_online = rq_online_rt, | 2037 | .rq_online = rq_online_rt, |
| 2001 | .rq_offline = rq_offline_rt, | 2038 | .rq_offline = rq_offline_rt, |
| 2002 | .pre_schedule = pre_schedule_rt, | ||
| 2003 | .post_schedule = post_schedule_rt, | 2039 | .post_schedule = post_schedule_rt, |
| 2004 | .task_woken = task_woken_rt, | 2040 | .task_woken = task_woken_rt, |
| 2005 | .switched_from = switched_from_rt, | 2041 | .switched_from = switched_from_rt, |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f964add50f38..c9007f28d3a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq); | |||
| 24 | extern void update_cpu_load_active(struct rq *this_rq); | 24 | extern void update_cpu_load_active(struct rq *this_rq); |
| 25 | 25 | ||
| 26 | /* | 26 | /* |
| 27 | * Convert user-nice values [ -20 ... 0 ... 19 ] | ||
| 28 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | ||
| 29 | * and back. | ||
| 30 | */ | ||
| 31 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | ||
| 32 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | ||
| 33 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | ||
| 34 | |||
| 35 | /* | ||
| 36 | * 'User priority' is the nice value converted to something we | ||
| 37 | * can work with better when scaling various scheduler parameters, | ||
| 38 | * it's a [ 0 ... 39 ] range. | ||
| 39 | */ | ||
| 40 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | ||
| 41 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | ||
| 42 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | ||
| 43 | |||
| 44 | /* | ||
| 45 | * Helpers for converting nanosecond timing to jiffy resolution | 27 | * Helpers for converting nanosecond timing to jiffy resolution |
| 46 | */ | 28 | */ |
| 47 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | 29 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) |
| @@ -441,6 +423,18 @@ struct rt_rq { | |||
| 441 | #endif | 423 | #endif |
| 442 | }; | 424 | }; |
| 443 | 425 | ||
| 426 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 427 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | ||
| 428 | { | ||
| 429 | return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; | ||
| 430 | } | ||
| 431 | #else | ||
| 432 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | ||
| 433 | { | ||
| 434 | return rt_rq->rt_throttled; | ||
| 435 | } | ||
| 436 | #endif | ||
| 437 | |||
| 444 | /* Deadline class' related fields in a runqueue */ | 438 | /* Deadline class' related fields in a runqueue */ |
| 445 | struct dl_rq { | 439 | struct dl_rq { |
| 446 | /* runqueue is an rbtree, ordered by deadline */ | 440 | /* runqueue is an rbtree, ordered by deadline */ |
| @@ -558,11 +552,9 @@ struct rq { | |||
| 558 | #ifdef CONFIG_FAIR_GROUP_SCHED | 552 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 559 | /* list of leaf cfs_rq on this cpu: */ | 553 | /* list of leaf cfs_rq on this cpu: */ |
| 560 | struct list_head leaf_cfs_rq_list; | 554 | struct list_head leaf_cfs_rq_list; |
| 561 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 562 | 555 | ||
| 563 | #ifdef CONFIG_RT_GROUP_SCHED | 556 | struct sched_avg avg; |
| 564 | struct list_head leaf_rt_rq_list; | 557 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 565 | #endif | ||
| 566 | 558 | ||
| 567 | /* | 559 | /* |
| 568 | * This is part of a global counter where only the total sum | 560 | * This is part of a global counter where only the total sum |
| @@ -651,8 +643,6 @@ struct rq { | |||
| 651 | #ifdef CONFIG_SMP | 643 | #ifdef CONFIG_SMP |
| 652 | struct llist_head wake_list; | 644 | struct llist_head wake_list; |
| 653 | #endif | 645 | #endif |
| 654 | |||
| 655 | struct sched_avg avg; | ||
| 656 | }; | 646 | }; |
| 657 | 647 | ||
| 658 | static inline int cpu_of(struct rq *rq) | 648 | static inline int cpu_of(struct rq *rq) |
| @@ -1112,6 +1102,8 @@ static const u32 prio_to_wmult[40] = { | |||
| 1112 | 1102 | ||
| 1113 | #define DEQUEUE_SLEEP 1 | 1103 | #define DEQUEUE_SLEEP 1 |
| 1114 | 1104 | ||
| 1105 | #define RETRY_TASK ((void *)-1UL) | ||
| 1106 | |||
| 1115 | struct sched_class { | 1107 | struct sched_class { |
| 1116 | const struct sched_class *next; | 1108 | const struct sched_class *next; |
| 1117 | 1109 | ||
| @@ -1122,14 +1114,22 @@ struct sched_class { | |||
| 1122 | 1114 | ||
| 1123 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); | 1115 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); |
| 1124 | 1116 | ||
| 1125 | struct task_struct * (*pick_next_task) (struct rq *rq); | 1117 | /* |
| 1118 | * It is the responsibility of the pick_next_task() method that will | ||
| 1119 | * return the next task to call put_prev_task() on the @prev task or | ||
| 1120 | * something equivalent. | ||
| 1121 | * | ||
| 1122 | * May return RETRY_TASK when it finds a higher prio class has runnable | ||
| 1123 | * tasks. | ||
| 1124 | */ | ||
| 1125 | struct task_struct * (*pick_next_task) (struct rq *rq, | ||
| 1126 | struct task_struct *prev); | ||
| 1126 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1127 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); |
| 1127 | 1128 | ||
| 1128 | #ifdef CONFIG_SMP | 1129 | #ifdef CONFIG_SMP |
| 1129 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); | 1130 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); |
| 1130 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); | 1131 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); |
| 1131 | 1132 | ||
| 1132 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | ||
| 1133 | void (*post_schedule) (struct rq *this_rq); | 1133 | void (*post_schedule) (struct rq *this_rq); |
| 1134 | void (*task_waking) (struct task_struct *task); | 1134 | void (*task_waking) (struct task_struct *task); |
| 1135 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); | 1135 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); |
| @@ -1159,6 +1159,11 @@ struct sched_class { | |||
| 1159 | #endif | 1159 | #endif |
| 1160 | }; | 1160 | }; |
| 1161 | 1161 | ||
| 1162 | static inline void put_prev_task(struct rq *rq, struct task_struct *prev) | ||
| 1163 | { | ||
| 1164 | prev->sched_class->put_prev_task(rq, prev); | ||
| 1165 | } | ||
| 1166 | |||
| 1162 | #define sched_class_highest (&stop_sched_class) | 1167 | #define sched_class_highest (&stop_sched_class) |
| 1163 | #define for_each_class(class) \ | 1168 | #define for_each_class(class) \ |
| 1164 | for (class = sched_class_highest; class; class = class->next) | 1169 | for (class = sched_class_highest; class; class = class->next) |
| @@ -1175,16 +1180,14 @@ extern const struct sched_class idle_sched_class; | |||
| 1175 | extern void update_group_power(struct sched_domain *sd, int cpu); | 1180 | extern void update_group_power(struct sched_domain *sd, int cpu); |
| 1176 | 1181 | ||
| 1177 | extern void trigger_load_balance(struct rq *rq); | 1182 | extern void trigger_load_balance(struct rq *rq); |
| 1178 | extern void idle_balance(int this_cpu, struct rq *this_rq); | ||
| 1179 | 1183 | ||
| 1180 | extern void idle_enter_fair(struct rq *this_rq); | 1184 | extern void idle_enter_fair(struct rq *this_rq); |
| 1181 | extern void idle_exit_fair(struct rq *this_rq); | 1185 | extern void idle_exit_fair(struct rq *this_rq); |
| 1182 | 1186 | ||
| 1183 | #else /* CONFIG_SMP */ | 1187 | #else |
| 1184 | 1188 | ||
| 1185 | static inline void idle_balance(int cpu, struct rq *rq) | 1189 | static inline void idle_enter_fair(struct rq *rq) { } |
| 1186 | { | 1190 | static inline void idle_exit_fair(struct rq *rq) { } |
| 1187 | } | ||
| 1188 | 1191 | ||
| 1189 | #endif | 1192 | #endif |
| 1190 | 1193 | ||
| @@ -1213,16 +1216,6 @@ extern void update_idle_cpu_load(struct rq *this_rq); | |||
| 1213 | 1216 | ||
| 1214 | extern void init_task_runnable_average(struct task_struct *p); | 1217 | extern void init_task_runnable_average(struct task_struct *p); |
| 1215 | 1218 | ||
| 1216 | #ifdef CONFIG_PARAVIRT | ||
| 1217 | static inline u64 steal_ticks(u64 steal) | ||
| 1218 | { | ||
| 1219 | if (unlikely(steal > NSEC_PER_SEC)) | ||
| 1220 | return div_u64(steal, TICK_NSEC); | ||
| 1221 | |||
| 1222 | return __iter_div_u64_rem(steal, TICK_NSEC, &steal); | ||
| 1223 | } | ||
| 1224 | #endif | ||
| 1225 | |||
| 1226 | static inline void inc_nr_running(struct rq *rq) | 1219 | static inline void inc_nr_running(struct rq *rq) |
| 1227 | { | 1220 | { |
| 1228 | rq->nr_running++; | 1221 | rq->nr_running++; |
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index da98af347e8b..a476bea17fbc 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c | |||
| @@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void) | |||
| 142 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | 142 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); |
| 143 | return 0; | 143 | return 0; |
| 144 | } | 144 | } |
| 145 | module_init(proc_schedstat_init); | 145 | subsys_initcall(proc_schedstat_init); |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index fdb6bb0b3356..d6ce65dde541 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
| @@ -23,16 +23,19 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) | |||
| 23 | /* we're never preempted */ | 23 | /* we're never preempted */ |
| 24 | } | 24 | } |
| 25 | 25 | ||
| 26 | static struct task_struct *pick_next_task_stop(struct rq *rq) | 26 | static struct task_struct * |
| 27 | pick_next_task_stop(struct rq *rq, struct task_struct *prev) | ||
| 27 | { | 28 | { |
| 28 | struct task_struct *stop = rq->stop; | 29 | struct task_struct *stop = rq->stop; |
| 29 | 30 | ||
| 30 | if (stop && stop->on_rq) { | 31 | if (!stop || !stop->on_rq) |
| 31 | stop->se.exec_start = rq_clock_task(rq); | 32 | return NULL; |
| 32 | return stop; | ||
| 33 | } | ||
| 34 | 33 | ||
| 35 | return NULL; | 34 | put_prev_task(rq, prev); |
| 35 | |||
| 36 | stop->se.exec_start = rq_clock_task(rq); | ||
| 37 | |||
| 38 | return stop; | ||
| 36 | } | 39 | } |
| 37 | 40 | ||
| 38 | static void | 41 | static void |
