diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 530 |
1 files changed, 436 insertions, 94 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 26efa475bdc1..1b59e265273b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -39,6 +39,7 @@ | |||
| 39 | #include <linux/completion.h> | 39 | #include <linux/completion.h> |
| 40 | #include <linux/kernel_stat.h> | 40 | #include <linux/kernel_stat.h> |
| 41 | #include <linux/debug_locks.h> | 41 | #include <linux/debug_locks.h> |
| 42 | #include <linux/perf_counter.h> | ||
| 42 | #include <linux/security.h> | 43 | #include <linux/security.h> |
| 43 | #include <linux/notifier.h> | 44 | #include <linux/notifier.h> |
| 44 | #include <linux/profile.h> | 45 | #include <linux/profile.h> |
| @@ -68,17 +69,18 @@ | |||
| 68 | #include <linux/pagemap.h> | 69 | #include <linux/pagemap.h> |
| 69 | #include <linux/hrtimer.h> | 70 | #include <linux/hrtimer.h> |
| 70 | #include <linux/tick.h> | 71 | #include <linux/tick.h> |
| 71 | #include <linux/bootmem.h> | ||
| 72 | #include <linux/debugfs.h> | 72 | #include <linux/debugfs.h> |
| 73 | #include <linux/ctype.h> | 73 | #include <linux/ctype.h> |
| 74 | #include <linux/ftrace.h> | 74 | #include <linux/ftrace.h> |
| 75 | #include <trace/sched.h> | ||
| 76 | 75 | ||
| 77 | #include <asm/tlb.h> | 76 | #include <asm/tlb.h> |
| 78 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
| 79 | 78 | ||
| 80 | #include "sched_cpupri.h" | 79 | #include "sched_cpupri.h" |
| 81 | 80 | ||
| 81 | #define CREATE_TRACE_POINTS | ||
| 82 | #include <trace/events/sched.h> | ||
| 83 | |||
| 82 | /* | 84 | /* |
| 83 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 85 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
| 84 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 86 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
| @@ -118,12 +120,6 @@ | |||
| 118 | */ | 120 | */ |
| 119 | #define RUNTIME_INF ((u64)~0ULL) | 121 | #define RUNTIME_INF ((u64)~0ULL) |
| 120 | 122 | ||
| 121 | DEFINE_TRACE(sched_wait_task); | ||
| 122 | DEFINE_TRACE(sched_wakeup); | ||
| 123 | DEFINE_TRACE(sched_wakeup_new); | ||
| 124 | DEFINE_TRACE(sched_switch); | ||
| 125 | DEFINE_TRACE(sched_migrate_task); | ||
| 126 | |||
| 127 | #ifdef CONFIG_SMP | 123 | #ifdef CONFIG_SMP |
| 128 | 124 | ||
| 129 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | 125 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); |
| @@ -244,7 +240,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
| 244 | hard = hrtimer_get_expires(&rt_b->rt_period_timer); | 240 | hard = hrtimer_get_expires(&rt_b->rt_period_timer); |
| 245 | delta = ktime_to_ns(ktime_sub(hard, soft)); | 241 | delta = ktime_to_ns(ktime_sub(hard, soft)); |
| 246 | __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, | 242 | __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, |
| 247 | HRTIMER_MODE_ABS, 0); | 243 | HRTIMER_MODE_ABS_PINNED, 0); |
| 248 | } | 244 | } |
| 249 | spin_unlock(&rt_b->rt_runtime_lock); | 245 | spin_unlock(&rt_b->rt_runtime_lock); |
| 250 | } | 246 | } |
| @@ -497,6 +493,7 @@ struct rt_rq { | |||
| 497 | #endif | 493 | #endif |
| 498 | #ifdef CONFIG_SMP | 494 | #ifdef CONFIG_SMP |
| 499 | unsigned long rt_nr_migratory; | 495 | unsigned long rt_nr_migratory; |
| 496 | unsigned long rt_nr_total; | ||
| 500 | int overloaded; | 497 | int overloaded; |
| 501 | struct plist_head pushable_tasks; | 498 | struct plist_head pushable_tasks; |
| 502 | #endif | 499 | #endif |
| @@ -584,6 +581,7 @@ struct rq { | |||
| 584 | struct load_weight load; | 581 | struct load_weight load; |
| 585 | unsigned long nr_load_updates; | 582 | unsigned long nr_load_updates; |
| 586 | u64 nr_switches; | 583 | u64 nr_switches; |
| 584 | u64 nr_migrations_in; | ||
| 587 | 585 | ||
| 588 | struct cfs_rq cfs; | 586 | struct cfs_rq cfs; |
| 589 | struct rt_rq rt; | 587 | struct rt_rq rt; |
| @@ -630,6 +628,10 @@ struct rq { | |||
| 630 | struct list_head migration_queue; | 628 | struct list_head migration_queue; |
| 631 | #endif | 629 | #endif |
| 632 | 630 | ||
| 631 | /* calc_load related fields */ | ||
| 632 | unsigned long calc_load_update; | ||
| 633 | long calc_load_active; | ||
| 634 | |||
| 633 | #ifdef CONFIG_SCHED_HRTICK | 635 | #ifdef CONFIG_SCHED_HRTICK |
| 634 | #ifdef CONFIG_SMP | 636 | #ifdef CONFIG_SMP |
| 635 | int hrtick_csd_pending; | 637 | int hrtick_csd_pending; |
| @@ -692,7 +694,7 @@ static inline int cpu_of(struct rq *rq) | |||
| 692 | #define task_rq(p) cpu_rq(task_cpu(p)) | 694 | #define task_rq(p) cpu_rq(task_cpu(p)) |
| 693 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 695 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
| 694 | 696 | ||
| 695 | static inline void update_rq_clock(struct rq *rq) | 697 | inline void update_rq_clock(struct rq *rq) |
| 696 | { | 698 | { |
| 697 | rq->clock = sched_clock_cpu(cpu_of(rq)); | 699 | rq->clock = sched_clock_cpu(cpu_of(rq)); |
| 698 | } | 700 | } |
| @@ -1154,7 +1156,7 @@ static __init void init_hrtick(void) | |||
| 1154 | static void hrtick_start(struct rq *rq, u64 delay) | 1156 | static void hrtick_start(struct rq *rq, u64 delay) |
| 1155 | { | 1157 | { |
| 1156 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, | 1158 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, |
| 1157 | HRTIMER_MODE_REL, 0); | 1159 | HRTIMER_MODE_REL_PINNED, 0); |
| 1158 | } | 1160 | } |
| 1159 | 1161 | ||
| 1160 | static inline void init_hrtick(void) | 1162 | static inline void init_hrtick(void) |
| @@ -1728,6 +1730,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
| 1728 | } | 1730 | } |
| 1729 | #endif | 1731 | #endif |
| 1730 | 1732 | ||
| 1733 | static void calc_load_account_active(struct rq *this_rq); | ||
| 1734 | |||
| 1731 | #include "sched_stats.h" | 1735 | #include "sched_stats.h" |
| 1732 | #include "sched_idletask.c" | 1736 | #include "sched_idletask.c" |
| 1733 | #include "sched_fair.c" | 1737 | #include "sched_fair.c" |
| @@ -1958,7 +1962,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1958 | 1962 | ||
| 1959 | clock_offset = old_rq->clock - new_rq->clock; | 1963 | clock_offset = old_rq->clock - new_rq->clock; |
| 1960 | 1964 | ||
| 1961 | trace_sched_migrate_task(p, task_cpu(p), new_cpu); | 1965 | trace_sched_migrate_task(p, new_cpu); |
| 1962 | 1966 | ||
| 1963 | #ifdef CONFIG_SCHEDSTATS | 1967 | #ifdef CONFIG_SCHEDSTATS |
| 1964 | if (p->se.wait_start) | 1968 | if (p->se.wait_start) |
| @@ -1967,12 +1971,17 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1967 | p->se.sleep_start -= clock_offset; | 1971 | p->se.sleep_start -= clock_offset; |
| 1968 | if (p->se.block_start) | 1972 | if (p->se.block_start) |
| 1969 | p->se.block_start -= clock_offset; | 1973 | p->se.block_start -= clock_offset; |
| 1974 | #endif | ||
| 1970 | if (old_cpu != new_cpu) { | 1975 | if (old_cpu != new_cpu) { |
| 1971 | schedstat_inc(p, se.nr_migrations); | 1976 | p->se.nr_migrations++; |
| 1977 | new_rq->nr_migrations_in++; | ||
| 1978 | #ifdef CONFIG_SCHEDSTATS | ||
| 1972 | if (task_hot(p, old_rq->clock, NULL)) | 1979 | if (task_hot(p, old_rq->clock, NULL)) |
| 1973 | schedstat_inc(p, se.nr_forced2_migrations); | 1980 | schedstat_inc(p, se.nr_forced2_migrations); |
| 1974 | } | ||
| 1975 | #endif | 1981 | #endif |
| 1982 | perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, | ||
| 1983 | 1, 1, NULL, 0); | ||
| 1984 | } | ||
| 1976 | p->se.vruntime -= old_cfsrq->min_vruntime - | 1985 | p->se.vruntime -= old_cfsrq->min_vruntime - |
| 1977 | new_cfsrq->min_vruntime; | 1986 | new_cfsrq->min_vruntime; |
| 1978 | 1987 | ||
| @@ -2015,6 +2024,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | |||
| 2015 | } | 2024 | } |
| 2016 | 2025 | ||
| 2017 | /* | 2026 | /* |
| 2027 | * wait_task_context_switch - wait for a thread to complete at least one | ||
| 2028 | * context switch. | ||
| 2029 | * | ||
| 2030 | * @p must not be current. | ||
| 2031 | */ | ||
| 2032 | void wait_task_context_switch(struct task_struct *p) | ||
| 2033 | { | ||
| 2034 | unsigned long nvcsw, nivcsw, flags; | ||
| 2035 | int running; | ||
| 2036 | struct rq *rq; | ||
| 2037 | |||
| 2038 | nvcsw = p->nvcsw; | ||
| 2039 | nivcsw = p->nivcsw; | ||
| 2040 | for (;;) { | ||
| 2041 | /* | ||
| 2042 | * The runqueue is assigned before the actual context | ||
| 2043 | * switch. We need to take the runqueue lock. | ||
| 2044 | * | ||
| 2045 | * We could check initially without the lock but it is | ||
| 2046 | * very likely that we need to take the lock in every | ||
| 2047 | * iteration. | ||
| 2048 | */ | ||
| 2049 | rq = task_rq_lock(p, &flags); | ||
| 2050 | running = task_running(rq, p); | ||
| 2051 | task_rq_unlock(rq, &flags); | ||
| 2052 | |||
| 2053 | if (likely(!running)) | ||
| 2054 | break; | ||
| 2055 | /* | ||
| 2056 | * The switch count is incremented before the actual | ||
| 2057 | * context switch. We thus wait for two switches to be | ||
| 2058 | * sure at least one completed. | ||
| 2059 | */ | ||
| 2060 | if ((p->nvcsw - nvcsw) > 1) | ||
| 2061 | break; | ||
| 2062 | if ((p->nivcsw - nivcsw) > 1) | ||
| 2063 | break; | ||
| 2064 | |||
| 2065 | cpu_relax(); | ||
| 2066 | } | ||
| 2067 | } | ||
| 2068 | |||
| 2069 | /* | ||
| 2018 | * wait_task_inactive - wait for a thread to unschedule. | 2070 | * wait_task_inactive - wait for a thread to unschedule. |
| 2019 | * | 2071 | * |
| 2020 | * If @match_state is nonzero, it's the @p->state value just checked and | 2072 | * If @match_state is nonzero, it's the @p->state value just checked and |
| @@ -2142,6 +2194,7 @@ void kick_process(struct task_struct *p) | |||
| 2142 | smp_send_reschedule(cpu); | 2194 | smp_send_reschedule(cpu); |
| 2143 | preempt_enable(); | 2195 | preempt_enable(); |
| 2144 | } | 2196 | } |
| 2197 | EXPORT_SYMBOL_GPL(kick_process); | ||
| 2145 | 2198 | ||
| 2146 | /* | 2199 | /* |
| 2147 | * Return a low guess at the load of a migration-source cpu weighted | 2200 | * Return a low guess at the load of a migration-source cpu weighted |
| @@ -2324,6 +2377,27 @@ static int sched_balance_self(int cpu, int flag) | |||
| 2324 | 2377 | ||
| 2325 | #endif /* CONFIG_SMP */ | 2378 | #endif /* CONFIG_SMP */ |
| 2326 | 2379 | ||
| 2380 | /** | ||
| 2381 | * task_oncpu_function_call - call a function on the cpu on which a task runs | ||
| 2382 | * @p: the task to evaluate | ||
| 2383 | * @func: the function to be called | ||
| 2384 | * @info: the function call argument | ||
| 2385 | * | ||
| 2386 | * Calls the function @func when the task is currently running. This might | ||
| 2387 | * be on the current CPU, which just calls the function directly | ||
| 2388 | */ | ||
| 2389 | void task_oncpu_function_call(struct task_struct *p, | ||
| 2390 | void (*func) (void *info), void *info) | ||
| 2391 | { | ||
| 2392 | int cpu; | ||
| 2393 | |||
| 2394 | preempt_disable(); | ||
| 2395 | cpu = task_cpu(p); | ||
| 2396 | if (task_curr(p)) | ||
| 2397 | smp_call_function_single(cpu, func, info, 1); | ||
| 2398 | preempt_enable(); | ||
| 2399 | } | ||
| 2400 | |||
| 2327 | /*** | 2401 | /*** |
| 2328 | * try_to_wake_up - wake up a thread | 2402 | * try_to_wake_up - wake up a thread |
| 2329 | * @p: the to-be-woken-up thread | 2403 | * @p: the to-be-woken-up thread |
| @@ -2458,6 +2532,17 @@ out: | |||
| 2458 | return success; | 2532 | return success; |
| 2459 | } | 2533 | } |
| 2460 | 2534 | ||
| 2535 | /** | ||
| 2536 | * wake_up_process - Wake up a specific process | ||
| 2537 | * @p: The process to be woken up. | ||
| 2538 | * | ||
| 2539 | * Attempt to wake up the nominated process and move it to the set of runnable | ||
| 2540 | * processes. Returns 1 if the process was woken up, 0 if it was already | ||
| 2541 | * running. | ||
| 2542 | * | ||
| 2543 | * It may be assumed that this function implies a write memory barrier before | ||
| 2544 | * changing the task state if and only if any tasks are woken up. | ||
| 2545 | */ | ||
| 2461 | int wake_up_process(struct task_struct *p) | 2546 | int wake_up_process(struct task_struct *p) |
| 2462 | { | 2547 | { |
| 2463 | return try_to_wake_up(p, TASK_ALL, 0); | 2548 | return try_to_wake_up(p, TASK_ALL, 0); |
| @@ -2480,21 +2565,44 @@ static void __sched_fork(struct task_struct *p) | |||
| 2480 | p->se.exec_start = 0; | 2565 | p->se.exec_start = 0; |
| 2481 | p->se.sum_exec_runtime = 0; | 2566 | p->se.sum_exec_runtime = 0; |
| 2482 | p->se.prev_sum_exec_runtime = 0; | 2567 | p->se.prev_sum_exec_runtime = 0; |
| 2568 | p->se.nr_migrations = 0; | ||
| 2483 | p->se.last_wakeup = 0; | 2569 | p->se.last_wakeup = 0; |
| 2484 | p->se.avg_overlap = 0; | 2570 | p->se.avg_overlap = 0; |
| 2485 | p->se.start_runtime = 0; | 2571 | p->se.start_runtime = 0; |
| 2486 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; | 2572 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; |
| 2487 | 2573 | ||
| 2488 | #ifdef CONFIG_SCHEDSTATS | 2574 | #ifdef CONFIG_SCHEDSTATS |
| 2489 | p->se.wait_start = 0; | 2575 | p->se.wait_start = 0; |
| 2490 | p->se.sum_sleep_runtime = 0; | 2576 | p->se.wait_max = 0; |
| 2491 | p->se.sleep_start = 0; | 2577 | p->se.wait_count = 0; |
| 2492 | p->se.block_start = 0; | 2578 | p->se.wait_sum = 0; |
| 2493 | p->se.sleep_max = 0; | 2579 | |
| 2494 | p->se.block_max = 0; | 2580 | p->se.sleep_start = 0; |
| 2495 | p->se.exec_max = 0; | 2581 | p->se.sleep_max = 0; |
| 2496 | p->se.slice_max = 0; | 2582 | p->se.sum_sleep_runtime = 0; |
| 2497 | p->se.wait_max = 0; | 2583 | |
| 2584 | p->se.block_start = 0; | ||
| 2585 | p->se.block_max = 0; | ||
| 2586 | p->se.exec_max = 0; | ||
| 2587 | p->se.slice_max = 0; | ||
| 2588 | |||
| 2589 | p->se.nr_migrations_cold = 0; | ||
| 2590 | p->se.nr_failed_migrations_affine = 0; | ||
| 2591 | p->se.nr_failed_migrations_running = 0; | ||
| 2592 | p->se.nr_failed_migrations_hot = 0; | ||
| 2593 | p->se.nr_forced_migrations = 0; | ||
| 2594 | p->se.nr_forced2_migrations = 0; | ||
| 2595 | |||
| 2596 | p->se.nr_wakeups = 0; | ||
| 2597 | p->se.nr_wakeups_sync = 0; | ||
| 2598 | p->se.nr_wakeups_migrate = 0; | ||
| 2599 | p->se.nr_wakeups_local = 0; | ||
| 2600 | p->se.nr_wakeups_remote = 0; | ||
| 2601 | p->se.nr_wakeups_affine = 0; | ||
| 2602 | p->se.nr_wakeups_affine_attempts = 0; | ||
| 2603 | p->se.nr_wakeups_passive = 0; | ||
| 2604 | p->se.nr_wakeups_idle = 0; | ||
| 2605 | |||
| 2498 | #endif | 2606 | #endif |
| 2499 | 2607 | ||
| 2500 | INIT_LIST_HEAD(&p->rt.run_list); | 2608 | INIT_LIST_HEAD(&p->rt.run_list); |
| @@ -2710,6 +2818,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2710 | */ | 2818 | */ |
| 2711 | prev_state = prev->state; | 2819 | prev_state = prev->state; |
| 2712 | finish_arch_switch(prev); | 2820 | finish_arch_switch(prev); |
| 2821 | perf_counter_task_sched_in(current, cpu_of(rq)); | ||
| 2713 | finish_lock_switch(rq, prev); | 2822 | finish_lock_switch(rq, prev); |
| 2714 | #ifdef CONFIG_SMP | 2823 | #ifdef CONFIG_SMP |
| 2715 | if (post_schedule) | 2824 | if (post_schedule) |
| @@ -2766,7 +2875,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2766 | * combine the page table reload and the switch backend into | 2875 | * combine the page table reload and the switch backend into |
| 2767 | * one hypercall. | 2876 | * one hypercall. |
| 2768 | */ | 2877 | */ |
| 2769 | arch_enter_lazy_cpu_mode(); | 2878 | arch_start_context_switch(prev); |
| 2770 | 2879 | ||
| 2771 | if (unlikely(!mm)) { | 2880 | if (unlikely(!mm)) { |
| 2772 | next->active_mm = oldmm; | 2881 | next->active_mm = oldmm; |
| @@ -2856,19 +2965,81 @@ unsigned long nr_iowait(void) | |||
| 2856 | return sum; | 2965 | return sum; |
| 2857 | } | 2966 | } |
| 2858 | 2967 | ||
| 2859 | unsigned long nr_active(void) | 2968 | /* Variables and functions for calc_load */ |
| 2969 | static atomic_long_t calc_load_tasks; | ||
| 2970 | static unsigned long calc_load_update; | ||
| 2971 | unsigned long avenrun[3]; | ||
| 2972 | EXPORT_SYMBOL(avenrun); | ||
| 2973 | |||
| 2974 | /** | ||
| 2975 | * get_avenrun - get the load average array | ||
| 2976 | * @loads: pointer to dest load array | ||
| 2977 | * @offset: offset to add | ||
| 2978 | * @shift: shift count to shift the result left | ||
| 2979 | * | ||
| 2980 | * These values are estimates at best, so no need for locking. | ||
| 2981 | */ | ||
| 2982 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
| 2860 | { | 2983 | { |
| 2861 | unsigned long i, running = 0, uninterruptible = 0; | 2984 | loads[0] = (avenrun[0] + offset) << shift; |
| 2985 | loads[1] = (avenrun[1] + offset) << shift; | ||
| 2986 | loads[2] = (avenrun[2] + offset) << shift; | ||
| 2987 | } | ||
| 2862 | 2988 | ||
| 2863 | for_each_online_cpu(i) { | 2989 | static unsigned long |
| 2864 | running += cpu_rq(i)->nr_running; | 2990 | calc_load(unsigned long load, unsigned long exp, unsigned long active) |
| 2865 | uninterruptible += cpu_rq(i)->nr_uninterruptible; | 2991 | { |
| 2866 | } | 2992 | load *= exp; |
| 2993 | load += active * (FIXED_1 - exp); | ||
| 2994 | return load >> FSHIFT; | ||
| 2995 | } | ||
| 2996 | |||
| 2997 | /* | ||
| 2998 | * calc_load - update the avenrun load estimates 10 ticks after the | ||
| 2999 | * CPUs have updated calc_load_tasks. | ||
| 3000 | */ | ||
| 3001 | void calc_global_load(void) | ||
| 3002 | { | ||
| 3003 | unsigned long upd = calc_load_update + 10; | ||
| 3004 | long active; | ||
| 3005 | |||
| 3006 | if (time_before(jiffies, upd)) | ||
| 3007 | return; | ||
| 3008 | |||
| 3009 | active = atomic_long_read(&calc_load_tasks); | ||
| 3010 | active = active > 0 ? active * FIXED_1 : 0; | ||
| 2867 | 3011 | ||
| 2868 | if (unlikely((long)uninterruptible < 0)) | 3012 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); |
| 2869 | uninterruptible = 0; | 3013 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); |
| 3014 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
| 2870 | 3015 | ||
| 2871 | return running + uninterruptible; | 3016 | calc_load_update += LOAD_FREQ; |
| 3017 | } | ||
| 3018 | |||
| 3019 | /* | ||
| 3020 | * Either called from update_cpu_load() or from a cpu going idle | ||
| 3021 | */ | ||
| 3022 | static void calc_load_account_active(struct rq *this_rq) | ||
| 3023 | { | ||
| 3024 | long nr_active, delta; | ||
| 3025 | |||
| 3026 | nr_active = this_rq->nr_running; | ||
| 3027 | nr_active += (long) this_rq->nr_uninterruptible; | ||
| 3028 | |||
| 3029 | if (nr_active != this_rq->calc_load_active) { | ||
| 3030 | delta = nr_active - this_rq->calc_load_active; | ||
| 3031 | this_rq->calc_load_active = nr_active; | ||
| 3032 | atomic_long_add(delta, &calc_load_tasks); | ||
| 3033 | } | ||
| 3034 | } | ||
| 3035 | |||
| 3036 | /* | ||
| 3037 | * Externally visible per-cpu scheduler statistics: | ||
| 3038 | * cpu_nr_migrations(cpu) - number of migrations into that cpu | ||
| 3039 | */ | ||
| 3040 | u64 cpu_nr_migrations(int cpu) | ||
| 3041 | { | ||
| 3042 | return cpu_rq(cpu)->nr_migrations_in; | ||
| 2872 | } | 3043 | } |
| 2873 | 3044 | ||
| 2874 | /* | 3045 | /* |
| @@ -2899,6 +3070,11 @@ static void update_cpu_load(struct rq *this_rq) | |||
| 2899 | new_load += scale-1; | 3070 | new_load += scale-1; |
| 2900 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 3071 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
| 2901 | } | 3072 | } |
| 3073 | |||
| 3074 | if (time_after_eq(jiffies, this_rq->calc_load_update)) { | ||
| 3075 | this_rq->calc_load_update += LOAD_FREQ; | ||
| 3076 | calc_load_account_active(this_rq); | ||
| 3077 | } | ||
| 2902 | } | 3078 | } |
| 2903 | 3079 | ||
| 2904 | #ifdef CONFIG_SMP | 3080 | #ifdef CONFIG_SMP |
| @@ -4240,10 +4416,131 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
| 4240 | static struct { | 4416 | static struct { |
| 4241 | atomic_t load_balancer; | 4417 | atomic_t load_balancer; |
| 4242 | cpumask_var_t cpu_mask; | 4418 | cpumask_var_t cpu_mask; |
| 4419 | cpumask_var_t ilb_grp_nohz_mask; | ||
| 4243 | } nohz ____cacheline_aligned = { | 4420 | } nohz ____cacheline_aligned = { |
| 4244 | .load_balancer = ATOMIC_INIT(-1), | 4421 | .load_balancer = ATOMIC_INIT(-1), |
| 4245 | }; | 4422 | }; |
| 4246 | 4423 | ||
| 4424 | int get_nohz_load_balancer(void) | ||
| 4425 | { | ||
| 4426 | return atomic_read(&nohz.load_balancer); | ||
| 4427 | } | ||
| 4428 | |||
| 4429 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 4430 | /** | ||
| 4431 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
| 4432 | * @cpu: The cpu whose lowest level of sched domain is to | ||
| 4433 | * be returned. | ||
| 4434 | * @flag: The flag to check for the lowest sched_domain | ||
| 4435 | * for the given cpu. | ||
| 4436 | * | ||
| 4437 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
| 4438 | */ | ||
| 4439 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
| 4440 | { | ||
| 4441 | struct sched_domain *sd; | ||
| 4442 | |||
| 4443 | for_each_domain(cpu, sd) | ||
| 4444 | if (sd && (sd->flags & flag)) | ||
| 4445 | break; | ||
| 4446 | |||
| 4447 | return sd; | ||
| 4448 | } | ||
| 4449 | |||
| 4450 | /** | ||
| 4451 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
| 4452 | * @cpu: The cpu whose domains we're iterating over. | ||
| 4453 | * @sd: variable holding the value of the power_savings_sd | ||
| 4454 | * for cpu. | ||
| 4455 | * @flag: The flag to filter the sched_domains to be iterated. | ||
| 4456 | * | ||
| 4457 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
| 4458 | * set, starting from the lowest sched_domain to the highest. | ||
| 4459 | */ | ||
| 4460 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
| 4461 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
| 4462 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
| 4463 | |||
| 4464 | /** | ||
| 4465 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
| 4466 | * @ilb_group: group to be checked for semi-idleness | ||
| 4467 | * | ||
| 4468 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
| 4469 | * | ||
| 4470 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
| 4471 | * and atleast one non-idle CPU. This helper function checks if the given | ||
| 4472 | * sched_group is semi-idle or not. | ||
| 4473 | */ | ||
| 4474 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
| 4475 | { | ||
| 4476 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | ||
| 4477 | sched_group_cpus(ilb_group)); | ||
| 4478 | |||
| 4479 | /* | ||
| 4480 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
| 4481 | * and atleast one idle cpu. | ||
| 4482 | */ | ||
| 4483 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | ||
| 4484 | return 0; | ||
| 4485 | |||
| 4486 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | ||
| 4487 | return 0; | ||
| 4488 | |||
| 4489 | return 1; | ||
| 4490 | } | ||
| 4491 | /** | ||
| 4492 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
| 4493 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
| 4494 | * | ||
| 4495 | * Returns: Returns the id of the idle load balancer if it exists, | ||
| 4496 | * Else, returns >= nr_cpu_ids. | ||
| 4497 | * | ||
| 4498 | * This algorithm picks the idle load balancer such that it belongs to a | ||
| 4499 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
| 4500 | * completely idle packages/cores just for the purpose of idle load balancing | ||
| 4501 | * when there are other idle cpu's which are better suited for that job. | ||
| 4502 | */ | ||
| 4503 | static int find_new_ilb(int cpu) | ||
| 4504 | { | ||
| 4505 | struct sched_domain *sd; | ||
| 4506 | struct sched_group *ilb_group; | ||
| 4507 | |||
| 4508 | /* | ||
| 4509 | * Have idle load balancer selection from semi-idle packages only | ||
| 4510 | * when power-aware load balancing is enabled | ||
| 4511 | */ | ||
| 4512 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
| 4513 | goto out_done; | ||
| 4514 | |||
| 4515 | /* | ||
| 4516 | * Optimize for the case when we have no idle CPUs or only one | ||
| 4517 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
| 4518 | */ | ||
| 4519 | if (cpumask_weight(nohz.cpu_mask) < 2) | ||
| 4520 | goto out_done; | ||
| 4521 | |||
| 4522 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
| 4523 | ilb_group = sd->groups; | ||
| 4524 | |||
| 4525 | do { | ||
| 4526 | if (is_semi_idle_group(ilb_group)) | ||
| 4527 | return cpumask_first(nohz.ilb_grp_nohz_mask); | ||
| 4528 | |||
| 4529 | ilb_group = ilb_group->next; | ||
| 4530 | |||
| 4531 | } while (ilb_group != sd->groups); | ||
| 4532 | } | ||
| 4533 | |||
| 4534 | out_done: | ||
| 4535 | return cpumask_first(nohz.cpu_mask); | ||
| 4536 | } | ||
| 4537 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
| 4538 | static inline int find_new_ilb(int call_cpu) | ||
| 4539 | { | ||
| 4540 | return cpumask_first(nohz.cpu_mask); | ||
| 4541 | } | ||
| 4542 | #endif | ||
| 4543 | |||
| 4247 | /* | 4544 | /* |
| 4248 | * This routine will try to nominate the ilb (idle load balancing) | 4545 | * This routine will try to nominate the ilb (idle load balancing) |
| 4249 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 4546 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle |
| @@ -4298,8 +4595,24 @@ int select_nohz_load_balancer(int stop_tick) | |||
| 4298 | /* make me the ilb owner */ | 4595 | /* make me the ilb owner */ |
| 4299 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | 4596 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) |
| 4300 | return 1; | 4597 | return 1; |
| 4301 | } else if (atomic_read(&nohz.load_balancer) == cpu) | 4598 | } else if (atomic_read(&nohz.load_balancer) == cpu) { |
| 4599 | int new_ilb; | ||
| 4600 | |||
| 4601 | if (!(sched_smt_power_savings || | ||
| 4602 | sched_mc_power_savings)) | ||
| 4603 | return 1; | ||
| 4604 | /* | ||
| 4605 | * Check to see if there is a more power-efficient | ||
| 4606 | * ilb. | ||
| 4607 | */ | ||
| 4608 | new_ilb = find_new_ilb(cpu); | ||
| 4609 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
| 4610 | atomic_set(&nohz.load_balancer, -1); | ||
| 4611 | resched_cpu(new_ilb); | ||
| 4612 | return 0; | ||
| 4613 | } | ||
| 4302 | return 1; | 4614 | return 1; |
| 4615 | } | ||
| 4303 | } else { | 4616 | } else { |
| 4304 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | 4617 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) |
| 4305 | return 0; | 4618 | return 0; |
| @@ -4468,15 +4781,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) | |||
| 4468 | } | 4781 | } |
| 4469 | 4782 | ||
| 4470 | if (atomic_read(&nohz.load_balancer) == -1) { | 4783 | if (atomic_read(&nohz.load_balancer) == -1) { |
| 4471 | /* | 4784 | int ilb = find_new_ilb(cpu); |
| 4472 | * simple selection for now: Nominate the | ||
| 4473 | * first cpu in the nohz list to be the next | ||
| 4474 | * ilb owner. | ||
| 4475 | * | ||
| 4476 | * TBD: Traverse the sched domains and nominate | ||
| 4477 | * the nearest cpu in the nohz.cpu_mask. | ||
| 4478 | */ | ||
| 4479 | int ilb = cpumask_first(nohz.cpu_mask); | ||
| 4480 | 4785 | ||
| 4481 | if (ilb < nr_cpu_ids) | 4786 | if (ilb < nr_cpu_ids) |
| 4482 | resched_cpu(ilb); | 4787 | resched_cpu(ilb); |
| @@ -4840,6 +5145,8 @@ void scheduler_tick(void) | |||
| 4840 | curr->sched_class->task_tick(rq, curr, 0); | 5145 | curr->sched_class->task_tick(rq, curr, 0); |
| 4841 | spin_unlock(&rq->lock); | 5146 | spin_unlock(&rq->lock); |
| 4842 | 5147 | ||
| 5148 | perf_counter_task_tick(curr, cpu); | ||
| 5149 | |||
| 4843 | #ifdef CONFIG_SMP | 5150 | #ifdef CONFIG_SMP |
| 4844 | rq->idle_at_tick = idle_cpu(cpu); | 5151 | rq->idle_at_tick = idle_cpu(cpu); |
| 4845 | trigger_load_balance(rq, cpu); | 5152 | trigger_load_balance(rq, cpu); |
| @@ -5007,13 +5314,15 @@ pick_next_task(struct rq *rq) | |||
| 5007 | /* | 5314 | /* |
| 5008 | * schedule() is the main scheduler function. | 5315 | * schedule() is the main scheduler function. |
| 5009 | */ | 5316 | */ |
| 5010 | asmlinkage void __sched __schedule(void) | 5317 | asmlinkage void __sched schedule(void) |
| 5011 | { | 5318 | { |
| 5012 | struct task_struct *prev, *next; | 5319 | struct task_struct *prev, *next; |
| 5013 | unsigned long *switch_count; | 5320 | unsigned long *switch_count; |
| 5014 | struct rq *rq; | 5321 | struct rq *rq; |
| 5015 | int cpu; | 5322 | int cpu; |
| 5016 | 5323 | ||
| 5324 | need_resched: | ||
| 5325 | preempt_disable(); | ||
| 5017 | cpu = smp_processor_id(); | 5326 | cpu = smp_processor_id(); |
| 5018 | rq = cpu_rq(cpu); | 5327 | rq = cpu_rq(cpu); |
| 5019 | rcu_qsctr_inc(cpu); | 5328 | rcu_qsctr_inc(cpu); |
| @@ -5053,6 +5362,7 @@ need_resched_nonpreemptible: | |||
| 5053 | 5362 | ||
| 5054 | if (likely(prev != next)) { | 5363 | if (likely(prev != next)) { |
| 5055 | sched_info_switch(prev, next); | 5364 | sched_info_switch(prev, next); |
| 5365 | perf_counter_task_sched_out(prev, next, cpu); | ||
| 5056 | 5366 | ||
| 5057 | rq->nr_switches++; | 5367 | rq->nr_switches++; |
| 5058 | rq->curr = next; | 5368 | rq->curr = next; |
| @@ -5070,15 +5380,9 @@ need_resched_nonpreemptible: | |||
| 5070 | 5380 | ||
| 5071 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 5381 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
| 5072 | goto need_resched_nonpreemptible; | 5382 | goto need_resched_nonpreemptible; |
| 5073 | } | ||
| 5074 | 5383 | ||
| 5075 | asmlinkage void __sched schedule(void) | ||
| 5076 | { | ||
| 5077 | need_resched: | ||
| 5078 | preempt_disable(); | ||
| 5079 | __schedule(); | ||
| 5080 | preempt_enable_no_resched(); | 5384 | preempt_enable_no_resched(); |
| 5081 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 5385 | if (need_resched()) |
| 5082 | goto need_resched; | 5386 | goto need_resched; |
| 5083 | } | 5387 | } |
| 5084 | EXPORT_SYMBOL(schedule); | 5388 | EXPORT_SYMBOL(schedule); |
| @@ -5221,7 +5525,7 @@ EXPORT_SYMBOL(default_wake_function); | |||
| 5221 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | 5525 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |
| 5222 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 5526 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
| 5223 | */ | 5527 | */ |
| 5224 | void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 5528 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
| 5225 | int nr_exclusive, int sync, void *key) | 5529 | int nr_exclusive, int sync, void *key) |
| 5226 | { | 5530 | { |
| 5227 | wait_queue_t *curr, *next; | 5531 | wait_queue_t *curr, *next; |
| @@ -5241,6 +5545,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | |||
| 5241 | * @mode: which threads | 5545 | * @mode: which threads |
| 5242 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 5546 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
| 5243 | * @key: is directly passed to the wakeup function | 5547 | * @key: is directly passed to the wakeup function |
| 5548 | * | ||
| 5549 | * It may be assumed that this function implies a write memory barrier before | ||
| 5550 | * changing the task state if and only if any tasks are woken up. | ||
| 5244 | */ | 5551 | */ |
| 5245 | void __wake_up(wait_queue_head_t *q, unsigned int mode, | 5552 | void __wake_up(wait_queue_head_t *q, unsigned int mode, |
| 5246 | int nr_exclusive, void *key) | 5553 | int nr_exclusive, void *key) |
| @@ -5279,6 +5586,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | |||
| 5279 | * with each other. This can prevent needless bouncing between CPUs. | 5586 | * with each other. This can prevent needless bouncing between CPUs. |
| 5280 | * | 5587 | * |
| 5281 | * On UP it can prevent extra preemption. | 5588 | * On UP it can prevent extra preemption. |
| 5589 | * | ||
| 5590 | * It may be assumed that this function implies a write memory barrier before | ||
| 5591 | * changing the task state if and only if any tasks are woken up. | ||
| 5282 | */ | 5592 | */ |
| 5283 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | 5593 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, |
| 5284 | int nr_exclusive, void *key) | 5594 | int nr_exclusive, void *key) |
| @@ -5315,6 +5625,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | |||
| 5315 | * awakened in the same order in which they were queued. | 5625 | * awakened in the same order in which they were queued. |
| 5316 | * | 5626 | * |
| 5317 | * See also complete_all(), wait_for_completion() and related routines. | 5627 | * See also complete_all(), wait_for_completion() and related routines. |
| 5628 | * | ||
| 5629 | * It may be assumed that this function implies a write memory barrier before | ||
| 5630 | * changing the task state if and only if any tasks are woken up. | ||
| 5318 | */ | 5631 | */ |
| 5319 | void complete(struct completion *x) | 5632 | void complete(struct completion *x) |
| 5320 | { | 5633 | { |
| @@ -5332,6 +5645,9 @@ EXPORT_SYMBOL(complete); | |||
| 5332 | * @x: holds the state of this particular completion | 5645 | * @x: holds the state of this particular completion |
| 5333 | * | 5646 | * |
| 5334 | * This will wake up all threads waiting on this particular completion event. | 5647 | * This will wake up all threads waiting on this particular completion event. |
| 5648 | * | ||
| 5649 | * It may be assumed that this function implies a write memory barrier before | ||
| 5650 | * changing the task state if and only if any tasks are woken up. | ||
| 5335 | */ | 5651 | */ |
| 5336 | void complete_all(struct completion *x) | 5652 | void complete_all(struct completion *x) |
| 5337 | { | 5653 | { |
| @@ -6248,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield) | |||
| 6248 | return 0; | 6564 | return 0; |
| 6249 | } | 6565 | } |
| 6250 | 6566 | ||
| 6567 | static inline int should_resched(void) | ||
| 6568 | { | ||
| 6569 | return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); | ||
| 6570 | } | ||
| 6571 | |||
| 6251 | static void __cond_resched(void) | 6572 | static void __cond_resched(void) |
| 6252 | { | 6573 | { |
| 6253 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6574 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
| @@ -6267,8 +6588,7 @@ static void __cond_resched(void) | |||
| 6267 | 6588 | ||
| 6268 | int __sched _cond_resched(void) | 6589 | int __sched _cond_resched(void) |
| 6269 | { | 6590 | { |
| 6270 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && | 6591 | if (should_resched()) { |
| 6271 | system_state == SYSTEM_RUNNING) { | ||
| 6272 | __cond_resched(); | 6592 | __cond_resched(); |
| 6273 | return 1; | 6593 | return 1; |
| 6274 | } | 6594 | } |
| @@ -6286,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched); | |||
| 6286 | */ | 6606 | */ |
| 6287 | int cond_resched_lock(spinlock_t *lock) | 6607 | int cond_resched_lock(spinlock_t *lock) |
| 6288 | { | 6608 | { |
| 6289 | int resched = need_resched() && system_state == SYSTEM_RUNNING; | 6609 | int resched = should_resched(); |
| 6290 | int ret = 0; | 6610 | int ret = 0; |
| 6291 | 6611 | ||
| 6292 | if (spin_needbreak(lock) || resched) { | 6612 | if (spin_needbreak(lock) || resched) { |
| 6293 | spin_unlock(lock); | 6613 | spin_unlock(lock); |
| 6294 | if (resched && need_resched()) | 6614 | if (resched) |
| 6295 | __cond_resched(); | 6615 | __cond_resched(); |
| 6296 | else | 6616 | else |
| 6297 | cpu_relax(); | 6617 | cpu_relax(); |
| @@ -6306,7 +6626,7 @@ int __sched cond_resched_softirq(void) | |||
| 6306 | { | 6626 | { |
| 6307 | BUG_ON(!in_softirq()); | 6627 | BUG_ON(!in_softirq()); |
| 6308 | 6628 | ||
| 6309 | if (need_resched() && system_state == SYSTEM_RUNNING) { | 6629 | if (should_resched()) { |
| 6310 | local_bh_enable(); | 6630 | local_bh_enable(); |
| 6311 | __cond_resched(); | 6631 | __cond_resched(); |
| 6312 | local_bh_disable(); | 6632 | local_bh_disable(); |
| @@ -6490,8 +6810,9 @@ void sched_show_task(struct task_struct *p) | |||
| 6490 | #ifdef CONFIG_DEBUG_STACK_USAGE | 6810 | #ifdef CONFIG_DEBUG_STACK_USAGE |
| 6491 | free = stack_not_used(p); | 6811 | free = stack_not_used(p); |
| 6492 | #endif | 6812 | #endif |
| 6493 | printk(KERN_CONT "%5lu %5d %6d\n", free, | 6813 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
| 6494 | task_pid_nr(p), task_pid_nr(p->real_parent)); | 6814 | task_pid_nr(p), task_pid_nr(p->real_parent), |
| 6815 | (unsigned long)task_thread_info(p)->flags); | ||
| 6495 | 6816 | ||
| 6496 | show_stack(p, NULL); | 6817 | show_stack(p, NULL); |
| 6497 | } | 6818 | } |
| @@ -6752,7 +7073,7 @@ static int migration_thread(void *data) | |||
| 6752 | 7073 | ||
| 6753 | if (cpu_is_offline(cpu)) { | 7074 | if (cpu_is_offline(cpu)) { |
| 6754 | spin_unlock_irq(&rq->lock); | 7075 | spin_unlock_irq(&rq->lock); |
| 6755 | goto wait_to_die; | 7076 | break; |
| 6756 | } | 7077 | } |
| 6757 | 7078 | ||
| 6758 | if (rq->active_balance) { | 7079 | if (rq->active_balance) { |
| @@ -6778,16 +7099,7 @@ static int migration_thread(void *data) | |||
| 6778 | complete(&req->done); | 7099 | complete(&req->done); |
| 6779 | } | 7100 | } |
| 6780 | __set_current_state(TASK_RUNNING); | 7101 | __set_current_state(TASK_RUNNING); |
| 6781 | return 0; | ||
| 6782 | 7102 | ||
| 6783 | wait_to_die: | ||
| 6784 | /* Wait for kthread_stop */ | ||
| 6785 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 6786 | while (!kthread_should_stop()) { | ||
| 6787 | schedule(); | ||
| 6788 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 6789 | } | ||
| 6790 | __set_current_state(TASK_RUNNING); | ||
| 6791 | return 0; | 7103 | return 0; |
| 6792 | } | 7104 | } |
| 6793 | 7105 | ||
| @@ -6970,6 +7282,15 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
| 6970 | 7282 | ||
| 6971 | } | 7283 | } |
| 6972 | } | 7284 | } |
| 7285 | |||
| 7286 | /* | ||
| 7287 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
| 7288 | */ | ||
| 7289 | static void calc_global_load_remove(struct rq *rq) | ||
| 7290 | { | ||
| 7291 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
| 7292 | rq->calc_load_active = 0; | ||
| 7293 | } | ||
| 6973 | #endif /* CONFIG_HOTPLUG_CPU */ | 7294 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 6974 | 7295 | ||
| 6975 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 7296 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
| @@ -7193,7 +7514,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 7193 | rq = task_rq_lock(p, &flags); | 7514 | rq = task_rq_lock(p, &flags); |
| 7194 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | 7515 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
| 7195 | task_rq_unlock(rq, &flags); | 7516 | task_rq_unlock(rq, &flags); |
| 7517 | get_task_struct(p); | ||
| 7196 | cpu_rq(cpu)->migration_thread = p; | 7518 | cpu_rq(cpu)->migration_thread = p; |
| 7519 | rq->calc_load_update = calc_load_update; | ||
| 7197 | break; | 7520 | break; |
| 7198 | 7521 | ||
| 7199 | case CPU_ONLINE: | 7522 | case CPU_ONLINE: |
| @@ -7221,6 +7544,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 7221 | kthread_bind(cpu_rq(cpu)->migration_thread, | 7544 | kthread_bind(cpu_rq(cpu)->migration_thread, |
| 7222 | cpumask_any(cpu_online_mask)); | 7545 | cpumask_any(cpu_online_mask)); |
| 7223 | kthread_stop(cpu_rq(cpu)->migration_thread); | 7546 | kthread_stop(cpu_rq(cpu)->migration_thread); |
| 7547 | put_task_struct(cpu_rq(cpu)->migration_thread); | ||
| 7224 | cpu_rq(cpu)->migration_thread = NULL; | 7548 | cpu_rq(cpu)->migration_thread = NULL; |
| 7225 | break; | 7549 | break; |
| 7226 | 7550 | ||
| @@ -7230,6 +7554,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 7230 | migrate_live_tasks(cpu); | 7554 | migrate_live_tasks(cpu); |
| 7231 | rq = cpu_rq(cpu); | 7555 | rq = cpu_rq(cpu); |
| 7232 | kthread_stop(rq->migration_thread); | 7556 | kthread_stop(rq->migration_thread); |
| 7557 | put_task_struct(rq->migration_thread); | ||
| 7233 | rq->migration_thread = NULL; | 7558 | rq->migration_thread = NULL; |
| 7234 | /* Idle task back to normal (off runqueue, low prio) */ | 7559 | /* Idle task back to normal (off runqueue, low prio) */ |
| 7235 | spin_lock_irq(&rq->lock); | 7560 | spin_lock_irq(&rq->lock); |
| @@ -7243,7 +7568,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 7243 | cpuset_unlock(); | 7568 | cpuset_unlock(); |
| 7244 | migrate_nr_uninterruptible(rq); | 7569 | migrate_nr_uninterruptible(rq); |
| 7245 | BUG_ON(rq->nr_running != 0); | 7570 | BUG_ON(rq->nr_running != 0); |
| 7246 | 7571 | calc_global_load_remove(rq); | |
| 7247 | /* | 7572 | /* |
| 7248 | * No need to migrate the tasks: it was best-effort if | 7573 | * No need to migrate the tasks: it was best-effort if |
| 7249 | * they didn't take sched_hotcpu_mutex. Just wake up | 7574 | * they didn't take sched_hotcpu_mutex. Just wake up |
| @@ -7279,8 +7604,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 7279 | return NOTIFY_OK; | 7604 | return NOTIFY_OK; |
| 7280 | } | 7605 | } |
| 7281 | 7606 | ||
| 7282 | /* Register at highest priority so that task migration (migrate_all_tasks) | 7607 | /* |
| 7283 | * happens before everything else. | 7608 | * Register at high priority so that task migration (migrate_all_tasks) |
| 7609 | * happens before everything else. This has to be lower priority than | ||
| 7610 | * the notifier in the perf_counter subsystem, though. | ||
| 7284 | */ | 7611 | */ |
| 7285 | static struct notifier_block __cpuinitdata migration_notifier = { | 7612 | static struct notifier_block __cpuinitdata migration_notifier = { |
| 7286 | .notifier_call = migration_call, | 7613 | .notifier_call = migration_call, |
| @@ -7523,26 +7850,23 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
| 7523 | free_rootdomain(old_rd); | 7850 | free_rootdomain(old_rd); |
| 7524 | } | 7851 | } |
| 7525 | 7852 | ||
| 7526 | static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) | 7853 | static int init_rootdomain(struct root_domain *rd, bool bootmem) |
| 7527 | { | 7854 | { |
| 7855 | gfp_t gfp = GFP_KERNEL; | ||
| 7856 | |||
| 7528 | memset(rd, 0, sizeof(*rd)); | 7857 | memset(rd, 0, sizeof(*rd)); |
| 7529 | 7858 | ||
| 7530 | if (bootmem) { | 7859 | if (bootmem) |
| 7531 | alloc_bootmem_cpumask_var(&def_root_domain.span); | 7860 | gfp = GFP_NOWAIT; |
| 7532 | alloc_bootmem_cpumask_var(&def_root_domain.online); | ||
| 7533 | alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); | ||
| 7534 | cpupri_init(&rd->cpupri, true); | ||
| 7535 | return 0; | ||
| 7536 | } | ||
| 7537 | 7861 | ||
| 7538 | if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) | 7862 | if (!alloc_cpumask_var(&rd->span, gfp)) |
| 7539 | goto out; | 7863 | goto out; |
| 7540 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) | 7864 | if (!alloc_cpumask_var(&rd->online, gfp)) |
| 7541 | goto free_span; | 7865 | goto free_span; |
| 7542 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | 7866 | if (!alloc_cpumask_var(&rd->rto_mask, gfp)) |
| 7543 | goto free_online; | 7867 | goto free_online; |
| 7544 | 7868 | ||
| 7545 | if (cpupri_init(&rd->cpupri, false) != 0) | 7869 | if (cpupri_init(&rd->cpupri, bootmem) != 0) |
| 7546 | goto free_rto_mask; | 7870 | goto free_rto_mask; |
| 7547 | return 0; | 7871 | return 0; |
| 7548 | 7872 | ||
| @@ -7753,8 +8077,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | |||
| 7753 | 8077 | ||
| 7754 | /* | 8078 | /* |
| 7755 | * The cpus mask in sched_group and sched_domain hangs off the end. | 8079 | * The cpus mask in sched_group and sched_domain hangs off the end. |
| 7756 | * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space | 8080 | * |
| 7757 | * for nr_cpu_ids < CONFIG_NR_CPUS. | 8081 | * ( See the the comments in include/linux/sched.h:struct sched_group |
| 8082 | * and struct sched_domain. ) | ||
| 7758 | */ | 8083 | */ |
| 7759 | struct static_sched_group { | 8084 | struct static_sched_group { |
| 7760 | struct sched_group sg; | 8085 | struct sched_group sg; |
| @@ -7875,7 +8200,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
| 7875 | struct sched_domain *sd; | 8200 | struct sched_domain *sd; |
| 7876 | 8201 | ||
| 7877 | sd = &per_cpu(phys_domains, j).sd; | 8202 | sd = &per_cpu(phys_domains, j).sd; |
| 7878 | if (j != cpumask_first(sched_group_cpus(sd->groups))) { | 8203 | if (j != group_first_cpu(sd->groups)) { |
| 7879 | /* | 8204 | /* |
| 7880 | * Only add "power" once for each | 8205 | * Only add "power" once for each |
| 7881 | * physical package. | 8206 | * physical package. |
| @@ -7953,7 +8278,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 7953 | 8278 | ||
| 7954 | WARN_ON(!sd || !sd->groups); | 8279 | WARN_ON(!sd || !sd->groups); |
| 7955 | 8280 | ||
| 7956 | if (cpu != cpumask_first(sched_group_cpus(sd->groups))) | 8281 | if (cpu != group_first_cpu(sd->groups)) |
| 7957 | return; | 8282 | return; |
| 7958 | 8283 | ||
| 7959 | child = sd->child; | 8284 | child = sd->child; |
| @@ -8731,6 +9056,8 @@ void __init sched_init_smp(void) | |||
| 8731 | } | 9056 | } |
| 8732 | #endif /* CONFIG_SMP */ | 9057 | #endif /* CONFIG_SMP */ |
| 8733 | 9058 | ||
| 9059 | const_debug unsigned int sysctl_timer_migration = 1; | ||
| 9060 | |||
| 8734 | int in_sched_functions(unsigned long addr) | 9061 | int in_sched_functions(unsigned long addr) |
| 8735 | { | 9062 | { |
| 8736 | return in_lock_functions(addr) || | 9063 | return in_lock_functions(addr) || |
| @@ -8770,7 +9097,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
| 8770 | #ifdef CONFIG_SMP | 9097 | #ifdef CONFIG_SMP |
| 8771 | rt_rq->rt_nr_migratory = 0; | 9098 | rt_rq->rt_nr_migratory = 0; |
| 8772 | rt_rq->overloaded = 0; | 9099 | rt_rq->overloaded = 0; |
| 8773 | plist_head_init(&rq->rt.pushable_tasks, &rq->lock); | 9100 | plist_head_init(&rt_rq->pushable_tasks, &rq->lock); |
| 8774 | #endif | 9101 | #endif |
| 8775 | 9102 | ||
| 8776 | rt_rq->rt_time = 0; | 9103 | rt_rq->rt_time = 0; |
| @@ -8865,7 +9192,7 @@ void __init sched_init(void) | |||
| 8865 | * we use alloc_bootmem(). | 9192 | * we use alloc_bootmem(). |
| 8866 | */ | 9193 | */ |
| 8867 | if (alloc_size) { | 9194 | if (alloc_size) { |
| 8868 | ptr = (unsigned long)alloc_bootmem(alloc_size); | 9195 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
| 8869 | 9196 | ||
| 8870 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9197 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 8871 | init_task_group.se = (struct sched_entity **)ptr; | 9198 | init_task_group.se = (struct sched_entity **)ptr; |
| @@ -8938,6 +9265,8 @@ void __init sched_init(void) | |||
| 8938 | rq = cpu_rq(i); | 9265 | rq = cpu_rq(i); |
| 8939 | spin_lock_init(&rq->lock); | 9266 | spin_lock_init(&rq->lock); |
| 8940 | rq->nr_running = 0; | 9267 | rq->nr_running = 0; |
| 9268 | rq->calc_load_active = 0; | ||
| 9269 | rq->calc_load_update = jiffies + LOAD_FREQ; | ||
| 8941 | init_cfs_rq(&rq->cfs, rq); | 9270 | init_cfs_rq(&rq->cfs, rq); |
| 8942 | init_rt_rq(&rq->rt, rq); | 9271 | init_rt_rq(&rq->rt, rq); |
| 8943 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9272 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| @@ -8958,7 +9287,7 @@ void __init sched_init(void) | |||
| 8958 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | 9287 | * 1024) and two child groups A0 and A1 (of weight 1024 each), |
| 8959 | * then A0's share of the cpu resource is: | 9288 | * then A0's share of the cpu resource is: |
| 8960 | * | 9289 | * |
| 8961 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | 9290 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% |
| 8962 | * | 9291 | * |
| 8963 | * We achieve this by letting init_task_group's tasks sit | 9292 | * We achieve this by letting init_task_group's tasks sit |
| 8964 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 9293 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). |
| @@ -9045,20 +9374,26 @@ void __init sched_init(void) | |||
| 9045 | * when this runqueue becomes "idle". | 9374 | * when this runqueue becomes "idle". |
| 9046 | */ | 9375 | */ |
| 9047 | init_idle(current, smp_processor_id()); | 9376 | init_idle(current, smp_processor_id()); |
| 9377 | |||
| 9378 | calc_load_update = jiffies + LOAD_FREQ; | ||
| 9379 | |||
| 9048 | /* | 9380 | /* |
| 9049 | * During early bootup we pretend to be a normal task: | 9381 | * During early bootup we pretend to be a normal task: |
| 9050 | */ | 9382 | */ |
| 9051 | current->sched_class = &fair_sched_class; | 9383 | current->sched_class = &fair_sched_class; |
| 9052 | 9384 | ||
| 9053 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | 9385 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ |
| 9054 | alloc_bootmem_cpumask_var(&nohz_cpu_mask); | 9386 | alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
| 9055 | #ifdef CONFIG_SMP | 9387 | #ifdef CONFIG_SMP |
| 9056 | #ifdef CONFIG_NO_HZ | 9388 | #ifdef CONFIG_NO_HZ |
| 9057 | alloc_bootmem_cpumask_var(&nohz.cpu_mask); | 9389 | alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); |
| 9390 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); | ||
| 9058 | #endif | 9391 | #endif |
| 9059 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | 9392 | alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
| 9060 | #endif /* SMP */ | 9393 | #endif /* SMP */ |
| 9061 | 9394 | ||
| 9395 | perf_counter_init(); | ||
| 9396 | |||
| 9062 | scheduler_running = 1; | 9397 | scheduler_running = 1; |
| 9063 | } | 9398 | } |
| 9064 | 9399 | ||
| @@ -9800,6 +10135,13 @@ static int sched_rt_global_constraints(void) | |||
| 9800 | if (sysctl_sched_rt_period <= 0) | 10135 | if (sysctl_sched_rt_period <= 0) |
| 9801 | return -EINVAL; | 10136 | return -EINVAL; |
| 9802 | 10137 | ||
| 10138 | /* | ||
| 10139 | * There's always some RT tasks in the root group | ||
| 10140 | * -- migration, kstopmachine etc.. | ||
| 10141 | */ | ||
| 10142 | if (sysctl_sched_rt_runtime == 0) | ||
| 10143 | return -EBUSY; | ||
| 10144 | |||
| 9803 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | 10145 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); |
| 9804 | for_each_possible_cpu(i) { | 10146 | for_each_possible_cpu(i) { |
| 9805 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | 10147 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; |
