diff options
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 694 |
1 files changed, 79 insertions, 615 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 649c9f876cb1..c17747236438 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -740,126 +740,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 740 | dequeue_task(rq, p, flags); | 740 | dequeue_task(rq, p, flags); |
| 741 | } | 741 | } |
| 742 | 742 | ||
| 743 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 744 | |||
| 745 | /* | ||
| 746 | * There are no locks covering percpu hardirq/softirq time. | ||
| 747 | * They are only modified in account_system_vtime, on corresponding CPU | ||
| 748 | * with interrupts disabled. So, writes are safe. | ||
| 749 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
| 750 | * This may result in other CPU reading this CPU's irq time and can | ||
| 751 | * race with irq/account_system_vtime on this CPU. We would either get old | ||
| 752 | * or new value with a side effect of accounting a slice of irq time to wrong | ||
| 753 | * task when irq is in progress while we read rq->clock. That is a worthy | ||
| 754 | * compromise in place of having locks on each irq in account_system_time. | ||
| 755 | */ | ||
| 756 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
| 757 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
| 758 | |||
| 759 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
| 760 | static int sched_clock_irqtime; | ||
| 761 | |||
| 762 | void enable_sched_clock_irqtime(void) | ||
| 763 | { | ||
| 764 | sched_clock_irqtime = 1; | ||
| 765 | } | ||
| 766 | |||
| 767 | void disable_sched_clock_irqtime(void) | ||
| 768 | { | ||
| 769 | sched_clock_irqtime = 0; | ||
| 770 | } | ||
| 771 | |||
| 772 | #ifndef CONFIG_64BIT | ||
| 773 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
| 774 | |||
| 775 | static inline void irq_time_write_begin(void) | ||
| 776 | { | ||
| 777 | __this_cpu_inc(irq_time_seq.sequence); | ||
| 778 | smp_wmb(); | ||
| 779 | } | ||
| 780 | |||
| 781 | static inline void irq_time_write_end(void) | ||
| 782 | { | ||
| 783 | smp_wmb(); | ||
| 784 | __this_cpu_inc(irq_time_seq.sequence); | ||
| 785 | } | ||
| 786 | |||
| 787 | static inline u64 irq_time_read(int cpu) | ||
| 788 | { | ||
| 789 | u64 irq_time; | ||
| 790 | unsigned seq; | ||
| 791 | |||
| 792 | do { | ||
| 793 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
| 794 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
| 795 | per_cpu(cpu_hardirq_time, cpu); | ||
| 796 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
| 797 | |||
| 798 | return irq_time; | ||
| 799 | } | ||
| 800 | #else /* CONFIG_64BIT */ | ||
| 801 | static inline void irq_time_write_begin(void) | ||
| 802 | { | ||
| 803 | } | ||
| 804 | |||
| 805 | static inline void irq_time_write_end(void) | ||
| 806 | { | ||
| 807 | } | ||
| 808 | |||
| 809 | static inline u64 irq_time_read(int cpu) | ||
| 810 | { | ||
| 811 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
| 812 | } | ||
| 813 | #endif /* CONFIG_64BIT */ | ||
| 814 | |||
| 815 | /* | ||
| 816 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
| 817 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
| 818 | */ | ||
| 819 | void account_system_vtime(struct task_struct *curr) | ||
| 820 | { | ||
| 821 | unsigned long flags; | ||
| 822 | s64 delta; | ||
| 823 | int cpu; | ||
| 824 | |||
| 825 | if (!sched_clock_irqtime) | ||
| 826 | return; | ||
| 827 | |||
| 828 | local_irq_save(flags); | ||
| 829 | |||
| 830 | cpu = smp_processor_id(); | ||
| 831 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | ||
| 832 | __this_cpu_add(irq_start_time, delta); | ||
| 833 | |||
| 834 | irq_time_write_begin(); | ||
| 835 | /* | ||
| 836 | * We do not account for softirq time from ksoftirqd here. | ||
| 837 | * We want to continue accounting softirq time to ksoftirqd thread | ||
| 838 | * in that case, so as not to confuse scheduler with a special task | ||
| 839 | * that do not consume any time, but still wants to run. | ||
| 840 | */ | ||
| 841 | if (hardirq_count()) | ||
| 842 | __this_cpu_add(cpu_hardirq_time, delta); | ||
| 843 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | ||
| 844 | __this_cpu_add(cpu_softirq_time, delta); | ||
| 845 | |||
| 846 | irq_time_write_end(); | ||
| 847 | local_irq_restore(flags); | ||
| 848 | } | ||
| 849 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
| 850 | |||
| 851 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 852 | |||
| 853 | #ifdef CONFIG_PARAVIRT | ||
| 854 | static inline u64 steal_ticks(u64 steal) | ||
| 855 | { | ||
| 856 | if (unlikely(steal > NSEC_PER_SEC)) | ||
| 857 | return div_u64(steal, TICK_NSEC); | ||
| 858 | |||
| 859 | return __iter_div_u64_rem(steal, TICK_NSEC, &steal); | ||
| 860 | } | ||
| 861 | #endif | ||
| 862 | |||
| 863 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 743 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
| 864 | { | 744 | { |
| 865 | /* | 745 | /* |
| @@ -920,43 +800,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
| 920 | #endif | 800 | #endif |
| 921 | } | 801 | } |
| 922 | 802 | ||
| 923 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 924 | static int irqtime_account_hi_update(void) | ||
| 925 | { | ||
| 926 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 927 | unsigned long flags; | ||
| 928 | u64 latest_ns; | ||
| 929 | int ret = 0; | ||
| 930 | |||
| 931 | local_irq_save(flags); | ||
| 932 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
| 933 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) | ||
| 934 | ret = 1; | ||
| 935 | local_irq_restore(flags); | ||
| 936 | return ret; | ||
| 937 | } | ||
| 938 | |||
| 939 | static int irqtime_account_si_update(void) | ||
| 940 | { | ||
| 941 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 942 | unsigned long flags; | ||
| 943 | u64 latest_ns; | ||
| 944 | int ret = 0; | ||
| 945 | |||
| 946 | local_irq_save(flags); | ||
| 947 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
| 948 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) | ||
| 949 | ret = 1; | ||
| 950 | local_irq_restore(flags); | ||
| 951 | return ret; | ||
| 952 | } | ||
| 953 | |||
| 954 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 955 | |||
| 956 | #define sched_clock_irqtime (0) | ||
| 957 | |||
| 958 | #endif | ||
| 959 | |||
| 960 | void sched_set_stop_task(int cpu, struct task_struct *stop) | 803 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
| 961 | { | 804 | { |
| 962 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | 805 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; |
| @@ -1518,25 +1361,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) | |||
| 1518 | smp_send_reschedule(cpu); | 1361 | smp_send_reschedule(cpu); |
| 1519 | } | 1362 | } |
| 1520 | 1363 | ||
| 1521 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1522 | static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | ||
| 1523 | { | ||
| 1524 | struct rq *rq; | ||
| 1525 | int ret = 0; | ||
| 1526 | |||
| 1527 | rq = __task_rq_lock(p); | ||
| 1528 | if (p->on_cpu) { | ||
| 1529 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | ||
| 1530 | ttwu_do_wakeup(rq, p, wake_flags); | ||
| 1531 | ret = 1; | ||
| 1532 | } | ||
| 1533 | __task_rq_unlock(rq); | ||
| 1534 | |||
| 1535 | return ret; | ||
| 1536 | |||
| 1537 | } | ||
| 1538 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
| 1539 | |||
| 1540 | bool cpus_share_cache(int this_cpu, int that_cpu) | 1364 | bool cpus_share_cache(int this_cpu, int that_cpu) |
| 1541 | { | 1365 | { |
| 1542 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | 1366 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); |
| @@ -1597,21 +1421,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 1597 | * If the owning (remote) cpu is still in the middle of schedule() with | 1421 | * If the owning (remote) cpu is still in the middle of schedule() with |
| 1598 | * this task as prev, wait until its done referencing the task. | 1422 | * this task as prev, wait until its done referencing the task. |
| 1599 | */ | 1423 | */ |
| 1600 | while (p->on_cpu) { | 1424 | while (p->on_cpu) |
| 1601 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1602 | /* | ||
| 1603 | * In case the architecture enables interrupts in | ||
| 1604 | * context_switch(), we cannot busy wait, since that | ||
| 1605 | * would lead to deadlocks when an interrupt hits and | ||
| 1606 | * tries to wake up @prev. So bail and do a complete | ||
| 1607 | * remote wakeup. | ||
| 1608 | */ | ||
| 1609 | if (ttwu_activate_remote(p, wake_flags)) | ||
| 1610 | goto stat; | ||
| 1611 | #else | ||
| 1612 | cpu_relax(); | 1425 | cpu_relax(); |
| 1613 | #endif | ||
| 1614 | } | ||
| 1615 | /* | 1426 | /* |
| 1616 | * Pairs with the smp_wmb() in finish_lock_switch(). | 1427 | * Pairs with the smp_wmb() in finish_lock_switch(). |
| 1617 | */ | 1428 | */ |
| @@ -1953,14 +1764,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 1953 | * Manfred Spraul <manfred@colorfullife.com> | 1764 | * Manfred Spraul <manfred@colorfullife.com> |
| 1954 | */ | 1765 | */ |
| 1955 | prev_state = prev->state; | 1766 | prev_state = prev->state; |
| 1767 | vtime_task_switch(prev); | ||
| 1956 | finish_arch_switch(prev); | 1768 | finish_arch_switch(prev); |
| 1957 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1958 | local_irq_disable(); | ||
| 1959 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
| 1960 | perf_event_task_sched_in(prev, current); | 1769 | perf_event_task_sched_in(prev, current); |
| 1961 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1962 | local_irq_enable(); | ||
| 1963 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
| 1964 | finish_lock_switch(rq, prev); | 1770 | finish_lock_switch(rq, prev); |
| 1965 | finish_arch_post_lock_switch(); | 1771 | finish_arch_post_lock_switch(); |
| 1966 | 1772 | ||
| @@ -2081,6 +1887,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2081 | #endif | 1887 | #endif |
| 2082 | 1888 | ||
| 2083 | /* Here we just switch the register state and the stack. */ | 1889 | /* Here we just switch the register state and the stack. */ |
| 1890 | rcu_switch(prev, next); | ||
| 2084 | switch_to(prev, next, prev); | 1891 | switch_to(prev, next, prev); |
| 2085 | 1892 | ||
| 2086 | barrier(); | 1893 | barrier(); |
| @@ -2809,404 +2616,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
| 2809 | return ns; | 2616 | return ns; |
| 2810 | } | 2617 | } |
| 2811 | 2618 | ||
| 2812 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 2813 | struct cgroup_subsys cpuacct_subsys; | ||
| 2814 | struct cpuacct root_cpuacct; | ||
| 2815 | #endif | ||
| 2816 | |||
| 2817 | static inline void task_group_account_field(struct task_struct *p, int index, | ||
| 2818 | u64 tmp) | ||
| 2819 | { | ||
| 2820 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 2821 | struct kernel_cpustat *kcpustat; | ||
| 2822 | struct cpuacct *ca; | ||
| 2823 | #endif | ||
| 2824 | /* | ||
| 2825 | * Since all updates are sure to touch the root cgroup, we | ||
| 2826 | * get ourselves ahead and touch it first. If the root cgroup | ||
| 2827 | * is the only cgroup, then nothing else should be necessary. | ||
| 2828 | * | ||
| 2829 | */ | ||
| 2830 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
| 2831 | |||
| 2832 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 2833 | if (unlikely(!cpuacct_subsys.active)) | ||
| 2834 | return; | ||
| 2835 | |||
| 2836 | rcu_read_lock(); | ||
| 2837 | ca = task_ca(p); | ||
| 2838 | while (ca && (ca != &root_cpuacct)) { | ||
| 2839 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
| 2840 | kcpustat->cpustat[index] += tmp; | ||
| 2841 | ca = parent_ca(ca); | ||
| 2842 | } | ||
| 2843 | rcu_read_unlock(); | ||
| 2844 | #endif | ||
| 2845 | } | ||
| 2846 | |||
| 2847 | |||
| 2848 | /* | ||
| 2849 | * Account user cpu time to a process. | ||
| 2850 | * @p: the process that the cpu time gets accounted to | ||
| 2851 | * @cputime: the cpu time spent in user space since the last update | ||
| 2852 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 2853 | */ | ||
| 2854 | void account_user_time(struct task_struct *p, cputime_t cputime, | ||
| 2855 | cputime_t cputime_scaled) | ||
| 2856 | { | ||
| 2857 | int index; | ||
| 2858 | |||
| 2859 | /* Add user time to process. */ | ||
| 2860 | p->utime += cputime; | ||
| 2861 | p->utimescaled += cputime_scaled; | ||
| 2862 | account_group_user_time(p, cputime); | ||
| 2863 | |||
| 2864 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | ||
| 2865 | |||
| 2866 | /* Add user time to cpustat. */ | ||
| 2867 | task_group_account_field(p, index, (__force u64) cputime); | ||
| 2868 | |||
| 2869 | /* Account for user time used */ | ||
| 2870 | acct_update_integrals(p); | ||
| 2871 | } | ||
| 2872 | |||
| 2873 | /* | ||
| 2874 | * Account guest cpu time to a process. | ||
| 2875 | * @p: the process that the cpu time gets accounted to | ||
| 2876 | * @cputime: the cpu time spent in virtual machine since the last update | ||
| 2877 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 2878 | */ | ||
| 2879 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | ||
| 2880 | cputime_t cputime_scaled) | ||
| 2881 | { | ||
| 2882 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 2883 | |||
| 2884 | /* Add guest time to process. */ | ||
| 2885 | p->utime += cputime; | ||
| 2886 | p->utimescaled += cputime_scaled; | ||
| 2887 | account_group_user_time(p, cputime); | ||
| 2888 | p->gtime += cputime; | ||
| 2889 | |||
| 2890 | /* Add guest time to cpustat. */ | ||
| 2891 | if (TASK_NICE(p) > 0) { | ||
| 2892 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | ||
| 2893 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | ||
| 2894 | } else { | ||
| 2895 | cpustat[CPUTIME_USER] += (__force u64) cputime; | ||
| 2896 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | ||
| 2897 | } | ||
| 2898 | } | ||
| 2899 | |||
| 2900 | /* | ||
| 2901 | * Account system cpu time to a process and desired cpustat field | ||
| 2902 | * @p: the process that the cpu time gets accounted to | ||
| 2903 | * @cputime: the cpu time spent in kernel space since the last update | ||
| 2904 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 2905 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
| 2906 | */ | ||
| 2907 | static inline | ||
| 2908 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
| 2909 | cputime_t cputime_scaled, int index) | ||
| 2910 | { | ||
| 2911 | /* Add system time to process. */ | ||
| 2912 | p->stime += cputime; | ||
| 2913 | p->stimescaled += cputime_scaled; | ||
| 2914 | account_group_system_time(p, cputime); | ||
| 2915 | |||
| 2916 | /* Add system time to cpustat. */ | ||
| 2917 | task_group_account_field(p, index, (__force u64) cputime); | ||
| 2918 | |||
| 2919 | /* Account for system time used */ | ||
| 2920 | acct_update_integrals(p); | ||
| 2921 | } | ||
| 2922 | |||
| 2923 | /* | ||
| 2924 | * Account system cpu time to a process. | ||
| 2925 | * @p: the process that the cpu time gets accounted to | ||
| 2926 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
| 2927 | * @cputime: the cpu time spent in kernel space since the last update | ||
| 2928 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 2929 | */ | ||
| 2930 | void account_system_time(struct task_struct *p, int hardirq_offset, | ||
| 2931 | cputime_t cputime, cputime_t cputime_scaled) | ||
| 2932 | { | ||
| 2933 | int index; | ||
| 2934 | |||
| 2935 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | ||
| 2936 | account_guest_time(p, cputime, cputime_scaled); | ||
| 2937 | return; | ||
| 2938 | } | ||
| 2939 | |||
| 2940 | if (hardirq_count() - hardirq_offset) | ||
| 2941 | index = CPUTIME_IRQ; | ||
| 2942 | else if (in_serving_softirq()) | ||
| 2943 | index = CPUTIME_SOFTIRQ; | ||
| 2944 | else | ||
| 2945 | index = CPUTIME_SYSTEM; | ||
| 2946 | |||
| 2947 | __account_system_time(p, cputime, cputime_scaled, index); | ||
| 2948 | } | ||
| 2949 | |||
| 2950 | /* | ||
| 2951 | * Account for involuntary wait time. | ||
| 2952 | * @cputime: the cpu time spent in involuntary wait | ||
| 2953 | */ | ||
| 2954 | void account_steal_time(cputime_t cputime) | ||
| 2955 | { | ||
| 2956 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 2957 | |||
| 2958 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | ||
| 2959 | } | ||
| 2960 | |||
| 2961 | /* | ||
| 2962 | * Account for idle time. | ||
| 2963 | * @cputime: the cpu time spent in idle wait | ||
| 2964 | */ | ||
| 2965 | void account_idle_time(cputime_t cputime) | ||
| 2966 | { | ||
| 2967 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 2968 | struct rq *rq = this_rq(); | ||
| 2969 | |||
| 2970 | if (atomic_read(&rq->nr_iowait) > 0) | ||
| 2971 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | ||
| 2972 | else | ||
| 2973 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | ||
| 2974 | } | ||
| 2975 | |||
| 2976 | static __always_inline bool steal_account_process_tick(void) | ||
| 2977 | { | ||
| 2978 | #ifdef CONFIG_PARAVIRT | ||
| 2979 | if (static_key_false(¶virt_steal_enabled)) { | ||
| 2980 | u64 steal, st = 0; | ||
| 2981 | |||
| 2982 | steal = paravirt_steal_clock(smp_processor_id()); | ||
| 2983 | steal -= this_rq()->prev_steal_time; | ||
| 2984 | |||
| 2985 | st = steal_ticks(steal); | ||
| 2986 | this_rq()->prev_steal_time += st * TICK_NSEC; | ||
| 2987 | |||
| 2988 | account_steal_time(st); | ||
| 2989 | return st; | ||
| 2990 | } | ||
| 2991 | #endif | ||
| 2992 | return false; | ||
| 2993 | } | ||
| 2994 | |||
| 2995 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 2996 | |||
| 2997 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 2998 | /* | ||
| 2999 | * Account a tick to a process and cpustat | ||
| 3000 | * @p: the process that the cpu time gets accounted to | ||
| 3001 | * @user_tick: is the tick from userspace | ||
| 3002 | * @rq: the pointer to rq | ||
| 3003 | * | ||
| 3004 | * Tick demultiplexing follows the order | ||
| 3005 | * - pending hardirq update | ||
| 3006 | * - pending softirq update | ||
| 3007 | * - user_time | ||
| 3008 | * - idle_time | ||
| 3009 | * - system time | ||
| 3010 | * - check for guest_time | ||
| 3011 | * - else account as system_time | ||
| 3012 | * | ||
| 3013 | * Check for hardirq is done both for system and user time as there is | ||
| 3014 | * no timer going off while we are on hardirq and hence we may never get an | ||
| 3015 | * opportunity to update it solely in system time. | ||
| 3016 | * p->stime and friends are only updated on system time and not on irq | ||
| 3017 | * softirq as those do not count in task exec_runtime any more. | ||
| 3018 | */ | ||
| 3019 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
| 3020 | struct rq *rq) | ||
| 3021 | { | ||
| 3022 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
| 3023 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 3024 | |||
| 3025 | if (steal_account_process_tick()) | ||
| 3026 | return; | ||
| 3027 | |||
| 3028 | if (irqtime_account_hi_update()) { | ||
| 3029 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; | ||
| 3030 | } else if (irqtime_account_si_update()) { | ||
| 3031 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; | ||
| 3032 | } else if (this_cpu_ksoftirqd() == p) { | ||
| 3033 | /* | ||
| 3034 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
| 3035 | * So, we have to handle it separately here. | ||
| 3036 | * Also, p->stime needs to be updated for ksoftirqd. | ||
| 3037 | */ | ||
| 3038 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
| 3039 | CPUTIME_SOFTIRQ); | ||
| 3040 | } else if (user_tick) { | ||
| 3041 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 3042 | } else if (p == rq->idle) { | ||
| 3043 | account_idle_time(cputime_one_jiffy); | ||
| 3044 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
| 3045 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 3046 | } else { | ||
| 3047 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
| 3048 | CPUTIME_SYSTEM); | ||
| 3049 | } | ||
| 3050 | } | ||
| 3051 | |||
| 3052 | static void irqtime_account_idle_ticks(int ticks) | ||
| 3053 | { | ||
| 3054 | int i; | ||
| 3055 | struct rq *rq = this_rq(); | ||
| 3056 | |||
| 3057 | for (i = 0; i < ticks; i++) | ||
| 3058 | irqtime_account_process_tick(current, 0, rq); | ||
| 3059 | } | ||
| 3060 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 3061 | static void irqtime_account_idle_ticks(int ticks) {} | ||
| 3062 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
| 3063 | struct rq *rq) {} | ||
| 3064 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 3065 | |||
| 3066 | /* | ||
| 3067 | * Account a single tick of cpu time. | ||
| 3068 | * @p: the process that the cpu time gets accounted to | ||
| 3069 | * @user_tick: indicates if the tick is a user or a system tick | ||
| 3070 | */ | ||
| 3071 | void account_process_tick(struct task_struct *p, int user_tick) | ||
| 3072 | { | ||
| 3073 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
| 3074 | struct rq *rq = this_rq(); | ||
| 3075 | |||
| 3076 | if (sched_clock_irqtime) { | ||
| 3077 | irqtime_account_process_tick(p, user_tick, rq); | ||
| 3078 | return; | ||
| 3079 | } | ||
| 3080 | |||
| 3081 | if (steal_account_process_tick()) | ||
| 3082 | return; | ||
| 3083 | |||
| 3084 | if (user_tick) | ||
| 3085 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 3086 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | ||
| 3087 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | ||
| 3088 | one_jiffy_scaled); | ||
| 3089 | else | ||
| 3090 | account_idle_time(cputime_one_jiffy); | ||
| 3091 | } | ||
| 3092 | |||
| 3093 | /* | ||
| 3094 | * Account multiple ticks of steal time. | ||
| 3095 | * @p: the process from which the cpu time has been stolen | ||
| 3096 | * @ticks: number of stolen ticks | ||
| 3097 | */ | ||
| 3098 | void account_steal_ticks(unsigned long ticks) | ||
| 3099 | { | ||
| 3100 | account_steal_time(jiffies_to_cputime(ticks)); | ||
| 3101 | } | ||
| 3102 | |||
| 3103 | /* | ||
| 3104 | * Account multiple ticks of idle time. | ||
| 3105 | * @ticks: number of stolen ticks | ||
| 3106 | */ | ||
| 3107 | void account_idle_ticks(unsigned long ticks) | ||
| 3108 | { | ||
| 3109 | |||
| 3110 | if (sched_clock_irqtime) { | ||
| 3111 | irqtime_account_idle_ticks(ticks); | ||
| 3112 | return; | ||
| 3113 | } | ||
| 3114 | |||
| 3115 | account_idle_time(jiffies_to_cputime(ticks)); | ||
| 3116 | } | ||
| 3117 | |||
| 3118 | #endif | ||
| 3119 | |||
| 3120 | /* | ||
| 3121 | * Use precise platform statistics if available: | ||
| 3122 | */ | ||
| 3123 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 3124 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 3125 | { | ||
| 3126 | *ut = p->utime; | ||
| 3127 | *st = p->stime; | ||
| 3128 | } | ||
| 3129 | |||
| 3130 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 3131 | { | ||
| 3132 | struct task_cputime cputime; | ||
| 3133 | |||
| 3134 | thread_group_cputime(p, &cputime); | ||
| 3135 | |||
| 3136 | *ut = cputime.utime; | ||
| 3137 | *st = cputime.stime; | ||
| 3138 | } | ||
| 3139 | #else | ||
| 3140 | |||
| 3141 | #ifndef nsecs_to_cputime | ||
| 3142 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | ||
| 3143 | #endif | ||
| 3144 | |||
| 3145 | static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | ||
| 3146 | { | ||
| 3147 | u64 temp = (__force u64) rtime; | ||
| 3148 | |||
| 3149 | temp *= (__force u64) utime; | ||
| 3150 | |||
| 3151 | if (sizeof(cputime_t) == 4) | ||
| 3152 | temp = div_u64(temp, (__force u32) total); | ||
| 3153 | else | ||
| 3154 | temp = div64_u64(temp, (__force u64) total); | ||
| 3155 | |||
| 3156 | return (__force cputime_t) temp; | ||
| 3157 | } | ||
| 3158 | |||
| 3159 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 3160 | { | ||
| 3161 | cputime_t rtime, utime = p->utime, total = utime + p->stime; | ||
| 3162 | |||
| 3163 | /* | ||
| 3164 | * Use CFS's precise accounting: | ||
| 3165 | */ | ||
| 3166 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | ||
| 3167 | |||
| 3168 | if (total) | ||
| 3169 | utime = scale_utime(utime, rtime, total); | ||
| 3170 | else | ||
| 3171 | utime = rtime; | ||
| 3172 | |||
| 3173 | /* | ||
| 3174 | * Compare with previous values, to keep monotonicity: | ||
| 3175 | */ | ||
| 3176 | p->prev_utime = max(p->prev_utime, utime); | ||
| 3177 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); | ||
| 3178 | |||
| 3179 | *ut = p->prev_utime; | ||
| 3180 | *st = p->prev_stime; | ||
| 3181 | } | ||
| 3182 | |||
| 3183 | /* | ||
| 3184 | * Must be called with siglock held. | ||
| 3185 | */ | ||
| 3186 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 3187 | { | ||
| 3188 | struct signal_struct *sig = p->signal; | ||
| 3189 | struct task_cputime cputime; | ||
| 3190 | cputime_t rtime, utime, total; | ||
| 3191 | |||
| 3192 | thread_group_cputime(p, &cputime); | ||
| 3193 | |||
| 3194 | total = cputime.utime + cputime.stime; | ||
| 3195 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | ||
| 3196 | |||
| 3197 | if (total) | ||
| 3198 | utime = scale_utime(cputime.utime, rtime, total); | ||
| 3199 | else | ||
| 3200 | utime = rtime; | ||
| 3201 | |||
| 3202 | sig->prev_utime = max(sig->prev_utime, utime); | ||
| 3203 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); | ||
| 3204 | |||
| 3205 | *ut = sig->prev_utime; | ||
| 3206 | *st = sig->prev_stime; | ||
| 3207 | } | ||
| 3208 | #endif | ||
| 3209 | |||
| 3210 | /* | 2619 | /* |
| 3211 | * This function gets called by the timer code, with HZ frequency. | 2620 | * This function gets called by the timer code, with HZ frequency. |
| 3212 | * We call it with interrupts disabled. | 2621 | * We call it with interrupts disabled. |
| @@ -3367,6 +2776,40 @@ pick_next_task(struct rq *rq) | |||
| 3367 | 2776 | ||
| 3368 | /* | 2777 | /* |
| 3369 | * __schedule() is the main scheduler function. | 2778 | * __schedule() is the main scheduler function. |
| 2779 | * | ||
| 2780 | * The main means of driving the scheduler and thus entering this function are: | ||
| 2781 | * | ||
| 2782 | * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. | ||
| 2783 | * | ||
| 2784 | * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return | ||
| 2785 | * paths. For example, see arch/x86/entry_64.S. | ||
| 2786 | * | ||
| 2787 | * To drive preemption between tasks, the scheduler sets the flag in timer | ||
| 2788 | * interrupt handler scheduler_tick(). | ||
| 2789 | * | ||
| 2790 | * 3. Wakeups don't really cause entry into schedule(). They add a | ||
| 2791 | * task to the run-queue and that's it. | ||
| 2792 | * | ||
| 2793 | * Now, if the new task added to the run-queue preempts the current | ||
| 2794 | * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets | ||
| 2795 | * called on the nearest possible occasion: | ||
| 2796 | * | ||
| 2797 | * - If the kernel is preemptible (CONFIG_PREEMPT=y): | ||
| 2798 | * | ||
| 2799 | * - in syscall or exception context, at the next outmost | ||
| 2800 | * preempt_enable(). (this might be as soon as the wake_up()'s | ||
| 2801 | * spin_unlock()!) | ||
| 2802 | * | ||
| 2803 | * - in IRQ context, return from interrupt-handler to | ||
| 2804 | * preemptible context | ||
| 2805 | * | ||
| 2806 | * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) | ||
| 2807 | * then at the next: | ||
| 2808 | * | ||
| 2809 | * - cond_resched() call | ||
| 2810 | * - explicit schedule() call | ||
| 2811 | * - return from syscall or exception to user-space | ||
| 2812 | * - return from interrupt-handler to user-space | ||
| 3370 | */ | 2813 | */ |
| 3371 | static void __sched __schedule(void) | 2814 | static void __sched __schedule(void) |
| 3372 | { | 2815 | { |
| @@ -3468,6 +2911,21 @@ asmlinkage void __sched schedule(void) | |||
| 3468 | } | 2911 | } |
| 3469 | EXPORT_SYMBOL(schedule); | 2912 | EXPORT_SYMBOL(schedule); |
| 3470 | 2913 | ||
| 2914 | #ifdef CONFIG_RCU_USER_QS | ||
| 2915 | asmlinkage void __sched schedule_user(void) | ||
| 2916 | { | ||
| 2917 | /* | ||
| 2918 | * If we come here after a random call to set_need_resched(), | ||
| 2919 | * or we have been woken up remotely but the IPI has not yet arrived, | ||
| 2920 | * we haven't yet exited the RCU idle mode. Do it here manually until | ||
| 2921 | * we find a better solution. | ||
| 2922 | */ | ||
| 2923 | rcu_user_exit(); | ||
| 2924 | schedule(); | ||
| 2925 | rcu_user_enter(); | ||
| 2926 | } | ||
| 2927 | #endif | ||
| 2928 | |||
| 3471 | /** | 2929 | /** |
| 3472 | * schedule_preempt_disabled - called with preemption disabled | 2930 | * schedule_preempt_disabled - called with preemption disabled |
| 3473 | * | 2931 | * |
| @@ -3569,6 +3027,7 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
| 3569 | /* Catch callers which need to be fixed */ | 3027 | /* Catch callers which need to be fixed */ |
| 3570 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3028 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
| 3571 | 3029 | ||
| 3030 | rcu_user_exit(); | ||
| 3572 | do { | 3031 | do { |
| 3573 | add_preempt_count(PREEMPT_ACTIVE); | 3032 | add_preempt_count(PREEMPT_ACTIVE); |
| 3574 | local_irq_enable(); | 3033 | local_irq_enable(); |
| @@ -4868,13 +4327,6 @@ again: | |||
| 4868 | */ | 4327 | */ |
| 4869 | if (preempt && rq != p_rq) | 4328 | if (preempt && rq != p_rq) |
| 4870 | resched_task(p_rq->curr); | 4329 | resched_task(p_rq->curr); |
| 4871 | } else { | ||
| 4872 | /* | ||
| 4873 | * We might have set it in task_yield_fair(), but are | ||
| 4874 | * not going to schedule(), so don't want to skip | ||
| 4875 | * the next update. | ||
| 4876 | */ | ||
| 4877 | rq->skip_clock_update = 0; | ||
| 4878 | } | 4330 | } |
| 4879 | 4331 | ||
| 4880 | out: | 4332 | out: |
| @@ -5416,16 +4868,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) | |||
| 5416 | *tablep = NULL; | 4868 | *tablep = NULL; |
| 5417 | } | 4869 | } |
| 5418 | 4870 | ||
| 4871 | static int min_load_idx = 0; | ||
| 4872 | static int max_load_idx = CPU_LOAD_IDX_MAX; | ||
| 4873 | |||
| 5419 | static void | 4874 | static void |
| 5420 | set_table_entry(struct ctl_table *entry, | 4875 | set_table_entry(struct ctl_table *entry, |
| 5421 | const char *procname, void *data, int maxlen, | 4876 | const char *procname, void *data, int maxlen, |
| 5422 | umode_t mode, proc_handler *proc_handler) | 4877 | umode_t mode, proc_handler *proc_handler, |
| 4878 | bool load_idx) | ||
| 5423 | { | 4879 | { |
| 5424 | entry->procname = procname; | 4880 | entry->procname = procname; |
| 5425 | entry->data = data; | 4881 | entry->data = data; |
| 5426 | entry->maxlen = maxlen; | 4882 | entry->maxlen = maxlen; |
| 5427 | entry->mode = mode; | 4883 | entry->mode = mode; |
| 5428 | entry->proc_handler = proc_handler; | 4884 | entry->proc_handler = proc_handler; |
| 4885 | |||
| 4886 | if (load_idx) { | ||
| 4887 | entry->extra1 = &min_load_idx; | ||
| 4888 | entry->extra2 = &max_load_idx; | ||
| 4889 | } | ||
| 5429 | } | 4890 | } |
| 5430 | 4891 | ||
| 5431 | static struct ctl_table * | 4892 | static struct ctl_table * |
| @@ -5437,30 +4898,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
| 5437 | return NULL; | 4898 | return NULL; |
| 5438 | 4899 | ||
| 5439 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 4900 | set_table_entry(&table[0], "min_interval", &sd->min_interval, |
| 5440 | sizeof(long), 0644, proc_doulongvec_minmax); | 4901 | sizeof(long), 0644, proc_doulongvec_minmax, false); |
| 5441 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | 4902 | set_table_entry(&table[1], "max_interval", &sd->max_interval, |
| 5442 | sizeof(long), 0644, proc_doulongvec_minmax); | 4903 | sizeof(long), 0644, proc_doulongvec_minmax, false); |
| 5443 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | 4904 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, |
| 5444 | sizeof(int), 0644, proc_dointvec_minmax); | 4905 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5445 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | 4906 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, |
| 5446 | sizeof(int), 0644, proc_dointvec_minmax); | 4907 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5447 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | 4908 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, |
| 5448 | sizeof(int), 0644, proc_dointvec_minmax); | 4909 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5449 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | 4910 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, |
| 5450 | sizeof(int), 0644, proc_dointvec_minmax); | 4911 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5451 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | 4912 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, |
| 5452 | sizeof(int), 0644, proc_dointvec_minmax); | 4913 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5453 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | 4914 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, |
| 5454 | sizeof(int), 0644, proc_dointvec_minmax); | 4915 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 5455 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | 4916 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, |
| 5456 | sizeof(int), 0644, proc_dointvec_minmax); | 4917 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 5457 | set_table_entry(&table[9], "cache_nice_tries", | 4918 | set_table_entry(&table[9], "cache_nice_tries", |
| 5458 | &sd->cache_nice_tries, | 4919 | &sd->cache_nice_tries, |
| 5459 | sizeof(int), 0644, proc_dointvec_minmax); | 4920 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 5460 | set_table_entry(&table[10], "flags", &sd->flags, | 4921 | set_table_entry(&table[10], "flags", &sd->flags, |
| 5461 | sizeof(int), 0644, proc_dointvec_minmax); | 4922 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 5462 | set_table_entry(&table[11], "name", sd->name, | 4923 | set_table_entry(&table[11], "name", sd->name, |
| 5463 | CORENAME_MAX_SIZE, 0444, proc_dostring); | 4924 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); |
| 5464 | /* &table[12] is terminator */ | 4925 | /* &table[12] is terminator */ |
| 5465 | 4926 | ||
| 5466 | return table; | 4927 | return table; |
| @@ -5604,7 +5065,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 5604 | migrate_tasks(cpu); | 5065 | migrate_tasks(cpu); |
| 5605 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | 5066 | BUG_ON(rq->nr_running != 1); /* the migration thread */ |
| 5606 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5067 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 5068 | break; | ||
| 5607 | 5069 | ||
| 5070 | case CPU_DEAD: | ||
| 5608 | calc_load_migrate(rq); | 5071 | calc_load_migrate(rq); |
| 5609 | break; | 5072 | break; |
| 5610 | #endif | 5073 | #endif |
| @@ -6537,7 +6000,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | |||
| 6537 | | 0*SD_BALANCE_FORK | 6000 | | 0*SD_BALANCE_FORK |
| 6538 | | 0*SD_BALANCE_WAKE | 6001 | | 0*SD_BALANCE_WAKE |
| 6539 | | 0*SD_WAKE_AFFINE | 6002 | | 0*SD_WAKE_AFFINE |
| 6540 | | 0*SD_PREFER_LOCAL | ||
| 6541 | | 0*SD_SHARE_CPUPOWER | 6003 | | 0*SD_SHARE_CPUPOWER |
| 6542 | | 0*SD_SHARE_PKG_RESOURCES | 6004 | | 0*SD_SHARE_PKG_RESOURCES |
| 6543 | | 1*SD_SERIALIZE | 6005 | | 1*SD_SERIALIZE |
| @@ -8335,6 +7797,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
| 8335 | * (balbir@in.ibm.com). | 7797 | * (balbir@in.ibm.com). |
| 8336 | */ | 7798 | */ |
| 8337 | 7799 | ||
| 7800 | struct cpuacct root_cpuacct; | ||
| 7801 | |||
| 8338 | /* create a new cpu accounting group */ | 7802 | /* create a new cpu accounting group */ |
| 8339 | static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) | 7803 | static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) |
| 8340 | { | 7804 | { |
