aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c675
-rw-r--r--kernel/sched/cputime.c530
-rw-r--r--kernel/sched/fair.c81
-rw-r--r--kernel/sched/features.h10
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h69
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/sysctl.c6
-rw-r--r--kernel/time/tick-sched.c3
11 files changed, 686 insertions, 705 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 2343c9eaaaf4..5a0e74d89a5a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1276,11 +1276,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1276#endif 1276#endif
1277#ifdef CONFIG_TRACE_IRQFLAGS 1277#ifdef CONFIG_TRACE_IRQFLAGS
1278 p->irq_events = 0; 1278 p->irq_events = 0;
1279#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1280 p->hardirqs_enabled = 1;
1281#else
1282 p->hardirqs_enabled = 0; 1279 p->hardirqs_enabled = 0;
1283#endif
1284 p->hardirq_enable_ip = 0; 1280 p->hardirq_enable_ip = 0;
1285 p->hardirq_enable_event = 0; 1281 p->hardirq_enable_event = 0;
1286 p->hardirq_disable_ip = _THIS_IP_; 1282 p->hardirq_disable_ip = _THIS_IP_;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 173ea52f3af0..f06d249e103b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o 15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3c4dec0594d6..c17747236438 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -740,126 +740,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
740 dequeue_task(rq, p, flags); 740 dequeue_task(rq, p, flags);
741} 741}
742 742
743#ifdef CONFIG_IRQ_TIME_ACCOUNTING
744
745/*
746 * There are no locks covering percpu hardirq/softirq time.
747 * They are only modified in account_system_vtime, on corresponding CPU
748 * with interrupts disabled. So, writes are safe.
749 * They are read and saved off onto struct rq in update_rq_clock().
750 * This may result in other CPU reading this CPU's irq time and can
751 * race with irq/account_system_vtime on this CPU. We would either get old
752 * or new value with a side effect of accounting a slice of irq time to wrong
753 * task when irq is in progress while we read rq->clock. That is a worthy
754 * compromise in place of having locks on each irq in account_system_time.
755 */
756static DEFINE_PER_CPU(u64, cpu_hardirq_time);
757static DEFINE_PER_CPU(u64, cpu_softirq_time);
758
759static DEFINE_PER_CPU(u64, irq_start_time);
760static int sched_clock_irqtime;
761
762void enable_sched_clock_irqtime(void)
763{
764 sched_clock_irqtime = 1;
765}
766
767void disable_sched_clock_irqtime(void)
768{
769 sched_clock_irqtime = 0;
770}
771
772#ifndef CONFIG_64BIT
773static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
774
775static inline void irq_time_write_begin(void)
776{
777 __this_cpu_inc(irq_time_seq.sequence);
778 smp_wmb();
779}
780
781static inline void irq_time_write_end(void)
782{
783 smp_wmb();
784 __this_cpu_inc(irq_time_seq.sequence);
785}
786
787static inline u64 irq_time_read(int cpu)
788{
789 u64 irq_time;
790 unsigned seq;
791
792 do {
793 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
794 irq_time = per_cpu(cpu_softirq_time, cpu) +
795 per_cpu(cpu_hardirq_time, cpu);
796 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
797
798 return irq_time;
799}
800#else /* CONFIG_64BIT */
801static inline void irq_time_write_begin(void)
802{
803}
804
805static inline void irq_time_write_end(void)
806{
807}
808
809static inline u64 irq_time_read(int cpu)
810{
811 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
812}
813#endif /* CONFIG_64BIT */
814
815/*
816 * Called before incrementing preempt_count on {soft,}irq_enter
817 * and before decrementing preempt_count on {soft,}irq_exit.
818 */
819void account_system_vtime(struct task_struct *curr)
820{
821 unsigned long flags;
822 s64 delta;
823 int cpu;
824
825 if (!sched_clock_irqtime)
826 return;
827
828 local_irq_save(flags);
829
830 cpu = smp_processor_id();
831 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
832 __this_cpu_add(irq_start_time, delta);
833
834 irq_time_write_begin();
835 /*
836 * We do not account for softirq time from ksoftirqd here.
837 * We want to continue accounting softirq time to ksoftirqd thread
838 * in that case, so as not to confuse scheduler with a special task
839 * that do not consume any time, but still wants to run.
840 */
841 if (hardirq_count())
842 __this_cpu_add(cpu_hardirq_time, delta);
843 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
844 __this_cpu_add(cpu_softirq_time, delta);
845
846 irq_time_write_end();
847 local_irq_restore(flags);
848}
849EXPORT_SYMBOL_GPL(account_system_vtime);
850
851#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
852
853#ifdef CONFIG_PARAVIRT
854static inline u64 steal_ticks(u64 steal)
855{
856 if (unlikely(steal > NSEC_PER_SEC))
857 return div_u64(steal, TICK_NSEC);
858
859 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
860}
861#endif
862
863static void update_rq_clock_task(struct rq *rq, s64 delta) 743static void update_rq_clock_task(struct rq *rq, s64 delta)
864{ 744{
865/* 745/*
@@ -920,43 +800,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
920#endif 800#endif
921} 801}
922 802
923#ifdef CONFIG_IRQ_TIME_ACCOUNTING
924static int irqtime_account_hi_update(void)
925{
926 u64 *cpustat = kcpustat_this_cpu->cpustat;
927 unsigned long flags;
928 u64 latest_ns;
929 int ret = 0;
930
931 local_irq_save(flags);
932 latest_ns = this_cpu_read(cpu_hardirq_time);
933 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
934 ret = 1;
935 local_irq_restore(flags);
936 return ret;
937}
938
939static int irqtime_account_si_update(void)
940{
941 u64 *cpustat = kcpustat_this_cpu->cpustat;
942 unsigned long flags;
943 u64 latest_ns;
944 int ret = 0;
945
946 local_irq_save(flags);
947 latest_ns = this_cpu_read(cpu_softirq_time);
948 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
949 ret = 1;
950 local_irq_restore(flags);
951 return ret;
952}
953
954#else /* CONFIG_IRQ_TIME_ACCOUNTING */
955
956#define sched_clock_irqtime (0)
957
958#endif
959
960void sched_set_stop_task(int cpu, struct task_struct *stop) 803void sched_set_stop_task(int cpu, struct task_struct *stop)
961{ 804{
962 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 805 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -1518,25 +1361,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
1518 smp_send_reschedule(cpu); 1361 smp_send_reschedule(cpu);
1519} 1362}
1520 1363
1521#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1522static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
1523{
1524 struct rq *rq;
1525 int ret = 0;
1526
1527 rq = __task_rq_lock(p);
1528 if (p->on_cpu) {
1529 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1530 ttwu_do_wakeup(rq, p, wake_flags);
1531 ret = 1;
1532 }
1533 __task_rq_unlock(rq);
1534
1535 return ret;
1536
1537}
1538#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1539
1540bool cpus_share_cache(int this_cpu, int that_cpu) 1364bool cpus_share_cache(int this_cpu, int that_cpu)
1541{ 1365{
1542 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1366 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
@@ -1597,21 +1421,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1597 * If the owning (remote) cpu is still in the middle of schedule() with 1421 * If the owning (remote) cpu is still in the middle of schedule() with
1598 * this task as prev, wait until its done referencing the task. 1422 * this task as prev, wait until its done referencing the task.
1599 */ 1423 */
1600 while (p->on_cpu) { 1424 while (p->on_cpu)
1601#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1602 /*
1603 * In case the architecture enables interrupts in
1604 * context_switch(), we cannot busy wait, since that
1605 * would lead to deadlocks when an interrupt hits and
1606 * tries to wake up @prev. So bail and do a complete
1607 * remote wakeup.
1608 */
1609 if (ttwu_activate_remote(p, wake_flags))
1610 goto stat;
1611#else
1612 cpu_relax(); 1425 cpu_relax();
1613#endif
1614 }
1615 /* 1426 /*
1616 * Pairs with the smp_wmb() in finish_lock_switch(). 1427 * Pairs with the smp_wmb() in finish_lock_switch().
1617 */ 1428 */
@@ -1953,14 +1764,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1953 * Manfred Spraul <manfred@colorfullife.com> 1764 * Manfred Spraul <manfred@colorfullife.com>
1954 */ 1765 */
1955 prev_state = prev->state; 1766 prev_state = prev->state;
1767 vtime_task_switch(prev);
1956 finish_arch_switch(prev); 1768 finish_arch_switch(prev);
1957#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1958 local_irq_disable();
1959#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1960 perf_event_task_sched_in(prev, current); 1769 perf_event_task_sched_in(prev, current);
1961#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1962 local_irq_enable();
1963#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1964 finish_lock_switch(rq, prev); 1770 finish_lock_switch(rq, prev);
1965 finish_arch_post_lock_switch(); 1771 finish_arch_post_lock_switch();
1966 1772
@@ -2810,404 +2616,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2810 return ns; 2616 return ns;
2811} 2617}
2812 2618
2813#ifdef CONFIG_CGROUP_CPUACCT
2814struct cgroup_subsys cpuacct_subsys;
2815struct cpuacct root_cpuacct;
2816#endif
2817
2818static inline void task_group_account_field(struct task_struct *p, int index,
2819 u64 tmp)
2820{
2821#ifdef CONFIG_CGROUP_CPUACCT
2822 struct kernel_cpustat *kcpustat;
2823 struct cpuacct *ca;
2824#endif
2825 /*
2826 * Since all updates are sure to touch the root cgroup, we
2827 * get ourselves ahead and touch it first. If the root cgroup
2828 * is the only cgroup, then nothing else should be necessary.
2829 *
2830 */
2831 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2832
2833#ifdef CONFIG_CGROUP_CPUACCT
2834 if (unlikely(!cpuacct_subsys.active))
2835 return;
2836
2837 rcu_read_lock();
2838 ca = task_ca(p);
2839 while (ca && (ca != &root_cpuacct)) {
2840 kcpustat = this_cpu_ptr(ca->cpustat);
2841 kcpustat->cpustat[index] += tmp;
2842 ca = parent_ca(ca);
2843 }
2844 rcu_read_unlock();
2845#endif
2846}
2847
2848
2849/*
2850 * Account user cpu time to a process.
2851 * @p: the process that the cpu time gets accounted to
2852 * @cputime: the cpu time spent in user space since the last update
2853 * @cputime_scaled: cputime scaled by cpu frequency
2854 */
2855void account_user_time(struct task_struct *p, cputime_t cputime,
2856 cputime_t cputime_scaled)
2857{
2858 int index;
2859
2860 /* Add user time to process. */
2861 p->utime += cputime;
2862 p->utimescaled += cputime_scaled;
2863 account_group_user_time(p, cputime);
2864
2865 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2866
2867 /* Add user time to cpustat. */
2868 task_group_account_field(p, index, (__force u64) cputime);
2869
2870 /* Account for user time used */
2871 acct_update_integrals(p);
2872}
2873
2874/*
2875 * Account guest cpu time to a process.
2876 * @p: the process that the cpu time gets accounted to
2877 * @cputime: the cpu time spent in virtual machine since the last update
2878 * @cputime_scaled: cputime scaled by cpu frequency
2879 */
2880static void account_guest_time(struct task_struct *p, cputime_t cputime,
2881 cputime_t cputime_scaled)
2882{
2883 u64 *cpustat = kcpustat_this_cpu->cpustat;
2884
2885 /* Add guest time to process. */
2886 p->utime += cputime;
2887 p->utimescaled += cputime_scaled;
2888 account_group_user_time(p, cputime);
2889 p->gtime += cputime;
2890
2891 /* Add guest time to cpustat. */
2892 if (TASK_NICE(p) > 0) {
2893 cpustat[CPUTIME_NICE] += (__force u64) cputime;
2894 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
2895 } else {
2896 cpustat[CPUTIME_USER] += (__force u64) cputime;
2897 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
2898 }
2899}
2900
2901/*
2902 * Account system cpu time to a process and desired cpustat field
2903 * @p: the process that the cpu time gets accounted to
2904 * @cputime: the cpu time spent in kernel space since the last update
2905 * @cputime_scaled: cputime scaled by cpu frequency
2906 * @target_cputime64: pointer to cpustat field that has to be updated
2907 */
2908static inline
2909void __account_system_time(struct task_struct *p, cputime_t cputime,
2910 cputime_t cputime_scaled, int index)
2911{
2912 /* Add system time to process. */
2913 p->stime += cputime;
2914 p->stimescaled += cputime_scaled;
2915 account_group_system_time(p, cputime);
2916
2917 /* Add system time to cpustat. */
2918 task_group_account_field(p, index, (__force u64) cputime);
2919
2920 /* Account for system time used */
2921 acct_update_integrals(p);
2922}
2923
2924/*
2925 * Account system cpu time to a process.
2926 * @p: the process that the cpu time gets accounted to
2927 * @hardirq_offset: the offset to subtract from hardirq_count()
2928 * @cputime: the cpu time spent in kernel space since the last update
2929 * @cputime_scaled: cputime scaled by cpu frequency
2930 */
2931void account_system_time(struct task_struct *p, int hardirq_offset,
2932 cputime_t cputime, cputime_t cputime_scaled)
2933{
2934 int index;
2935
2936 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
2937 account_guest_time(p, cputime, cputime_scaled);
2938 return;
2939 }
2940
2941 if (hardirq_count() - hardirq_offset)
2942 index = CPUTIME_IRQ;
2943 else if (in_serving_softirq())
2944 index = CPUTIME_SOFTIRQ;
2945 else
2946 index = CPUTIME_SYSTEM;
2947
2948 __account_system_time(p, cputime, cputime_scaled, index);
2949}
2950
2951/*
2952 * Account for involuntary wait time.
2953 * @cputime: the cpu time spent in involuntary wait
2954 */
2955void account_steal_time(cputime_t cputime)
2956{
2957 u64 *cpustat = kcpustat_this_cpu->cpustat;
2958
2959 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
2960}
2961
2962/*
2963 * Account for idle time.
2964 * @cputime: the cpu time spent in idle wait
2965 */
2966void account_idle_time(cputime_t cputime)
2967{
2968 u64 *cpustat = kcpustat_this_cpu->cpustat;
2969 struct rq *rq = this_rq();
2970
2971 if (atomic_read(&rq->nr_iowait) > 0)
2972 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
2973 else
2974 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
2975}
2976
2977static __always_inline bool steal_account_process_tick(void)
2978{
2979#ifdef CONFIG_PARAVIRT
2980 if (static_key_false(&paravirt_steal_enabled)) {
2981 u64 steal, st = 0;
2982
2983 steal = paravirt_steal_clock(smp_processor_id());
2984 steal -= this_rq()->prev_steal_time;
2985
2986 st = steal_ticks(steal);
2987 this_rq()->prev_steal_time += st * TICK_NSEC;
2988
2989 account_steal_time(st);
2990 return st;
2991 }
2992#endif
2993 return false;
2994}
2995
2996#ifndef CONFIG_VIRT_CPU_ACCOUNTING
2997
2998#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2999/*
3000 * Account a tick to a process and cpustat
3001 * @p: the process that the cpu time gets accounted to
3002 * @user_tick: is the tick from userspace
3003 * @rq: the pointer to rq
3004 *
3005 * Tick demultiplexing follows the order
3006 * - pending hardirq update
3007 * - pending softirq update
3008 * - user_time
3009 * - idle_time
3010 * - system time
3011 * - check for guest_time
3012 * - else account as system_time
3013 *
3014 * Check for hardirq is done both for system and user time as there is
3015 * no timer going off while we are on hardirq and hence we may never get an
3016 * opportunity to update it solely in system time.
3017 * p->stime and friends are only updated on system time and not on irq
3018 * softirq as those do not count in task exec_runtime any more.
3019 */
3020static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3021 struct rq *rq)
3022{
3023 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3024 u64 *cpustat = kcpustat_this_cpu->cpustat;
3025
3026 if (steal_account_process_tick())
3027 return;
3028
3029 if (irqtime_account_hi_update()) {
3030 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
3031 } else if (irqtime_account_si_update()) {
3032 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
3033 } else if (this_cpu_ksoftirqd() == p) {
3034 /*
3035 * ksoftirqd time do not get accounted in cpu_softirq_time.
3036 * So, we have to handle it separately here.
3037 * Also, p->stime needs to be updated for ksoftirqd.
3038 */
3039 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3040 CPUTIME_SOFTIRQ);
3041 } else if (user_tick) {
3042 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3043 } else if (p == rq->idle) {
3044 account_idle_time(cputime_one_jiffy);
3045 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3046 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3047 } else {
3048 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3049 CPUTIME_SYSTEM);
3050 }
3051}
3052
3053static void irqtime_account_idle_ticks(int ticks)
3054{
3055 int i;
3056 struct rq *rq = this_rq();
3057
3058 for (i = 0; i < ticks; i++)
3059 irqtime_account_process_tick(current, 0, rq);
3060}
3061#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3062static void irqtime_account_idle_ticks(int ticks) {}
3063static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3064 struct rq *rq) {}
3065#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3066
3067/*
3068 * Account a single tick of cpu time.
3069 * @p: the process that the cpu time gets accounted to
3070 * @user_tick: indicates if the tick is a user or a system tick
3071 */
3072void account_process_tick(struct task_struct *p, int user_tick)
3073{
3074 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3075 struct rq *rq = this_rq();
3076
3077 if (sched_clock_irqtime) {
3078 irqtime_account_process_tick(p, user_tick, rq);
3079 return;
3080 }
3081
3082 if (steal_account_process_tick())
3083 return;
3084
3085 if (user_tick)
3086 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3087 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3088 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3089 one_jiffy_scaled);
3090 else
3091 account_idle_time(cputime_one_jiffy);
3092}
3093
3094/*
3095 * Account multiple ticks of steal time.
3096 * @p: the process from which the cpu time has been stolen
3097 * @ticks: number of stolen ticks
3098 */
3099void account_steal_ticks(unsigned long ticks)
3100{
3101 account_steal_time(jiffies_to_cputime(ticks));
3102}
3103
3104/*
3105 * Account multiple ticks of idle time.
3106 * @ticks: number of stolen ticks
3107 */
3108void account_idle_ticks(unsigned long ticks)
3109{
3110
3111 if (sched_clock_irqtime) {
3112 irqtime_account_idle_ticks(ticks);
3113 return;
3114 }
3115
3116 account_idle_time(jiffies_to_cputime(ticks));
3117}
3118
3119#endif
3120
3121/*
3122 * Use precise platform statistics if available:
3123 */
3124#ifdef CONFIG_VIRT_CPU_ACCOUNTING
3125void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3126{
3127 *ut = p->utime;
3128 *st = p->stime;
3129}
3130
3131void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3132{
3133 struct task_cputime cputime;
3134
3135 thread_group_cputime(p, &cputime);
3136
3137 *ut = cputime.utime;
3138 *st = cputime.stime;
3139}
3140#else
3141
3142#ifndef nsecs_to_cputime
3143# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3144#endif
3145
3146static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
3147{
3148 u64 temp = (__force u64) rtime;
3149
3150 temp *= (__force u64) utime;
3151
3152 if (sizeof(cputime_t) == 4)
3153 temp = div_u64(temp, (__force u32) total);
3154 else
3155 temp = div64_u64(temp, (__force u64) total);
3156
3157 return (__force cputime_t) temp;
3158}
3159
3160void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3161{
3162 cputime_t rtime, utime = p->utime, total = utime + p->stime;
3163
3164 /*
3165 * Use CFS's precise accounting:
3166 */
3167 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3168
3169 if (total)
3170 utime = scale_utime(utime, rtime, total);
3171 else
3172 utime = rtime;
3173
3174 /*
3175 * Compare with previous values, to keep monotonicity:
3176 */
3177 p->prev_utime = max(p->prev_utime, utime);
3178 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
3179
3180 *ut = p->prev_utime;
3181 *st = p->prev_stime;
3182}
3183
3184/*
3185 * Must be called with siglock held.
3186 */
3187void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3188{
3189 struct signal_struct *sig = p->signal;
3190 struct task_cputime cputime;
3191 cputime_t rtime, utime, total;
3192
3193 thread_group_cputime(p, &cputime);
3194
3195 total = cputime.utime + cputime.stime;
3196 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3197
3198 if (total)
3199 utime = scale_utime(cputime.utime, rtime, total);
3200 else
3201 utime = rtime;
3202
3203 sig->prev_utime = max(sig->prev_utime, utime);
3204 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
3205
3206 *ut = sig->prev_utime;
3207 *st = sig->prev_stime;
3208}
3209#endif
3210
3211/* 2619/*
3212 * This function gets called by the timer code, with HZ frequency. 2620 * This function gets called by the timer code, with HZ frequency.
3213 * We call it with interrupts disabled. 2621 * We call it with interrupts disabled.
@@ -3368,6 +2776,40 @@ pick_next_task(struct rq *rq)
3368 2776
3369/* 2777/*
3370 * __schedule() is the main scheduler function. 2778 * __schedule() is the main scheduler function.
2779 *
2780 * The main means of driving the scheduler and thus entering this function are:
2781 *
2782 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
2783 *
2784 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
2785 * paths. For example, see arch/x86/entry_64.S.
2786 *
2787 * To drive preemption between tasks, the scheduler sets the flag in timer
2788 * interrupt handler scheduler_tick().
2789 *
2790 * 3. Wakeups don't really cause entry into schedule(). They add a
2791 * task to the run-queue and that's it.
2792 *
2793 * Now, if the new task added to the run-queue preempts the current
2794 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
2795 * called on the nearest possible occasion:
2796 *
2797 * - If the kernel is preemptible (CONFIG_PREEMPT=y):
2798 *
2799 * - in syscall or exception context, at the next outmost
2800 * preempt_enable(). (this might be as soon as the wake_up()'s
2801 * spin_unlock()!)
2802 *
2803 * - in IRQ context, return from interrupt-handler to
2804 * preemptible context
2805 *
2806 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
2807 * then at the next:
2808 *
2809 * - cond_resched() call
2810 * - explicit schedule() call
2811 * - return from syscall or exception to user-space
2812 * - return from interrupt-handler to user-space
3371 */ 2813 */
3372static void __sched __schedule(void) 2814static void __sched __schedule(void)
3373{ 2815{
@@ -4885,13 +4327,6 @@ again:
4885 */ 4327 */
4886 if (preempt && rq != p_rq) 4328 if (preempt && rq != p_rq)
4887 resched_task(p_rq->curr); 4329 resched_task(p_rq->curr);
4888 } else {
4889 /*
4890 * We might have set it in task_yield_fair(), but are
4891 * not going to schedule(), so don't want to skip
4892 * the next update.
4893 */
4894 rq->skip_clock_update = 0;
4895 } 4330 }
4896 4331
4897out: 4332out:
@@ -5433,16 +4868,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
5433 *tablep = NULL; 4868 *tablep = NULL;
5434} 4869}
5435 4870
4871static int min_load_idx = 0;
4872static int max_load_idx = CPU_LOAD_IDX_MAX;
4873
5436static void 4874static void
5437set_table_entry(struct ctl_table *entry, 4875set_table_entry(struct ctl_table *entry,
5438 const char *procname, void *data, int maxlen, 4876 const char *procname, void *data, int maxlen,
5439 umode_t mode, proc_handler *proc_handler) 4877 umode_t mode, proc_handler *proc_handler,
4878 bool load_idx)
5440{ 4879{
5441 entry->procname = procname; 4880 entry->procname = procname;
5442 entry->data = data; 4881 entry->data = data;
5443 entry->maxlen = maxlen; 4882 entry->maxlen = maxlen;
5444 entry->mode = mode; 4883 entry->mode = mode;
5445 entry->proc_handler = proc_handler; 4884 entry->proc_handler = proc_handler;
4885
4886 if (load_idx) {
4887 entry->extra1 = &min_load_idx;
4888 entry->extra2 = &max_load_idx;
4889 }
5446} 4890}
5447 4891
5448static struct ctl_table * 4892static struct ctl_table *
@@ -5454,30 +4898,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5454 return NULL; 4898 return NULL;
5455 4899
5456 set_table_entry(&table[0], "min_interval", &sd->min_interval, 4900 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5457 sizeof(long), 0644, proc_doulongvec_minmax); 4901 sizeof(long), 0644, proc_doulongvec_minmax, false);
5458 set_table_entry(&table[1], "max_interval", &sd->max_interval, 4902 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5459 sizeof(long), 0644, proc_doulongvec_minmax); 4903 sizeof(long), 0644, proc_doulongvec_minmax, false);
5460 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 4904 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5461 sizeof(int), 0644, proc_dointvec_minmax); 4905 sizeof(int), 0644, proc_dointvec_minmax, true);
5462 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 4906 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5463 sizeof(int), 0644, proc_dointvec_minmax); 4907 sizeof(int), 0644, proc_dointvec_minmax, true);
5464 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 4908 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5465 sizeof(int), 0644, proc_dointvec_minmax); 4909 sizeof(int), 0644, proc_dointvec_minmax, true);
5466 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 4910 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5467 sizeof(int), 0644, proc_dointvec_minmax); 4911 sizeof(int), 0644, proc_dointvec_minmax, true);
5468 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 4912 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5469 sizeof(int), 0644, proc_dointvec_minmax); 4913 sizeof(int), 0644, proc_dointvec_minmax, true);
5470 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 4914 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5471 sizeof(int), 0644, proc_dointvec_minmax); 4915 sizeof(int), 0644, proc_dointvec_minmax, false);
5472 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 4916 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5473 sizeof(int), 0644, proc_dointvec_minmax); 4917 sizeof(int), 0644, proc_dointvec_minmax, false);
5474 set_table_entry(&table[9], "cache_nice_tries", 4918 set_table_entry(&table[9], "cache_nice_tries",
5475 &sd->cache_nice_tries, 4919 &sd->cache_nice_tries,
5476 sizeof(int), 0644, proc_dointvec_minmax); 4920 sizeof(int), 0644, proc_dointvec_minmax, false);
5477 set_table_entry(&table[10], "flags", &sd->flags, 4921 set_table_entry(&table[10], "flags", &sd->flags,
5478 sizeof(int), 0644, proc_dointvec_minmax); 4922 sizeof(int), 0644, proc_dointvec_minmax, false);
5479 set_table_entry(&table[11], "name", sd->name, 4923 set_table_entry(&table[11], "name", sd->name,
5480 CORENAME_MAX_SIZE, 0444, proc_dostring); 4924 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
5481 /* &table[12] is terminator */ 4925 /* &table[12] is terminator */
5482 4926
5483 return table; 4927 return table;
@@ -6556,7 +6000,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6556 | 0*SD_BALANCE_FORK 6000 | 0*SD_BALANCE_FORK
6557 | 0*SD_BALANCE_WAKE 6001 | 0*SD_BALANCE_WAKE
6558 | 0*SD_WAKE_AFFINE 6002 | 0*SD_WAKE_AFFINE
6559 | 0*SD_PREFER_LOCAL
6560 | 0*SD_SHARE_CPUPOWER 6003 | 0*SD_SHARE_CPUPOWER
6561 | 0*SD_SHARE_PKG_RESOURCES 6004 | 0*SD_SHARE_PKG_RESOURCES
6562 | 1*SD_SERIALIZE 6005 | 1*SD_SERIALIZE
@@ -8354,6 +7797,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8354 * (balbir@in.ibm.com). 7797 * (balbir@in.ibm.com).
8355 */ 7798 */
8356 7799
7800struct cpuacct root_cpuacct;
7801
8357/* create a new cpu accounting group */ 7802/* create a new cpu accounting group */
8358static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) 7803static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
8359{ 7804{
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
new file mode 100644
index 000000000000..81b763ba58a6
--- /dev/null
+++ b/kernel/sched/cputime.c
@@ -0,0 +1,530 @@
1#include <linux/export.h>
2#include <linux/sched.h>
3#include <linux/tsacct_kern.h>
4#include <linux/kernel_stat.h>
5#include <linux/static_key.h>
6#include "sched.h"
7
8
9#ifdef CONFIG_IRQ_TIME_ACCOUNTING
10
11/*
12 * There are no locks covering percpu hardirq/softirq time.
13 * They are only modified in vtime_account, on corresponding CPU
14 * with interrupts disabled. So, writes are safe.
15 * They are read and saved off onto struct rq in update_rq_clock().
16 * This may result in other CPU reading this CPU's irq time and can
17 * race with irq/vtime_account on this CPU. We would either get old
18 * or new value with a side effect of accounting a slice of irq time to wrong
19 * task when irq is in progress while we read rq->clock. That is a worthy
20 * compromise in place of having locks on each irq in account_system_time.
21 */
22DEFINE_PER_CPU(u64, cpu_hardirq_time);
23DEFINE_PER_CPU(u64, cpu_softirq_time);
24
25static DEFINE_PER_CPU(u64, irq_start_time);
26static int sched_clock_irqtime;
27
28void enable_sched_clock_irqtime(void)
29{
30 sched_clock_irqtime = 1;
31}
32
33void disable_sched_clock_irqtime(void)
34{
35 sched_clock_irqtime = 0;
36}
37
38#ifndef CONFIG_64BIT
39DEFINE_PER_CPU(seqcount_t, irq_time_seq);
40#endif /* CONFIG_64BIT */
41
42/*
43 * Called before incrementing preempt_count on {soft,}irq_enter
44 * and before decrementing preempt_count on {soft,}irq_exit.
45 */
46void vtime_account(struct task_struct *curr)
47{
48 unsigned long flags;
49 s64 delta;
50 int cpu;
51
52 if (!sched_clock_irqtime)
53 return;
54
55 local_irq_save(flags);
56
57 cpu = smp_processor_id();
58 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
59 __this_cpu_add(irq_start_time, delta);
60
61 irq_time_write_begin();
62 /*
63 * We do not account for softirq time from ksoftirqd here.
64 * We want to continue accounting softirq time to ksoftirqd thread
65 * in that case, so as not to confuse scheduler with a special task
66 * that do not consume any time, but still wants to run.
67 */
68 if (hardirq_count())
69 __this_cpu_add(cpu_hardirq_time, delta);
70 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
71 __this_cpu_add(cpu_softirq_time, delta);
72
73 irq_time_write_end();
74 local_irq_restore(flags);
75}
76EXPORT_SYMBOL_GPL(vtime_account);
77
78static int irqtime_account_hi_update(void)
79{
80 u64 *cpustat = kcpustat_this_cpu->cpustat;
81 unsigned long flags;
82 u64 latest_ns;
83 int ret = 0;
84
85 local_irq_save(flags);
86 latest_ns = this_cpu_read(cpu_hardirq_time);
87 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
88 ret = 1;
89 local_irq_restore(flags);
90 return ret;
91}
92
93static int irqtime_account_si_update(void)
94{
95 u64 *cpustat = kcpustat_this_cpu->cpustat;
96 unsigned long flags;
97 u64 latest_ns;
98 int ret = 0;
99
100 local_irq_save(flags);
101 latest_ns = this_cpu_read(cpu_softirq_time);
102 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
103 ret = 1;
104 local_irq_restore(flags);
105 return ret;
106}
107
108#else /* CONFIG_IRQ_TIME_ACCOUNTING */
109
110#define sched_clock_irqtime (0)
111
112#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
113
114static inline void task_group_account_field(struct task_struct *p, int index,
115 u64 tmp)
116{
117#ifdef CONFIG_CGROUP_CPUACCT
118 struct kernel_cpustat *kcpustat;
119 struct cpuacct *ca;
120#endif
121 /*
122 * Since all updates are sure to touch the root cgroup, we
123 * get ourselves ahead and touch it first. If the root cgroup
124 * is the only cgroup, then nothing else should be necessary.
125 *
126 */
127 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
128
129#ifdef CONFIG_CGROUP_CPUACCT
130 if (unlikely(!cpuacct_subsys.active))
131 return;
132
133 rcu_read_lock();
134 ca = task_ca(p);
135 while (ca && (ca != &root_cpuacct)) {
136 kcpustat = this_cpu_ptr(ca->cpustat);
137 kcpustat->cpustat[index] += tmp;
138 ca = parent_ca(ca);
139 }
140 rcu_read_unlock();
141#endif
142}
143
144/*
145 * Account user cpu time to a process.
146 * @p: the process that the cpu time gets accounted to
147 * @cputime: the cpu time spent in user space since the last update
148 * @cputime_scaled: cputime scaled by cpu frequency
149 */
150void account_user_time(struct task_struct *p, cputime_t cputime,
151 cputime_t cputime_scaled)
152{
153 int index;
154
155 /* Add user time to process. */
156 p->utime += cputime;
157 p->utimescaled += cputime_scaled;
158 account_group_user_time(p, cputime);
159
160 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
161
162 /* Add user time to cpustat. */
163 task_group_account_field(p, index, (__force u64) cputime);
164
165 /* Account for user time used */
166 acct_update_integrals(p);
167}
168
169/*
170 * Account guest cpu time to a process.
171 * @p: the process that the cpu time gets accounted to
172 * @cputime: the cpu time spent in virtual machine since the last update
173 * @cputime_scaled: cputime scaled by cpu frequency
174 */
175static void account_guest_time(struct task_struct *p, cputime_t cputime,
176 cputime_t cputime_scaled)
177{
178 u64 *cpustat = kcpustat_this_cpu->cpustat;
179
180 /* Add guest time to process. */
181 p->utime += cputime;
182 p->utimescaled += cputime_scaled;
183 account_group_user_time(p, cputime);
184 p->gtime += cputime;
185
186 /* Add guest time to cpustat. */
187 if (TASK_NICE(p) > 0) {
188 cpustat[CPUTIME_NICE] += (__force u64) cputime;
189 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
190 } else {
191 cpustat[CPUTIME_USER] += (__force u64) cputime;
192 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
193 }
194}
195
196/*
197 * Account system cpu time to a process and desired cpustat field
198 * @p: the process that the cpu time gets accounted to
199 * @cputime: the cpu time spent in kernel space since the last update
200 * @cputime_scaled: cputime scaled by cpu frequency
201 * @target_cputime64: pointer to cpustat field that has to be updated
202 */
203static inline
204void __account_system_time(struct task_struct *p, cputime_t cputime,
205 cputime_t cputime_scaled, int index)
206{
207 /* Add system time to process. */
208 p->stime += cputime;
209 p->stimescaled += cputime_scaled;
210 account_group_system_time(p, cputime);
211
212 /* Add system time to cpustat. */
213 task_group_account_field(p, index, (__force u64) cputime);
214
215 /* Account for system time used */
216 acct_update_integrals(p);
217}
218
219/*
220 * Account system cpu time to a process.
221 * @p: the process that the cpu time gets accounted to
222 * @hardirq_offset: the offset to subtract from hardirq_count()
223 * @cputime: the cpu time spent in kernel space since the last update
224 * @cputime_scaled: cputime scaled by cpu frequency
225 */
226void account_system_time(struct task_struct *p, int hardirq_offset,
227 cputime_t cputime, cputime_t cputime_scaled)
228{
229 int index;
230
231 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
232 account_guest_time(p, cputime, cputime_scaled);
233 return;
234 }
235
236 if (hardirq_count() - hardirq_offset)
237 index = CPUTIME_IRQ;
238 else if (in_serving_softirq())
239 index = CPUTIME_SOFTIRQ;
240 else
241 index = CPUTIME_SYSTEM;
242
243 __account_system_time(p, cputime, cputime_scaled, index);
244}
245
246/*
247 * Account for involuntary wait time.
248 * @cputime: the cpu time spent in involuntary wait
249 */
250void account_steal_time(cputime_t cputime)
251{
252 u64 *cpustat = kcpustat_this_cpu->cpustat;
253
254 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
255}
256
257/*
258 * Account for idle time.
259 * @cputime: the cpu time spent in idle wait
260 */
261void account_idle_time(cputime_t cputime)
262{
263 u64 *cpustat = kcpustat_this_cpu->cpustat;
264 struct rq *rq = this_rq();
265
266 if (atomic_read(&rq->nr_iowait) > 0)
267 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
268 else
269 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
270}
271
272static __always_inline bool steal_account_process_tick(void)
273{
274#ifdef CONFIG_PARAVIRT
275 if (static_key_false(&paravirt_steal_enabled)) {
276 u64 steal, st = 0;
277
278 steal = paravirt_steal_clock(smp_processor_id());
279 steal -= this_rq()->prev_steal_time;
280
281 st = steal_ticks(steal);
282 this_rq()->prev_steal_time += st * TICK_NSEC;
283
284 account_steal_time(st);
285 return st;
286 }
287#endif
288 return false;
289}
290
291#ifndef CONFIG_VIRT_CPU_ACCOUNTING
292
293#ifdef CONFIG_IRQ_TIME_ACCOUNTING
294/*
295 * Account a tick to a process and cpustat
296 * @p: the process that the cpu time gets accounted to
297 * @user_tick: is the tick from userspace
298 * @rq: the pointer to rq
299 *
300 * Tick demultiplexing follows the order
301 * - pending hardirq update
302 * - pending softirq update
303 * - user_time
304 * - idle_time
305 * - system time
306 * - check for guest_time
307 * - else account as system_time
308 *
309 * Check for hardirq is done both for system and user time as there is
310 * no timer going off while we are on hardirq and hence we may never get an
311 * opportunity to update it solely in system time.
312 * p->stime and friends are only updated on system time and not on irq
313 * softirq as those do not count in task exec_runtime any more.
314 */
315static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
316 struct rq *rq)
317{
318 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
319 u64 *cpustat = kcpustat_this_cpu->cpustat;
320
321 if (steal_account_process_tick())
322 return;
323
324 if (irqtime_account_hi_update()) {
325 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
326 } else if (irqtime_account_si_update()) {
327 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
328 } else if (this_cpu_ksoftirqd() == p) {
329 /*
330 * ksoftirqd time do not get accounted in cpu_softirq_time.
331 * So, we have to handle it separately here.
332 * Also, p->stime needs to be updated for ksoftirqd.
333 */
334 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
335 CPUTIME_SOFTIRQ);
336 } else if (user_tick) {
337 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
338 } else if (p == rq->idle) {
339 account_idle_time(cputime_one_jiffy);
340 } else if (p->flags & PF_VCPU) { /* System time or guest time */
341 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
342 } else {
343 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
344 CPUTIME_SYSTEM);
345 }
346}
347
348static void irqtime_account_idle_ticks(int ticks)
349{
350 int i;
351 struct rq *rq = this_rq();
352
353 for (i = 0; i < ticks; i++)
354 irqtime_account_process_tick(current, 0, rq);
355}
356#else /* CONFIG_IRQ_TIME_ACCOUNTING */
357static void irqtime_account_idle_ticks(int ticks) {}
358static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
359 struct rq *rq) {}
360#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
361
362/*
363 * Account a single tick of cpu time.
364 * @p: the process that the cpu time gets accounted to
365 * @user_tick: indicates if the tick is a user or a system tick
366 */
367void account_process_tick(struct task_struct *p, int user_tick)
368{
369 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
370 struct rq *rq = this_rq();
371
372 if (sched_clock_irqtime) {
373 irqtime_account_process_tick(p, user_tick, rq);
374 return;
375 }
376
377 if (steal_account_process_tick())
378 return;
379
380 if (user_tick)
381 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
382 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
383 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
384 one_jiffy_scaled);
385 else
386 account_idle_time(cputime_one_jiffy);
387}
388
389/*
390 * Account multiple ticks of steal time.
391 * @p: the process from which the cpu time has been stolen
392 * @ticks: number of stolen ticks
393 */
394void account_steal_ticks(unsigned long ticks)
395{
396 account_steal_time(jiffies_to_cputime(ticks));
397}
398
399/*
400 * Account multiple ticks of idle time.
401 * @ticks: number of stolen ticks
402 */
403void account_idle_ticks(unsigned long ticks)
404{
405
406 if (sched_clock_irqtime) {
407 irqtime_account_idle_ticks(ticks);
408 return;
409 }
410
411 account_idle_time(jiffies_to_cputime(ticks));
412}
413
414#endif
415
416/*
417 * Use precise platform statistics if available:
418 */
419#ifdef CONFIG_VIRT_CPU_ACCOUNTING
420void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
421{
422 *ut = p->utime;
423 *st = p->stime;
424}
425
426void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
427{
428 struct task_cputime cputime;
429
430 thread_group_cputime(p, &cputime);
431
432 *ut = cputime.utime;
433 *st = cputime.stime;
434}
435
436/*
437 * Archs that account the whole time spent in the idle task
438 * (outside irq) as idle time can rely on this and just implement
439 * vtime_account_system() and vtime_account_idle(). Archs that
440 * have other meaning of the idle time (s390 only includes the
441 * time spent by the CPU when it's in low power mode) must override
442 * vtime_account().
443 */
444#ifndef __ARCH_HAS_VTIME_ACCOUNT
445void vtime_account(struct task_struct *tsk)
446{
447 unsigned long flags;
448
449 local_irq_save(flags);
450
451 if (in_interrupt() || !is_idle_task(tsk))
452 vtime_account_system(tsk);
453 else
454 vtime_account_idle(tsk);
455
456 local_irq_restore(flags);
457}
458EXPORT_SYMBOL_GPL(vtime_account);
459#endif /* __ARCH_HAS_VTIME_ACCOUNT */
460
461#else
462
463#ifndef nsecs_to_cputime
464# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
465#endif
466
467static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
468{
469 u64 temp = (__force u64) rtime;
470
471 temp *= (__force u64) utime;
472
473 if (sizeof(cputime_t) == 4)
474 temp = div_u64(temp, (__force u32) total);
475 else
476 temp = div64_u64(temp, (__force u64) total);
477
478 return (__force cputime_t) temp;
479}
480
481void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
482{
483 cputime_t rtime, utime = p->utime, total = utime + p->stime;
484
485 /*
486 * Use CFS's precise accounting:
487 */
488 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
489
490 if (total)
491 utime = scale_utime(utime, rtime, total);
492 else
493 utime = rtime;
494
495 /*
496 * Compare with previous values, to keep monotonicity:
497 */
498 p->prev_utime = max(p->prev_utime, utime);
499 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
500
501 *ut = p->prev_utime;
502 *st = p->prev_stime;
503}
504
505/*
506 * Must be called with siglock held.
507 */
508void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
509{
510 struct signal_struct *sig = p->signal;
511 struct task_cputime cputime;
512 cputime_t rtime, utime, total;
513
514 thread_group_cputime(p, &cputime);
515
516 total = cputime.utime + cputime.stime;
517 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
518
519 if (total)
520 utime = scale_utime(cputime.utime, rtime, total);
521 else
522 utime = rtime;
523
524 sig->prev_utime = max(sig->prev_utime, utime);
525 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
526
527 *ut = sig->prev_utime;
528 *st = sig->prev_stime;
529}
530#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 96e2b18b6283..6b800a14b990 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -597,7 +597,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se)
597/* 597/*
598 * The idea is to set a period in which each task runs once. 598 * The idea is to set a period in which each task runs once.
599 * 599 *
600 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 600 * When there are too many tasks (sched_nr_latency) we have to stretch
601 * this period because otherwise the slices get too small. 601 * this period because otherwise the slices get too small.
602 * 602 *
603 * p = (nr <= nl) ? l : l*nr/nl 603 * p = (nr <= nl) ? l : l*nr/nl
@@ -2700,7 +2700,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2700 int prev_cpu = task_cpu(p); 2700 int prev_cpu = task_cpu(p);
2701 int new_cpu = cpu; 2701 int new_cpu = cpu;
2702 int want_affine = 0; 2702 int want_affine = 0;
2703 int want_sd = 1;
2704 int sync = wake_flags & WF_SYNC; 2703 int sync = wake_flags & WF_SYNC;
2705 2704
2706 if (p->nr_cpus_allowed == 1) 2705 if (p->nr_cpus_allowed == 1)
@@ -2718,48 +2717,21 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2718 continue; 2717 continue;
2719 2718
2720 /* 2719 /*
2721 * If power savings logic is enabled for a domain, see if we
2722 * are not overloaded, if so, don't balance wider.
2723 */
2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0;
2726 unsigned long nr_running = 0;
2727 unsigned long capacity;
2728 int i;
2729
2730 for_each_cpu(i, sched_domain_span(tmp)) {
2731 power += power_of(i);
2732 nr_running += cpu_rq(i)->cfs.nr_running;
2733 }
2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736
2737 if (nr_running < capacity)
2738 want_sd = 0;
2739 }
2740
2741 /*
2742 * If both cpu and prev_cpu are part of this domain, 2720 * If both cpu and prev_cpu are part of this domain,
2743 * cpu is a valid SD_WAKE_AFFINE target. 2721 * cpu is a valid SD_WAKE_AFFINE target.
2744 */ 2722 */
2745 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 2723 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
2746 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 2724 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
2747 affine_sd = tmp; 2725 affine_sd = tmp;
2748 want_affine = 0;
2749 }
2750
2751 if (!want_sd && !want_affine)
2752 break; 2726 break;
2727 }
2753 2728
2754 if (!(tmp->flags & sd_flag)) 2729 if (tmp->flags & sd_flag)
2755 continue;
2756
2757 if (want_sd)
2758 sd = tmp; 2730 sd = tmp;
2759 } 2731 }
2760 2732
2761 if (affine_sd) { 2733 if (affine_sd) {
2762 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 2734 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
2763 prev_cpu = cpu; 2735 prev_cpu = cpu;
2764 2736
2765 new_cpu = select_idle_sibling(p, prev_cpu); 2737 new_cpu = select_idle_sibling(p, prev_cpu);
@@ -4295,7 +4267,7 @@ redo:
4295 goto out_balanced; 4267 goto out_balanced;
4296 } 4268 }
4297 4269
4298 BUG_ON(busiest == this_rq); 4270 BUG_ON(busiest == env.dst_rq);
4299 4271
4300 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 4272 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4301 4273
@@ -4316,7 +4288,7 @@ redo:
4316 update_h_load(env.src_cpu); 4288 update_h_load(env.src_cpu);
4317more_balance: 4289more_balance:
4318 local_irq_save(flags); 4290 local_irq_save(flags);
4319 double_rq_lock(this_rq, busiest); 4291 double_rq_lock(env.dst_rq, busiest);
4320 4292
4321 /* 4293 /*
4322 * cur_ld_moved - load moved in current iteration 4294 * cur_ld_moved - load moved in current iteration
@@ -4324,7 +4296,7 @@ more_balance:
4324 */ 4296 */
4325 cur_ld_moved = move_tasks(&env); 4297 cur_ld_moved = move_tasks(&env);
4326 ld_moved += cur_ld_moved; 4298 ld_moved += cur_ld_moved;
4327 double_rq_unlock(this_rq, busiest); 4299 double_rq_unlock(env.dst_rq, busiest);
4328 local_irq_restore(flags); 4300 local_irq_restore(flags);
4329 4301
4330 if (env.flags & LBF_NEED_BREAK) { 4302 if (env.flags & LBF_NEED_BREAK) {
@@ -4360,8 +4332,7 @@ more_balance:
4360 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && 4332 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4361 lb_iterations++ < max_lb_iterations) { 4333 lb_iterations++ < max_lb_iterations) {
4362 4334
4363 this_rq = cpu_rq(env.new_dst_cpu); 4335 env.dst_rq = cpu_rq(env.new_dst_cpu);
4364 env.dst_rq = this_rq;
4365 env.dst_cpu = env.new_dst_cpu; 4336 env.dst_cpu = env.new_dst_cpu;
4366 env.flags &= ~LBF_SOME_PINNED; 4337 env.flags &= ~LBF_SOME_PINNED;
4367 env.loop = 0; 4338 env.loop = 0;
@@ -4646,7 +4617,7 @@ static void nohz_balancer_kick(int cpu)
4646 return; 4617 return;
4647} 4618}
4648 4619
4649static inline void clear_nohz_tick_stopped(int cpu) 4620static inline void nohz_balance_exit_idle(int cpu)
4650{ 4621{
4651 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 4622 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
4652 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 4623 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
@@ -4686,28 +4657,23 @@ void set_cpu_sd_state_idle(void)
4686} 4657}
4687 4658
4688/* 4659/*
4689 * This routine will record that this cpu is going idle with tick stopped. 4660 * This routine will record that the cpu is going idle with tick stopped.
4690 * This info will be used in performing idle load balancing in the future. 4661 * This info will be used in performing idle load balancing in the future.
4691 */ 4662 */
4692void select_nohz_load_balancer(int stop_tick) 4663void nohz_balance_enter_idle(int cpu)
4693{ 4664{
4694 int cpu = smp_processor_id();
4695
4696 /* 4665 /*
4697 * If this cpu is going down, then nothing needs to be done. 4666 * If this cpu is going down, then nothing needs to be done.
4698 */ 4667 */
4699 if (!cpu_active(cpu)) 4668 if (!cpu_active(cpu))
4700 return; 4669 return;
4701 4670
4702 if (stop_tick) { 4671 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4703 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 4672 return;
4704 return;
4705 4673
4706 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 4674 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4707 atomic_inc(&nohz.nr_cpus); 4675 atomic_inc(&nohz.nr_cpus);
4708 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 4676 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4709 }
4710 return;
4711} 4677}
4712 4678
4713static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, 4679static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
@@ -4715,7 +4681,7 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
4715{ 4681{
4716 switch (action & ~CPU_TASKS_FROZEN) { 4682 switch (action & ~CPU_TASKS_FROZEN) {
4717 case CPU_DYING: 4683 case CPU_DYING:
4718 clear_nohz_tick_stopped(smp_processor_id()); 4684 nohz_balance_exit_idle(smp_processor_id());
4719 return NOTIFY_OK; 4685 return NOTIFY_OK;
4720 default: 4686 default:
4721 return NOTIFY_DONE; 4687 return NOTIFY_DONE;
@@ -4837,14 +4803,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4837 if (need_resched()) 4803 if (need_resched())
4838 break; 4804 break;
4839 4805
4840 raw_spin_lock_irq(&this_rq->lock); 4806 rq = cpu_rq(balance_cpu);
4841 update_rq_clock(this_rq); 4807
4842 update_idle_cpu_load(this_rq); 4808 raw_spin_lock_irq(&rq->lock);
4843 raw_spin_unlock_irq(&this_rq->lock); 4809 update_rq_clock(rq);
4810 update_idle_cpu_load(rq);
4811 raw_spin_unlock_irq(&rq->lock);
4844 4812
4845 rebalance_domains(balance_cpu, CPU_IDLE); 4813 rebalance_domains(balance_cpu, CPU_IDLE);
4846 4814
4847 rq = cpu_rq(balance_cpu);
4848 if (time_after(this_rq->next_balance, rq->next_balance)) 4815 if (time_after(this_rq->next_balance, rq->next_balance))
4849 this_rq->next_balance = rq->next_balance; 4816 this_rq->next_balance = rq->next_balance;
4850 } 4817 }
@@ -4875,7 +4842,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
4875 * busy tick after returning from idle, we will update the busy stats. 4842 * busy tick after returning from idle, we will update the busy stats.
4876 */ 4843 */
4877 set_cpu_sd_state_busy(); 4844 set_cpu_sd_state_busy();
4878 clear_nohz_tick_stopped(cpu); 4845 nohz_balance_exit_idle(cpu);
4879 4846
4880 /* 4847 /*
4881 * None are in tickless mode and hence no need for NOHZ idle load 4848 * None are in tickless mode and hence no need for NOHZ idle load
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index de00a486c5c6..eebefcad7027 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
12SCHED_FEAT(START_DEBIT, true) 12SCHED_FEAT(START_DEBIT, true)
13 13
14/* 14/*
15 * Based on load and program behaviour, see if it makes sense to place
16 * a newly woken task on the same cpu as the task that woke it --
17 * improve cache locality. Typically used with SYNC wakeups as
18 * generated by pipes and the like, see also SYNC_WAKEUPS.
19 */
20SCHED_FEAT(AFFINE_WAKEUPS, true)
21
22/*
23 * Prefer to schedule the task we woke last (assuming it failed 15 * Prefer to schedule the task we woke last (assuming it failed
24 * wakeup-preemption), since its likely going to consume data we 16 * wakeup-preemption), since its likely going to consume data we
25 * touched, increases cache locality. 17 * touched, increases cache locality.
@@ -42,7 +34,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
42/* 34/*
43 * Use arch dependent cpu power functions 35 * Use arch dependent cpu power functions
44 */ 36 */
45SCHED_FEAT(ARCH_POWER, false) 37SCHED_FEAT(ARCH_POWER, true)
46 38
47SCHED_FEAT(HRTICK, false) 39SCHED_FEAT(HRTICK, false)
48SCHED_FEAT(DOUBLE_TICK, false) 40SCHED_FEAT(DOUBLE_TICK, false)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e0b7ba9c040f..418feb01344e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1632,11 +1632,6 @@ static int push_rt_task(struct rq *rq)
1632 if (!next_task) 1632 if (!next_task)
1633 return 0; 1633 return 0;
1634 1634
1635#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1636 if (unlikely(task_running(rq, next_task)))
1637 return 0;
1638#endif
1639
1640retry: 1635retry:
1641 if (unlikely(next_task == rq->curr)) { 1636 if (unlikely(next_task == rq->curr)) {
1642 WARN_ON(1); 1637 WARN_ON(1);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0848fa36c383..7a7db09cfabc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -737,11 +737,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
737 */ 737 */
738 next->on_cpu = 1; 738 next->on_cpu = 1;
739#endif 739#endif
740#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
741 raw_spin_unlock_irq(&rq->lock);
742#else
743 raw_spin_unlock(&rq->lock); 740 raw_spin_unlock(&rq->lock);
744#endif
745} 741}
746 742
747static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 743static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
@@ -755,9 +751,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
755 smp_wmb(); 751 smp_wmb();
756 prev->on_cpu = 0; 752 prev->on_cpu = 0;
757#endif 753#endif
758#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
759 local_irq_enable(); 754 local_irq_enable();
760#endif
761} 755}
762#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 756#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
763 757
@@ -891,6 +885,9 @@ struct cpuacct {
891 struct kernel_cpustat __percpu *cpustat; 885 struct kernel_cpustat __percpu *cpustat;
892}; 886};
893 887
888extern struct cgroup_subsys cpuacct_subsys;
889extern struct cpuacct root_cpuacct;
890
894/* return cpu accounting group corresponding to this container */ 891/* return cpu accounting group corresponding to this container */
895static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) 892static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
896{ 893{
@@ -917,6 +914,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
917static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 914static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
918#endif 915#endif
919 916
917#ifdef CONFIG_PARAVIRT
918static inline u64 steal_ticks(u64 steal)
919{
920 if (unlikely(steal > NSEC_PER_SEC))
921 return div_u64(steal, TICK_NSEC);
922
923 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
924}
925#endif
926
920static inline void inc_nr_running(struct rq *rq) 927static inline void inc_nr_running(struct rq *rq)
921{ 928{
922 rq->nr_running++; 929 rq->nr_running++;
@@ -1156,3 +1163,53 @@ enum rq_nohz_flag_bits {
1156 1163
1157#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 1164#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1158#endif 1165#endif
1166
1167#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1168
1169DECLARE_PER_CPU(u64, cpu_hardirq_time);
1170DECLARE_PER_CPU(u64, cpu_softirq_time);
1171
1172#ifndef CONFIG_64BIT
1173DECLARE_PER_CPU(seqcount_t, irq_time_seq);
1174
1175static inline void irq_time_write_begin(void)
1176{
1177 __this_cpu_inc(irq_time_seq.sequence);
1178 smp_wmb();
1179}
1180
1181static inline void irq_time_write_end(void)
1182{
1183 smp_wmb();
1184 __this_cpu_inc(irq_time_seq.sequence);
1185}
1186
1187static inline u64 irq_time_read(int cpu)
1188{
1189 u64 irq_time;
1190 unsigned seq;
1191
1192 do {
1193 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1194 irq_time = per_cpu(cpu_softirq_time, cpu) +
1195 per_cpu(cpu_hardirq_time, cpu);
1196 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1197
1198 return irq_time;
1199}
1200#else /* CONFIG_64BIT */
1201static inline void irq_time_write_begin(void)
1202{
1203}
1204
1205static inline void irq_time_write_end(void)
1206{
1207}
1208
1209static inline u64 irq_time_read(int cpu)
1210{
1211 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1212}
1213#endif /* CONFIG_64BIT */
1214#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1215
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5c6a5bd8462f..cc96bdc0c2c9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
221 current->flags &= ~PF_MEMALLOC; 221 current->flags &= ~PF_MEMALLOC;
222 222
223 pending = local_softirq_pending(); 223 pending = local_softirq_pending();
224 account_system_vtime(current); 224 vtime_account(current);
225 225
226 __local_bh_disable((unsigned long)__builtin_return_address(0), 226 __local_bh_disable((unsigned long)__builtin_return_address(0),
227 SOFTIRQ_OFFSET); 227 SOFTIRQ_OFFSET);
@@ -272,7 +272,7 @@ restart:
272 272
273 lockdep_softirq_exit(); 273 lockdep_softirq_exit();
274 274
275 account_system_vtime(current); 275 vtime_account(current);
276 __local_bh_enable(SOFTIRQ_OFFSET); 276 __local_bh_enable(SOFTIRQ_OFFSET);
277 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 277 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
278} 278}
@@ -341,7 +341,7 @@ static inline void invoke_softirq(void)
341 */ 341 */
342void irq_exit(void) 342void irq_exit(void)
343{ 343{
344 account_system_vtime(current); 344 vtime_account(current);
345 trace_hardirq_exit(); 345 trace_hardirq_exit();
346 sub_preempt_count(IRQ_EXIT_OFFSET); 346 sub_preempt_count(IRQ_EXIT_OFFSET);
347 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 87174ef59161..81c7b1a1a307 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -307,7 +307,7 @@ static struct ctl_table kern_table[] = {
307 .extra2 = &max_sched_tunable_scaling, 307 .extra2 = &max_sched_tunable_scaling,
308 }, 308 },
309 { 309 {
310 .procname = "sched_migration_cost", 310 .procname = "sched_migration_cost_ns",
311 .data = &sysctl_sched_migration_cost, 311 .data = &sysctl_sched_migration_cost,
312 .maxlen = sizeof(unsigned int), 312 .maxlen = sizeof(unsigned int),
313 .mode = 0644, 313 .mode = 0644,
@@ -321,14 +321,14 @@ static struct ctl_table kern_table[] = {
321 .proc_handler = proc_dointvec, 321 .proc_handler = proc_dointvec,
322 }, 322 },
323 { 323 {
324 .procname = "sched_time_avg", 324 .procname = "sched_time_avg_ms",
325 .data = &sysctl_sched_time_avg, 325 .data = &sysctl_sched_time_avg,
326 .maxlen = sizeof(unsigned int), 326 .maxlen = sizeof(unsigned int),
327 .mode = 0644, 327 .mode = 0644,
328 .proc_handler = proc_dointvec, 328 .proc_handler = proc_dointvec,
329 }, 329 },
330 { 330 {
331 .procname = "sched_shares_window", 331 .procname = "sched_shares_window_ns",
332 .data = &sysctl_sched_shares_window, 332 .data = &sysctl_sched_shares_window,
333 .maxlen = sizeof(unsigned int), 333 .maxlen = sizeof(unsigned int),
334 .mode = 0644, 334 .mode = 0644,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cf5f6b262673..f423bdd035c2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -372,7 +372,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
372 * the scheduler tick in nohz_restart_sched_tick. 372 * the scheduler tick in nohz_restart_sched_tick.
373 */ 373 */
374 if (!ts->tick_stopped) { 374 if (!ts->tick_stopped) {
375 select_nohz_load_balancer(1); 375 nohz_balance_enter_idle(cpu);
376 calc_load_enter_idle(); 376 calc_load_enter_idle();
377 377
378 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 378 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -570,7 +570,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
570static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) 570static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
571{ 571{
572 /* Update jiffies first */ 572 /* Update jiffies first */
573 select_nohz_load_balancer(0);
574 tick_do_update_jiffies64(now); 573 tick_do_update_jiffies64(now);
575 update_cpu_load_nohz(); 574 update_cpu_load_nohz();
576 575