aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-01 13:43:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-01 13:43:39 -0400
commit0b981cb94bc63a2d0e5eccccdca75fe57643ffce (patch)
tree966ad6e6807fd1041d9962c9904e032a5ab07a65 /kernel
parent4cba3335826cbb36a218c3f5a1387e2c7c7ca9aa (diff)
parentfdf9c356502ae02238efcdf90cefd7b473a63fd4 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "Continued quest to clean up and enhance the cputime code by Frederic Weisbecker, in preparation for future tickless kernel features. Other than that, smallish changes." Fix up trivial conflicts due to additions next to each other in arch/{x86/}Kconfig * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits) cputime: Make finegrained irqtime accounting generally available cputime: Gather time/stats accounting config options into a single menu ia64: Reuse system and user vtime accounting functions on task switch ia64: Consolidate user vtime accounting vtime: Consolidate system/idle context detection cputime: Use a proper subsystem naming for vtime related APIs sched: cpu_power: enable ARCH_POWER sched/nohz: Clean up select_nohz_load_balancer() sched: Fix load avg vs. cpu-hotplug sched: Remove __ARCH_WANT_INTERRUPTS_ON_CTXSW sched: Fix nohz_idle_balance() sched: Remove useless code in yield_to() sched: Add time unit suffix to sched sysctl knobs sched/debug: Limit sd->*_idx range on sysctl sched: Remove AFFINE_WAKEUPS feature flag s390: Remove leftover account_tick_vtime() header cputime: Consolidate vtime handling on context switch sched: Move cputime code to its own file cputime: Generalize CONFIG_VIRT_CPU_ACCOUNTING tile: Remove SD_PREFER_LOCAL leftover ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c675
-rw-r--r--kernel/sched/cputime.c530
-rw-r--r--kernel/sched/fair.c81
-rw-r--r--kernel/sched/features.h10
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h69
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/sysctl.c6
-rw-r--r--kernel/time/tick-sched.c3
11 files changed, 686 insertions, 705 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 2343c9eaaaf4..5a0e74d89a5a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1276,11 +1276,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1276#endif 1276#endif
1277#ifdef CONFIG_TRACE_IRQFLAGS 1277#ifdef CONFIG_TRACE_IRQFLAGS
1278 p->irq_events = 0; 1278 p->irq_events = 0;
1279#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1280 p->hardirqs_enabled = 1;
1281#else
1282 p->hardirqs_enabled = 0; 1279 p->hardirqs_enabled = 0;
1283#endif
1284 p->hardirq_enable_ip = 0; 1280 p->hardirq_enable_ip = 0;
1285 p->hardirq_enable_event = 0; 1281 p->hardirq_enable_event = 0;
1286 p->hardirq_disable_ip = _THIS_IP_; 1282 p->hardirq_disable_ip = _THIS_IP_;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 173ea52f3af0..f06d249e103b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o 15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3c4dec0594d6..c17747236438 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -740,126 +740,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
740 dequeue_task(rq, p, flags); 740 dequeue_task(rq, p, flags);
741} 741}
742 742
743#ifdef CONFIG_IRQ_TIME_ACCOUNTING
744
745/*
746 * There are no locks covering percpu hardirq/softirq time.
747 * They are only modified in account_system_vtime, on corresponding CPU
748 * with interrupts disabled. So, writes are safe.
749 * They are read and saved off onto struct rq in update_rq_clock().
750 * This may result in other CPU reading this CPU's irq time and can
751 * race with irq/account_system_vtime on this CPU. We would either get old
752 * or new value with a side effect of accounting a slice of irq time to wrong
753 * task when irq is in progress while we read rq->clock. That is a worthy
754 * compromise in place of having locks on each irq in account_system_time.
755 */
756static DEFINE_PER_CPU(u64, cpu_hardirq_time);
757static DEFINE_PER_CPU(u64, cpu_softirq_time);
758
759static DEFINE_PER_CPU(u64, irq_start_time);
760static int sched_clock_irqtime;
761
762void enable_sched_clock_irqtime(void)
763{
764 sched_clock_irqtime = 1;
765}
766
767void disable_sched_clock_irqtime(void)
768{
769 sched_clock_irqtime = 0;
770}
771
772#ifndef CONFIG_64BIT
773static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
774
775static inline void irq_time_write_begin(void)
776{
777 __this_cpu_inc(irq_time_seq.sequence);
778 smp_wmb();
779}
780
781static inline void irq_time_write_end(void)
782{
783 smp_wmb();
784 __this_cpu_inc(irq_time_seq.sequence);
785}
786
787static inline u64 irq_time_read(int cpu)
788{
789 u64 irq_time;
790 unsigned seq;
791
792 do {
793 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
794 irq_time = per_cpu(cpu_softirq_time, cpu) +
795 per_cpu(cpu_hardirq_time, cpu);
796 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
797
798 return irq_time;
799}
800#else /* CONFIG_64BIT */
801static inline void irq_time_write_begin(void)
802{
803}
804
805static inline void irq_time_write_end(void)
806{
807}
808
809static inline u64 irq_time_read(int cpu)
810{
811 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
812}
813#endif /* CONFIG_64BIT */
814
815/*
816 * Called before incrementing preempt_count on {soft,}irq_enter
817 * and before decrementing preempt_count on {soft,}irq_exit.
818 */
819void account_system_vtime(struct task_struct *curr)
820{
821 unsigned long flags;
822 s64 delta;
823 int cpu;
824
825 if (!sched_clock_irqtime)
826 return;
827
828 local_irq_save(flags);
829
830 cpu = smp_processor_id();
831 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
832 __this_cpu_add(irq_start_time, delta);
833
834 irq_time_write_begin();
835 /*
836 * We do not account for softirq time from ksoftirqd here.
837 * We want to continue accounting softirq time to ksoftirqd thread
838 * in that case, so as not to confuse scheduler with a special task
839 * that do not consume any time, but still wants to run.
840 */
841 if (hardirq_count())
842 __this_cpu_add(cpu_hardirq_time, delta);
843 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
844 __this_cpu_add(cpu_softirq_time, delta);
845
846 irq_time_write_end();
847 local_irq_restore(flags);
848}
849EXPORT_SYMBOL_GPL(account_system_vtime);
850
851#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
852
853#ifdef CONFIG_PARAVIRT
854static inline u64 steal_ticks(u64 steal)
855{
856 if (unlikely(steal > NSEC_PER_SEC))
857 return div_u64(steal, TICK_NSEC);
858
859 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
860}
861#endif
862
863static void update_rq_clock_task(struct rq *rq, s64 delta) 743static void update_rq_clock_task(struct rq *rq, s64 delta)
864{ 744{
865/* 745/*
@@ -920,43 +800,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
920#endif 800#endif
921} 801}
922 802
923#ifdef CONFIG_IRQ_TIME_ACCOUNTING
924static int irqtime_account_hi_update(void)
925{
926 u64 *cpustat = kcpustat_this_cpu->cpustat;
927 unsigned long flags;
928 u64 latest_ns;
929 int ret = 0;
930
931 local_irq_save(flags);
932 latest_ns = this_cpu_read(cpu_hardirq_time);
933 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
934 ret = 1;
935 local_irq_restore(flags);
936 return ret;
937}
938
939static int irqtime_account_si_update(void)
940{
941 u64 *cpustat = kcpustat_this_cpu->cpustat;
942 unsigned long flags;
943 u64 latest_ns;
944 int ret = 0;
945
946 local_irq_save(flags);
947 latest_ns = this_cpu_read(cpu_softirq_time);
948 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
949 ret = 1;
950 local_irq_restore(flags);
951 return ret;
952}
953
954#else /* CONFIG_IRQ_TIME_ACCOUNTING */
955
956#define sched_clock_irqtime (0)
957
958#endif
959
960void sched_set_stop_task(int cpu, struct task_struct *stop) 803void sched_set_stop_task(int cpu, struct task_struct *stop)
961{ 804{
962 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 805 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -1518,25 +1361,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
1518 smp_send_reschedule(cpu); 1361 smp_send_reschedule(cpu);
1519} 1362}
1520 1363
1521#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1522static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
1523{
1524 struct rq *rq;
1525 int ret = 0;
1526
1527 rq = __task_rq_lock(p);
1528 if (p->on_cpu) {
1529 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1530 ttwu_do_wakeup(rq, p, wake_flags);
1531 ret = 1;
1532 }
1533 __task_rq_unlock(rq);
1534
1535 return ret;
1536
1537}
1538#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1539
1540bool cpus_share_cache(int this_cpu, int that_cpu) 1364bool cpus_share_cache(int this_cpu, int that_cpu)
1541{ 1365{
1542 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1366 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
@@ -1597,21 +1421,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1597 * If the owning (remote) cpu is still in the middle of schedule() with 1421 * If the owning (remote) cpu is still in the middle of schedule() with
1598 * this task as prev, wait until its done referencing the task. 1422 * this task as prev, wait until its done referencing the task.
1599 */ 1423 */
1600 while (p->on_cpu) { 1424 while (p->on_cpu)
1601#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1602 /*
1603 * In case the architecture enables interrupts in
1604 * context_switch(), we cannot busy wait, since that
1605 * would lead to deadlocks when an interrupt hits and
1606 * tries to wake up @prev. So bail and do a complete
1607 * remote wakeup.
1608 */
1609 if (ttwu_activate_remote(p, wake_flags))
1610 goto stat;
1611#else
1612 cpu_relax(); 1425 cpu_relax();
1613#endif
1614 }
1615 /* 1426 /*
1616 * Pairs with the smp_wmb() in finish_lock_switch(). 1427 * Pairs with the smp_wmb() in finish_lock_switch().
1617 */ 1428 */
@@ -1953,14 +1764,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1953 * Manfred Spraul <manfred@colorfullife.com> 1764 * Manfred Spraul <manfred@colorfullife.com>
1954 */ 1765 */
1955 prev_state = prev->state; 1766 prev_state = prev->state;
1767 vtime_task_switch(prev);
1956 finish_arch_switch(prev); 1768 finish_arch_switch(prev);
1957#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1958 local_irq_disable();
1959#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1960 perf_event_task_sched_in(prev, current); 1769 perf_event_task_sched_in(prev, current);
1961#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1962 local_irq_enable();
1963#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1964 finish_lock_switch(rq, prev); 1770 finish_lock_switch(rq, prev);
1965 finish_arch_post_lock_switch(); 1771 finish_arch_post_lock_switch();
1966 1772
@@ -2810,404 +2616,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2810 return ns; 2616 return ns;
2811} 2617}
2812 2618
2813#ifdef CONFIG_CGROUP_CPUACCT
2814struct cgroup_subsys cpuacct_subsys;
2815struct cpuacct root_cpuacct;
2816#endif
2817
2818static inline void task_group_account_field(struct task_struct *p, int index,
2819 u64 tmp)
2820{
2821#ifdef CONFIG_CGROUP_CPUACCT
2822 struct kernel_cpustat *kcpustat;
2823 struct cpuacct *ca;
2824#endif
2825 /*
2826 * Since all updates are sure to touch the root cgroup, we
2827 * get ourselves ahead and touch it first. If the root cgroup
2828 * is the only cgroup, then nothing else should be necessary.
2829 *
2830 */
2831 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2832
2833#ifdef CONFIG_CGROUP_CPUACCT
2834 if (unlikely(!cpuacct_subsys.active))
2835 return;
2836
2837 rcu_read_lock();
2838 ca = task_ca(p);
2839 while (ca && (ca != &root_cpuacct)) {
2840 kcpustat = this_cpu_ptr(ca->cpustat);
2841 kcpustat->cpustat[index] += tmp;
2842 ca = parent_ca(ca);
2843 }
2844 rcu_read_unlock();
2845#endif
2846}
2847
2848
2849/*
2850 * Account user cpu time to a process.
2851 * @p: the process that the cpu time gets accounted to
2852 * @cputime: the cpu time spent in user space since the last update
2853 * @cputime_scaled: cputime scaled by cpu frequency
2854 */
2855void account_user_time(struct task_struct *p, cputime_t cputime,
2856 cputime_t cputime_scaled)
2857{
2858 int index;
2859
2860 /* Add user time to process. */
2861 p->utime += cputime;
2862 p->utimescaled += cputime_scaled;
2863 account_group_user_time(p, cputime);
2864
2865 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2866
2867 /* Add user time to cpustat. */
2868 task_group_account_field(p, index, (__force u64) cputime);
2869
2870 /* Account for user time used */
2871 acct_update_integrals(p);
2872}
2873
2874/*
2875 * Account guest cpu time to a process.
2876 * @p: the process that the cpu time gets accounted to
2877 * @cputime: the cpu time spent in virtual machine since the last update
2878 * @cputime_scaled: cputime scaled by cpu frequency
2879 */
2880static void account_guest_time(struct task_struct *p, cputime_t cputime,
2881 cputime_t cputime_scaled)
2882{
2883 u64 *cpustat = kcpustat_this_cpu->cpustat;
2884
2885 /* Add guest time to process. */
2886 p->utime += cputime;
2887 p->utimescaled += cputime_scaled;
2888 account_group_user_time(p, cputime);
2889 p->gtime += cputime;
2890
2891 /* Add guest time to cpustat. */
2892 if (TASK_NICE(p) > 0) {
2893 cpustat[CPUTIME_NICE] += (__force u64) cputime;
2894 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
2895 } else {
2896 cpustat[CPUTIME_USER] += (__force u64) cputime;
2897 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
2898 }
2899}
2900
2901/*
2902 * Account system cpu time to a process and desired cpustat field
2903 * @p: the process that the cpu time gets accounted to
2904 * @cputime: the cpu time spent in kernel space since the last update
2905 * @cputime_scaled: cputime scaled by cpu frequency
2906 * @target_cputime64: pointer to cpustat field that has to be updated
2907 */
2908static inline
2909void __account_system_time(struct task_struct *p, cputime_t cputime,
2910 cputime_t cputime_scaled, int index)
2911{
2912 /* Add system time to process. */
2913 p->stime += cputime;
2914 p->stimescaled += cputime_scaled;
2915 account_group_system_time(p, cputime);
2916
2917 /* Add system time to cpustat. */
2918 task_group_account_field(p, index, (__force u64) cputime);
2919
2920 /* Account for system time used */
2921 acct_update_integrals(p);
2922}
2923
2924/*
2925 * Account system cpu time to a process.
2926 * @p: the process that the cpu time gets accounted to
2927 * @hardirq_offset: the offset to subtract from hardirq_count()
2928 * @cputime: the cpu time spent in kernel space since the last update
2929 * @cputime_scaled: cputime scaled by cpu frequency
2930 */
2931void account_system_time(struct task_struct *p, int hardirq_offset,
2932 cputime_t cputime, cputime_t cputime_scaled)
2933{
2934 int index;
2935
2936 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
2937 account_guest_time(p, cputime, cputime_scaled);
2938 return;
2939 }
2940
2941 if (hardirq_count() - hardirq_offset)
2942 index = CPUTIME_IRQ;
2943 else if (in_serving_softirq())
2944 index = CPUTIME_SOFTIRQ;
2945 else
2946 index = CPUTIME_SYSTEM;
2947
2948 __account_system_time(p, cputime, cputime_scaled, index);
2949}
2950
2951/*
2952 * Account for involuntary wait time.
2953 * @cputime: the cpu time spent in involuntary wait
2954 */
2955void account_steal_time(cputime_t cputime)
2956{
2957 u64 *cpustat = kcpustat_this_cpu->cpustat;
2958
2959 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
2960}
2961
2962/*
2963 * Account for idle time.
2964 * @cputime: the cpu time spent in idle wait
2965 */
2966void account_idle_time(cputime_t cputime)
2967{
2968 u64 *cpustat = kcpustat_this_cpu->cpustat;
2969 struct rq *rq = this_rq();
2970
2971 if (atomic_read(&rq->nr_iowait) > 0)
2972 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
2973 else
2974 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
2975}
2976
2977static __always_inline bool steal_account_process_tick(void)
2978{
2979#ifdef CONFIG_PARAVIRT
2980 if (static_key_false(&paravirt_steal_enabled)) {
2981 u64 steal, st = 0;
2982
2983 steal = paravirt_steal_clock(smp_processor_id());
2984 steal -= this_rq()->prev_steal_time;
2985
2986 st = steal_ticks(steal);
2987 this_rq()->prev_steal_time += st * TICK_NSEC;
2988
2989 account_steal_time(st);
2990 return st;
2991 }
2992#endif
2993 return false;
2994}
2995
2996#ifndef CONFIG_VIRT_CPU_ACCOUNTING
2997
2998#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2999/*
3000 * Account a tick to a process and cpustat
3001 * @p: the process that the cpu time gets accounted to
3002 * @user_tick: is the tick from userspace
3003 * @rq: the pointer to rq
3004 *
3005 * Tick demultiplexing follows the order
3006 * - pending hardirq update
3007 * - pending softirq update
3008 * - user_time
3009 * - idle_time
3010 * - system time
3011 * - check for guest_time
3012 * - else account as system_time
3013 *
3014 * Check for hardirq is done both for system and user time as there is
3015 * no timer going off while we are on hardirq and hence we may never get an
3016 * opportunity to update it solely in system time.
3017 * p->stime and friends are only updated on system time and not on irq
3018 * softirq as those do not count in task exec_runtime any more.
3019 */
3020static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3021 struct rq *rq)
3022{
3023 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3024 u64 *cpustat = kcpustat_this_cpu->cpustat;
3025
3026 if (steal_account_process_tick())
3027 return;
3028
3029 if (irqtime_account_hi_update()) {
3030 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
3031 } else if (irqtime_account_si_update()) {
3032 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
3033 } else if (this_cpu_ksoftirqd() == p) {
3034 /*
3035 * ksoftirqd time do not get accounted in cpu_softirq_time.
3036 * So, we have to handle it separately here.
3037 * Also, p->stime needs to be updated for ksoftirqd.
3038 */
3039 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3040 CPUTIME_SOFTIRQ);
3041 } else if (user_tick) {
3042 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3043 } else if (p == rq->idle) {
3044 account_idle_time(cputime_one_jiffy);
3045 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3046 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3047 } else {
3048 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3049 CPUTIME_SYSTEM);
3050 }
3051}
3052
3053static void irqtime_account_idle_ticks(int ticks)
3054{
3055 int i;
3056 struct rq *rq = this_rq();
3057
3058 for (i = 0; i < ticks; i++)
3059 irqtime_account_process_tick(current, 0, rq);
3060}
3061#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3062static void irqtime_account_idle_ticks(int ticks) {}
3063static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3064 struct rq *rq) {}
3065#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3066
3067/*
3068 * Account a single tick of cpu time.
3069 * @p: the process that the cpu time gets accounted to
3070 * @user_tick: indicates if the tick is a user or a system tick
3071 */
3072void account_process_tick(struct task_struct *p, int user_tick)
3073{
3074 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3075 struct rq *rq = this_rq();
3076
3077 if (sched_clock_irqtime) {
3078 irqtime_account_process_tick(p, user_tick, rq);
3079 return;
3080 }
3081
3082 if (steal_account_process_tick())
3083 return;
3084
3085 if (user_tick)
3086 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3087 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3088 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3089 one_jiffy_scaled);
3090 else
3091 account_idle_time(cputime_one_jiffy);
3092}
3093
3094/*
3095 * Account multiple ticks of steal time.
3096 * @p: the process from which the cpu time has been stolen
3097 * @ticks: number of stolen ticks
3098 */
3099void account_steal_ticks(unsigned long ticks)
3100{
3101 account_steal_time(jiffies_to_cputime(ticks));
3102}
3103
3104/*
3105 * Account multiple ticks of idle time.
3106 * @ticks: number of stolen ticks
3107 */
3108void account_idle_ticks(unsigned long ticks)
3109{
3110
3111 if (sched_clock_irqtime) {
3112 irqtime_account_idle_ticks(ticks);
3113 return;
3114 }
3115
3116 account_idle_time(jiffies_to_cputime(ticks));
3117}
3118
3119#endif
3120
3121/*
3122 * Use precise platform statistics if available:
3123 */
3124#ifdef CONFIG_VIRT_CPU_ACCOUNTING
3125void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3126{
3127 *ut = p->utime;
3128 *st = p->stime;
3129}
3130
3131void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3132{
3133 struct task_cputime cputime;
3134
3135 thread_group_cputime(p, &cputime);
3136
3137 *ut = cputime.utime;
3138 *st = cputime.stime;
3139}
3140#else
3141
3142#ifndef nsecs_to_cputime
3143# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3144#endif
3145
3146static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
3147{
3148 u64 temp = (__force u64) rtime;
3149
3150 temp *= (__force u64) utime;
3151
3152 if (sizeof(cputime_t) == 4)
3153 temp = div_u64(temp, (__force u32) total);
3154 else
3155 temp = div64_u64(temp, (__force u64) total);
3156
3157 return (__force cputime_t) temp;
3158}
3159
3160void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3161{
3162 cputime_t rtime, utime = p->utime, total = utime + p->stime;
3163
3164 /*
3165 * Use CFS's precise accounting:
3166 */
3167 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3168
3169 if (total)
3170 utime = scale_utime(utime, rtime, total);
3171 else
3172 utime = rtime;
3173
3174 /*
3175 * Compare with previous values, to keep monotonicity:
3176 */
3177 p->prev_utime = max(p->prev_utime, utime);
3178 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
3179
3180 *ut = p->prev_utime;
3181 *st = p->prev_stime;
3182}
3183
3184/*
3185 * Must be called with siglock held.
3186 */
3187void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3188{
3189 struct signal_struct *sig = p->signal;
3190 struct task_cputime cputime;
3191 cputime_t rtime, utime, total;
3192
3193 thread_group_cputime(p, &cputime);
3194
3195 total = cputime.utime + cputime.stime;
3196 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3197
3198 if (total)
3199 utime = scale_utime(cputime.utime, rtime, total);
3200 else
3201 utime = rtime;
3202
3203 sig->prev_utime = max(sig->prev_utime, utime);
3204 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
3205
3206 *ut = sig->prev_utime;
3207 *st = sig->prev_stime;
3208}
3209#endif
3210
3211/* 2619/*
3212 * This function gets called by the timer code, with HZ frequency. 2620 * This function gets called by the timer code, with HZ frequency.
3213 * We call it with interrupts disabled. 2621 * We call it with interrupts disabled.
@@ -3368,6 +2776,40 @@ pick_next_task(struct rq *rq)
3368 2776
3369/* 2777/*
3370 * __schedule() is the main scheduler function. 2778 * __schedule() is the main scheduler function.
2779 *
2780 * The main means of driving the scheduler and thus entering this function are:
2781 *
2782 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
2783 *
2784 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
2785 * paths. For example, see arch/x86/entry_64.S.
2786 *
2787 * To drive preemption between tasks, the scheduler sets the flag in timer
2788 * interrupt handler scheduler_tick().
2789 *
2790 * 3. Wakeups don't really cause entry into schedule(). They add a
2791 * task to the run-queue and that's it.
2792 *
2793 * Now, if the new task added to the run-queue preempts the current
2794 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
2795 * called on the nearest possible occasion:
2796 *
2797 * - If the kernel is preemptible (CONFIG_PREEMPT=y):
2798 *
2799 * - in syscall or exception context, at the next outmost
2800 * preempt_enable(). (this might be as soon as the wake_up()'s
2801 * spin_unlock()!)
2802 *
2803 * - in IRQ context, return from interrupt-handler to
2804 * preemptible context
2805 *
2806 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
2807 * then at the next:
2808 *
2809 * - cond_resched() call
2810 * - explicit schedule() call
2811 * - return from syscall or exception to user-space
2812 * - return from interrupt-handler to user-space
3371 */ 2813 */
3372static void __sched __schedule(void) 2814static void __sched __schedule(void)
3373{ 2815{
@@ -4885,13 +4327,6 @@ again:
4885 */ 4327 */
4886 if (preempt && rq != p_rq) 4328 if (preempt && rq != p_rq)
4887 resched_task(p_rq->curr); 4329 resched_task(p_rq->curr);
4888 } else {
4889 /*
4890 * We might have set it in task_yield_fair(), but are
4891 * not going to schedule(), so don't want to skip
4892 * the next update.
4893 */
4894 rq->skip_clock_update = 0;
4895 } 4330 }
4896 4331
4897out: 4332out:
@@ -5433,16 +4868,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
5433 *tablep = NULL; 4868 *tablep = NULL;
5434} 4869}
5435 4870
4871static int min_load_idx = 0;
4872static int max_load_idx = CPU_LOAD_IDX_MAX;
4873
5436static void 4874static void
5437set_table_entry(struct ctl_table *entry, 4875set_table_entry(struct ctl_table *entry,
5438 const char *procname, void *data, int maxlen, 4876 const char *procname, void *data, int maxlen,
5439 umode_t mode, proc_handler *proc_handler) 4877 umode_t mode, proc_handler *proc_handler,
4878 bool load_idx)
5440{ 4879{
5441 entry->procname = procname; 4880 entry->procname = procname;
5442 entry->data = data; 4881 entry->data = data;
5443 entry->maxlen = maxlen; 4882 entry->maxlen = maxlen;
5444 entry->mode = mode; 4883 entry->mode = mode;
5445 entry->proc_handler = proc_handler; 4884 entry->proc_handler = proc_handler;
4885
4886 if (load_idx) {
4887 entry->extra1 = &min_load_idx;
4888 entry->extra2 = &max_load_idx;
4889 }
5446} 4890}
5447 4891
5448static struct ctl_table * 4892static struct ctl_table *
@@ -5454,30 +4898,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5454 return NULL; 4898 return NULL;
5455 4899
5456 set_table_entry(&table[0], "min_interval", &sd->min_interval, 4900 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5457 sizeof(long), 0644, proc_doulongvec_minmax); 4901 sizeof(long), 0644, proc_doulongvec_minmax, false);
5458 set_table_entry(&table[1], "max_interval", &sd->max_interval, 4902 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5459 sizeof(long), 0644, proc_doulongvec_minmax); 4903 sizeof(long), 0644, proc_doulongvec_minmax, false);
5460 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 4904 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5461 sizeof(int), 0644, proc_dointvec_minmax); 4905 sizeof(int), 0644, proc_dointvec_minmax, true);
5462 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 4906 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5463 sizeof(int), 0644, proc_dointvec_minmax); 4907 sizeof(int), 0644, proc_dointvec_minmax, true);
5464 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 4908 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5465 sizeof(int), 0644, proc_dointvec_minmax); 4909 sizeof(int), 0644, proc_dointvec_minmax, true);
5466 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 4910 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5467 sizeof(int), 0644, proc_dointvec_minmax); 4911 sizeof(int), 0644, proc_dointvec_minmax, true);
5468 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 4912 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5469 sizeof(int), 0644, proc_dointvec_minmax); 4913 sizeof(int), 0644, proc_dointvec_minmax, true);
5470 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 4914 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5471 sizeof(int), 0644, proc_dointvec_minmax); 4915 sizeof(int), 0644, proc_dointvec_minmax, false);
5472 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 4916 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5473 sizeof(int), 0644, proc_dointvec_minmax); 4917 sizeof(int), 0644, proc_dointvec_minmax, false);
5474 set_table_entry(&table[9], "cache_nice_tries", 4918 set_table_entry(&table[9], "cache_nice_tries",
5475 &sd->cache_nice_tries, 4919 &sd->cache_nice_tries,
5476 sizeof(int), 0644, proc_dointvec_minmax); 4920 sizeof(int), 0644, proc_dointvec_minmax, false);
5477 set_table_entry(&table[10], "flags", &sd->flags, 4921 set_table_entry(&table[10], "flags", &sd->flags,
5478 sizeof(int), 0644, proc_dointvec_minmax); 4922 sizeof(int), 0644, proc_dointvec_minmax, false);
5479 set_table_entry(&table[11], "name", sd->name, 4923 set_table_entry(&table[11], "name", sd->name,
5480 CORENAME_MAX_SIZE, 0444, proc_dostring); 4924 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
5481 /* &table[12] is terminator */ 4925 /* &table[12] is terminator */
5482 4926
5483 return table; 4927 return table;
@@ -6556,7 +6000,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6556 | 0*SD_BALANCE_FORK 6000 | 0*SD_BALANCE_FORK
6557 | 0*SD_BALANCE_WAKE 6001 | 0*SD_BALANCE_WAKE
6558 | 0*SD_WAKE_AFFINE 6002 | 0*SD_WAKE_AFFINE
6559 | 0*SD_PREFER_LOCAL
6560 | 0*SD_SHARE_CPUPOWER 6003 | 0*SD_SHARE_CPUPOWER
6561 | 0*SD_SHARE_PKG_RESOURCES 6004 | 0*SD_SHARE_PKG_RESOURCES
6562 | 1*SD_SERIALIZE 6005 | 1*SD_SERIALIZE
@@ -8354,6 +7797,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8354 * (balbir@in.ibm.com). 7797 * (balbir@in.ibm.com).
8355 */ 7798 */
8356 7799
7800struct cpuacct root_cpuacct;
7801
8357/* create a new cpu accounting group */ 7802/* create a new cpu accounting group */
8358static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) 7803static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
8359{ 7804{
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
new file mode 100644
index 000000000000..81b763ba58a6
--- /dev/null
+++ b/kernel/sched/cputime.c
@@ -0,0 +1,530 @@
1#include <linux/export.h>
2#include <linux/sched.h>
3#include <linux/tsacct_kern.h>
4#include <linux/kernel_stat.h>
5#include <linux/static_key.h>
6#include "sched.h"
7
8
9#ifdef CONFIG_IRQ_TIME_ACCOUNTING
10
11/*
12 * There are no locks covering percpu hardirq/softirq time.
13 * They are only modified in vtime_account, on corresponding CPU
14 * with interrupts disabled. So, writes are safe.
15 * They are read and saved off onto struct rq in update_rq_clock().
16 * This may result in other CPU reading this CPU's irq time and can
17 * race with irq/vtime_account on this CPU. We would either get old
18 * or new value with a side effect of accounting a slice of irq time to wrong
19 * task when irq is in progress while we read rq->clock. That is a worthy
20 * compromise in place of having locks on each irq in account_system_time.
21 */
22DEFINE_PER_CPU(u64, cpu_hardirq_time);
23DEFINE_PER_CPU(u64, cpu_softirq_time);
24
25static DEFINE_PER_CPU(u64, irq_start_time);
26static int sched_clock_irqtime;
27
28void enable_sched_clock_irqtime(void)
29{
30 sched_clock_irqtime = 1;
31}
32
33void disable_sched_clock_irqtime(void)
34{
35 sched_clock_irqtime = 0;
36}
37
38#ifndef CONFIG_64BIT
39DEFINE_PER_CPU(seqcount_t, irq_time_seq);
40#endif /* CONFIG_64BIT */
41
42/*
43 * Called before incrementing preempt_count on {soft,}irq_enter
44 * and before decrementing preempt_count on {soft,}irq_exit.
45 */
46void vtime_account(struct task_struct *curr)
47{
48 unsigned long flags;
49 s64 delta;
50 int cpu;
51
52 if (!sched_clock_irqtime)
53 return;
54
55 local_irq_save(flags);
56
57 cpu = smp_processor_id();
58 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
59 __this_cpu_add(irq_start_time, delta);
60
61 irq_time_write_begin();
62 /*
63 * We do not account for softirq time from ksoftirqd here.
64 * We want to continue accounting softirq time to ksoftirqd thread
65 * in that case, so as not to confuse scheduler with a special task
66 * that do not consume any time, but still wants to run.
67 */
68 if (hardirq_count())
69 __this_cpu_add(cpu_hardirq_time, delta);
70 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
71 __this_cpu_add(cpu_softirq_time, delta);
72
73 irq_time_write_end();
74 local_irq_restore(flags);
75}
76EXPORT_SYMBOL_GPL(vtime_account);
77
78static int irqtime_account_hi_update(void)
79{
80 u64 *cpustat = kcpustat_this_cpu->cpustat;
81 unsigned long flags;
82 u64 latest_ns;
83 int ret = 0;
84
85 local_irq_save(flags);
86 latest_ns = this_cpu_read(cpu_hardirq_time);
87 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
88 ret = 1;
89 local_irq_restore(flags);
90 return ret;
91}
92
93static int irqtime_account_si_update(void)
94{
95 u64 *cpustat = kcpustat_this_cpu->cpustat;
96 unsigned long flags;
97 u64 latest_ns;
98 int ret = 0;
99
100 local_irq_save(flags);
101 latest_ns = this_cpu_read(cpu_softirq_time);
102 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
103 ret = 1;
104 local_irq_restore(flags);
105 return ret;
106}
107
108#else /* CONFIG_IRQ_TIME_ACCOUNTING */
109
110#define sched_clock_irqtime (0)
111
112#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
113
114static inline void task_group_account_field(struct task_struct *p, int index,
115 u64 tmp)
116{
117#ifdef CONFIG_CGROUP_CPUACCT
118 struct kernel_cpustat *kcpustat;
119 struct cpuacct *ca;
120#endif
121 /*
122 * Since all updates are sure to touch the root cgroup, we
123 * get ourselves ahead and touch it first. If the root cgroup
124 * is the only cgroup, then nothing else should be necessary.
125 *
126 */
127 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
128
129#ifdef CONFIG_CGROUP_CPUACCT
130 if (unlikely(!cpuacct_subsys.active))
131 return;
132
133 rcu_read_lock();
134 ca = task_ca(p);
135 while (ca && (ca != &root_cpuacct)) {
136 kcpustat = this_cpu_ptr(ca->cpustat);
137 kcpustat->cpustat[index] += tmp;
138 ca = parent_ca(ca);
139 }
140 rcu_read_unlock();
141#endif
142}
143
144/*
145 * Account user cpu time to a process.
146 * @p: the process that the cpu time gets accounted to
147 * @cputime: the cpu time spent in user space since the last update
148 * @cputime_scaled: cputime scaled by cpu frequency
149 */
150void account_user_time(struct task_struct *p, cputime_t cputime,
151 cputime_t cputime_scaled)
152{
153 int index;
154
155 /* Add user time to process. */
156 p->utime += cputime;
157 p->utimescaled += cputime_scaled;
158 account_group_user_time(p, cputime);
159
160 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
161
162 /* Add user time to cpustat. */
163 task_group_account_field(p, index, (__force u64) cputime);
164
165 /* Account for user time used */
166 acct_update_integrals(p);
167}
168
169/*
170 * Account guest cpu time to a process.
171 * @p: the process that the cpu time gets accounted to
172 * @cputime: the cpu time spent in virtual machine since the last update
173 * @cputime_scaled: cputime scaled by cpu frequency
174 */
175static void account_guest_time(struct task_struct *p, cputime_t cputime,
176 cputime_t cputime_scaled)
177{
178 u64 *cpustat = kcpustat_this_cpu->cpustat;
179
180 /* Add guest time to process. */
181 p->utime += cputime;
182 p->utimescaled += cputime_scaled;
183 account_group_user_time(p, cputime);
184 p->gtime += cputime;
185
186 /* Add guest time to cpustat. */
187 if (TASK_NICE(p) > 0) {
188 cpustat[CPUTIME_NICE] += (__force u64) cputime;
189 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
190 } else {
191 cpustat[CPUTIME_USER] += (__force u64) cputime;
192 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
193 }
194}
195
196/*
197 * Account system cpu time to a process and desired cpustat field
198 * @p: the process that the cpu time gets accounted to
199 * @cputime: the cpu time spent in kernel space since the last update
200 * @cputime_scaled: cputime scaled by cpu frequency
201 * @target_cputime64: pointer to cpustat field that has to be updated
202 */
203static inline
204void __account_system_time(struct task_struct *p, cputime_t cputime,
205 cputime_t cputime_scaled, int index)
206{
207 /* Add system time to process. */
208 p->stime += cputime;
209 p->stimescaled += cputime_scaled;
210 account_group_system_time(p, cputime);
211
212 /* Add system time to cpustat. */
213 task_group_account_field(p, index, (__force u64) cputime);
214
215 /* Account for system time used */
216 acct_update_integrals(p);
217}
218
219/*
220 * Account system cpu time to a process.
221 * @p: the process that the cpu time gets accounted to
222 * @hardirq_offset: the offset to subtract from hardirq_count()
223 * @cputime: the cpu time spent in kernel space since the last update
224 * @cputime_scaled: cputime scaled by cpu frequency
225 */
226void account_system_time(struct task_struct *p, int hardirq_offset,
227 cputime_t cputime, cputime_t cputime_scaled)
228{
229 int index;
230
231 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
232 account_guest_time(p, cputime, cputime_scaled);
233 return;
234 }
235
236 if (hardirq_count() - hardirq_offset)
237 index = CPUTIME_IRQ;
238 else if (in_serving_softirq())
239 index = CPUTIME_SOFTIRQ;
240 else
241 index = CPUTIME_SYSTEM;
242
243 __account_system_time(p, cputime, cputime_scaled, index);
244}
245
246/*
247 * Account for involuntary wait time.
248 * @cputime: the cpu time spent in involuntary wait
249 */
250void account_steal_time(cputime_t cputime)
251{
252 u64 *cpustat = kcpustat_this_cpu->cpustat;
253
254 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
255}
256
257/*
258 * Account for idle time.
259 * @cputime: the cpu time spent in idle wait
260 */
261void account_idle_time(cputime_t cputime)
262{
263 u64 *cpustat = kcpustat_this_cpu->cpustat;
264 struct rq *rq = this_rq();
265
266 if (atomic_read(&rq->nr_iowait) > 0)
267 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
268 else
269 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
270}
271
272static __always_inline bool steal_account_process_tick(void)
273{
274#ifdef CONFIG_PARAVIRT
275 if (static_key_false(&paravirt_steal_enabled)) {
276 u64 steal, st = 0;
277
278 steal = paravirt_steal_clock(smp_processor_id());
279 steal -= this_rq()->prev_steal_time;
280
281 st = steal_ticks(steal);
282 this_rq()->prev_steal_time += st * TICK_NSEC;
283
284 account_steal_time(st);
285 return st;
286 }
287#endif
288 return false;
289}
290
291#ifndef CONFIG_VIRT_CPU_ACCOUNTING
292
293#ifdef CONFIG_IRQ_TIME_ACCOUNTING
294/*
295 * Account a tick to a process and cpustat
296 * @p: the process that the cpu time gets accounted to
297 * @user_tick: is the tick from userspace
298 * @rq: the pointer to rq
299 *
300 * Tick demultiplexing follows the order
301 * - pending hardirq update
302 * - pending softirq update
303 * - user_time
304 * - idle_time
305 * - system time
306 * - check for guest_time
307 * - else account as system_time
308 *
309 * Check for hardirq is done both for system and user time as there is
310 * no timer going off while we are on hardirq and hence we may never get an
311 * opportunity to update it solely in system time.
312 * p->stime and friends are only updated on system time and not on irq
313 * softirq as those do not count in task exec_runtime any more.
314 */
315static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
316 struct rq *rq)
317{
318 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
319 u64 *cpustat = kcpustat_this_cpu->cpustat;
320
321 if (steal_account_process_tick())
322 return;
323
324 if (irqtime_account_hi_update()) {
325 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
326 } else if (irqtime_account_si_update()) {
327 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
328 } else if (this_cpu_ksoftirqd() == p) {
329 /*
330 * ksoftirqd time do not get accounted in cpu_softirq_time.
331 * So, we have to handle it separately here.
332 * Also, p->stime needs to be updated for ksoftirqd.
333 */
334 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
335 CPUTIME_SOFTIRQ);
336 } else if (user_tick) {
337 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
338 } else if (p == rq->idle) {
339 account_idle_time(cputime_one_jiffy);
340 } else if (p->flags & PF_VCPU) { /* System time or guest time */
341 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
342 } else {
343 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
344 CPUTIME_SYSTEM);
345 }
346}
347
348static void irqtime_account_idle_ticks(int ticks)
349{
350 int i;
351 struct rq *rq = this_rq();
352
353 for (i = 0; i < ticks; i++)
354 irqtime_account_process_tick(current, 0, rq);
355}
356#else /* CONFIG_IRQ_TIME_ACCOUNTING */
357static void irqtime_account_idle_ticks(int ticks) {}
358static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
359 struct rq *rq) {}
360#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
361
362/*
363 * Account a single tick of cpu time.
364 * @p: the process that the cpu time gets accounted to
365 * @user_tick: indicates if the tick is a user or a system tick
366 */
367void account_process_tick(struct task_struct *p, int user_tick)
368{
369 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
370 struct rq *rq = this_rq();
371
372 if (sched_clock_irqtime) {
373 irqtime_account_process_tick(p, user_tick, rq);
374 return;
375 }
376
377 if (steal_account_process_tick())
378 return;
379
380 if (user_tick)
381 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
382 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
383 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
384 one_jiffy_scaled);
385 else
386 account_idle_time(cputime_one_jiffy);
387}
388
389/*
390 * Account multiple ticks of steal time.
391 * @p: the process from which the cpu time has been stolen
392 * @ticks: number of stolen ticks
393 */
394void account_steal_ticks(unsigned long ticks)
395{
396 account_steal_time(jiffies_to_cputime(ticks));
397}
398
399/*
400 * Account multiple ticks of idle time.
401 * @ticks: number of stolen ticks
402 */
403void account_idle_ticks(unsigned long ticks)
404{
405
406 if (sched_clock_irqtime) {
407 irqtime_account_idle_ticks(ticks);
408 return;
409 }
410
411 account_idle_time(jiffies_to_cputime(ticks));
412}
413
414#endif
415
416/*
417 * Use precise platform statistics if available:
418 */
419#ifdef CONFIG_VIRT_CPU_ACCOUNTING
420void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
421{
422 *ut = p->utime;
423 *st = p->stime;
424}
425
426void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
427{
428 struct task_cputime cputime;
429
430 thread_group_cputime(p, &cputime);
431
432 *ut = cputime.utime;
433 *st = cputime.stime;
434}
435
436/*
437 * Archs that account the whole time spent in the idle task
438 * (outside irq) as idle time can rely on this and just implement
439 * vtime_account_system() and vtime_account_idle(). Archs that
440 * have other meaning of the idle time (s390 only includes the
441 * time spent by the CPU when it's in low power mode) must override
442 * vtime_account().
443 */
444#ifndef __ARCH_HAS_VTIME_ACCOUNT
445void vtime_account(struct task_struct *tsk)
446{
447 unsigned long flags;
448
449 local_irq_save(flags);
450
451 if (in_interrupt() || !is_idle_task(tsk))
452 vtime_account_system(tsk);
453 else
454 vtime_account_idle(tsk);
455
456 local_irq_restore(flags);
457}
458EXPORT_SYMBOL_GPL(vtime_account);
459#endif /* __ARCH_HAS_VTIME_ACCOUNT */
460
461#else
462
463#ifndef nsecs_to_cputime
464# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
465#endif
466
467static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
468{
469 u64 temp = (__force u64) rtime;
470
471 temp *= (__force u64) utime;
472
473 if (sizeof(cputime_t) == 4)
474 temp = div_u64(temp, (__force u32) total);
475 else
476 temp = div64_u64(temp, (__force u64) total);
477
478 return (__force cputime_t) temp;
479}
480
481void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
482{
483 cputime_t rtime, utime = p->utime, total = utime + p->stime;
484
485 /*
486 * Use CFS's precise accounting:
487 */
488 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
489
490 if (total)
491 utime = scale_utime(utime, rtime, total);
492 else
493 utime = rtime;
494
495 /*
496 * Compare with previous values, to keep monotonicity:
497 */
498 p->prev_utime = max(p->prev_utime, utime);
499 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
500
501 *ut = p->prev_utime;
502 *st = p->prev_stime;
503}
504
505/*
506 * Must be called with siglock held.
507 */
508void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
509{
510 struct signal_struct *sig = p->signal;
511 struct task_cputime cputime;
512 cputime_t rtime, utime, total;
513
514 thread_group_cputime(p, &cputime);
515
516 total = cputime.utime + cputime.stime;
517 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
518
519 if (total)
520 utime = scale_utime(cputime.utime, rtime, total);
521 else
522 utime = rtime;
523
524 sig->prev_utime = max(sig->prev_utime, utime);
525 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
526
527 *ut = sig->prev_utime;
528 *st = sig->prev_stime;
529}
530#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 96e2b18b6283..6b800a14b990 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -597,7 +597,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se)
597/* 597/*
598 * The idea is to set a period in which each task runs once. 598 * The idea is to set a period in which each task runs once.
599 * 599 *
600 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 600 * When there are too many tasks (sched_nr_latency) we have to stretch
601 * this period because otherwise the slices get too small. 601 * this period because otherwise the slices get too small.
602 * 602 *
603 * p = (nr <= nl) ? l : l*nr/nl 603 * p = (nr <= nl) ? l : l*nr/nl
@@ -2700,7 +2700,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2700 int prev_cpu = task_cpu(p); 2700 int prev_cpu = task_cpu(p);
2701 int new_cpu = cpu; 2701 int new_cpu = cpu;
2702 int want_affine = 0; 2702 int want_affine = 0;
2703 int want_sd = 1;
2704 int sync = wake_flags & WF_SYNC; 2703 int sync = wake_flags & WF_SYNC;
2705 2704
2706 if (p->nr_cpus_allowed == 1) 2705 if (p->nr_cpus_allowed == 1)
@@ -2718,48 +2717,21 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2718 continue; 2717 continue;
2719 2718
2720 /* 2719 /*
2721 * If power savings logic is enabled for a domain, see if we
2722 * are not overloaded, if so, don't balance wider.
2723 */
2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0;
2726 unsigned long nr_running = 0;
2727 unsigned long capacity;
2728 int i;
2729
2730 for_each_cpu(i, sched_domain_span(tmp)) {
2731 power += power_of(i);
2732 nr_running += cpu_rq(i)->cfs.nr_running;
2733 }
2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736
2737 if (nr_running < capacity)
2738 want_sd = 0;
2739 }
2740
2741 /*
2742 * If both cpu and prev_cpu are part of this domain, 2720 * If both cpu and prev_cpu are part of this domain,
2743 * cpu is a valid SD_WAKE_AFFINE target. 2721 * cpu is a valid SD_WAKE_AFFINE target.
2744 */ 2722 */
2745 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 2723 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
2746 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 2724 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
2747 affine_sd = tmp; 2725 affine_sd = tmp;
2748 want_affine = 0;
2749 }
2750
2751 if (!want_sd && !want_affine)
2752 break; 2726 break;
2727 }
2753 2728
2754 if (!(tmp->flags & sd_flag)) 2729 if (tmp->flags & sd_flag)
2755 continue;
2756
2757 if (want_sd)
2758 sd = tmp; 2730 sd = tmp;
2759 } 2731 }
2760 2732
2761 if (affine_sd) { 2733 if (affine_sd) {
2762 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 2734 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
2763 prev_cpu = cpu; 2735 prev_cpu = cpu;
2764 2736
2765 new_cpu = select_idle_sibling(p, prev_cpu); 2737 new_cpu = select_idle_sibling(p, prev_cpu);
@@ -4295,7 +4267,7 @@ redo:
4295 goto out_balanced; 4267 goto out_balanced;
4296 } 4268 }
4297 4269
4298 BUG_ON(busiest == this_rq); 4270 BUG_ON(busiest == env.dst_rq);
4299 4271
4300 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 4272 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4301 4273
@@ -4316,7 +4288,7 @@ redo:
4316 update_h_load(env.src_cpu); 4288 update_h_load(env.src_cpu);
4317more_balance: 4289more_balance:
4318 local_irq_save(flags); 4290 local_irq_save(flags);
4319 double_rq_lock(this_rq, busiest); 4291 double_rq_lock(env.dst_rq, busiest);
4320 4292
4321 /* 4293 /*
4322 * cur_ld_moved - load moved in current iteration 4294 * cur_ld_moved - load moved in current iteration
@@ -4324,7 +4296,7 @@ more_balance:
4324 */ 4296 */
4325 cur_ld_moved = move_tasks(&env); 4297 cur_ld_moved = move_tasks(&env);
4326 ld_moved += cur_ld_moved; 4298 ld_moved += cur_ld_moved;
4327 double_rq_unlock(this_rq, busiest); 4299 double_rq_unlock(env.dst_rq, busiest);
4328 local_irq_restore(flags); 4300 local_irq_restore(flags);
4329 4301
4330 if (env.flags & LBF_NEED_BREAK) { 4302 if (env.flags & LBF_NEED_BREAK) {
@@ -4360,8 +4332,7 @@ more_balance:
4360 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && 4332 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4361 lb_iterations++ < max_lb_iterations) { 4333 lb_iterations++ < max_lb_iterations) {
4362 4334
4363 this_rq = cpu_rq(env.new_dst_cpu); 4335 env.dst_rq = cpu_rq(env.new_dst_cpu);
4364 env.dst_rq = this_rq;
4365 env.dst_cpu = env.new_dst_cpu; 4336 env.dst_cpu = env.new_dst_cpu;
4366 env.flags &= ~LBF_SOME_PINNED; 4337 env.flags &= ~LBF_SOME_PINNED;
4367 env.loop = 0; 4338 env.loop = 0;
@@ -4646,7 +4617,7 @@ static void nohz_balancer_kick(int cpu)
4646 return; 4617 return;
4647} 4618}
4648 4619
4649static inline void clear_nohz_tick_stopped(int cpu) 4620static inline void nohz_balance_exit_idle(int cpu)
4650{ 4621{
4651 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 4622 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
4652 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 4623 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
@@ -4686,28 +4657,23 @@ void set_cpu_sd_state_idle(void)
4686} 4657}
4687 4658
4688/* 4659/*
4689 * This routine will record that this cpu is going idle with tick stopped. 4660 * This routine will record that the cpu is going idle with tick stopped.
4690 * This info will be used in performing idle load balancing in the future. 4661 * This info will be used in performing idle load balancing in the future.
4691 */ 4662 */
4692void select_nohz_load_balancer(int stop_tick) 4663void nohz_balance_enter_idle(int cpu)
4693{ 4664{
4694 int cpu = smp_processor_id();
4695
4696 /* 4665 /*
4697 * If this cpu is going down, then nothing needs to be done. 4666 * If this cpu is going down, then nothing needs to be done.
4698 */ 4667 */
4699 if (!cpu_active(cpu)) 4668 if (!cpu_active(cpu))
4700 return; 4669 return;
4701 4670
4702 if (stop_tick) { 4671 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4703 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 4672 return;
4704 return;
4705 4673
4706 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 4674 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4707 atomic_inc(&nohz.nr_cpus); 4675 atomic_inc(&nohz.nr_cpus);
4708 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 4676 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4709 }
4710 return;
4711} 4677}
4712 4678
4713static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, 4679static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
@@ -4715,7 +4681,7 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
4715{ 4681{
4716 switch (action & ~CPU_TASKS_FROZEN) { 4682 switch (action & ~CPU_TASKS_FROZEN) {
4717 case CPU_DYING: 4683 case CPU_DYING:
4718 clear_nohz_tick_stopped(smp_processor_id()); 4684 nohz_balance_exit_idle(smp_processor_id());
4719 return NOTIFY_OK; 4685 return NOTIFY_OK;
4720 default: 4686 default:
4721 return NOTIFY_DONE; 4687 return NOTIFY_DONE;
@@ -4837,14 +4803,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4837 if (need_resched()) 4803 if (need_resched())
4838 break; 4804 break;
4839 4805
4840 raw_spin_lock_irq(&this_rq->lock); 4806 rq = cpu_rq(balance_cpu);
4841 update_rq_clock(this_rq); 4807
4842 update_idle_cpu_load(this_rq); 4808 raw_spin_lock_irq(&rq->lock);
4843 raw_spin_unlock_irq(&this_rq->lock); 4809 update_rq_clock(rq);
4810 update_idle_cpu_load(rq);
4811 raw_spin_unlock_irq(&rq->lock);
4844 4812
4845 rebalance_domains(balance_cpu, CPU_IDLE); 4813 rebalance_domains(balance_cpu, CPU_IDLE);
4846 4814
4847 rq = cpu_rq(balance_cpu);
4848 if (time_after(this_rq->next_balance, rq->next_balance)) 4815 if (time_after(this_rq->next_balance, rq->next_balance))
4849 this_rq->next_balance = rq->next_balance; 4816 this_rq->next_balance = rq->next_balance;
4850 } 4817 }
@@ -4875,7 +4842,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
4875 * busy tick after returning from idle, we will update the busy stats. 4842 * busy tick after returning from idle, we will update the busy stats.
4876 */ 4843 */
4877 set_cpu_sd_state_busy(); 4844 set_cpu_sd_state_busy();
4878 clear_nohz_tick_stopped(cpu); 4845 nohz_balance_exit_idle(cpu);
4879 4846
4880 /* 4847 /*
4881 * None are in tickless mode and hence no need for NOHZ idle load 4848 * None are in tickless mode and hence no need for NOHZ idle load
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index de00a486c5c6..eebefcad7027 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
12SCHED_FEAT(START_DEBIT, true) 12SCHED_FEAT(START_DEBIT, true)
13 13
14/* 14/*
15 * Based on load and program behaviour, see if it makes sense to place
16 * a newly woken task on the same cpu as the task that woke it --
17 * improve cache locality. Typically used with SYNC wakeups as
18 * generated by pipes and the like, see also SYNC_WAKEUPS.
19 */
20SCHED_FEAT(AFFINE_WAKEUPS, true)
21
22/*
23 * Prefer to schedule the task we woke last (assuming it failed 15 * Prefer to schedule the task we woke last (assuming it failed
24 * wakeup-preemption), since its likely going to consume data we 16 * wakeup-preemption), since its likely going to consume data we
25 * touched, increases cache locality. 17 * touched, increases cache locality.
@@ -42,7 +34,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
42/* 34/*
43 * Use arch dependent cpu power functions 35 * Use arch dependent cpu power functions
44 */ 36 */
45SCHED_FEAT(ARCH_POWER, false) 37SCHED_FEAT(ARCH_POWER, true)
46 38
47SCHED_FEAT(HRTICK, false) 39SCHED_FEAT(HRTICK, false)
48SCHED_FEAT(DOUBLE_TICK, false) 40SCHED_FEAT(DOUBLE_TICK, false)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e0b7ba9c040f..418feb01344e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1632,11 +1632,6 @@ static int push_rt_task(struct rq *rq)
1632 if (!next_task) 1632 if (!next_task)
1633 return 0; 1633 return 0;
1634 1634
1635#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1636 if (unlikely(task_running(rq, next_task)))
1637 return 0;
1638#endif
1639
1640retry: 1635retry:
1641 if (unlikely(next_task == rq->curr)) { 1636 if (unlikely(next_task == rq->curr)) {
1642 WARN_ON(1); 1637 WARN_ON(1);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0848fa36c383..7a7db09cfabc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -737,11 +737,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
737 */ 737 */
738 next->on_cpu = 1; 738 next->on_cpu = 1;
739#endif 739#endif
740#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
741 raw_spin_unlock_irq(&rq->lock);
742#else
743 raw_spin_unlock(&rq->lock); 740 raw_spin_unlock(&rq->lock);
744#endif
745} 741}
746 742
747static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 743static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
@@ -755,9 +751,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
755 smp_wmb(); 751 smp_wmb();
756 prev->on_cpu = 0; 752 prev->on_cpu = 0;
757#endif 753#endif
758#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
759 local_irq_enable(); 754 local_irq_enable();
760#endif
761} 755}
762#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 756#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
763 757
@@ -891,6 +885,9 @@ struct cpuacct {
891 struct kernel_cpustat __percpu *cpustat; 885 struct kernel_cpustat __percpu *cpustat;
892}; 886};
893 887
888extern struct cgroup_subsys cpuacct_subsys;
889extern struct cpuacct root_cpuacct;
890
894/* return cpu accounting group corresponding to this container */ 891/* return cpu accounting group corresponding to this container */
895static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) 892static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
896{ 893{
@@ -917,6 +914,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
917static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 914static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
918#endif 915#endif
919 916
917#ifdef CONFIG_PARAVIRT
918static inline u64 steal_ticks(u64 steal)
919{
920 if (unlikely(steal > NSEC_PER_SEC))
921 return div_u64(steal, TICK_NSEC);
922
923 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
924}
925#endif
926
920static inline void inc_nr_running(struct rq *rq) 927static inline void inc_nr_running(struct rq *rq)
921{ 928{
922 rq->nr_running++; 929 rq->nr_running++;
@@ -1156,3 +1163,53 @@ enum rq_nohz_flag_bits {
1156 1163
1157#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 1164#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1158#endif 1165#endif
1166
1167#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1168
1169DECLARE_PER_CPU(u64, cpu_hardirq_time);
1170DECLARE_PER_CPU(u64, cpu_softirq_time);
1171
1172#ifndef CONFIG_64BIT
1173DECLARE_PER_CPU(seqcount_t, irq_time_seq);
1174
1175static inline void irq_time_write_begin(void)
1176{
1177 __this_cpu_inc(irq_time_seq.sequence);
1178 smp_wmb();
1179}
1180
1181static inline void irq_time_write_end(void)
1182{
1183 smp_wmb();
1184 __this_cpu_inc(irq_time_seq.sequence);
1185}
1186
1187static inline u64 irq_time_read(int cpu)
1188{
1189 u64 irq_time;
1190 unsigned seq;
1191
1192 do {
1193 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1194 irq_time = per_cpu(cpu_softirq_time, cpu) +
1195 per_cpu(cpu_hardirq_time, cpu);
1196 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1197
1198 return irq_time;
1199}
1200#else /* CONFIG_64BIT */
1201static inline void irq_time_write_begin(void)
1202{
1203}
1204
1205static inline void irq_time_write_end(void)
1206{
1207}
1208
1209static inline u64 irq_time_read(int cpu)
1210{
1211 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1212}
1213#endif /* CONFIG_64BIT */
1214#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1215
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5c6a5bd8462f..cc96bdc0c2c9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
221 current->flags &= ~PF_MEMALLOC; 221 current->flags &= ~PF_MEMALLOC;
222 222
223 pending = local_softirq_pending(); 223 pending = local_softirq_pending();
224 account_system_vtime(current); 224 vtime_account(current);
225 225
226 __local_bh_disable((unsigned long)__builtin_return_address(0), 226 __local_bh_disable((unsigned long)__builtin_return_address(0),
227 SOFTIRQ_OFFSET); 227 SOFTIRQ_OFFSET);
@@ -272,7 +272,7 @@ restart:
272 272
273 lockdep_softirq_exit(); 273 lockdep_softirq_exit();
274 274
275 account_system_vtime(current); 275 vtime_account(current);
276 __local_bh_enable(SOFTIRQ_OFFSET); 276 __local_bh_enable(SOFTIRQ_OFFSET);
277 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 277 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
278} 278}
@@ -341,7 +341,7 @@ static inline void invoke_softirq(void)
341 */ 341 */
342void irq_exit(void) 342void irq_exit(void)
343{ 343{
344 account_system_vtime(current); 344 vtime_account(current);
345 trace_hardirq_exit(); 345 trace_hardirq_exit();
346 sub_preempt_count(IRQ_EXIT_OFFSET); 346 sub_preempt_count(IRQ_EXIT_OFFSET);
347 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 87174ef59161..81c7b1a1a307 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -307,7 +307,7 @@ static struct ctl_table kern_table[] = {
307 .extra2 = &max_sched_tunable_scaling, 307 .extra2 = &max_sched_tunable_scaling,
308 }, 308 },
309 { 309 {
310 .procname = "sched_migration_cost", 310 .procname = "sched_migration_cost_ns",
311 .data = &sysctl_sched_migration_cost, 311 .data = &sysctl_sched_migration_cost,
312 .maxlen = sizeof(unsigned int), 312 .maxlen = sizeof(unsigned int),
313 .mode = 0644, 313 .mode = 0644,
@@ -321,14 +321,14 @@ static struct ctl_table kern_table[] = {
321 .proc_handler = proc_dointvec, 321 .proc_handler = proc_dointvec,
322 }, 322 },
323 { 323 {
324 .procname = "sched_time_avg", 324 .procname = "sched_time_avg_ms",
325 .data = &sysctl_sched_time_avg, 325 .data = &sysctl_sched_time_avg,
326 .maxlen = sizeof(unsigned int), 326 .maxlen = sizeof(unsigned int),
327 .mode = 0644, 327 .mode = 0644,
328 .proc_handler = proc_dointvec, 328 .proc_handler = proc_dointvec,
329 }, 329 },
330 { 330 {
331 .procname = "sched_shares_window", 331 .procname = "sched_shares_window_ns",
332 .data = &sysctl_sched_shares_window, 332 .data = &sysctl_sched_shares_window,
333 .maxlen = sizeof(unsigned int), 333 .maxlen = sizeof(unsigned int),
334 .mode = 0644, 334 .mode = 0644,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cf5f6b262673..f423bdd035c2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -372,7 +372,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
372 * the scheduler tick in nohz_restart_sched_tick. 372 * the scheduler tick in nohz_restart_sched_tick.
373 */ 373 */
374 if (!ts->tick_stopped) { 374 if (!ts->tick_stopped) {
375 select_nohz_load_balancer(1); 375 nohz_balance_enter_idle(cpu);
376 calc_load_enter_idle(); 376 calc_load_enter_idle();
377 377
378 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 378 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -570,7 +570,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
570static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) 570static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
571{ 571{
572 /* Update jiffies first */ 572 /* Update jiffies first */
573 select_nohz_load_balancer(0);
574 tick_do_update_jiffies64(now); 573 tick_do_update_jiffies64(now);
575 update_cpu_load_nohz(); 574 update_cpu_load_nohz();
576 575