Merge branch 'timers/nohz-v3' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks into timers/nohz

Pull nohz improvements from Frederic Weisbecker: " It mostly contains fixes and full dynticks off-case optimizations. I believe that distros want to enable this feature so it seems important to optimize the case where the "nohz_full=" parameter is empty. ie: I'm trying to remove any performance regression that comes with NO_HZ_FULL=y when the feature is not used. This patchset improves the current situation a lot (off-case appears to be around 11% faster with hackbench, although I guess it may vary depending on the configuration but it should be significantly faster in any case) now there is still some work to do: I can still observe a remaining loss of 1.6% throughput seen with hackbench compared to CONFIG_NO_HZ_FULL=n. " Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ingo Molnar <mingo@kernel.org> 2013-08-14 11:58:56 -0400
committer: Ingo Molnar <mingo@kernel.org> 2013-08-14 11:58:56 -0400
commit: 6f1d657668ac3041b65265d3653d7e9172a0d603 (patch)
tree: 6e837c683783708637cc4caf9de759166c7469b7 /kernel
parent: d4e4ab86bcba5a72779c43dc1459f71fea3d89c8 (diff)
parent: d13508f9440e46dccac6a2dd48d51a73b2207482 (diff)
6 files changed, 116 insertions, 128 deletions
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 383f8231e436..247091bf0587 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -20,22 +20,33 @@
 #include <linux/hardirq.h>
 #include <linux/export.h>
-DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
+#define CREATE_TRACE_POINTS
-#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+#include <trace/events/context_tracking.h>
-        .active = true,
-#endif
+struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
-};
+EXPORT_SYMBOL_GPL(context_tracking_enabled);
+DEFINE_PER_CPU(struct context_tracking, context_tracking);
+EXPORT_SYMBOL_GPL(context_tracking);
+void context_tracking_cpu_set(int cpu)
+{
+        if (!per_cpu(context_tracking.active, cpu)) {
+                per_cpu(context_tracking.active, cpu) = true;
+                static_key_slow_inc(&context_tracking_enabled);
+        }
+}
 /**
- * user_enter - Inform the context tracking that the CPU is going to
+ * context_tracking_user_enter - Inform the context tracking that the CPU is going to
- *              enter userspace mode.
+ *                               enter userspace mode.
 *
 * This function must be called right before we switch from the kernel
 * to userspace, when it's guaranteed the remaining kernel instructions
 * to execute won't use any RCU read side critical section because this
 * function sets RCU in extended quiescent state.
 */
-void user_enter(void)
+void context_tracking_user_enter(void)
 {
        unsigned long flags;
@@ -54,17 +65,32 @@ void user_enter(void)
        WARN_ON_ONCE(!current->mm);
        local_irq_save(flags);
-        if (__this_cpu_read(context_tracking.active) &&
+        if ( __this_cpu_read(context_tracking.state) != IN_USER) {
-            __this_cpu_read(context_tracking.state) != IN_USER) {
+                if (__this_cpu_read(context_tracking.active)) {
+                        trace_user_enter(0);
+                        /*
+                         * At this stage, only low level arch entry code remains and
+                         * then we'll run in userspace. We can assume there won't be
+                         * any RCU read-side critical section until the next call to
+                         * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+                         * on the tick.
+                         */
+                        vtime_user_enter(current);
+                        rcu_user_enter();
+                }
                /*
-                 * At this stage, only low level arch entry code remains and
+                 * Even if context tracking is disabled on this CPU, because it's outside
-                 * then we'll run in userspace. We can assume there won't be
+                 * the full dynticks mask for example, we still have to keep track of the
-                 * any RCU read-side critical section until the next call to
+                 * context transitions and states to prevent inconsistency on those of
-                 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+                 * other CPUs.
-                 * on the tick.
+                 * If a task triggers an exception in userspace, sleep on the exception
+                 * handler and then migrate to another CPU, that new CPU must know where
+                 * the exception returns by the time we call exception_exit().
+                 * This information can only be provided by the previous CPU when it called
+                 * exception_enter().
+                 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+                 * is false because we know that CPU is not tickless.
                 */
-                vtime_user_enter(current);
-                rcu_user_enter();
                __this_cpu_write(context_tracking.state, IN_USER);
        }
        local_irq_restore(flags);
@@ -87,10 +113,9 @@ void user_enter(void)
 */
 void __sched notrace preempt_schedule_context(void)
 {
-        struct thread_info *ti = current_thread_info();
        enum ctx_state prev_ctx;
-        if (likely(ti->preempt_count || irqs_disabled()))
+        if (likely(!preemptible()))
                return;
        /*
@@ -112,8 +137,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
 #endif /* CONFIG_PREEMPT */
 /**
- * user_exit - Inform the context tracking that the CPU is
+ * context_tracking_user_exit - Inform the context tracking that the CPU is
- *             exiting userspace mode and entering the kernel.
+ *                              exiting userspace mode and entering the kernel.
 *
 * This function must be called after we entered the kernel from userspace
 * before any use of RCU read side critical section. This potentially include
@@ -122,7 +147,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
 * This call supports re-entrancy. This way it can be called from any exception
 * handler without needing to know if we came from userspace or not.
 */
-void user_exit(void)
+void context_tracking_user_exit(void)
 {
        unsigned long flags;
@@ -131,38 +156,22 @@ void user_exit(void)
        local_irq_save(flags);
        if (__this_cpu_read(context_tracking.state) == IN_USER) {
-                /*
+                if (__this_cpu_read(context_tracking.active)) {
-                 * We are going to run code that may use RCU. Inform
+                        /*
-                 * RCU core about that (ie: we may need the tick again).
+                         * We are going to run code that may use RCU. Inform
-                 */
+                         * RCU core about that (ie: we may need the tick again).
-                rcu_user_exit();
+                         */
-                vtime_user_exit(current);
+                        rcu_user_exit();
+                        vtime_user_exit(current);
+                        trace_user_exit(0);
+                }
                __this_cpu_write(context_tracking.state, IN_KERNEL);
        }
        local_irq_restore(flags);
 }
-void guest_enter(void)
-{
-        if (vtime_accounting_enabled())
-                vtime_guest_enter(current);
-        else
-                __guest_enter();
-}
-EXPORT_SYMBOL_GPL(guest_enter);
-void guest_exit(void)
-{
-        if (vtime_accounting_enabled())
-                vtime_guest_exit(current);
-        else
-                __guest_exit();
-}
-EXPORT_SYMBOL_GPL(guest_exit);
 /**
- * context_tracking_task_switch - context switch the syscall callbacks
+ * __context_tracking_task_switch - context switch the syscall callbacks
 * @prev: the task that is being switched out
 * @next: the task that is being switched in
 *
@@ -174,11 +183,19 @@ EXPORT_SYMBOL_GPL(guest_exit);
 * migrate to some CPU that doesn't do the context tracking. As such the TIF
 * flag may not be desired there.
 */
-void context_tracking_task_switch(struct task_struct *prev,
+void __context_tracking_task_switch(struct task_struct *prev,
-                             struct task_struct *next)
+                                    struct task_struct *next)
 {
-        if (__this_cpu_read(context_tracking.active)) {
+        clear_tsk_thread_flag(prev, TIF_NOHZ);
-                clear_tsk_thread_flag(prev, TIF_NOHZ);
+        set_tsk_thread_flag(next, TIF_NOHZ);
-                set_tsk_thread_flag(next, TIF_NOHZ);
-        }
 }
+#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+void __init context_tracking_init(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                context_tracking_cpu_set(cpu);
+}
+#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7c32cb7bfeb..3fb7acee7326 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2510,13 +2510,11 @@ void __sched schedule_preempt_disabled(void)
 */
 asmlinkage void __sched notrace preempt_schedule(void)
 {
-        struct thread_info *ti = current_thread_info();
        /*
         * If there is a non-zero preempt_count or interrupts are disabled,
         * we do not want to preempt the current task. Just return..
         */
-        if (likely(ti->preempt_count || irqs_disabled()))
+        if (likely(!preemptible()))
                return;
        do {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a7959e05a9d5..c1d7493825ae 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 #ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
+void vtime_common_task_switch(struct task_struct *prev)
 {
-        if (!vtime_accounting_enabled())
-                return;
        if (is_idle_task(prev))
                vtime_account_idle(prev);
        else
@@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev)
 * vtime_account().
 */
 #ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account_irq_enter(struct task_struct *tsk)
+void vtime_common_account_irq_enter(struct task_struct *tsk)
 {
-        if (!vtime_accounting_enabled())
-                return;
        if (!in_interrupt()) {
                /*
                 * If we interrupted user, context_tracking_in_user()
@@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
        }
        vtime_account_system(tsk);
 }
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
+EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
@@ -559,12 +553,6 @@ static void cputime_adjust(struct task_cputime *curr,
 {
        cputime_t rtime, stime, utime, total;
-        if (vtime_accounting_enabled()) {
-                *ut = curr->utime;
-                *st = curr->stime;
-                return;
-        }
        stime = curr->stime;
        total = stime + curr->utime;
@@ -664,23 +652,17 @@ static void __vtime_account_system(struct task_struct *tsk)
 void vtime_account_system(struct task_struct *tsk)
 {
-        if (!vtime_accounting_enabled())
-                return;
        write_seqlock(&tsk->vtime_seqlock);
        __vtime_account_system(tsk);
        write_sequnlock(&tsk->vtime_seqlock);
 }
-void vtime_account_irq_exit(struct task_struct *tsk)
+void vtime_gen_account_irq_exit(struct task_struct *tsk)
 {
-        if (!vtime_accounting_enabled())
-                return;
        write_seqlock(&tsk->vtime_seqlock);
+        __vtime_account_system(tsk);
        if (context_tracking_in_user())
                tsk->vtime_snap_whence = VTIME_USER;
-        __vtime_account_system(tsk);
        write_sequnlock(&tsk->vtime_seqlock);
 }
@@ -688,12 +670,8 @@ void vtime_account_user(struct task_struct *tsk)
 {
        cputime_t delta_cpu;
-        if (!vtime_accounting_enabled())
-                return;
-        delta_cpu = get_vtime_delta(tsk);
        write_seqlock(&tsk->vtime_seqlock);
+        delta_cpu = get_vtime_delta(tsk);
        tsk->vtime_snap_whence = VTIME_SYS;
        account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
        write_sequnlock(&tsk->vtime_seqlock);
@@ -701,22 +679,27 @@ void vtime_account_user(struct task_struct *tsk)
 void vtime_user_enter(struct task_struct *tsk)
 {
-        if (!vtime_accounting_enabled())
-                return;
        write_seqlock(&tsk->vtime_seqlock);
-        tsk->vtime_snap_whence = VTIME_USER;
        __vtime_account_system(tsk);
+        tsk->vtime_snap_whence = VTIME_USER;
        write_sequnlock(&tsk->vtime_seqlock);
 }
 void vtime_guest_enter(struct task_struct *tsk)
 {
+        /*
+         * The flags must be updated under the lock with
+         * the vtime_snap flush and update.
+         * That enforces a right ordering and update sequence
+         * synchronization against the reader (task_gtime())
+         * that can thus safely catch up with a tickless delta.
+         */
        write_seqlock(&tsk->vtime_seqlock);
        __vtime_account_system(tsk);
        current->flags |= PF_VCPU;
        write_sequnlock(&tsk->vtime_seqlock);
 }
+EXPORT_SYMBOL_GPL(vtime_guest_enter);
 void vtime_guest_exit(struct task_struct *tsk)
 {
@@ -725,6 +708,7 @@ void vtime_guest_exit(struct task_struct *tsk)
        current->flags &= ~PF_VCPU;
        write_sequnlock(&tsk->vtime_seqlock);
 }
+EXPORT_SYMBOL_GPL(vtime_guest_exit);
 void vtime_account_idle(struct task_struct *tsk)
 {
@@ -733,11 +717,6 @@ void vtime_account_idle(struct task_struct *tsk)
        account_idle_time(delta_cpu);
 }
-bool vtime_accounting_enabled(void)
-{
-        return context_tracking_active();
-}
 void arch_vtime_task_switch(struct task_struct *prev)
 {
        write_seqlock(&prev->vtime_seqlock);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 70f27e89012b..747bbc70f53b 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -105,7 +105,6 @@ config NO_HZ_FULL
        select RCU_USER_QS
        select RCU_NOCB_CPU
        select VIRT_CPU_ACCOUNTING_GEN
-        select CONTEXT_TRACKING_FORCE
        select IRQ_WORK
        help
         Adaptively try to shutdown the tick whenever possible, even when
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index a326f27d7f09..0b479a6a22bb 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -121,7 +121,7 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
        BUG_ON(bits > 32);
        WARN_ON(!irqs_disabled());
        read_sched_clock = read;
-        sched_clock_mask = (1 << bits) - 1;
+        sched_clock_mask = (1ULL << bits) - 1;
        cd.rate = rate;
        /* calculate the mult/shift to convert counter ticks to ns. */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e77edc97e036..adea6fc3ba2a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -23,6 +23,7 @@
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
 #include <linux/perf_event.h>
+#include <linux/context_tracking.h>
 #include <asm/irq_regs.h>
@@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 }
 #ifdef CONFIG_NO_HZ_FULL
-static cpumask_var_t nohz_full_mask;
+cpumask_var_t tick_nohz_full_mask;
-bool have_nohz_full_mask;
+bool tick_nohz_full_running;
 static bool can_stop_full_tick(void)
 {
@@ -182,7 +183,8 @@ static bool can_stop_full_tick(void)
                 * Don't allow the user to think they can get
                 * full NO_HZ with this machine.
                 */
-                WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock");
+                WARN_ONCE(tick_nohz_full_running,
+                          "NO_HZ FULL will not work with unstable sched clock");
                return false;
        }
 #endif
@@ -196,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
 * Re-evaluate the need for the tick on the current CPU
 * and restart it if necessary.
 */
-void tick_nohz_full_check(void)
+void __tick_nohz_full_check(void)
 {
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
@@ -210,7 +212,7 @@ void tick_nohz_full_check(void)
 static void nohz_full_kick_work_func(struct irq_work *work)
 {
-        tick_nohz_full_check();
+        __tick_nohz_full_check();
 }
 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -229,7 +231,7 @@ void tick_nohz_full_kick(void)
 static void nohz_full_kick_ipi(void *info)
 {
-        tick_nohz_full_check();
+        __tick_nohz_full_check();
 }
 /*
@@ -238,11 +240,11 @@ static void nohz_full_kick_ipi(void *info)
 */
 void tick_nohz_full_kick_all(void)
 {
-        if (!have_nohz_full_mask)
+        if (!tick_nohz_full_running)
                return;
        preempt_disable();
-        smp_call_function_many(nohz_full_mask,
+        smp_call_function_many(tick_nohz_full_mask,
                               nohz_full_kick_ipi, NULL, false);
        preempt_enable();
 }
@@ -252,7 +254,7 @@ void tick_nohz_full_kick_all(void)
 * It might need the tick due to per task/process properties:
 * perf events, posix cpu timers, ...
 */
-void tick_nohz_task_switch(struct task_struct *tsk)
+void __tick_nohz_task_switch(struct task_struct *tsk)
 {
        unsigned long flags;
@@ -268,31 +270,23 @@ out:
        local_irq_restore(flags);
 }
-int tick_nohz_full_cpu(int cpu)
-{
-        if (!have_nohz_full_mask)
-                return 0;
-        return cpumask_test_cpu(cpu, nohz_full_mask);
-}
 /* Parse the boot-time nohz CPU list from the kernel parameters. */
 static int __init tick_nohz_full_setup(char *str)
 {
        int cpu;
-        alloc_bootmem_cpumask_var(&nohz_full_mask);
+        alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
-        if (cpulist_parse(str, nohz_full_mask) < 0) {
+        if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
                pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
                return 1;
        }
        cpu = smp_processor_id();
-        if (cpumask_test_cpu(cpu, nohz_full_mask)) {
+        if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
                pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
-                cpumask_clear_cpu(cpu, nohz_full_mask);
+                cpumask_clear_cpu(cpu, tick_nohz_full_mask);
        }
-        have_nohz_full_mask = true;
+        tick_nohz_full_running = true;
        return 1;
 }
@@ -310,7 +304,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
                 * If we handle the timekeeping duty for full dynticks CPUs,
                 * we can't safely shutdown that CPU.
                 */
-                if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
+                if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
                        return NOTIFY_BAD;
                break;
        }
@@ -329,14 +323,14 @@ static int tick_nohz_init_all(void)
        int err = -1;
 #ifdef CONFIG_NO_HZ_FULL_ALL
-        if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) {
+        if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
                pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
                return err;
        }
        err = 0;
-        cpumask_setall(nohz_full_mask);
+        cpumask_setall(tick_nohz_full_mask);
-        cpumask_clear_cpu(smp_processor_id(), nohz_full_mask);
+        cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
-        have_nohz_full_mask = true;
+        tick_nohz_full_running = true;
 #endif
        return err;
 }
@@ -345,17 +339,18 @@ void __init tick_nohz_init(void)
 {
        int cpu;
-        if (!have_nohz_full_mask) {
+        if (!tick_nohz_full_running) {
                if (tick_nohz_init_all() < 0)
                        return;
        }
+        for_each_cpu(cpu, tick_nohz_full_mask)
+                context_tracking_cpu_set(cpu);
        cpu_notifier(tick_nohz_cpu_down_callback, 0);
-        cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
+        cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
        pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
 }
-#else
-#define have_nohz_full_mask (0)
 #endif
 /*
@@ -733,7 +728,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
                return false;
        }
-        if (have_nohz_full_mask) {
+        if (tick_nohz_full_enabled()) {
                /*
                 * Keep the tick alive to guarantee timekeeping progression
                 * if there are full dynticks CPUs around
author	Ingo Molnar <mingo@kernel.org>	2013-08-14 11:58:56 -0400
committer	Ingo Molnar <mingo@kernel.org>	2013-08-14 11:58:56 -0400
commit	6f1d657668ac3041b65265d3653d7e9172a0d603 (patch)
tree	6e837c683783708637cc4caf9de759166c7469b7 /kernel
parent	d4e4ab86bcba5a72779c43dc1459f71fea3d89c8 (diff)
parent	d13508f9440e46dccac6a2dd48d51a73b2207482 (diff)