Merge branch 'x86/spinlocks' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into stable/for-linus-3.12

* 'x86/spinlocks' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/kvm/guest: Fix sparse warning: "symbol 'klock_waiting' was not declared as static" kvm: Paravirtual ticketlocks support for linux guests running on KVM hypervisor kvm guest: Add configuration support to enable debug information for KVM Guests kvm uapi: Add KICK_CPU and PV_UNHALT definition to uapi xen, pvticketlock: Allow interrupts to be enabled while blocking x86, ticketlock: Add slowpath logic jump_label: Split jumplabel ratelimit x86, pvticketlock: When paravirtualizing ticket locks, increment by 2 x86, pvticketlock: Use callee-save for lock_spinning xen, pvticketlocks: Add xen_nopvspin parameter to disable xen pv ticketlocks xen, pvticketlock: Xen implementation for PV ticket locks xen: Defer spinlock setup until boot CPU setup x86, ticketlock: Collapse a layer of functions x86, ticketlock: Don't inline _spin_unlock when using paravirt spinlocks x86, spinlock: Replace pv spinlocks with pv ticketlocks
author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 2013-09-09 12:01:15 -0400
committer: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 2013-09-09 12:01:15 -0400
commit: c3f31f6a6f68bcb51689c90733282ec263602a9d (patch)
tree: 07c2c7ae966b07d5adabe78215d9c76fa4ec531a /arch/x86
parent: e1a9c16b303725ac900fee2a3ec4dbe2c2f846ab (diff)
parent: 36bd621337c91a1ecda588e5bbbae8dd9698bae7 (diff)
11 files changed, 531 insertions, 353 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b32ebf92b0ce..b1fb846e6dac 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -632,6 +632,7 @@ config PARAVIRT_DEBUG
 config PARAVIRT_SPINLOCKS
        bool "Paravirtualization layer for spinlocks"
        depends on PARAVIRT && SMP
+        select UNINLINE_SPIN_UNLOCK
        ---help---
          Paravirtualized spinlocks allow a pvops backend to replace the
          spinlock implementation with something virtualization-friendly
@@ -656,6 +657,15 @@ config KVM_GUEST
          underlying device model, the host provides the guest with
          timing infrastructure such as time of day, and system time
+config KVM_DEBUG_FS
+        bool "Enable debug information for KVM Guests in debugfs"
+        depends on KVM_GUEST && DEBUG_FS
+        default n
+        ---help---
+          This option enables collection of various statistics for KVM guest.
+          Statistics are displayed in debugfs filesystem. Enabling this option
+          may incur significant overhead.
 source "arch/x86/lguest/Kconfig"
 config PARAVIRT_TIME_ACCOUNTING
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 695399f2d5eb..427afcbf3d55 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -118,10 +118,20 @@ void kvm_async_pf_task_wait(u32 token);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 extern void kvm_disable_steal_time(void);
-#else
-#define kvm_guest_init() do { } while (0)
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init kvm_spinlock_init(void);
+#else /* !CONFIG_PARAVIRT_SPINLOCKS */
+static inline void kvm_spinlock_init(void)
+{
+}
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+#else /* CONFIG_KVM_GUEST */
+#define kvm_guest_init() do {} while (0)
 #define kvm_async_pf_task_wait(T) do {} while(0)
 #define kvm_async_pf_task_wake(T) do {} while(0)
 static inline u32 kvm_read_and_reset_pf_reason(void)
 {
        return 0;
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index cfdc9ee4c900..401f350ef71b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -712,36 +712,16 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
 #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
-static inline int arch_spin_is_locked(struct arch_spinlock *lock)
+static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
+                                                        __ticket_t ticket)
 {
-        return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
+        PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket);
 }
-static inline int arch_spin_is_contended(struct arch_spinlock *lock)
+static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
+                                                        __ticket_t ticket)
 {
-        return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
+        PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
-}
-#define arch_spin_is_contended  arch_spin_is_contended
-static __always_inline void arch_spin_lock(struct arch_spinlock *lock)
-{
-        PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
-}
-static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock,
-                                                  unsigned long flags)
-{
-        PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
-}
-static __always_inline int arch_spin_trylock(struct arch_spinlock *lock)
-{
-        return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
-}
-static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
-{
-        PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
 }
 #endif
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 0db1fcac668c..04ac40e192eb 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -327,13 +327,15 @@ struct pv_mmu_ops {
 };
 struct arch_spinlock;
+#ifdef CONFIG_SMP
+#include <asm/spinlock_types.h>
+#else
+typedef u16 __ticket_t;
+#endif
 struct pv_lock_ops {
-        int (*spin_is_locked)(struct arch_spinlock *lock);
+        struct paravirt_callee_save lock_spinning;
-        int (*spin_is_contended)(struct arch_spinlock *lock);
+        void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
-        void (*spin_lock)(struct arch_spinlock *lock);
-        void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags);
-        int (*spin_trylock)(struct arch_spinlock *lock);
-        void (*spin_unlock)(struct arch_spinlock *lock);
 };
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 33692eaabab5..d68883dd133c 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -1,11 +1,14 @@
 #ifndef _ASM_X86_SPINLOCK_H
 #define _ASM_X86_SPINLOCK_H
+#include <linux/jump_label.h>
 #include <linux/atomic.h>
 #include <asm/page.h>
 #include <asm/processor.h>
 #include <linux/compiler.h>
 #include <asm/paravirt.h>
+#include <asm/bitops.h>
 /*
 * Your basic SMP spinlocks, allowing only a single CPU anywhere
 *
@@ -34,6 +37,31 @@
 # define UNLOCK_LOCK_PREFIX
 #endif
+/* How long a lock should spin before we consider blocking */
+#define SPIN_THRESHOLD  (1 << 15)
+extern struct static_key paravirt_ticketlocks_enabled;
+static __always_inline bool static_key_false(struct static_key *key);
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
+{
+        set_bit(0, (volatile unsigned long *)&lock->tickets.tail);
+}
+#else  /* !CONFIG_PARAVIRT_SPINLOCKS */
+static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock,
+                                                        __ticket_t ticket)
+{
+}
+static inline void __ticket_unlock_kick(arch_spinlock_t *lock,
+                                                        __ticket_t ticket)
+{
+}
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
 /*
 * Ticket locks are conceptually two parts, one indicating the current head of
 * the queue, and the other indicating the current tail. The lock is acquired
@@ -47,81 +75,101 @@
 * in the high part, because a wide xadd increment of the low part would carry
 * up and contaminate the high part.
 */
-static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
+static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
 {
-        register struct __raw_tickets inc = { .tail = 1 };
+        register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC };
        inc = xadd(&lock->tickets, inc);
+        if (likely(inc.head == inc.tail))
+                goto out;
+        inc.tail &= ~TICKET_SLOWPATH_FLAG;
        for (;;) {
-                if (inc.head == inc.tail)
+                unsigned count = SPIN_THRESHOLD;
-                        break;
-                cpu_relax();
+                do {
-                inc.head = ACCESS_ONCE(lock->tickets.head);
+                        if (ACCESS_ONCE(lock->tickets.head) == inc.tail)
+                                goto out;
+                        cpu_relax();
+                } while (--count);
+                __ticket_lock_spinning(lock, inc.tail);
        }
-        barrier();              /* make sure nothing creeps before the lock is taken */
+out:    barrier();      /* make sure nothing creeps before the lock is taken */
 }
-static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
+static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
        arch_spinlock_t old, new;
        old.tickets = ACCESS_ONCE(lock->tickets);
-        if (old.tickets.head != old.tickets.tail)
+        if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
                return 0;
-        new.head_tail = old.head_tail + (1 << TICKET_SHIFT);
+        new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
        /* cmpxchg is a full barrier, so nothing can move before it */
        return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
 }
-static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
+static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock,
+                                            arch_spinlock_t old)
 {
-        __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
+        arch_spinlock_t new;
+        BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
+        /* Perform the unlock on the "before" copy */
+        old.tickets.head += TICKET_LOCK_INC;
+        /* Clear the slowpath flag */
+        new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT);
+        /*
+         * If the lock is uncontended, clear the flag - use cmpxchg in
+         * case it changes behind our back though.
+         */
+        if (new.tickets.head != new.tickets.tail ||
+            cmpxchg(&lock->head_tail, old.head_tail,
+                                        new.head_tail) != old.head_tail) {
+                /*
+                 * Lock still has someone queued for it, so wake up an
+                 * appropriate waiter.
+                 */
+                __ticket_unlock_kick(lock, old.tickets.head);
+        }
 }
-static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
+static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
-        struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+        if (TICKET_SLOWPATH_FLAG &&
+            static_key_false(&paravirt_ticketlocks_enabled)) {
+                arch_spinlock_t prev;
-        return tmp.tail != tmp.head;
+                prev = *lock;
-}
+                add_smp(&lock->tickets.head, TICKET_LOCK_INC);
-static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
+                /* add_smp() is a full mb() */
-{
-        struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
-        return (__ticket_t)(tmp.tail - tmp.head) > 1;
+                if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
+                        __ticket_unlock_slowpath(lock, prev);
+        } else
+                __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
 }
-#ifndef CONFIG_PARAVIRT_SPINLOCKS
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-        return __ticket_spin_is_locked(lock);
+        struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
-}
-static inline int arch_spin_is_contended(arch_spinlock_t *lock)
-{
-        return __ticket_spin_is_contended(lock);
-}
-#define arch_spin_is_contended  arch_spin_is_contended
-static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
+        return tmp.tail != tmp.head;
-{
-        __ticket_spin_lock(lock);
 }
-static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
+static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
-        return __ticket_spin_trylock(lock);
+        struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
-}
-static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
+        return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
-{
-        __ticket_spin_unlock(lock);
 }
+#define arch_spin_is_contended  arch_spin_is_contended
 static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
                                                  unsigned long flags)
@@ -129,8 +177,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
        arch_spin_lock(lock);
 }
-#endif  /* CONFIG_PARAVIRT_SPINLOCKS */
 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
        while (arch_spin_is_locked(lock))
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index ad0ad07fc006..4f1bea19945b 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -1,13 +1,17 @@
 #ifndef _ASM_X86_SPINLOCK_TYPES_H
 #define _ASM_X86_SPINLOCK_TYPES_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
-# error "please don't include this file directly"
-#endif
 #include <linux/types.h>
-#if (CONFIG_NR_CPUS < 256)
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define __TICKET_LOCK_INC       2
+#define TICKET_SLOWPATH_FLAG    ((__ticket_t)1)
+#else
+#define __TICKET_LOCK_INC       1
+#define TICKET_SLOWPATH_FLAG    ((__ticket_t)0)
+#endif
+#if (CONFIG_NR_CPUS < (256 / __TICKET_LOCK_INC))
 typedef u8  __ticket_t;
 typedef u16 __ticketpair_t;
 #else
@@ -15,6 +19,8 @@ typedef u16 __ticket_t;
 typedef u32 __ticketpair_t;
 #endif
+#define TICKET_LOCK_INC ((__ticket_t)__TICKET_LOCK_INC)
 #define TICKET_SHIFT    (sizeof(__ticket_t) * 8)
 typedef struct arch_spinlock {
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 06fdbd987e97..94dc8ca434e0 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -23,6 +23,7 @@
 #define KVM_FEATURE_ASYNC_PF            4
 #define KVM_FEATURE_STEAL_TIME          5
 #define KVM_FEATURE_PV_EOI              6
+#define KVM_FEATURE_PV_UNHALT           7
 /* The last 8 bits are used to indicate how to interpret the flags field
 * in pvclock structure. If no bits are set, all flags are ignored.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a96d32cc55b8..56e2fa4a8b13 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -34,6 +34,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/kprobes.h>
+#include <linux/debugfs.h>
 #include <asm/timer.h>
 #include <asm/cpu.h>
 #include <asm/traps.h>
@@ -419,6 +420,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
        WARN_ON(kvm_register_clock("primary cpu clock"));
        kvm_guest_cpu_init();
        native_smp_prepare_boot_cpu();
+        kvm_spinlock_init();
 }
 static void kvm_guest_cpu_online(void *dummy)
@@ -523,3 +525,263 @@ static __init int activate_jump_labels(void)
        return 0;
 }
 arch_initcall(activate_jump_labels);
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
+static void kvm_kick_cpu(int cpu)
+{
+        int apicid;
+        unsigned long flags = 0;
+        apicid = per_cpu(x86_cpu_to_apicid, cpu);
+        kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
+}
+enum kvm_contention_stat {
+        TAKEN_SLOW,
+        TAKEN_SLOW_PICKUP,
+        RELEASED_SLOW,
+        RELEASED_SLOW_KICKED,
+        NR_CONTENTION_STATS
+};
+#ifdef CONFIG_KVM_DEBUG_FS
+#define HISTO_BUCKETS   30
+static struct kvm_spinlock_stats
+{
+        u32 contention_stats[NR_CONTENTION_STATS];
+        u32 histo_spin_blocked[HISTO_BUCKETS+1];
+        u64 time_blocked;
+} spinlock_stats;
+static u8 zero_stats;
+static inline void check_zero(void)
+{
+        u8 ret;
+        u8 old;
+        old = ACCESS_ONCE(zero_stats);
+        if (unlikely(old)) {
+                ret = cmpxchg(&zero_stats, old, 0);
+                /* This ensures only one fellow resets the stat */
+                if (ret == old)
+                        memset(&spinlock_stats, 0, sizeof(spinlock_stats));
+        }
+}
+static inline void add_stats(enum kvm_contention_stat var, u32 val)
+{
+        check_zero();
+        spinlock_stats.contention_stats[var] += val;
+}
+static inline u64 spin_time_start(void)
+{
+        return sched_clock();
+}
+static void __spin_time_accum(u64 delta, u32 *array)
+{
+        unsigned index;
+        index = ilog2(delta);
+        check_zero();
+        if (index < HISTO_BUCKETS)
+                array[index]++;
+        else
+                array[HISTO_BUCKETS]++;
+}
+static inline void spin_time_accum_blocked(u64 start)
+{
+        u32 delta;
+        delta = sched_clock() - start;
+        __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
+        spinlock_stats.time_blocked += delta;
+}
+static struct dentry *d_spin_debug;
+static struct dentry *d_kvm_debug;
+struct dentry *kvm_init_debugfs(void)
+{
+        d_kvm_debug = debugfs_create_dir("kvm", NULL);
+        if (!d_kvm_debug)
+                printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");
+        return d_kvm_debug;
+}
+static int __init kvm_spinlock_debugfs(void)
+{
+        struct dentry *d_kvm;
+        d_kvm = kvm_init_debugfs();
+        if (d_kvm == NULL)
+                return -ENOMEM;
+        d_spin_debug = debugfs_create_dir("spinlocks", d_kvm);
+        debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
+        debugfs_create_u32("taken_slow", 0444, d_spin_debug,
+                   &spinlock_stats.contention_stats[TAKEN_SLOW]);
+        debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
+                   &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
+        debugfs_create_u32("released_slow", 0444, d_spin_debug,
+                   &spinlock_stats.contention_stats[RELEASED_SLOW]);
+        debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
+                   &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
+        debugfs_create_u64("time_blocked", 0444, d_spin_debug,
+                           &spinlock_stats.time_blocked);
+        debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
+                     spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
+        return 0;
+}
+fs_initcall(kvm_spinlock_debugfs);
+#else  /* !CONFIG_KVM_DEBUG_FS */
+static inline void add_stats(enum kvm_contention_stat var, u32 val)
+{
+}
+static inline u64 spin_time_start(void)
+{
+        return 0;
+}
+static inline void spin_time_accum_blocked(u64 start)
+{
+}
+#endif  /* CONFIG_KVM_DEBUG_FS */
+struct kvm_lock_waiting {
+        struct arch_spinlock *lock;
+        __ticket_t want;
+};
+/* cpus 'waiting' on a spinlock to become available */
+static cpumask_t waiting_cpus;
+/* Track spinlock on which a cpu is waiting */
+static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting);
+static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
+{
+        struct kvm_lock_waiting *w;
+        int cpu;
+        u64 start;
+        unsigned long flags;
+        if (in_nmi())
+                return;
+        w = &__get_cpu_var(klock_waiting);
+        cpu = smp_processor_id();
+        start = spin_time_start();
+        /*
+         * Make sure an interrupt handler can't upset things in a
+         * partially setup state.
+         */
+        local_irq_save(flags);
+        /*
+         * The ordering protocol on this is that the "lock" pointer
+         * may only be set non-NULL if the "want" ticket is correct.
+         * If we're updating "want", we must first clear "lock".
+         */
+        w->lock = NULL;
+        smp_wmb();
+        w->want = want;
+        smp_wmb();
+        w->lock = lock;
+        add_stats(TAKEN_SLOW, 1);
+        /*
+         * This uses set_bit, which is atomic but we should not rely on its
+         * reordering gurantees. So barrier is needed after this call.
+         */
+        cpumask_set_cpu(cpu, &waiting_cpus);
+        barrier();
+        /*
+         * Mark entry to slowpath before doing the pickup test to make
+         * sure we don't deadlock with an unlocker.
+         */
+        __ticket_enter_slowpath(lock);
+        /*
+         * check again make sure it didn't become free while
+         * we weren't looking.
+         */
+        if (ACCESS_ONCE(lock->tickets.head) == want) {
+                add_stats(TAKEN_SLOW_PICKUP, 1);
+                goto out;
+        }
+        /*
+         * halt until it's our turn and kicked. Note that we do safe halt
+         * for irq enabled case to avoid hang when lock info is overwritten
+         * in irq spinlock slowpath and no spurious interrupt occur to save us.
+         */
+        if (arch_irqs_disabled_flags(flags))
+                halt();
+        else
+                safe_halt();
+out:
+        cpumask_clear_cpu(cpu, &waiting_cpus);
+        w->lock = NULL;
+        local_irq_restore(flags);
+        spin_time_accum_blocked(start);
+}
+PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
+/* Kick vcpu waiting on @lock->head to reach value @ticket */
+static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
+{
+        int cpu;
+        add_stats(RELEASED_SLOW, 1);
+        for_each_cpu(cpu, &waiting_cpus) {
+                const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
+                if (ACCESS_ONCE(w->lock) == lock &&
+                    ACCESS_ONCE(w->want) == ticket) {
+                        add_stats(RELEASED_SLOW_KICKED, 1);
+                        kvm_kick_cpu(cpu);
+                        break;
+                }
+        }
+}
+/*
+ * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
+ */
+void __init kvm_spinlock_init(void)
+{
+        if (!kvm_para_available())
+                return;
+        /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
+        if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
+                return;
+        printk(KERN_INFO "KVM setup paravirtual spinlock\n");
+        static_key_slow_inc(&paravirt_ticketlocks_enabled);
+        pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
+        pv_lock_ops.unlock_kick = kvm_unlock_kick;
+}
+#endif  /* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 676b8c77a976..bbb6c7316341 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -4,25 +4,17 @@
 */
 #include <linux/spinlock.h>
 #include <linux/module.h>
+#include <linux/jump_label.h>
 #include <asm/paravirt.h>
-static inline void
-default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
-{
-        arch_spin_lock(lock);
-}
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
-        .spin_is_locked = __ticket_spin_is_locked,
+        .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
-        .spin_is_contended = __ticket_spin_is_contended,
+        .unlock_kick = paravirt_nop,
-        .spin_lock = __ticket_spin_lock,
-        .spin_lock_flags = default_spin_lock_flags,
-        .spin_trylock = __ticket_spin_trylock,
-        .spin_unlock = __ticket_spin_unlock,
 #endif
 };
 EXPORT_SYMBOL(pv_lock_ops);
+struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(paravirt_ticketlocks_enabled);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 22759c6d309f..368c290929fe 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -279,6 +279,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
        xen_filter_cpu_maps();
        xen_setup_vcpu_info_placement();
+        xen_init_spinlocks();
 }
 static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
@@ -686,7 +687,6 @@ void __init xen_smp_init(void)
 {
        smp_ops = xen_smp_ops;
        xen_fill_possible_map();
-        xen_init_spinlocks();
 }
 static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index cf3caee356b3..0438b9324a72 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -17,45 +17,44 @@
 #include "xen-ops.h"
 #include "debugfs.h"
-#ifdef CONFIG_XEN_DEBUG_FS
+enum xen_contention_stat {
-static struct xen_spinlock_stats
+        TAKEN_SLOW,
-{
+        TAKEN_SLOW_PICKUP,
-        u64 taken;
+        TAKEN_SLOW_SPURIOUS,
-        u32 taken_slow;
+        RELEASED_SLOW,
-        u32 taken_slow_nested;
+        RELEASED_SLOW_KICKED,
-        u32 taken_slow_pickup;
+        NR_CONTENTION_STATS
-        u32 taken_slow_spurious;
+};
-        u32 taken_slow_irqenable;
-        u64 released;
-        u32 released_slow;
-        u32 released_slow_kicked;
+#ifdef CONFIG_XEN_DEBUG_FS
 #define HISTO_BUCKETS   30
-        u32 histo_spin_total[HISTO_BUCKETS+1];
+static struct xen_spinlock_stats
-        u32 histo_spin_spinning[HISTO_BUCKETS+1];
+{
+        u32 contention_stats[NR_CONTENTION_STATS];
        u32 histo_spin_blocked[HISTO_BUCKETS+1];
-        u64 time_total;
-        u64 time_spinning;
        u64 time_blocked;
 } spinlock_stats;
 static u8 zero_stats;
-static unsigned lock_timeout = 1 << 10;
-#define TIMEOUT lock_timeout
 static inline void check_zero(void)
 {
-        if (unlikely(zero_stats)) {
+        u8 ret;
-                memset(&spinlock_stats, 0, sizeof(spinlock_stats));
+        u8 old = ACCESS_ONCE(zero_stats);
-                zero_stats = 0;
+        if (unlikely(old)) {
+                ret = cmpxchg(&zero_stats, old, 0);
+                /* This ensures only one fellow resets the stat */
+                if (ret == old)
+                        memset(&spinlock_stats, 0, sizeof(spinlock_stats));
        }
 }
-#define ADD_STATS(elem, val)                    \
+static inline void add_stats(enum xen_contention_stat var, u32 val)
-        do { check_zero(); spinlock_stats.elem += (val); } while(0)
+{
+        check_zero();
+        spinlock_stats.contention_stats[var] += val;
+}
 static inline u64 spin_time_start(void)
 {
@@ -74,22 +73,6 @@ static void __spin_time_accum(u64 delta, u32 *array)
                array[HISTO_BUCKETS]++;
 }
-static inline void spin_time_accum_spinning(u64 start)
-{
-        u32 delta = xen_clocksource_read() - start;
-        __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
-        spinlock_stats.time_spinning += delta;
-}
-static inline void spin_time_accum_total(u64 start)
-{
-        u32 delta = xen_clocksource_read() - start;
-        __spin_time_accum(delta, spinlock_stats.histo_spin_total);
-        spinlock_stats.time_total += delta;
-}
 static inline void spin_time_accum_blocked(u64 start)
 {
        u32 delta = xen_clocksource_read() - start;
@@ -99,19 +82,15 @@ static inline void spin_time_accum_blocked(u64 start)
 }
 #else  /* !CONFIG_XEN_DEBUG_FS */
 #define TIMEOUT                 (1 << 10)
-#define ADD_STATS(elem, val)    do { (void)(val); } while(0)
+static inline void add_stats(enum xen_contention_stat var, u32 val)
+{
+}
 static inline u64 spin_time_start(void)
 {
        return 0;
 }
-static inline void spin_time_accum_total(u64 start)
-{
-}
-static inline void spin_time_accum_spinning(u64 start)
-{
-}
 static inline void spin_time_accum_blocked(u64 start)
 {
 }
@@ -134,227 +113,123 @@ typedef u16 xen_spinners_t;
        asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory");
 #endif
-struct xen_spinlock {
+struct xen_lock_waiting {
-        unsigned char lock;             /* 0 -> free; 1 -> locked */
+        struct arch_spinlock *lock;
-        xen_spinners_t spinners;        /* count of waiting cpus */
+        __ticket_t want;
 };
-static int xen_spin_is_locked(struct arch_spinlock *lock)
-{
-        struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-        return xl->lock != 0;
-}
-static int xen_spin_is_contended(struct arch_spinlock *lock)
-{
-        struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-        /* Not strictly true; this is only the count of contended
-           lock-takers entering the slow path. */
-        return xl->spinners != 0;
-}
-static int xen_spin_trylock(struct arch_spinlock *lock)
-{
-        struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-        u8 old = 1;
-        asm("xchgb %b0,%1"
-            : "+q" (old), "+m" (xl->lock) : : "memory");
-        return old == 0;
-}
-static DEFINE_PER_CPU(char *, irq_name);
 static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
+static DEFINE_PER_CPU(char *, irq_name);
+static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
-/*
+static cpumask_t waiting_cpus;
- * Mark a cpu as interested in a lock.  Returns the CPU's previous
- * lock of interest, in case we got preempted by an interrupt.
- */
-static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
-{
-        struct xen_spinlock *prev;
-        prev = __this_cpu_read(lock_spinners);
-        __this_cpu_write(lock_spinners, xl);
-        wmb();                  /* set lock of interest before count */
-        inc_spinners(xl);
-        return prev;
-}
-/*
- * Mark a cpu as no longer interested in a lock.  Restores previous
- * lock of interest (NULL for none).
- */
-static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
-{
-        dec_spinners(xl);
-        wmb();                  /* decrement count before restoring lock */
-        __this_cpu_write(lock_spinners, prev);
-}
-static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
+static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 {
-        struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-        struct xen_spinlock *prev;
        int irq = __this_cpu_read(lock_kicker_irq);
-        int ret;
+        struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting);
+        int cpu = smp_processor_id();
        u64 start;
+        unsigned long flags;
        /* If kicker interrupts not initialized yet, just spin */
        if (irq == -1)
-                return 0;
+                return;
        start = spin_time_start();
-        /* announce we're spinning */
+        /*
-        prev = spinning_lock(xl);
+         * Make sure an interrupt handler can't upset things in a
+         * partially setup state.
+         */
+        local_irq_save(flags);
+        /*
+         * We don't really care if we're overwriting some other
+         * (lock,want) pair, as that would mean that we're currently
+         * in an interrupt context, and the outer context had
+         * interrupts enabled.  That has already kicked the VCPU out
+         * of xen_poll_irq(), so it will just return spuriously and
+         * retry with newly setup (lock,want).
+         *
+         * The ordering protocol on this is that the "lock" pointer
+         * may only be set non-NULL if the "want" ticket is correct.
+         * If we're updating "want", we must first clear "lock".
+         */
+        w->lock = NULL;
+        smp_wmb();
+        w->want = want;
+        smp_wmb();
+        w->lock = lock;
-        ADD_STATS(taken_slow, 1);
+        /* This uses set_bit, which atomic and therefore a barrier */
-        ADD_STATS(taken_slow_nested, prev != NULL);
+        cpumask_set_cpu(cpu, &waiting_cpus);
+        add_stats(TAKEN_SLOW, 1);
-        do {
+        /* clear pending */
-                unsigned long flags;
+        xen_clear_irq_pending(irq);
-                /* clear pending */
+        /* Only check lock once pending cleared */
-                xen_clear_irq_pending(irq);
+        barrier();
-                /* check again make sure it didn't become free while
+        /*
-                   we weren't looking  */
+         * Mark entry to slowpath before doing the pickup test to make
-                ret = xen_spin_trylock(lock);
+         * sure we don't deadlock with an unlocker.
-                if (ret) {
+         */
-                        ADD_STATS(taken_slow_pickup, 1);
+        __ticket_enter_slowpath(lock);
-                        /*
+        /*
-                         * If we interrupted another spinlock while it
+         * check again make sure it didn't become free while
-                         * was blocking, make sure it doesn't block
+         * we weren't looking
-                         * without rechecking the lock.
+         */
-                         */
+        if (ACCESS_ONCE(lock->tickets.head) == want) {
-                        if (prev != NULL)
+                add_stats(TAKEN_SLOW_PICKUP, 1);
-                                xen_set_irq_pending(irq);
+                goto out;
-                        goto out;
+        }
-                }
-                flags = arch_local_save_flags();
+        /* Allow interrupts while blocked */
-                if (irq_enable) {
+        local_irq_restore(flags);
-                        ADD_STATS(taken_slow_irqenable, 1);
-                        raw_local_irq_enable();
-                }
-                /*
+        /*
-                 * Block until irq becomes pending.  If we're
+         * If an interrupt happens here, it will leave the wakeup irq
-                 * interrupted at this point (after the trylock but
+         * pending, which will cause xen_poll_irq() to return
-                 * before entering the block), then the nested lock
+         * immediately.
-                 * handler guarantees that the irq will be left
+         */
-                 * pending if there's any chance the lock became free;
-                 * xen_poll_irq() returns immediately if the irq is
-                 * pending.
-                 */
-                xen_poll_irq(irq);
-                raw_local_irq_restore(flags);
+        /* Block until irq becomes pending (or perhaps a spurious wakeup) */
+        xen_poll_irq(irq);
+        add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq));
-                ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
+        local_irq_save(flags);
-        } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
        kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
 out:
-        unspinning_lock(xl, prev);
+        cpumask_clear_cpu(cpu, &waiting_cpus);
-        spin_time_accum_blocked(start);
+        w->lock = NULL;
-        return ret;
-}
-static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable)
-{
-        struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-        unsigned timeout;
-        u8 oldval;
-        u64 start_spin;
-        ADD_STATS(taken, 1);
-        start_spin = spin_time_start();
-        do {
-                u64 start_spin_fast = spin_time_start();
-                timeout = TIMEOUT;
-                asm("1: xchgb %1,%0\n"
-                    "   testb %1,%1\n"
-                    "   jz 3f\n"
-                    "2: rep;nop\n"
-                    "   cmpb $0,%0\n"
-                    "   je 1b\n"
-                    "   dec %2\n"
-                    "   jnz 2b\n"
-                    "3:\n"
-                    : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
-                    : "1" (1)
-                    : "memory");
-                spin_time_accum_spinning(start_spin_fast);
+        local_irq_restore(flags);
-        } while (unlikely(oldval != 0 &&
+        spin_time_accum_blocked(start);
-                          (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
-        spin_time_accum_total(start_spin);
-}
-static void xen_spin_lock(struct arch_spinlock *lock)
-{
-        __xen_spin_lock(lock, false);
-}
-static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags)
-{
-        __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning);
-static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
+static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
 {
        int cpu;
-        ADD_STATS(released_slow, 1);
+        add_stats(RELEASED_SLOW, 1);
+        for_each_cpu(cpu, &waiting_cpus) {
+                const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
-        for_each_online_cpu(cpu) {
+                /* Make sure we read lock before want */
-                /* XXX should mix up next cpu selection */
+                if (ACCESS_ONCE(w->lock) == lock &&
-                if (per_cpu(lock_spinners, cpu) == xl) {
+                    ACCESS_ONCE(w->want) == next) {
-                        ADD_STATS(released_slow_kicked, 1);
+                        add_stats(RELEASED_SLOW_KICKED, 1);
                        xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+                        break;
                }
        }
 }
-static void xen_spin_unlock(struct arch_spinlock *lock)
-{
-        struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-        ADD_STATS(released, 1);
-        smp_wmb();              /* make sure no writes get moved after unlock */
-        xl->lock = 0;           /* release lock */
-        /*
-         * Make sure unlock happens before checking for waiting
-         * spinners.  We need a strong barrier to enforce the
-         * write-read ordering to different memory locations, as the
-         * CPU makes no implied guarantees about their ordering.
-         */
-        mb();
-        if (unlikely(xl->spinners))
-                xen_spin_unlock_slow(xl);
-}
 static irqreturn_t dummy_handler(int irq, void *dev_id)
 {
        BUG();
@@ -408,6 +283,8 @@ void xen_uninit_lock_cpu(int cpu)
        per_cpu(irq_name, cpu) = NULL;
 }
+static bool xen_pvspin __initdata = true;
 void __init xen_init_spinlocks(void)
 {
        /*
@@ -417,15 +294,23 @@ void __init xen_init_spinlocks(void)
        if (xen_hvm_domain())
                return;
-        BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t));
+        if (!xen_pvspin) {
+                printk(KERN_DEBUG "xen: PV spinlocks disabled\n");
+                return;
+        }
-        pv_lock_ops.spin_is_locked = xen_spin_is_locked;
+        static_key_slow_inc(&paravirt_ticketlocks_enabled);
-        pv_lock_ops.spin_is_contended = xen_spin_is_contended;
-        pv_lock_ops.spin_lock = xen_spin_lock;
+        pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
-        pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
+        pv_lock_ops.unlock_kick = xen_unlock_kick;
-        pv_lock_ops.spin_trylock = xen_spin_trylock;
+}
-        pv_lock_ops.spin_unlock = xen_spin_unlock;
+static __init int xen_parse_nopvspin(char *arg)
+{
+        xen_pvspin = false;
+        return 0;
 }
+early_param("xen_nopvspin", xen_parse_nopvspin);
 #ifdef CONFIG_XEN_DEBUG_FS
@@ -442,37 +327,21 @@ static int __init xen_spinlock_debugfs(void)
        debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
-        debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
-        debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
        debugfs_create_u32("taken_slow", 0444, d_spin_debug,
-                           &spinlock_stats.taken_slow);
+                           &spinlock_stats.contention_stats[TAKEN_SLOW]);
-        debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
-                           &spinlock_stats.taken_slow_nested);
        debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
-                           &spinlock_stats.taken_slow_pickup);
+                           &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
        debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
-                           &spinlock_stats.taken_slow_spurious);
+                           &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]);
-        debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
-                           &spinlock_stats.taken_slow_irqenable);
-        debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
        debugfs_create_u32("released_slow", 0444, d_spin_debug,
-                           &spinlock_stats.released_slow);
+                           &spinlock_stats.contention_stats[RELEASED_SLOW]);
        debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
-                           &spinlock_stats.released_slow_kicked);
+                           &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
-        debugfs_create_u64("time_spinning", 0444, d_spin_debug,
-                           &spinlock_stats.time_spinning);
        debugfs_create_u64("time_blocked", 0444, d_spin_debug,
                           &spinlock_stats.time_blocked);
-        debugfs_create_u64("time_total", 0444, d_spin_debug,
-                           &spinlock_stats.time_total);
-        debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
-                                spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
-        debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
-                                spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
        debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
                                spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
author	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>	2013-09-09 12:01:15 -0400
committer	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>	2013-09-09 12:01:15 -0400
commit	c3f31f6a6f68bcb51689c90733282ec263602a9d (patch)
tree	07c2c7ae966b07d5adabe78215d9c76fa4ec531a /arch/x86
parent	e1a9c16b303725ac900fee2a3ec4dbe2c2f846ab (diff)
parent	36bd621337c91a1ecda588e5bbbae8dd9698bae7 (diff)