Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking updates from Ingo Molnar: "The main changes are: - 'qspinlock' support, enabled on x86: queued spinlocks - these are now the spinlock variant used by x86 as they outperform ticket spinlocks in every category. (Waiman Long) - 'pvqspinlock' support on x86: paravirtualized variant of queued spinlocks. (Waiman Long, Peter Zijlstra) - 'qrwlock' support, enabled on x86: queued rwlocks. Similar to queued spinlocks, they are now the variant used by x86: CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y CONFIG_QUEUED_SPINLOCKS=y CONFIG_ARCH_USE_QUEUED_RWLOCKS=y CONFIG_QUEUED_RWLOCKS=y - various lockdep fixlets - various locking primitives cleanups, further WRITE_ONCE() propagation" * 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits) locking/lockdep: Remove hard coded array size dependency locking/qrwlock: Don't contend with readers when setting _QW_WAITING lockdep: Do not break user-visible string locking/arch: Rename set_mb() to smp_store_mb() locking/arch: Add WRITE_ONCE() to set_mb() rtmutex: Warn if trylock is called from hard/softirq context arch: Remove __ARCH_HAVE_CMPXCHG locking/rtmutex: Drop usage of __HAVE_ARCH_CMPXCHG locking/qrwlock: Rename QUEUE_RWLOCK to QUEUED_RWLOCKS locking/pvqspinlock: Rename QUEUED_SPINLOCK to QUEUED_SPINLOCKS locking/pvqspinlock: Replace xchg() by the more descriptive set_mb() locking/pvqspinlock, x86: Enable PV qspinlock for Xen locking/pvqspinlock, x86: Enable PV qspinlock for KVM locking/pvqspinlock, x86: Implement the paravirt qspinlock call patching locking/pvqspinlock: Implement simple paravirt support for the qspinlock locking/qspinlock: Revert to test-and-set on hypervisors locking/qspinlock: Use a simple write to grab the lock locking/qspinlock: Optimize for smaller NR_CPUS locking/qspinlock: Extract out code snippets for the next patch locking/qspinlock: Add pending bit ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-06-22 17:54:22 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-06-22 17:54:22 -0400
commit: 1bf7067c6e173dc10411704db48338ed69c05565 (patch)
tree: 06d731d9647c525fa598d03d7ec957ff9772ff40
parent: fc934d40178ad4e551a17e2733241d9f29fddd70 (diff)
parent: 68722101ec3a0e179408a13708dd020e04f54aab (diff)
61 files changed, 1423 insertions, 102 deletions
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 360841da3744..13feb697271f 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1673,7 +1673,7 @@ CPU from reordering them.
 There are some more advanced barrier functions:
- (*) set_mb(var, value)
+ (*) smp_store_mb(var, value)
     This assigns the value to the variable and then inserts a full memory
     barrier after it, depending on the function.  It isn't guaranteed to
@@ -1985,7 +1985,7 @@ after it has altered the task state:
        CPU 1
        ===============================
        set_current_state();
-          set_mb();
+          smp_store_mb();
            STORE current->state
            <general barrier>
        LOAD event_indicated
@@ -2026,7 +2026,7 @@ between the STORE to indicate the event and the STORE to set TASK_RUNNING:
        CPU 1                           CPU 2
        =============================== ===============================
        set_current_state();            STORE event_indicated
-          set_mb();                     wake_up();
+          smp_store_mb();               wake_up();
            STORE current->state          <write barrier>
            <general barrier>             STORE current->state
        LOAD event_indicated
diff --git a/arch/alpha/include/asm/cmpxchg.h b/arch/alpha/include/asm/cmpxchg.h
index 429e8cd0d78e..e5117766529e 100644
--- a/arch/alpha/include/asm/cmpxchg.h
+++ b/arch/alpha/include/asm/cmpxchg.h
@@ -66,6 +66,4 @@
 #undef __ASM__MB
 #undef ____cmpxchg
-#define __HAVE_ARCH_CMPXCHG 1
 #endif /* _ALPHA_CMPXCHG_H */
diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h
index d2f81e6b8c1c..6c2327e1c732 100644
--- a/arch/arm/include/asm/barrier.h
+++ b/arch/arm/include/asm/barrier.h
@@ -81,7 +81,7 @@ do {									\
 #define read_barrier_depends()          do { } while(0)
 #define smp_read_barrier_depends()      do { } while(0)
-#define set_mb(var, value)      do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value)        do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 #define smp_mb__before_atomic() smp_mb()
 #define smp_mb__after_atomic()  smp_mb()
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 71f19c4dc0de..0fa47c4275cb 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -114,7 +114,7 @@ do {									\
 #define read_barrier_depends()          do { } while(0)
 #define smp_read_barrier_depends()      do { } while(0)
-#define set_mb(var, value)      do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value)        do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 #define nop()           asm volatile("nop");
 #define smp_mb__before_atomic() smp_mb()
diff --git a/arch/avr32/include/asm/cmpxchg.h b/arch/avr32/include/asm/cmpxchg.h
index 962a6aeab787..366bbeaeb405 100644
--- a/arch/avr32/include/asm/cmpxchg.h
+++ b/arch/avr32/include/asm/cmpxchg.h
@@ -70,8 +70,6 @@ extern unsigned long __cmpxchg_u64_unsupported_on_32bit_kernels(
   if something tries to do an invalid cmpxchg().  */
 extern void __cmpxchg_called_with_bad_pointer(void);
-#define __HAVE_ARCH_CMPXCHG 1
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
                                      unsigned long new, int size)
 {
diff --git a/arch/hexagon/include/asm/cmpxchg.h b/arch/hexagon/include/asm/cmpxchg.h
index 9e7802911a57..a6e34e2acbba 100644
--- a/arch/hexagon/include/asm/cmpxchg.h
+++ b/arch/hexagon/include/asm/cmpxchg.h
@@ -64,7 +64,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
 *  looks just like atomic_cmpxchg on our arch currently with a bunch of
 *  variable casting.
 */
-#define __HAVE_ARCH_CMPXCHG 1
 #define cmpxchg(ptr, old, new)                                  \
 ({                                                              \
diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h
index f6769eb2bbf9..843ba435e43b 100644
--- a/arch/ia64/include/asm/barrier.h
+++ b/arch/ia64/include/asm/barrier.h
@@ -77,12 +77,7 @@ do {									\
        ___p1;                                                          \
 })
-/*
+#define smp_store_mb(var, value)        do { WRITE_ONCE(var, value); mb(); } while (0)
- * XXX check on this ---I suspect what Linus really wants here is
- * acquire vs release semantics but we can't discuss this stuff with
- * Linus just yet.  Grrr...
- */
-#define set_mb(var, value)      do { (var) = (value); mb(); } while (0)
 /*
 * The group barrier in front of the rsm & ssm are necessary to ensure
diff --git a/arch/ia64/include/uapi/asm/cmpxchg.h b/arch/ia64/include/uapi/asm/cmpxchg.h
index f35109b1d907..a0e3620f8f13 100644
--- a/arch/ia64/include/uapi/asm/cmpxchg.h
+++ b/arch/ia64/include/uapi/asm/cmpxchg.h
@@ -61,8 +61,6 @@ extern void ia64_xchg_called_with_bad_pointer(void);
 * indicated by comparing RETURN with OLD.
 */
-#define __HAVE_ARCH_CMPXCHG 1
 /*
 * This function doesn't exist, so you'll get a linker error
 * if something tries to do an invalid cmpxchg().
diff --git a/arch/m32r/include/asm/cmpxchg.h b/arch/m32r/include/asm/cmpxchg.h
index de651db20b43..14bf9b739dd2 100644
--- a/arch/m32r/include/asm/cmpxchg.h
+++ b/arch/m32r/include/asm/cmpxchg.h
@@ -107,8 +107,6 @@ __xchg_local(unsigned long x, volatile void *ptr, int size)
        ((__typeof__(*(ptr)))__xchg_local((unsigned long)(x), (ptr),    \
                        sizeof(*(ptr))))
-#define __HAVE_ARCH_CMPXCHG     1
 static inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned int old, unsigned int new)
 {
diff --git a/arch/m68k/include/asm/cmpxchg.h b/arch/m68k/include/asm/cmpxchg.h
index bc755bc620ad..83b1df80f0ac 100644
--- a/arch/m68k/include/asm/cmpxchg.h
+++ b/arch/m68k/include/asm/cmpxchg.h
@@ -90,7 +90,6 @@ extern unsigned long __invalid_cmpxchg_size(volatile void *,
 * indicated by comparing RETURN with OLD.
 */
 #ifdef CONFIG_RMW_INSNS
-#define __HAVE_ARCH_CMPXCHG     1
 static inline unsigned long __cmpxchg(volatile void *p, unsigned long old,
                                      unsigned long new, int size)
diff --git a/arch/metag/include/asm/barrier.h b/arch/metag/include/asm/barrier.h
index d703d8e26a65..5a696e507930 100644
--- a/arch/metag/include/asm/barrier.h
+++ b/arch/metag/include/asm/barrier.h
@@ -84,7 +84,7 @@ static inline void fence(void)
 #define read_barrier_depends()          do { } while (0)
 #define smp_read_barrier_depends()      do { } while (0)
-#define set_mb(var, value) do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 #define smp_store_release(p, v)                                         \
 do {                                                                    \
diff --git a/arch/metag/include/asm/cmpxchg.h b/arch/metag/include/asm/cmpxchg.h
index b1bc1be8540f..be29e3e44321 100644
--- a/arch/metag/include/asm/cmpxchg.h
+++ b/arch/metag/include/asm/cmpxchg.h
@@ -51,8 +51,6 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
        return old;
 }
-#define __HAVE_ARCH_CMPXCHG 1
 #define cmpxchg(ptr, o, n)                                              \
        ({                                                              \
                __typeof__(*(ptr)) _o_ = (o);                           \
diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
index 2b8bbbcb9be0..7ecba84656d4 100644
--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -112,8 +112,8 @@
 #define __WEAK_LLSC_MB          "               \n"
 #endif
-#define set_mb(var, value) \
+#define smp_store_mb(var, value) \
-        do { var = value; smp_mb(); } while (0)
+        do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 #define smp_llsc_mb()   __asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
diff --git a/arch/mips/include/asm/cmpxchg.h b/arch/mips/include/asm/cmpxchg.h
index 412f945f1f5e..b71ab4a5fd50 100644
--- a/arch/mips/include/asm/cmpxchg.h
+++ b/arch/mips/include/asm/cmpxchg.h
@@ -138,8 +138,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
                __xchg((unsigned long)(x), (ptr), sizeof(*(ptr))));     \
 })
-#define __HAVE_ARCH_CMPXCHG 1
 #define __cmpxchg_asm(ld, st, m, old, new)                              \
 ({                                                                      \
        __typeof(*(m)) __ret;                                           \
diff --git a/arch/parisc/include/asm/cmpxchg.h b/arch/parisc/include/asm/cmpxchg.h
index dbd13354ec41..0a90b965cccb 100644
--- a/arch/parisc/include/asm/cmpxchg.h
+++ b/arch/parisc/include/asm/cmpxchg.h
@@ -46,8 +46,6 @@ __xchg(unsigned long x, __volatile__ void *ptr, int size)
 #define xchg(ptr, x) \
        ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))))
-#define __HAVE_ARCH_CMPXCHG     1
 /* bug catcher for when unsupported size is used - won't link */
 extern void __cmpxchg_called_with_bad_pointer(void);
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index 1124f59b8df4..51ccc7232042 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -34,7 +34,7 @@
 #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
-#define set_mb(var, value)      do { var = value; mb(); } while (0)
+#define smp_store_mb(var, value)        do { WRITE_ONCE(var, value); mb(); } while (0)
 #ifdef __SUBARCH_HAS_LWSYNC
 #    define SMPWMB      LWSYNC
diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index d463c68fe7f0..ad6263cffb0f 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -144,7 +144,6 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
 * Compare and exchange - if *p == old, set it to new,
 * and return the old value of *p.
 */
-#define __HAVE_ARCH_CMPXCHG     1
 static __always_inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index 8d724718ec21..e6f8615a11eb 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -36,7 +36,7 @@
 #define smp_mb__before_atomic()         smp_mb()
 #define smp_mb__after_atomic()          smp_mb()
-#define set_mb(var, value)              do { var = value; mb(); } while (0)
+#define smp_store_mb(var, value)                do { WRITE_ONCE(var, value); mb(); } while (0)
 #define smp_store_release(p, v)                                         \
 do {                                                                    \
diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h
index 4eadec466b8c..411464f4c97a 100644
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -32,8 +32,6 @@
        __old;                                                          \
 })
-#define __HAVE_ARCH_CMPXCHG
 #define __cmpxchg_double_op(p1, p2, o1, o2, n1, n2, insn)               \
 ({                                                                      \
        register __typeof__(*(p1)) __old1 asm("2") = (o1);              \
diff --git a/arch/score/include/asm/cmpxchg.h b/arch/score/include/asm/cmpxchg.h
index f384839c3ee5..cc3f6420b71c 100644
--- a/arch/score/include/asm/cmpxchg.h
+++ b/arch/score/include/asm/cmpxchg.h
@@ -42,8 +42,6 @@ static inline unsigned long __cmpxchg(volatile unsigned long *m,
                                        (unsigned long)(o),     \
                                        (unsigned long)(n)))
-#define __HAVE_ARCH_CMPXCHG     1
 #include <asm-generic/cmpxchg-local.h>
 #endif /* _ASM_SCORE_CMPXCHG_H */
diff --git a/arch/sh/include/asm/barrier.h b/arch/sh/include/asm/barrier.h
index 43715308b068..bf91037db4e0 100644
--- a/arch/sh/include/asm/barrier.h
+++ b/arch/sh/include/asm/barrier.h
@@ -32,7 +32,7 @@
 #define ctrl_barrier()  __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop")
 #endif
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
 #include <asm-generic/barrier.h>
diff --git a/arch/sh/include/asm/cmpxchg.h b/arch/sh/include/asm/cmpxchg.h
index f6bd1406b897..85c97b188d71 100644
--- a/arch/sh/include/asm/cmpxchg.h
+++ b/arch/sh/include/asm/cmpxchg.h
@@ -46,8 +46,6 @@ extern void __xchg_called_with_bad_pointer(void);
 * if something tries to do an invalid cmpxchg(). */
 extern void __cmpxchg_called_with_bad_pointer(void);
-#define __HAVE_ARCH_CMPXCHG 1
 static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old,
                unsigned long new, int size)
 {
diff --git a/arch/sparc/include/asm/barrier_64.h b/arch/sparc/include/asm/barrier_64.h
index 76648941fea7..809941e33e12 100644
--- a/arch/sparc/include/asm/barrier_64.h
+++ b/arch/sparc/include/asm/barrier_64.h
@@ -40,8 +40,8 @@ do {	__asm__ __volatile__("ba,pt	%%xcc, 1f\n\t" \
 #define dma_rmb()       rmb()
 #define dma_wmb()       wmb()
-#define set_mb(__var, __value) \
+#define smp_store_mb(__var, __value) \
-        do { __var = __value; membar_safe("#StoreLoad"); } while(0)
+        do { WRITE_ONCE(__var, __value); membar_safe("#StoreLoad"); } while(0)
 #ifdef CONFIG_SMP
 #define smp_mb()        mb()
diff --git a/arch/sparc/include/asm/cmpxchg_32.h b/arch/sparc/include/asm/cmpxchg_32.h
index d38b52dca216..83ffb83c5397 100644
--- a/arch/sparc/include/asm/cmpxchg_32.h
+++ b/arch/sparc/include/asm/cmpxchg_32.h
@@ -34,7 +34,6 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, int
 *
 * Cribbed from <asm-parisc/atomic.h>
 */
-#define __HAVE_ARCH_CMPXCHG     1
 /* bug catcher for when unsupported size is used - won't link */
 void __cmpxchg_called_with_bad_pointer(void);
diff --git a/arch/sparc/include/asm/cmpxchg_64.h b/arch/sparc/include/asm/cmpxchg_64.h
index 0e1ed6cfbf68..faa2f61058c2 100644
--- a/arch/sparc/include/asm/cmpxchg_64.h
+++ b/arch/sparc/include/asm/cmpxchg_64.h
@@ -65,8 +65,6 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr,
 #include <asm-generic/cmpxchg-local.h>
-#define __HAVE_ARCH_CMPXCHG 1
 static inline unsigned long
 __cmpxchg_u32(volatile int *m, int old, int new)
 {
diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h
index 7b11c5fadd42..0496970cef82 100644
--- a/arch/tile/include/asm/atomic_64.h
+++ b/arch/tile/include/asm/atomic_64.h
@@ -105,9 +105,6 @@ static inline long atomic64_add_unless(atomic64_t *v, long a, long u)
 #define atomic64_inc_not_zero(v)        atomic64_add_unless((v), 1, 0)
-/* Define this to indicate that cmpxchg is an efficient operation. */
-#define __HAVE_ARCH_CMPXCHG
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_TILE_ATOMIC_64_H */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 226d5696e1d1..4e986e809861 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -127,7 +127,8 @@ config X86
        select MODULES_USE_ELF_RELA if X86_64
        select CLONE_BACKWARDS if X86_32
        select ARCH_USE_BUILTIN_BSWAP
-        select ARCH_USE_QUEUE_RWLOCK
+        select ARCH_USE_QUEUED_SPINLOCKS
+        select ARCH_USE_QUEUED_RWLOCKS
        select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
        select OLD_SIGACTION if X86_32
        select COMPAT_OLD_SIGACTION if IA32_EMULATION
@@ -666,7 +667,7 @@ config PARAVIRT_DEBUG
 config PARAVIRT_SPINLOCKS
        bool "Paravirtualization layer for spinlocks"
        depends on PARAVIRT && SMP
-        select UNINLINE_SPIN_UNLOCK
+        select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS
        ---help---
          Paravirtualized spinlocks allow a pvops backend to replace the
          spinlock implementation with something virtualization-friendly
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 959e45b81fe2..e51a8f803f55 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -35,12 +35,12 @@
 #define smp_mb()        mb()
 #define smp_rmb()       dma_rmb()
 #define smp_wmb()       barrier()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
 #else /* !SMP */
 #define smp_mb()        barrier()
 #define smp_rmb()       barrier()
 #define smp_wmb()       barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 #endif /* SMP */
 #define read_barrier_depends()          do { } while (0)
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 99c105d78b7e..ad19841eddfe 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -4,8 +4,6 @@
 #include <linux/compiler.h>
 #include <asm/alternative.h> /* Provides LOCK_PREFIX */
-#define __HAVE_ARCH_CMPXCHG 1
 /*
 * Non-existant functions to indicate usage errors at link time
 * (or compile-time if the compiler implements __compiletime_error().
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 8957810ad7d1..d143bfad45d7 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -712,6 +712,31 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
 #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#ifdef CONFIG_QUEUED_SPINLOCKS
+static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
+                                                        u32 val)
+{
+        PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val);
+}
+static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+        PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock);
+}
+static __always_inline void pv_wait(u8 *ptr, u8 val)
+{
+        PVOP_VCALL2(pv_lock_ops.wait, ptr, val);
+}
+static __always_inline void pv_kick(int cpu)
+{
+        PVOP_VCALL1(pv_lock_ops.kick, cpu);
+}
+#else /* !CONFIG_QUEUED_SPINLOCKS */
 static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
                                                        __ticket_t ticket)
 {
@@ -724,7 +749,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
        PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
 }
-#endif
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+#endif /* SMP && PARAVIRT_SPINLOCKS */
 #ifdef CONFIG_X86_32
 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index f7b0b5c112f2..8766c7c395c2 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -333,9 +333,19 @@ struct arch_spinlock;
 typedef u16 __ticket_t;
 #endif
+struct qspinlock;
 struct pv_lock_ops {
+#ifdef CONFIG_QUEUED_SPINLOCKS
+        void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
+        struct paravirt_callee_save queued_spin_unlock;
+        void (*wait)(u8 *ptr, u8 val);
+        void (*kick)(int cpu);
+#else /* !CONFIG_QUEUED_SPINLOCKS */
        struct paravirt_callee_save lock_spinning;
        void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
 };
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
new file mode 100644
index 000000000000..9d51fae1cba3
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock.h
@@ -0,0 +1,57 @@
+#ifndef _ASM_X86_QSPINLOCK_H
+#define _ASM_X86_QSPINLOCK_H
+#include <asm/cpufeature.h>
+#include <asm-generic/qspinlock_types.h>
+#include <asm/paravirt.h>
+#define queued_spin_unlock queued_spin_unlock
+/**
+ * queued_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * A smp_store_release() on the least-significant byte.
+ */
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+        smp_store_release((u8 *)lock, 0);
+}
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+        pv_queued_spin_lock_slowpath(lock, val);
+}
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+        pv_queued_spin_unlock(lock);
+}
+#else
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+        native_queued_spin_unlock(lock);
+}
+#endif
+#define virt_queued_spin_lock virt_queued_spin_lock
+static inline bool virt_queued_spin_lock(struct qspinlock *lock)
+{
+        if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+                return false;
+        while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0)
+                cpu_relax();
+        return true;
+}
+#include <asm-generic/qspinlock.h>
+#endif /* _ASM_X86_QSPINLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000000000000..b002e711ba88
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_QSPINLOCK_PARAVIRT_H
+#define __ASM_QSPINLOCK_PARAVIRT_H
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+#endif
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 64b611782ef0..be0a05913b91 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -42,6 +42,10 @@
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#else
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
@@ -196,6 +200,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
                cpu_relax();
        }
 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 /*
 * Read-write spinlocks, allowing multiple readers
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 5f9d7572d82b..65c3e37f879a 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -23,6 +23,9 @@ typedef u32 __ticketpair_t;
 #define TICKET_SHIFT    (sizeof(__ticket_t) * 8)
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#else
 typedef struct arch_spinlock {
        union {
                __ticketpair_t head_tail;
@@ -33,6 +36,7 @@ typedef struct arch_spinlock {
 } arch_spinlock_t;
 #define __ARCH_SPIN_LOCK_UNLOCKED       { { 0 } }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 #include <asm-generic/qrwlock_types.h>
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 9435620062df..1681504e44a4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -584,6 +584,39 @@ static void kvm_kick_cpu(int cpu)
        kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
 }
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+static void kvm_wait(u8 *ptr, u8 val)
+{
+        unsigned long flags;
+        if (in_nmi())
+                return;
+        local_irq_save(flags);
+        if (READ_ONCE(*ptr) != val)
+                goto out;
+        /*
+         * halt until it's our turn and kicked. Note that we do safe halt
+         * for irq enabled case to avoid hang when lock info is overwritten
+         * in irq spinlock slowpath and no spurious interrupt occur to save us.
+         */
+        if (arch_irqs_disabled_flags(flags))
+                halt();
+        else
+                safe_halt();
+out:
+        local_irq_restore(flags);
+}
+#else /* !CONFIG_QUEUED_SPINLOCKS */
 enum kvm_contention_stat {
        TAKEN_SLOW,
        TAKEN_SLOW_PICKUP,
@@ -817,6 +850,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
        }
 }
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
 /*
 * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
 */
@@ -828,8 +863,16 @@ void __init kvm_spinlock_init(void)
        if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
                return;
+#ifdef CONFIG_QUEUED_SPINLOCKS
+        __pv_init_lock_hash();
+        pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+        pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+        pv_lock_ops.wait = kvm_wait;
+        pv_lock_ops.kick = kvm_kick_cpu;
+#else /* !CONFIG_QUEUED_SPINLOCKS */
        pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
        pv_lock_ops.unlock_kick = kvm_unlock_kick;
+#endif
 }
 static __init int kvm_spinlock_init_jump(void)
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index bbb6c7316341..33ee3e0efd65 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,11 +8,33 @@
 #include <asm/paravirt.h>
+#ifdef CONFIG_QUEUED_SPINLOCKS
+__visible void __native_queued_spin_unlock(struct qspinlock *lock)
+{
+        native_queued_spin_unlock(lock);
+}
+PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
+bool pv_is_native_spin_unlock(void)
+{
+        return pv_lock_ops.queued_spin_unlock.func ==
+                __raw_callee_save___native_queued_spin_unlock;
+}
+#endif
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
+#ifdef CONFIG_QUEUED_SPINLOCKS
+        .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
+        .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
+        .wait = paravirt_nop,
+        .kick = paravirt_nop,
+#else /* !CONFIG_QUEUED_SPINLOCKS */
        .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
        .unlock_kick = paravirt_nop,
-#endif
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
+#endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index d9f32e6d6ab6..e1b013696dde 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,10 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+#endif
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
 {
        /* arg in %eax, return in %eax */
@@ -24,6 +28,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
        return 0;
 }
+extern bool pv_is_native_spin_unlock(void);
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                      unsigned long addr, unsigned len)
 {
@@ -47,14 +53,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                PATCH_SITE(pv_mmu_ops, write_cr3);
                PATCH_SITE(pv_cpu_ops, clts);
                PATCH_SITE(pv_cpu_ops, read_tsc);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
-        patch_site:
+                case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
-                ret = paravirt_patch_insns(ibuf, len, start, end);
+                        if (pv_is_native_spin_unlock()) {
-                break;
+                                start = start_pv_lock_ops_queued_spin_unlock;
+                                end   = end_pv_lock_ops_queued_spin_unlock;
+                                goto patch_site;
+                        }
+#endif
        default:
                ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
                break;
+patch_site:
+                ret = paravirt_patch_insns(ibuf, len, start, end);
+                break;
        }
 #undef PATCH_SITE
        return ret;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index a1da6737ba5b..a1fa86782186 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -21,6 +21,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
 DEF_NATIVE(, mov32, "mov %edi, %eax");
 DEF_NATIVE(, mov64, "mov %rdi, %rax");
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+#endif
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
 {
        return paravirt_patch_insns(insnbuf, len,
@@ -33,6 +37,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
                                    start__mov64, end__mov64);
 }
+extern bool pv_is_native_spin_unlock(void);
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                      unsigned long addr, unsigned len)
 {
@@ -59,14 +65,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                PATCH_SITE(pv_cpu_ops, clts);
                PATCH_SITE(pv_mmu_ops, flush_tlb_single);
                PATCH_SITE(pv_cpu_ops, wbinvd);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
-        patch_site:
+                case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
-                ret = paravirt_patch_insns(ibuf, len, start, end);
+                        if (pv_is_native_spin_unlock()) {
-                break;
+                                start = start_pv_lock_ops_queued_spin_unlock;
+                                end   = end_pv_lock_ops_queued_spin_unlock;
+                                goto patch_site;
+                        }
+#endif
        default:
                ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
                break;
+patch_site:
+                ret = paravirt_patch_insns(ibuf, len, start, end);
+                break;
        }
 #undef PATCH_SITE
        return ret;
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 7e8a1a650435..b9531d343134 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -39,7 +39,8 @@
 #define smp_mb()        barrier()
 #define smp_rmb()       barrier()
 #define smp_wmb()       barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 #define read_barrier_depends()          do { } while (0)
 #define smp_read_barrier_depends()      do { } while (0)
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 956374c1edbc..9e2ba5c6e1dd 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -17,6 +17,56 @@
 #include "xen-ops.h"
 #include "debugfs.h"
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(char *, irq_name);
+static bool xen_pvspin = true;
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+static void xen_qlock_kick(int cpu)
+{
+        xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+}
+/*
+ * Halt the current CPU & release it back to the host
+ */
+static void xen_qlock_wait(u8 *byte, u8 val)
+{
+        int irq = __this_cpu_read(lock_kicker_irq);
+        /* If kicker interrupts not initialized yet, just spin */
+        if (irq == -1)
+                return;
+        /* clear pending */
+        xen_clear_irq_pending(irq);
+        barrier();
+        /*
+         * We check the byte value after clearing pending IRQ to make sure
+         * that we won't miss a wakeup event because of the clearing.
+         *
+         * The sync_clear_bit() call in xen_clear_irq_pending() is atomic.
+         * So it is effectively a memory barrier for x86.
+         */
+        if (READ_ONCE(*byte) != val)
+                return;
+        /*
+         * If an interrupt happens here, it will leave the wakeup irq
+         * pending, which will cause xen_poll_irq() to return
+         * immediately.
+         */
+        /* Block until irq becomes pending (or perhaps a spurious wakeup) */
+        xen_poll_irq(irq);
+}
+#else /* CONFIG_QUEUED_SPINLOCKS */
 enum xen_contention_stat {
        TAKEN_SLOW,
        TAKEN_SLOW_PICKUP,
@@ -100,12 +150,9 @@ struct xen_lock_waiting {
        __ticket_t want;
 };
-static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(char *, irq_name);
 static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
 static cpumask_t waiting_cpus;
-static bool xen_pvspin = true;
 __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 {
        int irq = __this_cpu_read(lock_kicker_irq);
@@ -217,6 +264,7 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
                }
        }
 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 static irqreturn_t dummy_handler(int irq, void *dev_id)
 {
@@ -280,8 +328,16 @@ void __init xen_init_spinlocks(void)
                return;
        }
        printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
+#ifdef CONFIG_QUEUED_SPINLOCKS
+        __pv_init_lock_hash();
+        pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+        pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+        pv_lock_ops.wait = xen_qlock_wait;
+        pv_lock_ops.kick = xen_qlock_kick;
+#else
        pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
        pv_lock_ops.unlock_kick = xen_unlock_kick;
+#endif
 }
 /*
@@ -310,7 +366,7 @@ static __init int xen_parse_nopvspin(char *arg)
 }
 early_param("xen_nopvspin", xen_parse_nopvspin);
-#ifdef CONFIG_XEN_DEBUG_FS
+#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS)
 static struct dentry *d_spin_debug;
diff --git a/fs/select.c b/fs/select.c
index f684c750e08a..015547330e88 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -189,7 +189,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
         * doesn't imply write barrier and the users expect write
         * barrier semantics on wakeup functions.  The following
         * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
-         * and is paired with set_mb() in poll_schedule_timeout.
+         * and is paired with smp_store_mb() in poll_schedule_timeout.
         */
        smp_wmb();
        pwq->triggered = 1;
@@ -244,7 +244,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
        /*
         * Prepare for the next iteration.
         *
-         * The following set_mb() serves two purposes.  First, it's
+         * The following smp_store_mb() serves two purposes.  First, it's
         * the counterpart rmb of the wmb in pollwake() such that data
         * written before wake up is always visible after wake up.
         * Second, the full barrier guarantees that triggered clearing
@@ -252,7 +252,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
         * this problem doesn't exist for the first iteration as
         * add_wait_queue() has full barrier semantics.
         */
-        set_mb(pwq->triggered, 0);
+        smp_store_mb(pwq->triggered, 0);
        return rc;
 }
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index f5c40b0fadc2..e6a83d712ef6 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -66,8 +66,8 @@
 #define smp_read_barrier_depends()      do { } while (0)
 #endif
-#ifndef set_mb
+#ifndef smp_store_mb
-#define set_mb(var, value)  do { (var) = (value); mb(); } while (0)
+#define smp_store_mb(var, value)  do { WRITE_ONCE(var, value); mb(); } while (0)
 #endif
 #ifndef smp_mb__before_atomic
diff --git a/include/asm-generic/cmpxchg.h b/include/asm-generic/cmpxchg.h
index 811fb1e9b061..3766ab34aa45 100644
--- a/include/asm-generic/cmpxchg.h
+++ b/include/asm-generic/cmpxchg.h
@@ -86,9 +86,6 @@ unsigned long __xchg(unsigned long x, volatile void *ptr, int size)
 /*
 * Atomic compare and exchange.
- *
- * Do not define __HAVE_ARCH_CMPXCHG because we want to use it to check whether
- * a cmpxchg primitive faster than repeated local irq save/restore exists.
 */
 #include <asm-generic/cmpxchg-local.h>
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
new file mode 100644
index 000000000000..83bfb87f5bf1
--- /dev/null
+++ b/include/asm-generic/qspinlock.h
@@ -0,0 +1,139 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ */
+#ifndef __ASM_GENERIC_QSPINLOCK_H
+#define __ASM_GENERIC_QSPINLOCK_H
+#include <asm-generic/qspinlock_types.h>
+/**
+ * queued_spin_is_locked - is the spinlock locked?
+ * @lock: Pointer to queued spinlock structure
+ * Return: 1 if it is locked, 0 otherwise
+ */
+static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
+{
+        return atomic_read(&lock->val);
+}
+/**
+ * queued_spin_value_unlocked - is the spinlock structure unlocked?
+ * @lock: queued spinlock structure
+ * Return: 1 if it is unlocked, 0 otherwise
+ *
+ * N.B. Whenever there are tasks waiting for the lock, it is considered
+ *      locked wrt the lockref code to avoid lock stealing by the lockref
+ *      code and change things underneath the lock. This also allows some
+ *      optimizations to be applied without conflict with lockref.
+ */
+static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
+{
+        return !atomic_read(&lock.val);
+}
+/**
+ * queued_spin_is_contended - check if the lock is contended
+ * @lock : Pointer to queued spinlock structure
+ * Return: 1 if lock contended, 0 otherwise
+ */
+static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
+{
+        return atomic_read(&lock->val) & ~_Q_LOCKED_MASK;
+}
+/**
+ * queued_spin_trylock - try to acquire the queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ * Return: 1 if lock acquired, 0 if failed
+ */
+static __always_inline int queued_spin_trylock(struct qspinlock *lock)
+{
+        if (!atomic_read(&lock->val) &&
+           (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) == 0))
+                return 1;
+        return 0;
+}
+extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+/**
+ * queued_spin_lock - acquire a queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ */
+static __always_inline void queued_spin_lock(struct qspinlock *lock)
+{
+        u32 val;
+        val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
+        if (likely(val == 0))
+                return;
+        queued_spin_lock_slowpath(lock, val);
+}
+#ifndef queued_spin_unlock
+/**
+ * queued_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ */
+static __always_inline void queued_spin_unlock(struct qspinlock *lock)
+{
+        /*
+         * smp_mb__before_atomic() in order to guarantee release semantics
+         */
+        smp_mb__before_atomic_dec();
+        atomic_sub(_Q_LOCKED_VAL, &lock->val);
+}
+#endif
+/**
+ * queued_spin_unlock_wait - wait until current lock holder releases the lock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * There is a very slight possibility of live-lock if the lockers keep coming
+ * and the waiter is just unfortunate enough to not see any unlock state.
+ */
+static inline void queued_spin_unlock_wait(struct qspinlock *lock)
+{
+        while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
+                cpu_relax();
+}
+#ifndef virt_queued_spin_lock
+static __always_inline bool virt_queued_spin_lock(struct qspinlock *lock)
+{
+        return false;
+}
+#endif
+/*
+ * Initializier
+ */
+#define __ARCH_SPIN_LOCK_UNLOCKED       { ATOMIC_INIT(0) }
+/*
+ * Remapping spinlock architecture specific functions to the corresponding
+ * queued spinlock functions.
+ */
+#define arch_spin_is_locked(l)          queued_spin_is_locked(l)
+#define arch_spin_is_contended(l)       queued_spin_is_contended(l)
+#define arch_spin_value_unlocked(l)     queued_spin_value_unlocked(l)
+#define arch_spin_lock(l)               queued_spin_lock(l)
+#define arch_spin_trylock(l)            queued_spin_trylock(l)
+#define arch_spin_unlock(l)             queued_spin_unlock(l)
+#define arch_spin_lock_flags(l, f)      queued_spin_lock(l)
+#define arch_spin_unlock_wait(l)        queued_spin_unlock_wait(l)
+#endif /* __ASM_GENERIC_QSPINLOCK_H */
diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h
new file mode 100644
index 000000000000..85f888e86761
--- /dev/null
+++ b/include/asm-generic/qspinlock_types.h
@@ -0,0 +1,79 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ */
+#ifndef __ASM_GENERIC_QSPINLOCK_TYPES_H
+#define __ASM_GENERIC_QSPINLOCK_TYPES_H
+/*
+ * Including atomic.h with PARAVIRT on will cause compilation errors because
+ * of recursive header file incluson via paravirt_types.h. So don't include
+ * it if PARAVIRT is on.
+ */
+#ifndef CONFIG_PARAVIRT
+#include <linux/types.h>
+#include <linux/atomic.h>
+#endif
+typedef struct qspinlock {
+        atomic_t        val;
+} arch_spinlock_t;
+/*
+ * Bitfields in the atomic value:
+ *
+ * When NR_CPUS < 16K
+ *  0- 7: locked byte
+ *     8: pending
+ *  9-15: not used
+ * 16-17: tail index
+ * 18-31: tail cpu (+1)
+ *
+ * When NR_CPUS >= 16K
+ *  0- 7: locked byte
+ *     8: pending
+ *  9-10: tail index
+ * 11-31: tail cpu (+1)
+ */
+#define _Q_SET_MASK(type)       (((1U << _Q_ ## type ## _BITS) - 1)\
+                                      << _Q_ ## type ## _OFFSET)
+#define _Q_LOCKED_OFFSET        0
+#define _Q_LOCKED_BITS          8
+#define _Q_LOCKED_MASK          _Q_SET_MASK(LOCKED)
+#define _Q_PENDING_OFFSET       (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
+#if CONFIG_NR_CPUS < (1U << 14)
+#define _Q_PENDING_BITS         8
+#else
+#define _Q_PENDING_BITS         1
+#endif
+#define _Q_PENDING_MASK         _Q_SET_MASK(PENDING)
+#define _Q_TAIL_IDX_OFFSET      (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
+#define _Q_TAIL_IDX_BITS        2
+#define _Q_TAIL_IDX_MASK        _Q_SET_MASK(TAIL_IDX)
+#define _Q_TAIL_CPU_OFFSET      (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
+#define _Q_TAIL_CPU_BITS        (32 - _Q_TAIL_CPU_OFFSET)
+#define _Q_TAIL_CPU_MASK        _Q_SET_MASK(TAIL_CPU)
+#define _Q_TAIL_OFFSET          _Q_TAIL_IDX_OFFSET
+#define _Q_TAIL_MASK            (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
+#define _Q_LOCKED_VAL           (1U << _Q_LOCKED_OFFSET)
+#define _Q_PENDING_VAL          (1U << _Q_PENDING_OFFSET)
+#endif /* __ASM_GENERIC_QSPINLOCK_TYPES_H */
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 5d66777914db..05be2352fef8 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -250,7 +250,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
        ({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
 #define WRITE_ONCE(x, val) \
-        ({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })
+        ({ union { typeof(x) __val; char __c[1]; } __u = { .__val = (val) }; __write_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
 /**
 * READ_ONCE_CTRL - Read a value heading a control dependency
@@ -466,7 +466,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 * with an explicit memory barrier or atomic instruction that provides the
 * required ordering.
 *
- * If possible use READ_ONCE/ASSIGN_ONCE instead.
+ * If possible use READ_ONCE()/WRITE_ONCE() instead.
 */
 #define __ACCESS_ONCE(x) ({ \
         __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 066ba4157541..2722111591a3 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -130,8 +130,8 @@ enum bounce_type {
 };
 struct lock_class_stats {
-        unsigned long                   contention_point[4];
+        unsigned long                   contention_point[LOCKSTAT_POINTS];
-        unsigned long                   contending_point[4];
+        unsigned long                   contending_point[LOCKSTAT_POINTS];
        struct lock_time                read_waittime;
        struct lock_time                write_waittime;
        struct lock_time                read_holdtime;
diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index 3a6490e81b28..703ea5c30a33 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -32,4 +32,9 @@ static inline void osq_lock_init(struct optimistic_spin_queue *lock)
 extern bool osq_lock(struct optimistic_spin_queue *lock);
 extern void osq_unlock(struct optimistic_spin_queue *lock);
+static inline bool osq_is_locked(struct optimistic_spin_queue *lock)
+{
+        return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL;
+}
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a1158c954f0f..8ca95f6a9395 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -253,7 +253,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define set_task_state(tsk, state_value)                        \
        do {                                                    \
                (tsk)->task_state_change = _THIS_IP_;           \
-                set_mb((tsk)->state, (state_value));            \
+                smp_store_mb((tsk)->state, (state_value));              \
        } while (0)
 /*
@@ -275,7 +275,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define set_current_state(state_value)                          \
        do {                                                    \
                current->task_state_change = _THIS_IP_;         \
-                set_mb(current->state, (state_value));          \
+                smp_store_mb(current->state, (state_value));            \
        } while (0)
 #else
@@ -283,7 +283,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define __set_task_state(tsk, state_value)              \
        do { (tsk)->state = (state_value); } while (0)
 #define set_task_state(tsk, state_value)                \
-        set_mb((tsk)->state, (state_value))
+        smp_store_mb((tsk)->state, (state_value))
 /*
 * set_current_state() includes a barrier so that the write of current->state
@@ -299,7 +299,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define __set_current_state(state_value)                \
        do { current->state = (state_value); } while (0)
 #define set_current_state(state_value)                  \
-        set_mb(current->state, (state_value))
+        smp_store_mb(current->state, (state_value))
 #endif
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 08561f1acd13..ebdb0043203a 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -235,9 +235,16 @@ config LOCK_SPIN_ON_OWNER
       def_bool y
       depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
-config ARCH_USE_QUEUE_RWLOCK
+config ARCH_USE_QUEUED_SPINLOCKS
        bool
-config QUEUE_RWLOCK
+config QUEUED_SPINLOCKS
-        def_bool y if ARCH_USE_QUEUE_RWLOCK
+        def_bool y if ARCH_USE_QUEUED_SPINLOCKS
+        depends on SMP
+config ARCH_USE_QUEUED_RWLOCKS
+        bool
+config QUEUED_RWLOCKS
+        def_bool y if ARCH_USE_QUEUED_RWLOCKS
        depends on SMP
diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e407ff67..55ca63ad9622 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2055,7 +2055,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 {
        /*
         * The task state is guaranteed to be set before another task can
-         * wake it. set_current_state() is implemented using set_mb() and
+         * wake it. set_current_state() is implemented using smp_store_mb() and
         * queue_me() calls spin_unlock() upon completion, both serializing
         * access to the hash list and forcing another memory barrier.
         */
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index de7a416cca2a..7dd5c9918e4c 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
 obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
@@ -25,5 +26,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
-obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
+obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index aaeae885d9af..456614136f1a 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4067,8 +4067,7 @@ void __init lockdep_info(void)
 #ifdef CONFIG_DEBUG_LOCKDEP
        if (lockdep_init_error) {
-                printk("WARNING: lockdep init error! lock-%s was acquired"
+                printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
-                        "before lockdep_init\n", lock_init_error);
                printk("Call stack leading to lockdep invocation was:\n");
                print_stack_trace(&lockdep_init_trace, 0);
        }
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 75e114bdf3f2..fd91aaa4554c 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -17,6 +17,7 @@
 struct mcs_spinlock {
        struct mcs_spinlock *next;
        int locked; /* 1 if lock acquired */
+        int count;  /* nesting count, see qspinlock.c */
 };
 #ifndef arch_mcs_spin_lock_contended
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index f956ede7f90d..6c5da483966b 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,5 +1,5 @@
 /*
- * Queue read/write lock
+ * Queued read/write locks
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -22,6 +22,26 @@
 #include <linux/hardirq.h>
 #include <asm/qrwlock.h>
+/*
+ * This internal data structure is used for optimizing access to some of
+ * the subfields within the atomic_t cnts.
+ */
+struct __qrwlock {
+        union {
+                atomic_t cnts;
+                struct {
+#ifdef __LITTLE_ENDIAN
+                        u8 wmode;       /* Writer mode   */
+                        u8 rcnts[3];    /* Reader counts */
+#else
+                        u8 rcnts[3];    /* Reader counts */
+                        u8 wmode;       /* Writer mode   */
+#endif
+                };
+        };
+        arch_spinlock_t lock;
+};
 /**
 * rspin_until_writer_unlock - inc reader count & spin until writer is gone
 * @lock  : Pointer to queue rwlock structure
@@ -107,10 +127,10 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
         * or wait for a previous writer to go away.
         */
        for (;;) {
-                cnts = atomic_read(&lock->cnts);
+                struct __qrwlock *l = (struct __qrwlock *)lock;
-                if (!(cnts & _QW_WMASK) &&
-                    (atomic_cmpxchg(&lock->cnts, cnts,
+                if (!READ_ONCE(l->wmode) &&
-                                    cnts | _QW_WAITING) == cnts))
+                   (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0))
                        break;
                cpu_relax_lowlatency();
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
new file mode 100644
index 000000000000..38c49202d532
--- /dev/null
+++ b/kernel/locking/qspinlock.c
@@ -0,0 +1,473 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ *          Peter Zijlstra <peterz@infradead.org>
+ */
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/byteorder.h>
+#include <asm/qspinlock.h>
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. The paper below provides a good description for this kind
+ * of lock.
+ *
+ * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to make
+ * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
+ * API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ *      atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+#include "mcs_spinlock.h"
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define MAX_NODES       8
+#else
+#define MAX_NODES       4
+#endif
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+static inline u32 encode_tail(int cpu, int idx)
+{
+        u32 tail;
+#ifdef CONFIG_DEBUG_SPINLOCK
+        BUG_ON(idx > 3);
+#endif
+        tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+        tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+        return tail;
+}
+static inline struct mcs_spinlock *decode_tail(u32 tail)
+{
+        int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+        int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+        return per_cpu_ptr(&mcs_nodes[idx], cpu);
+}
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+/*
+ * By using the whole 2nd least significant byte for the pending bit, we
+ * can allow better optimization of the lock acquisition for the pending
+ * bit holder.
+ *
+ * This internal structure is also used by the set_locked function which
+ * is not restricted to _Q_PENDING_BITS == 8.
+ */
+struct __qspinlock {
+        union {
+                atomic_t val;
+#ifdef __LITTLE_ENDIAN
+                struct {
+                        u8      locked;
+                        u8      pending;
+                };
+                struct {
+                        u16     locked_pending;
+                        u16     tail;
+                };
+#else
+                struct {
+                        u16     tail;
+                        u16     locked_pending;
+                };
+                struct {
+                        u8      reserved[2];
+                        u8      pending;
+                        u8      locked;
+                };
+#endif
+        };
+};
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+}
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+        struct __qspinlock *l = (void *)lock;
+        return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+#else /* _Q_PENDING_BITS == 8 */
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+        atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+        u32 old, new, val = atomic_read(&lock->val);
+        for (;;) {
+                new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+                old = atomic_cmpxchg(&lock->val, val, new);
+                if (old == val)
+                        break;
+                val = old;
+        }
+        return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+}
+/*
+ * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_head(struct qspinlock *lock,
+                                           struct mcs_spinlock *node) { }
+#define pv_enabled()            false
+#define pv_init_node            __pv_init_node
+#define pv_wait_node            __pv_wait_node
+#define pv_kick_node            __pv_kick_node
+#define pv_wait_head            __pv_wait_head
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queued_spin_lock_slowpath       native_queued_spin_lock_slowpath
+#endif
+#endif /* _GEN_PV_LOCK_SLOWPATH */
+/**
+ * queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ *              fast     :    slow                                  :    unlock
+ *                       :                                          :
+ * uncontended  (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ *                       :       | ^--------.------.             /  :
+ *                       :       v           \      \            |  :
+ * pending               :    (0,1,1) +--> (0,1,0)   \           |  :
+ *                       :       | ^--'              |           |  :
+ *                       :       v                   |           |  :
+ * uncontended           :    (n,x,y) +--> (n,0,0) --'           |  :
+ *   queue               :       | ^--'                          |  :
+ *                       :       v                               |  :
+ * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
+ *   queue               :         ^--'                             :
+ */
+void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+        struct mcs_spinlock *prev, *next, *node;
+        u32 new, old, tail;
+        int idx;
+        BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+        if (pv_enabled())
+                goto queue;
+        if (virt_queued_spin_lock(lock))
+                return;
+        /*
+         * wait for in-progress pending->locked hand-overs
+         *
+         * 0,1,0 -> 0,0,1
+         */
+        if (val == _Q_PENDING_VAL) {
+                while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
+                        cpu_relax();
+        }
+        /*
+         * trylock || pending
+         *
+         * 0,0,0 -> 0,0,1 ; trylock
+         * 0,0,1 -> 0,1,1 ; pending
+         */
+        for (;;) {
+                /*
+                 * If we observe any contention; queue.
+                 */
+                if (val & ~_Q_LOCKED_MASK)
+                        goto queue;
+                new = _Q_LOCKED_VAL;
+                if (val == new)
+                        new |= _Q_PENDING_VAL;
+                old = atomic_cmpxchg(&lock->val, val, new);
+                if (old == val)
+                        break;
+                val = old;
+        }
+        /*
+         * we won the trylock
+         */
+        if (new == _Q_LOCKED_VAL)
+                return;
+        /*
+         * we're pending, wait for the owner to go away.
+         *
+         * *,1,1 -> *,1,0
+         *
+         * this wait loop must be a load-acquire such that we match the
+         * store-release that clears the locked bit and create lock
+         * sequentiality; this is because not all clear_pending_set_locked()
+         * implementations imply full barriers.
+         */
+        while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
+                cpu_relax();
+        /*
+         * take ownership and clear the pending bit.
+         *
+         * *,1,0 -> *,0,1
+         */
+        clear_pending_set_locked(lock);
+        return;
+        /*
+         * End of pending bit optimistic spinning and beginning of MCS
+         * queuing.
+         */
+queue:
+        node = this_cpu_ptr(&mcs_nodes[0]);
+        idx = node->count++;
+        tail = encode_tail(smp_processor_id(), idx);
+        node += idx;
+        node->locked = 0;
+        node->next = NULL;
+        pv_init_node(node);
+        /*
+         * We touched a (possibly) cold cacheline in the per-cpu queue node;
+         * attempt the trylock once more in the hope someone let go while we
+         * weren't watching.
+         */
+        if (queued_spin_trylock(lock))
+                goto release;
+        /*
+         * We have already touched the queueing cacheline; don't bother with
+         * pending stuff.
+         *
+         * p,*,* -> n,*,*
+         */
+        old = xchg_tail(lock, tail);
+        /*
+         * if there was a previous node; link it and wait until reaching the
+         * head of the waitqueue.
+         */
+        if (old & _Q_TAIL_MASK) {
+                prev = decode_tail(old);
+                WRITE_ONCE(prev->next, node);
+                pv_wait_node(node);
+                arch_mcs_spin_lock_contended(&node->locked);
+        }
+        /*
+         * we're at the head of the waitqueue, wait for the owner & pending to
+         * go away.
+         *
+         * *,x,y -> *,0,0
+         *
+         * this wait loop must use a load-acquire such that we match the
+         * store-release that clears the locked bit and create lock
+         * sequentiality; this is because the set_locked() function below
+         * does not imply a full barrier.
+         *
+         */
+        pv_wait_head(lock, node);
+        while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+                cpu_relax();
+        /*
+         * claim the lock:
+         *
+         * n,0,0 -> 0,0,1 : lock, uncontended
+         * *,0,0 -> *,0,1 : lock, contended
+         *
+         * If the queue head is the only one in the queue (lock value == tail),
+         * clear the tail code and grab the lock. Otherwise, we only need
+         * to grab the lock.
+         */
+        for (;;) {
+                if (val != tail) {
+                        set_locked(lock);
+                        break;
+                }
+                old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+                if (old == val)
+                        goto release;   /* No contention */
+                val = old;
+        }
+        /*
+         * contended path; wait for next, release.
+         */
+        while (!(next = READ_ONCE(node->next)))
+                cpu_relax();
+        arch_mcs_spin_unlock_contended(&next->locked);
+        pv_kick_node(next);
+release:
+        /*
+         * release the node
+         */
+        this_cpu_dec(mcs_nodes[0].count);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+/*
+ * Generate the paravirt code for queued_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+#undef  pv_enabled
+#define pv_enabled()    true
+#undef pv_init_node
+#undef pv_wait_node
+#undef pv_kick_node
+#undef pv_wait_head
+#undef  queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath       __pv_queued_spin_lock_slowpath
+#include "qspinlock_paravirt.h"
+#include "qspinlock.c"
+#endif
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
new file mode 100644
index 000000000000..04ab18151cc8
--- /dev/null
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -0,0 +1,325 @@
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#error "do not include this file"
+#endif
+#include <linux/hash.h>
+#include <linux/bootmem.h>
+/*
+ * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
+ * of spinning them.
+ *
+ * This relies on the architecture to provide two paravirt hypercalls:
+ *
+ *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
+ *   pv_kick(cpu)             -- wakes a suspended vcpu
+ *
+ * Using these we implement __pv_queued_spin_lock_slowpath() and
+ * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
+ * native_queued_spin_unlock().
+ */
+#define _Q_SLOW_VAL     (3U << _Q_LOCKED_OFFSET)
+enum vcpu_state {
+        vcpu_running = 0,
+        vcpu_halted,
+};
+struct pv_node {
+        struct mcs_spinlock     mcs;
+        struct mcs_spinlock     __res[3];
+        int                     cpu;
+        u8                      state;
+};
+/*
+ * Lock and MCS node addresses hash table for fast lookup
+ *
+ * Hashing is done on a per-cacheline basis to minimize the need to access
+ * more than one cacheline.
+ *
+ * Dynamically allocate a hash table big enough to hold at least 4X the
+ * number of possible cpus in the system. Allocation is done on page
+ * granularity. So the minimum number of hash buckets should be at least
+ * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ */
+struct pv_hash_entry {
+        struct qspinlock *lock;
+        struct pv_node   *node;
+};
+#define PV_HE_PER_LINE  (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
+#define PV_HE_MIN       (PAGE_SIZE / sizeof(struct pv_hash_entry))
+static struct pv_hash_entry *pv_lock_hash;
+static unsigned int pv_lock_hash_bits __read_mostly;
+/*
+ * Allocate memory for the PV qspinlock hash buckets
+ *
+ * This function should be called from the paravirt spinlock initialization
+ * routine.
+ */
+void __init __pv_init_lock_hash(void)
+{
+        int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+        if (pv_hash_size < PV_HE_MIN)
+                pv_hash_size = PV_HE_MIN;
+        /*
+         * Allocate space from bootmem which should be page-size aligned
+         * and hence cacheline aligned.
+         */
+        pv_lock_hash = alloc_large_system_hash("PV qspinlock",
+                                               sizeof(struct pv_hash_entry),
+                                               pv_hash_size, 0, HASH_EARLY,
+                                               &pv_lock_hash_bits, NULL,
+                                               pv_hash_size, pv_hash_size);
+}
+#define for_each_hash_entry(he, offset, hash)                                           \
+        for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0;       \
+             offset < (1 << pv_lock_hash_bits);                                         \
+             offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
+static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
+{
+        unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+        struct pv_hash_entry *he;
+        for_each_hash_entry(he, offset, hash) {
+                if (!cmpxchg(&he->lock, NULL, lock)) {
+                        WRITE_ONCE(he->node, node);
+                        return &he->lock;
+                }
+        }
+        /*
+         * Hard assume there is a free entry for us.
+         *
+         * This is guaranteed by ensuring every blocked lock only ever consumes
+         * a single entry, and since we only have 4 nesting levels per CPU
+         * and allocated 4*nr_possible_cpus(), this must be so.
+         *
+         * The single entry is guaranteed by having the lock owner unhash
+         * before it releases.
+         */
+        BUG();
+}
+static struct pv_node *pv_unhash(struct qspinlock *lock)
+{
+        unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+        struct pv_hash_entry *he;
+        struct pv_node *node;
+        for_each_hash_entry(he, offset, hash) {
+                if (READ_ONCE(he->lock) == lock) {
+                        node = READ_ONCE(he->node);
+                        WRITE_ONCE(he->lock, NULL);
+                        return node;
+                }
+        }
+        /*
+         * Hard assume we'll find an entry.
+         *
+         * This guarantees a limited lookup time and is itself guaranteed by
+         * having the lock owner do the unhash -- IFF the unlock sees the
+         * SLOW flag, there MUST be a hash entry.
+         */
+        BUG();
+}
+/*
+ * Initialize the PV part of the mcs_spinlock node.
+ */
+static void pv_init_node(struct mcs_spinlock *node)
+{
+        struct pv_node *pn = (struct pv_node *)node;
+        BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
+        pn->cpu = smp_processor_id();
+        pn->state = vcpu_running;
+}
+/*
+ * Wait for node->locked to become true, halt the vcpu after a short spin.
+ * pv_kick_node() is used to wake the vcpu again.
+ */
+static void pv_wait_node(struct mcs_spinlock *node)
+{
+        struct pv_node *pn = (struct pv_node *)node;
+        int loop;
+        for (;;) {
+                for (loop = SPIN_THRESHOLD; loop; loop--) {
+                        if (READ_ONCE(node->locked))
+                                return;
+                        cpu_relax();
+                }
+                /*
+                 * Order pn->state vs pn->locked thusly:
+                 *
+                 * [S] pn->state = vcpu_halted    [S] next->locked = 1
+                 *     MB                             MB
+                 * [L] pn->locked               [RmW] pn->state = vcpu_running
+                 *
+                 * Matches the xchg() from pv_kick_node().
+                 */
+                smp_store_mb(pn->state, vcpu_halted);
+                if (!READ_ONCE(node->locked))
+                        pv_wait(&pn->state, vcpu_halted);
+                /*
+                 * Reset the vCPU state to avoid unncessary CPU kicking
+                 */
+                WRITE_ONCE(pn->state, vcpu_running);
+                /*
+                 * If the locked flag is still not set after wakeup, it is a
+                 * spurious wakeup and the vCPU should wait again. However,
+                 * there is a pretty high overhead for CPU halting and kicking.
+                 * So it is better to spin for a while in the hope that the
+                 * MCS lock will be released soon.
+                 */
+        }
+        /*
+         * By now our node->locked should be 1 and our caller will not actually
+         * spin-wait for it. We do however rely on our caller to do a
+         * load-acquire for us.
+         */
+}
+/*
+ * Called after setting next->locked = 1, used to wake those stuck in
+ * pv_wait_node().
+ */
+static void pv_kick_node(struct mcs_spinlock *node)
+{
+        struct pv_node *pn = (struct pv_node *)node;
+        /*
+         * Note that because node->locked is already set, this actual
+         * mcs_spinlock entry could be re-used already.
+         *
+         * This should be fine however, kicking people for no reason is
+         * harmless.
+         *
+         * See the comment in pv_wait_node().
+         */
+        if (xchg(&pn->state, vcpu_running) == vcpu_halted)
+                pv_kick(pn->cpu);
+}
+/*
+ * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * __pv_queued_spin_unlock() will wake us.
+ */
+static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+        struct pv_node *pn = (struct pv_node *)node;
+        struct __qspinlock *l = (void *)lock;
+        struct qspinlock **lp = NULL;
+        int loop;
+        for (;;) {
+                for (loop = SPIN_THRESHOLD; loop; loop--) {
+                        if (!READ_ONCE(l->locked))
+                                return;
+                        cpu_relax();
+                }
+                WRITE_ONCE(pn->state, vcpu_halted);
+                if (!lp) { /* ONCE */
+                        lp = pv_hash(lock, pn);
+                        /*
+                         * lp must be set before setting _Q_SLOW_VAL
+                         *
+                         * [S] lp = lock                [RmW] l = l->locked = 0
+                         *     MB                             MB
+                         * [S] l->locked = _Q_SLOW_VAL  [L]   lp
+                         *
+                         * Matches the cmpxchg() in __pv_queued_spin_unlock().
+                         */
+                        if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+                                /*
+                                 * The lock is free and _Q_SLOW_VAL has never
+                                 * been set. Therefore we need to unhash before
+                                 * getting the lock.
+                                 */
+                                WRITE_ONCE(*lp, NULL);
+                                return;
+                        }
+                }
+                pv_wait(&l->locked, _Q_SLOW_VAL);
+                /*
+                 * The unlocker should have freed the lock before kicking the
+                 * CPU. So if the lock is still not free, it is a spurious
+                 * wakeup and so the vCPU should wait again after spinning for
+                 * a while.
+                 */
+        }
+        /*
+         * Lock is unlocked now; the caller will acquire it without waiting.
+         * As with pv_wait_node() we rely on the caller to do a load-acquire
+         * for us.
+         */
+}
+/*
+ * PV version of the unlock function to be used in stead of
+ * queued_spin_unlock().
+ */
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        struct pv_node *node;
+        /*
+         * We must not unlock if SLOW, because in that case we must first
+         * unhash. Otherwise it would be possible to have multiple @lock
+         * entries, which would be BAD.
+         */
+        if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL))
+                return;
+        /*
+         * Since the above failed to release, this must be the SLOW path.
+         * Therefore start by looking up the blocked node and unhashing it.
+         */
+        node = pv_unhash(lock);
+        /*
+         * Now that we have a reference to the (likely) blocked pv_node,
+         * release the lock.
+         */
+        smp_store_release(&l->locked, 0);
+        /*
+         * At this point the memory pointed at by lock can be freed/reused,
+         * however we can still use the pv_node to kick the CPU.
+         */
+        if (READ_ONCE(node->state) == vcpu_halted)
+                pv_kick(node->cpu);
+}
+/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() near the top of the file to make sure
+ * that the callee-save thunk and the real unlock function are close
+ * to each other sharing consecutive instruction cachelines.
+ */
+#include <asm/qspinlock_paravirt.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b025295f4966..30ec5b46cd8c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -70,10 +70,10 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 }
 /*
- * We can speed up the acquire/release, if the architecture
+ * We can speed up the acquire/release, if there's no debugging state to be
- * supports cmpxchg and if there's no debugging state to be set up
+ * set up.
 */
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+#ifndef CONFIG_DEBUG_RT_MUTEXES
 # define rt_mutex_cmpxchg(l,c,n)        (cmpxchg(&l->owner, c, n) == c)
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
@@ -1443,10 +1443,17 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
 *
 * @lock:       the rt_mutex to be locked
 *
+ * This function can only be called in thread context. It's safe to
+ * call it from atomic regions, but not from hard interrupt or soft
+ * interrupt context.
+ *
 * Returns 1 on success and 0 on contention
 */
 int __sched rt_mutex_trylock(struct rt_mutex *lock)
 {
+        if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+                return 0;
        return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_trylock);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 3417d0172a5d..0f189714e457 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -409,11 +409,24 @@ done:
        return taken;
 }
+/*
+ * Return true if the rwsem has active spinner
+ */
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+        return osq_is_locked(&sem->osq);
+}
 #else
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 {
        return false;
 }
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+        return false;
+}
 #endif
 /*
@@ -496,7 +509,38 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
        unsigned long flags;
+        /*
+         * If a spinner is present, it is not necessary to do the wakeup.
+         * Try to do wakeup only if the trylock succeeds to minimize
+         * spinlock contention which may introduce too much delay in the
+         * unlock operation.
+         *
+         *    spinning writer           up_write/up_read caller
+         *    ---------------           -----------------------
+         * [S]   osq_unlock()           [L]   osq
+         *       MB                           RMB
+         * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
+         *
+         * Here, it is important to make sure that there won't be a missed
+         * wakeup while the rwsem is free and the only spinning writer goes
+         * to sleep without taking the rwsem. Even when the spinning writer
+         * is just going to break out of the waiting loop, it will still do
+         * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
+         * rwsem_has_spinner() is true, it will guarantee at least one
+         * trylock attempt on the rwsem later on.
+         */
+        if (rwsem_has_spinner(sem)) {
+                /*
+                 * The smp_rmb() here is to make sure that the spinner
+                 * state is consulted before reading the wait_lock.
+                 */
+                smp_rmb();
+                if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
+                        return sem;
+                goto locked;
+        }
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+locked:
        /* do nothing if list empty */
        if (!list_empty(&sem->wait_list))
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a79f36..9bc82329eaad 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -341,7 +341,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
         * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
         * an event.
         */
-        set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+        smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
        return timeout;
 }
@@ -354,7 +354,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
         * doesn't imply write barrier and the users expects write
         * barrier semantics on wakeup functions.  The following
         * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
-         * and is paired with set_mb() in wait_woken().
+         * and is paired with smp_store_mb() in wait_woken().
         */
        smp_wmb(); /* C */
        wait->flags |= WQ_FLAG_WOKEN;
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-06-22 17:54:22 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-06-22 17:54:22 -0400
commit	1bf7067c6e173dc10411704db48338ed69c05565 (patch)
tree	06d731d9647c525fa598d03d7ec957ff9772ff40
parent	fc934d40178ad4e551a17e2733241d9f29fddd70 (diff)
parent	68722101ec3a0e179408a13708dd020e04f54aab (diff)