108 files changed, 3460 insertions, 1112 deletions
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
new file mode 100644
index 000000000000..5550bfdcce5f
--- /dev/null
+++ b/Documentation/atomic_bitops.txt
@@ -0,0 +1,66 @@
+On atomic bitops.
+While our bitmap_{}() functions are non-atomic, we have a number of operations
+operating on single bits in a bitmap that are atomic.
+API
+---
+The single bit operations are:
+Non-RMW ops:
+  test_bit()
+RMW atomic operations without return value:
+  {set,clear,change}_bit()
+  clear_bit_unlock()
+RMW atomic operations with return value:
+  test_and_{set,clear,change}_bit()
+  test_and_set_bit_lock()
+Barriers:
+  smp_mb__{before,after}_atomic()
+All RMW atomic operations have a '__' prefixed variant which is non-atomic.
+SEMANTICS
+---------
+Non-atomic ops:
+In particular __clear_bit_unlock() suffers the same issue as atomic_set(),
+which is why the generic version maps to clear_bit_unlock(), see atomic_t.txt.
+RMW ops:
+The test_and_{}_bit() operations return the original value of the bit.
+ORDERING
+--------
+Like with atomic_t, the rule of thumb is:
+ - non-RMW operations are unordered;
+ - RMW operations that have no return value are unordered;
+ - RMW operations that have a return value are fully ordered.
+Except for test_and_set_bit_lock() which has ACQUIRE semantics and
+clear_bit_unlock() which has RELEASE semantics.
+Since a platform only has a single means of achieving atomic operations
+the same barriers as for atomic_t are used, see atomic_t.txt.
diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt
new file mode 100644
index 000000000000..913396ac5824
--- /dev/null
+++ b/Documentation/atomic_t.txt
@@ -0,0 +1,242 @@
+On atomic types (atomic_t atomic64_t and atomic_long_t).
+The atomic type provides an interface to the architecture's means of atomic
+RMW operations between CPUs (atomic operations on MMIO are not supported and
+can lead to fatal traps on some platforms).
+API
+---
+The 'full' API consists of (atomic64_ and atomic_long_ prefixes omitted for
+brevity):
+Non-RMW ops:
+  atomic_read(), atomic_set()
+  atomic_read_acquire(), atomic_set_release()
+RMW atomic operations:
+Arithmetic:
+  atomic_{add,sub,inc,dec}()
+  atomic_{add,sub,inc,dec}_return{,_relaxed,_acquire,_release}()
+  atomic_fetch_{add,sub,inc,dec}{,_relaxed,_acquire,_release}()
+Bitwise:
+  atomic_{and,or,xor,andnot}()
+  atomic_fetch_{and,or,xor,andnot}{,_relaxed,_acquire,_release}()
+Swap:
+  atomic_xchg{,_relaxed,_acquire,_release}()
+  atomic_cmpxchg{,_relaxed,_acquire,_release}()
+  atomic_try_cmpxchg{,_relaxed,_acquire,_release}()
+Reference count (but please see refcount_t):
+  atomic_add_unless(), atomic_inc_not_zero()
+  atomic_sub_and_test(), atomic_dec_and_test()
+Misc:
+  atomic_inc_and_test(), atomic_add_negative()
+  atomic_dec_unless_positive(), atomic_inc_unless_negative()
+Barriers:
+  smp_mb__{before,after}_atomic()
+SEMANTICS
+---------
+Non-RMW ops:
+The non-RMW ops are (typically) regular LOADs and STOREs and are canonically
+implemented using READ_ONCE(), WRITE_ONCE(), smp_load_acquire() and
+smp_store_release() respectively.
+The one detail to this is that atomic_set{}() should be observable to the RMW
+ops. That is:
+  C atomic-set
+  {
+    atomic_set(v, 1);
+  }
+  P1(atomic_t *v)
+  {
+    atomic_add_unless(v, 1, 0);
+  }
+  P2(atomic_t *v)
+  {
+    atomic_set(v, 0);
+  }
+  exists
+  (v=2)
+In this case we would expect the atomic_set() from CPU1 to either happen
+before the atomic_add_unless(), in which case that latter one would no-op, or
+_after_ in which case we'd overwrite its result. In no case is "2" a valid
+outcome.
+This is typically true on 'normal' platforms, where a regular competing STORE
+will invalidate a LL/SC or fail a CMPXCHG.
+The obvious case where this is not so is when we need to implement atomic ops
+with a lock:
+  CPU0                                          CPU1
+  atomic_add_unless(v, 1, 0);
+    lock();
+    ret = READ_ONCE(v->counter); // == 1
+                                                atomic_set(v, 0);
+    if (ret != u)                                 WRITE_ONCE(v->counter, 0);
+      WRITE_ONCE(v->counter, ret + 1);
+    unlock();
+the typical solution is to then implement atomic_set{}() with atomic_xchg().
+RMW ops:
+These come in various forms:
+ - plain operations without return value: atomic_{}()
+ - operations which return the modified value: atomic_{}_return()
+   these are limited to the arithmetic operations because those are
+   reversible. Bitops are irreversible and therefore the modified value
+   is of dubious utility.
+ - operations which return the original value: atomic_fetch_{}()
+ - swap operations: xchg(), cmpxchg() and try_cmpxchg()
+ - misc; the special purpose operations that are commonly used and would,
+   given the interface, normally be implemented using (try_)cmpxchg loops but
+   are time critical and can, (typically) on LL/SC architectures, be more
+   efficiently implemented.
+All these operations are SMP atomic; that is, the operations (for a single
+atomic variable) can be fully ordered and no intermediate state is lost or
+visible.
+ORDERING  (go read memory-barriers.txt first)
+--------
+The rule of thumb:
+ - non-RMW operations are unordered;
+ - RMW operations that have no return value are unordered;
+ - RMW operations that have a return value are fully ordered;
+ - RMW operations that are conditional are unordered on FAILURE,
+   otherwise the above rules apply.
+Except of course when an operation has an explicit ordering like:
+ {}_relaxed: unordered
+ {}_acquire: the R of the RMW (or atomic_read) is an ACQUIRE
+ {}_release: the W of the RMW (or atomic_set)  is a  RELEASE
+Where 'unordered' is against other memory locations. Address dependencies are
+not defeated.
+Fully ordered primitives are ordered against everything prior and everything
+subsequent. Therefore a fully ordered primitive is like having an smp_mb()
+before and an smp_mb() after the primitive.
+The barriers:
+  smp_mb__{before,after}_atomic()
+only apply to the RMW ops and can be used to augment/upgrade the ordering
+inherent to the used atomic op. These barriers provide a full smp_mb().
+These helper barriers exist because architectures have varying implicit
+ordering on their SMP atomic primitives. For example our TSO architectures
+provide full ordered atomics and these barriers are no-ops.
+Thus:
+  atomic_fetch_add();
+is equivalent to:
+  smp_mb__before_atomic();
+  atomic_fetch_add_relaxed();
+  smp_mb__after_atomic();
+However the atomic_fetch_add() might be implemented more efficiently.
+Further, while something like:
+  smp_mb__before_atomic();
+  atomic_dec(&X);
+is a 'typical' RELEASE pattern, the barrier is strictly stronger than
+a RELEASE. Similarly for something like:
+  atomic_inc(&X);
+  smp_mb__after_atomic();
+is an ACQUIRE pattern (though very much not typical), but again the barrier is
+strictly stronger than ACQUIRE. As illustrated:
+  C strong-acquire
+  {
+  }
+  P1(int *x, atomic_t *y)
+  {
+    r0 = READ_ONCE(*x);
+    smp_rmb();
+    r1 = atomic_read(y);
+  }
+  P2(int *x, atomic_t *y)
+  {
+    atomic_inc(y);
+    smp_mb__after_atomic();
+    WRITE_ONCE(*x, 1);
+  }
+  exists
+  (r0=1 /\ r1=0)
+This should not happen; but a hypothetical atomic_inc_acquire() --
+(void)atomic_fetch_inc_acquire() for instance -- would allow the outcome,
+since then:
+  P1                    P2
+                        t = LL.acq *y (0)
+                        t++;
+                        *x = 1;
+  r0 = *x (1)
+  RMB
+  r1 = *y (0)
+                        SC *y, t;
+is allowed.
diff --git a/Documentation/locking/crossrelease.txt b/Documentation/locking/crossrelease.txt
new file mode 100644
index 000000000000..bdf1423d5f99
--- /dev/null
+++ b/Documentation/locking/crossrelease.txt
@@ -0,0 +1,874 @@
+Crossrelease
+============
+Started by Byungchul Park <byungchul.park@lge.com>
+Contents:
+ (*) Background
+     - What causes deadlock
+     - How lockdep works
+ (*) Limitation
+     - Limit lockdep
+     - Pros from the limitation
+     - Cons from the limitation
+     - Relax the limitation
+ (*) Crossrelease
+     - Introduce crossrelease
+     - Introduce commit
+ (*) Implementation
+     - Data structures
+     - How crossrelease works
+ (*) Optimizations
+     - Avoid duplication
+     - Lockless for hot paths
+ (*) APPENDIX A: What lockdep does to work aggresively
+ (*) APPENDIX B: How to avoid adding false dependencies
+==========
+Background
+==========
+What causes deadlock
+--------------------
+A deadlock occurs when a context is waiting for an event to happen,
+which is impossible because another (or the) context who can trigger the
+event is also waiting for another (or the) event to happen, which is
+also impossible due to the same reason.
+For example:
+   A context going to trigger event C is waiting for event A to happen.
+   A context going to trigger event A is waiting for event B to happen.
+   A context going to trigger event B is waiting for event C to happen.
+A deadlock occurs when these three wait operations run at the same time,
+because event C cannot be triggered if event A does not happen, which in
+turn cannot be triggered if event B does not happen, which in turn
+cannot be triggered if event C does not happen. After all, no event can
+be triggered since any of them never meets its condition to wake up.
+A dependency might exist between two waiters and a deadlock might happen
+due to an incorrect releationship between dependencies. Thus, we must
+define what a dependency is first. A dependency exists between them if:
+   1. There are two waiters waiting for each event at a given time.
+   2. The only way to wake up each waiter is to trigger its event.
+   3. Whether one can be woken up depends on whether the other can.
+Each wait in the example creates its dependency like:
+   Event C depends on event A.
+   Event A depends on event B.
+   Event B depends on event C.
+   NOTE: Precisely speaking, a dependency is one between whether a
+   waiter for an event can be woken up and whether another waiter for
+   another event can be woken up. However from now on, we will describe
+   a dependency as if it's one between an event and another event for
+   simplicity.
+And they form circular dependencies like:
+    -> C -> A -> B -
+   /                \
+   \                /
+    ----------------
+   where 'A -> B' means that event A depends on event B.
+Such circular dependencies lead to a deadlock since no waiter can meet
+its condition to wake up as described.
+CONCLUSION
+Circular dependencies cause a deadlock.
+How lockdep works
+-----------------
+Lockdep tries to detect a deadlock by checking dependencies created by
+lock operations, acquire and release. Waiting for a lock corresponds to
+waiting for an event, and releasing a lock corresponds to triggering an
+event in the previous section.
+In short, lockdep does:
+   1. Detect a new dependency.
+   2. Add the dependency into a global graph.
+   3. Check if that makes dependencies circular.
+   4. Report a deadlock or its possibility if so.
+For example, consider a graph built by lockdep that looks like:
+   A -> B -
+           \
+            -> E
+           /
+   C -> D -
+   where A, B,..., E are different lock classes.
+Lockdep will add a dependency into the graph on detection of a new
+dependency. For example, it will add a dependency 'E -> C' when a new
+dependency between lock E and lock C is detected. Then the graph will be:
+       A -> B -
+               \
+                -> E -
+               /      \
+    -> C -> D -        \
+   /                   /
+   \                  /
+    ------------------
+   where A, B,..., E are different lock classes.
+This graph contains a subgraph which demonstrates circular dependencies:
+                -> E -
+               /      \
+    -> C -> D -        \
+   /                   /
+   \                  /
+    ------------------
+   where C, D and E are different lock classes.
+This is the condition under which a deadlock might occur. Lockdep
+reports it on detection after adding a new dependency. This is the way
+how lockdep works.
+CONCLUSION
+Lockdep detects a deadlock or its possibility by checking if circular
+dependencies were created after adding each new dependency.
+==========
+Limitation
+==========
+Limit lockdep
+-------------
+Limiting lockdep to work on only typical locks e.g. spin locks and
+mutexes, which are released within the acquire context, the
+implementation becomes simple but its capacity for detection becomes
+limited. Let's check pros and cons in next section.
+Pros from the limitation
+------------------------
+Given the limitation, when acquiring a lock, locks in a held_locks
+cannot be released if the context cannot acquire it so has to wait to
+acquire it, which means all waiters for the locks in the held_locks are
+stuck. It's an exact case to create dependencies between each lock in
+the held_locks and the lock to acquire.
+For example:
+   CONTEXT X
+   ---------
+   acquire A
+   acquire B /* Add a dependency 'A -> B' */
+   release B
+   release A
+   where A and B are different lock classes.
+When acquiring lock A, the held_locks of CONTEXT X is empty thus no
+dependency is added. But when acquiring lock B, lockdep detects and adds
+a new dependency 'A -> B' between lock A in the held_locks and lock B.
+They can be simply added whenever acquiring each lock.
+And data required by lockdep exists in a local structure, held_locks
+embedded in task_struct. Forcing to access the data within the context,
+lockdep can avoid racy problems without explicit locks while handling
+the local data.
+Lastly, lockdep only needs to keep locks currently being held, to build
+a dependency graph. However, relaxing the limitation, it needs to keep
+even locks already released, because a decision whether they created
+dependencies might be long-deferred.
+To sum up, we can expect several advantages from the limitation:
+   1. Lockdep can easily identify a dependency when acquiring a lock.
+   2. Races are avoidable while accessing local locks in a held_locks.
+   3. Lockdep only needs to keep locks currently being held.
+CONCLUSION
+Given the limitation, the implementation becomes simple and efficient.
+Cons from the limitation
+------------------------
+Given the limitation, lockdep is applicable only to typical locks. For
+example, page locks for page access or completions for synchronization
+cannot work with lockdep.
+Can we detect deadlocks below, under the limitation?
+Example 1:
+   CONTEXT X       CONTEXT Y       CONTEXT Z
+   ---------       ---------       ----------
+                   mutex_lock A
+   lock_page B
+                   lock_page B
+                                   mutex_lock A /* DEADLOCK */
+                                   unlock_page B held by X
+                   unlock_page B
+                   mutex_unlock A
+                                   mutex_unlock A
+   where A and B are different lock classes.
+No, we cannot.
+Example 2:
+   CONTEXT X               CONTEXT Y
+   ---------               ---------
+                           mutex_lock A
+   mutex_lock A
+                           wait_for_complete B /* DEADLOCK */
+   complete B
+                           mutex_unlock A
+   mutex_unlock A
+   where A is a lock class and B is a completion variable.
+No, we cannot.
+CONCLUSION
+Given the limitation, lockdep cannot detect a deadlock or its
+possibility caused by page locks or completions.
+Relax the limitation
+--------------------
+Under the limitation, things to create dependencies are limited to
+typical locks. However, synchronization primitives like page locks and
+completions, which are allowed to be released in any context, also
+create dependencies and can cause a deadlock. So lockdep should track
+these locks to do a better job. We have to relax the limitation for
+these locks to work with lockdep.
+Detecting dependencies is very important for lockdep to work because
+adding a dependency means adding an opportunity to check whether it
+causes a deadlock. The more lockdep adds dependencies, the more it
+thoroughly works. Thus Lockdep has to do its best to detect and add as
+many true dependencies into a graph as possible.
+For example, considering only typical locks, lockdep builds a graph like:
+   A -> B -
+           \
+            -> E
+           /
+   C -> D -
+   where A, B,..., E are different lock classes.
+On the other hand, under the relaxation, additional dependencies might
+be created and added. Assuming additional 'FX -> C' and 'E -> GX' are
+added thanks to the relaxation, the graph will be:
+         A -> B -
+                 \
+                  -> E -> GX
+                 /
+   FX -> C -> D -
+   where A, B,..., E, FX and GX are different lock classes, and a suffix
+   'X' is added on non-typical locks.
+The latter graph gives us more chances to check circular dependencies
+than the former. However, it might suffer performance degradation since
+relaxing the limitation, with which design and implementation of lockdep
+can be efficient, might introduce inefficiency inevitably. So lockdep
+should provide two options, strong detection and efficient detection.
+Choosing efficient detection:
+   Lockdep works with only locks restricted to be released within the
+   acquire context. However, lockdep works efficiently.
+Choosing strong detection:
+   Lockdep works with all synchronization primitives. However, lockdep
+   suffers performance degradation.
+CONCLUSION
+Relaxing the limitation, lockdep can add additional dependencies giving
+additional opportunities to check circular dependencies.
+============
+Crossrelease
+============
+Introduce crossrelease
+----------------------
+In order to allow lockdep to handle additional dependencies by what
+might be released in any context, namely 'crosslock', we have to be able
+to identify those created by crosslocks. The proposed 'crossrelease'
+feature provoides a way to do that.
+Crossrelease feature has to do:
+   1. Identify dependencies created by crosslocks.
+   2. Add the dependencies into a dependency graph.
+That's all. Once a meaningful dependency is added into graph, then
+lockdep would work with the graph as it did. The most important thing
+crossrelease feature has to do is to correctly identify and add true
+dependencies into the global graph.
+A dependency e.g. 'A -> B' can be identified only in the A's release
+context because a decision required to identify the dependency can be
+made only in the release context. That is to decide whether A can be
+released so that a waiter for A can be woken up. It cannot be made in
+other than the A's release context.
+It's no matter for typical locks because each acquire context is same as
+its release context, thus lockdep can decide whether a lock can be
+released in the acquire context. However for crosslocks, lockdep cannot
+make the decision in the acquire context but has to wait until the
+release context is identified.
+Therefore, deadlocks by crosslocks cannot be detected just when it
+happens, because those cannot be identified until the crosslocks are
+released. However, deadlock possibilities can be detected and it's very
+worth. See 'APPENDIX A' section to check why.
+CONCLUSION
+Using crossrelease feature, lockdep can work with what might be released
+in any context, namely crosslock.
+Introduce commit
+----------------
+Since crossrelease defers the work adding true dependencies of
+crosslocks until they are actually released, crossrelease has to queue
+all acquisitions which might create dependencies with the crosslocks.
+Then it identifies dependencies using the queued data in batches at a
+proper time. We call it 'commit'.
+There are four types of dependencies:
+1. TT type: 'typical lock A -> typical lock B'
+   Just when acquiring B, lockdep can see it's in the A's release
+   context. So the dependency between A and B can be identified
+   immediately. Commit is unnecessary.
+2. TC type: 'typical lock A -> crosslock BX'
+   Just when acquiring BX, lockdep can see it's in the A's release
+   context. So the dependency between A and BX can be identified
+   immediately. Commit is unnecessary, too.
+3. CT type: 'crosslock AX -> typical lock B'
+   When acquiring B, lockdep cannot identify the dependency because
+   there's no way to know if it's in the AX's release context. It has
+   to wait until the decision can be made. Commit is necessary.
+4. CC type: 'crosslock AX -> crosslock BX'
+   When acquiring BX, lockdep cannot identify the dependency because
+   there's no way to know if it's in the AX's release context. It has
+   to wait until the decision can be made. Commit is necessary.
+   But, handling CC type is not implemented yet. It's a future work.
+Lockdep can work without commit for typical locks, but commit step is
+necessary once crosslocks are involved. Introducing commit, lockdep
+performs three steps. What lockdep does in each step is:
+1. Acquisition: For typical locks, lockdep does what it originally did
+   and queues the lock so that CT type dependencies can be checked using
+   it at the commit step. For crosslocks, it saves data which will be
+   used at the commit step and increases a reference count for it.
+2. Commit: No action is reauired for typical locks. For crosslocks,
+   lockdep adds CT type dependencies using the data saved at the
+   acquisition step.
+3. Release: No changes are required for typical locks. When a crosslock
+   is released, it decreases a reference count for it.
+CONCLUSION
+Crossrelease introduces commit step to handle dependencies of crosslocks
+in batches at a proper time.
+==============
+Implementation
+==============
+Data structures
+---------------
+Crossrelease introduces two main data structures.
+1. hist_lock
+   This is an array embedded in task_struct, for keeping lock history so
+   that dependencies can be added using them at the commit step. Since
+   it's local data, it can be accessed locklessly in the owner context.
+   The array is filled at the acquisition step and consumed at the
+   commit step. And it's managed in circular manner.
+2. cross_lock
+   One per lockdep_map exists. This is for keeping data of crosslocks
+   and used at the commit step.
+How crossrelease works
+----------------------
+It's the key of how crossrelease works, to defer necessary works to an
+appropriate point in time and perform in at once at the commit step.
+Let's take a look with examples step by step, starting from how lockdep
+works without crossrelease for typical locks.
+   acquire A /* Push A onto held_locks */
+   acquire B /* Push B onto held_locks and add 'A -> B' */
+   acquire C /* Push C onto held_locks and add 'B -> C' */
+   release C /* Pop C from held_locks */
+   release B /* Pop B from held_locks */
+   release A /* Pop A from held_locks */
+   where A, B and C are different lock classes.
+   NOTE: This document assumes that readers already understand how
+   lockdep works without crossrelease thus omits details. But there's
+   one thing to note. Lockdep pretends to pop a lock from held_locks
+   when releasing it. But it's subtly different from the original pop
+   operation because lockdep allows other than the top to be poped.
+In this case, lockdep adds 'the top of held_locks -> the lock to acquire'
+dependency every time acquiring a lock.
+After adding 'A -> B', a dependency graph will be:
+   A -> B
+   where A and B are different lock classes.
+And after adding 'B -> C', the graph will be:
+   A -> B -> C
+   where A, B and C are different lock classes.
+Let's performs commit step even for typical locks to add dependencies.
+Of course, commit step is not necessary for them, however, it would work
+well because this is a more general way.
+   acquire A
+   /*
+    * Queue A into hist_locks
+    *
+    * In hist_locks: A
+    * In graph: Empty
+    */
+   acquire B
+   /*
+    * Queue B into hist_locks
+    *
+    * In hist_locks: A, B
+    * In graph: Empty
+    */
+   acquire C
+   /*
+    * Queue C into hist_locks
+    *
+    * In hist_locks: A, B, C
+    * In graph: Empty
+    */
+   commit C
+   /*
+    * Add 'C -> ?'
+    * Answer the following to decide '?'
+    * What has been queued since acquire C: Nothing
+    *
+    * In hist_locks: A, B, C
+    * In graph: Empty
+    */
+   release C
+   commit B
+   /*
+    * Add 'B -> ?'
+    * Answer the following to decide '?'
+    * What has been queued since acquire B: C
+    *
+    * In hist_locks: A, B, C
+    * In graph: 'B -> C'
+    */
+   release B
+   commit A
+   /*
+    * Add 'A -> ?'
+    * Answer the following to decide '?'
+    * What has been queued since acquire A: B, C
+    *
+    * In hist_locks: A, B, C
+    * In graph: 'B -> C', 'A -> B', 'A -> C'
+    */
+   release A
+   where A, B and C are different lock classes.
+In this case, dependencies are added at the commit step as described.
+After commits for A, B and C, the graph will be:
+   A -> B -> C
+   where A, B and C are different lock classes.
+   NOTE: A dependency 'A -> C' is optimized out.
+We can see the former graph built without commit step is same as the
+latter graph built using commit steps. Of course the former way leads to
+earlier finish for building the graph, which means we can detect a
+deadlock or its possibility sooner. So the former way would be prefered
+when possible. But we cannot avoid using the latter way for crosslocks.
+Let's look at how commit steps work for crosslocks. In this case, the
+commit step is performed only on crosslock AX as real. And it assumes
+that the AX release context is different from the AX acquire context.
+   BX RELEASE CONTEXT              BX ACQUIRE CONTEXT
+   ------------------              ------------------
+                                   acquire A
+                                   /*
+                                    * Push A onto held_locks
+                                    * Queue A into hist_locks
+                                    *
+                                    * In held_locks: A
+                                    * In hist_locks: A
+                                    * In graph: Empty
+                                    */
+                                   acquire BX
+                                   /*
+                                    * Add 'the top of held_locks -> BX'
+                                    *
+                                    * In held_locks: A
+                                    * In hist_locks: A
+                                    * In graph: 'A -> BX'
+                                    */
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+   It must be guaranteed that the following operations are seen after
+   acquiring BX globally. It can be done by things like barrier.
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+   acquire C
+   /*
+    * Push C onto held_locks
+    * Queue C into hist_locks
+    *
+    * In held_locks: C
+    * In hist_locks: C
+    * In graph: 'A -> BX'
+    */
+   release C
+   /*
+    * Pop C from held_locks
+    *
+    * In held_locks: Empty
+    * In hist_locks: C
+    * In graph: 'A -> BX'
+    */
+                                   acquire D
+                                   /*
+                                    * Push D onto held_locks
+                                    * Queue D into hist_locks
+                                    * Add 'the top of held_locks -> D'
+                                    *
+                                    * In held_locks: A, D
+                                    * In hist_locks: A, D
+                                    * In graph: 'A -> BX', 'A -> D'
+                                    */
+   acquire E
+   /*
+    * Push E onto held_locks
+    * Queue E into hist_locks
+    *
+    * In held_locks: E
+    * In hist_locks: C, E
+    * In graph: 'A -> BX', 'A -> D'
+    */
+   release E
+   /*
+    * Pop E from held_locks
+    *
+    * In held_locks: Empty
+    * In hist_locks: D, E
+    * In graph: 'A -> BX', 'A -> D'
+    */
+                                   release D
+                                   /*
+                                    * Pop D from held_locks
+                                    *
+                                    * In held_locks: A
+                                    * In hist_locks: A, D
+                                    * In graph: 'A -> BX', 'A -> D'
+                                    */
+   commit BX
+   /*
+    * Add 'BX -> ?'
+    * What has been queued since acquire BX: C, E
+    *
+    * In held_locks: Empty
+    * In hist_locks: D, E
+    * In graph: 'A -> BX', 'A -> D',
+    *           'BX -> C', 'BX -> E'
+    */
+   release BX
+   /*
+    * In held_locks: Empty
+    * In hist_locks: D, E
+    * In graph: 'A -> BX', 'A -> D',
+    *           'BX -> C', 'BX -> E'
+    */
+                                   release A
+                                   /*
+                                    * Pop A from held_locks
+                                    *
+                                    * In held_locks: Empty
+                                    * In hist_locks: A, D
+                                    * In graph: 'A -> BX', 'A -> D',
+                                    *           'BX -> C', 'BX -> E'
+                                    */
+   where A, BX, C,..., E are different lock classes, and a suffix 'X' is
+   added on crosslocks.
+Crossrelease considers all acquisitions after acqiuring BX are
+candidates which might create dependencies with BX. True dependencies
+will be determined when identifying the release context of BX. Meanwhile,
+all typical locks are queued so that they can be used at the commit step.
+And then two dependencies 'BX -> C' and 'BX -> E' are added at the
+commit step when identifying the release context.
+The final graph will be, with crossrelease:
+               -> C
+              /
+       -> BX -
+      /       \
+   A -         -> E
+      \
+       -> D
+   where A, BX, C,..., E are different lock classes, and a suffix 'X' is
+   added on crosslocks.
+However, the final graph will be, without crossrelease:
+   A -> D
+   where A and D are different lock classes.
+The former graph has three more dependencies, 'A -> BX', 'BX -> C' and
+'BX -> E' giving additional opportunities to check if they cause
+deadlocks. This way lockdep can detect a deadlock or its possibility
+caused by crosslocks.
+CONCLUSION
+We checked how crossrelease works with several examples.
+=============
+Optimizations
+=============
+Avoid duplication
+-----------------
+Crossrelease feature uses a cache like what lockdep already uses for
+dependency chains, but this time it's for caching CT type dependencies.
+Once that dependency is cached, the same will never be added again.
+Lockless for hot paths
+----------------------
+To keep all locks for later use at the commit step, crossrelease adopts
+a local array embedded in task_struct, which makes access to the data
+lockless by forcing it to happen only within the owner context. It's
+like how lockdep handles held_locks. Lockless implmentation is important
+since typical locks are very frequently acquired and released.
+=================================================
+APPENDIX A: What lockdep does to work aggresively
+=================================================
+A deadlock actually occurs when all wait operations creating circular
+dependencies run at the same time. Even though they don't, a potential
+deadlock exists if the problematic dependencies exist. Thus it's
+meaningful to detect not only an actual deadlock but also its potential
+possibility. The latter is rather valuable. When a deadlock occurs
+actually, we can identify what happens in the system by some means or
+other even without lockdep. However, there's no way to detect possiblity
+without lockdep unless the whole code is parsed in head. It's terrible.
+Lockdep does the both, and crossrelease only focuses on the latter.
+Whether or not a deadlock actually occurs depends on several factors.
+For example, what order contexts are switched in is a factor. Assuming
+circular dependencies exist, a deadlock would occur when contexts are
+switched so that all wait operations creating the dependencies run
+simultaneously. Thus to detect a deadlock possibility even in the case
+that it has not occured yet, lockdep should consider all possible
+combinations of dependencies, trying to:
+1. Use a global dependency graph.
+   Lockdep combines all dependencies into one global graph and uses them,
+   regardless of which context generates them or what order contexts are
+   switched in. Aggregated dependencies are only considered so they are
+   prone to be circular if a problem exists.
+2. Check dependencies between classes instead of instances.
+   What actually causes a deadlock are instances of lock. However,
+   lockdep checks dependencies between classes instead of instances.
+   This way lockdep can detect a deadlock which has not happened but
+   might happen in future by others but the same class.
+3. Assume all acquisitions lead to waiting.
+   Although locks might be acquired without waiting which is essential
+   to create dependencies, lockdep assumes all acquisitions lead to
+   waiting since it might be true some time or another.
+CONCLUSION
+Lockdep detects not only an actual deadlock but also its possibility,
+and the latter is more valuable.
+==================================================
+APPENDIX B: How to avoid adding false dependencies
+==================================================
+Remind what a dependency is. A dependency exists if:
+   1. There are two waiters waiting for each event at a given time.
+   2. The only way to wake up each waiter is to trigger its event.
+   3. Whether one can be woken up depends on whether the other can.
+For example:
+   acquire A
+   acquire B /* A dependency 'A -> B' exists */
+   release B
+   release A
+   where A and B are different lock classes.
+A depedency 'A -> B' exists since:
+   1. A waiter for A and a waiter for B might exist when acquiring B.
+   2. Only way to wake up each is to release what it waits for.
+   3. Whether the waiter for A can be woken up depends on whether the
+      other can. IOW, TASK X cannot release A if it fails to acquire B.
+For another example:
+   TASK X                          TASK Y
+   ------                          ------
+                                   acquire AX
+   acquire B /* A dependency 'AX -> B' exists */
+   release B
+   release AX held by Y
+   where AX and B are different lock classes, and a suffix 'X' is added
+   on crosslocks.
+Even in this case involving crosslocks, the same rule can be applied. A
+depedency 'AX -> B' exists since:
+   1. A waiter for AX and a waiter for B might exist when acquiring B.
+   2. Only way to wake up each is to release what it waits for.
+   3. Whether the waiter for AX can be woken up depends on whether the
+      other can. IOW, TASK X cannot release AX if it fails to acquire B.
+Let's take a look at more complicated example:
+   TASK X                          TASK Y
+   ------                          ------
+   acquire B
+   release B
+   fork Y
+                                   acquire AX
+   acquire C /* A dependency 'AX -> C' exists */
+   release C
+   release AX held by Y
+   where AX, B and C are different lock classes, and a suffix 'X' is
+   added on crosslocks.
+Does a dependency 'AX -> B' exist? Nope.
+Two waiters are essential to create a dependency. However, waiters for
+AX and B to create 'AX -> B' cannot exist at the same time in this
+example. Thus the dependency 'AX -> B' cannot be created.
+It would be ideal if the full set of true ones can be considered. But
+we can ensure nothing but what actually happened. Relying on what
+actually happens at runtime, we can anyway add only true ones, though
+they might be a subset of true ones. It's similar to how lockdep works
+for typical locks. There might be more true dependencies than what
+lockdep has detected in runtime. Lockdep has no choice but to rely on
+what actually happens. Crossrelease also relies on it.
+CONCLUSION
+Relying on what actually happens, lockdep can avoid adding false
+dependencies.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index e2ee0a1c299a..b759a60624fd 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -498,11 +498,11 @@ And a couple of implicit varieties:
     This means that ACQUIRE acts as a minimal "acquire" operation and
     RELEASE acts as a minimal "release" operation.
-A subset of the atomic operations described in core-api/atomic_ops.rst have
+A subset of the atomic operations described in atomic_t.txt have ACQUIRE and
-ACQUIRE and RELEASE variants in addition to fully-ordered and relaxed (no
+RELEASE variants in addition to fully-ordered and relaxed (no barrier
-barrier semantics) definitions.  For compound atomics performing both a load
+semantics) definitions.  For compound atomics performing both a load and a
-and a store, ACQUIRE semantics apply only to the load and RELEASE semantics
+store, ACQUIRE semantics apply only to the load and RELEASE semantics apply
-apply only to the store portion of the operation.
+only to the store portion of the operation.
 Memory barriers are only required where there's a possibility of interaction
 between two CPUs or between a CPU and a device.  If it can be guaranteed that
@@ -1883,8 +1883,7 @@ There are some more advanced barrier functions:
     This makes sure that the death mark on the object is perceived to be set
     *before* the reference counter is decremented.
-     See Documentation/core-api/atomic_ops.rst for more information.  See the
+     See Documentation/atomic_{t,bitops}.txt for more information.
-     "Atomic operations" subsection for information on where to use these.
 (*) lockless_dereference();
@@ -1989,10 +1988,7 @@ for each construct.  These operations all imply certain barriers:
     ACQUIRE operation has completed.
     Memory operations issued before the ACQUIRE may be completed after
-     the ACQUIRE operation has completed.  An smp_mb__before_spinlock(),
+     the ACQUIRE operation has completed.
-     combined with a following ACQUIRE, orders prior stores against
-     subsequent loads and stores.  Note that this is weaker than smp_mb()!
-     The smp_mb__before_spinlock() primitive is free on many architectures.
 (2) RELEASE operation implication:
@@ -2510,88 +2506,7 @@ operations are noted specially as some of them imply full memory barriers and
 some don't, but they're very heavily relied on as a group throughout the
 kernel.
-Any atomic operation that modifies some state in memory and returns information
+See Documentation/atomic_t.txt for more information.
-about the state (old or new) implies an SMP-conditional general memory barrier
-(smp_mb()) on each side of the actual operation (with the exception of
-explicit lock operations, described later).  These include:
-        xchg();
-        atomic_xchg();                  atomic_long_xchg();
-        atomic_inc_return();            atomic_long_inc_return();
-        atomic_dec_return();            atomic_long_dec_return();
-        atomic_add_return();            atomic_long_add_return();
-        atomic_sub_return();            atomic_long_sub_return();
-        atomic_inc_and_test();          atomic_long_inc_and_test();
-        atomic_dec_and_test();          atomic_long_dec_and_test();
-        atomic_sub_and_test();          atomic_long_sub_and_test();
-        atomic_add_negative();          atomic_long_add_negative();
-        test_and_set_bit();
-        test_and_clear_bit();
-        test_and_change_bit();
-        /* when succeeds */
-        cmpxchg();
-        atomic_cmpxchg();               atomic_long_cmpxchg();
-        atomic_add_unless();            atomic_long_add_unless();
-These are used for such things as implementing ACQUIRE-class and RELEASE-class
-operations and adjusting reference counters towards object destruction, and as
-such the implicit memory barrier effects are necessary.
-The following operations are potential problems as they do _not_ imply memory
-barriers, but might be used for implementing such things as RELEASE-class
-operations:
-        atomic_set();
-        set_bit();
-        clear_bit();
-        change_bit();
-With these the appropriate explicit memory barrier should be used if necessary
-(smp_mb__before_atomic() for instance).
-The following also do _not_ imply memory barriers, and so may require explicit
-memory barriers under some circumstances (smp_mb__before_atomic() for
-instance):
-        atomic_add();
-        atomic_sub();
-        atomic_inc();
-        atomic_dec();
-If they're used for statistics generation, then they probably don't need memory
-barriers, unless there's a coupling between statistical data.
-If they're used for reference counting on an object to control its lifetime,
-they probably don't need memory barriers because either the reference count
-will be adjusted inside a locked section, or the caller will already hold
-sufficient references to make the lock, and thus a memory barrier unnecessary.
-If they're used for constructing a lock of some description, then they probably
-do need memory barriers as a lock primitive generally has to do things in a
-specific order.
-Basically, each usage case has to be carefully considered as to whether memory
-barriers are needed or not.
-The following operations are special locking primitives:
-        test_and_set_bit_lock();
-        clear_bit_unlock();
-        __clear_bit_unlock();
-These implement ACQUIRE-class and RELEASE-class operations.  These should be
-used in preference to other operations when implementing locking primitives,
-because their implementations can be optimised on many architectures.
-[!] Note that special memory barrier primitives are available for these
-situations because on some CPUs the atomic instructions used imply full memory
-barriers, and so barrier instructions are superfluous in conjunction with them,
-and in such cases the special barrier primitives will be no-ops.
-See Documentation/core-api/atomic_ops.rst for more information.
 ACCESSING DEVICES
diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt
index b83dfa1c0602..ab16efe0c79d 100644
--- a/Documentation/static-keys.txt
+++ b/Documentation/static-keys.txt
@@ -149,6 +149,26 @@ static_branch_inc(), will change the branch back to true. Likewise, if the
 key is initialized false, a 'static_branch_inc()', will change the branch to
 true. And then a 'static_branch_dec()', will again make the branch false.
+The state and the reference count can be retrieved with 'static_key_enabled()'
+and 'static_key_count()'.  In general, if you use these functions, they
+should be protected with the same mutex used around the enable/disable
+or increment/decrement function.
+Note that switching branches results in some locks being taken,
+particularly the CPU hotplug lock (in order to avoid races against
+CPUs being brought in the kernel whilst the kernel is getting
+patched). Calling the static key API from within a hotplug notifier is
+thus a sure deadlock recipe. In order to still allow use of the
+functionnality, the following functions are provided:
+        static_key_enable_cpuslocked()
+        static_key_disable_cpuslocked()
+        static_branch_enable_cpuslocked()
+        static_branch_disable_cpuslocked()
+These functions are *not* general purpose, and must only be used when
+you really know that you're in the above context, and no other.
 Where an array of keys is required, it can be defined as::
        DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count);
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index 38310dcd6620..bc80fc0e210f 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -1956,10 +1956,7 @@ MMIO 쓰기 배리어
     뒤에 완료됩니다.
     ACQUIRE 앞에서 요청된 메모리 오퍼레이션은 ACQUIRE 오퍼레이션이 완료된 후에
-     완료될 수 있습니다.  smp_mb__before_spinlock() 뒤에 ACQUIRE 가 실행되는
+     완료될 수 있습니다.
-     코드 블록은 블록 앞의 스토어를 블록 뒤의 로드와 스토어에 대해 순서
-     맞춥니다.  이건 smp_mb() 보다 완화된 것임을 기억하세요!  많은 아키텍쳐에서
-     smp_mb__before_spinlock() 은 사실 아무일도 하지 않습니다.
 (2) RELEASE 오퍼레이션의 영향:
diff --git a/arch/Kconfig b/arch/Kconfig
index 21d0089117fe..2520ca5b42eb 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -931,6 +931,18 @@ config STRICT_MODULE_RWX
 config ARCH_WANT_RELAX_ORDER
        bool
+config ARCH_HAS_REFCOUNT
+        bool
+        help
+          An architecture selects this when it has implemented refcount_t
+          using open coded assembly primitives that provide an optimized
+          refcount_t implementation, possibly at the expense of some full
+          refcount state checks of CONFIG_REFCOUNT_FULL=y.
+          The refcount overflow check behavior, however, must be retained.
+          Catching overflows is the primary security concern for protecting
+          against bugs in reference counts.
 config REFCOUNT_FULL
        bool "Perform full reference count validation at the expense of speed"
        help
diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h
index fb01dfb760c2..05a70edd57b6 100644
--- a/arch/alpha/include/asm/futex.h
+++ b/arch/alpha/include/asm/futex.h
@@ -25,18 +25,10 @@
        :       "r" (uaddr), "r"(oparg)                         \
        :       "memory")
-static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+                u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
        pagefault_disable();
@@ -62,17 +54,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-                case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-                case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-                case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-                case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-                default: ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 54b54da6384c..11859287c52a 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -123,6 +123,8 @@ static inline void atomic_set(atomic_t *v, int i)
        atomic_ops_unlock(flags);
 }
+#define atomic_set_release(v, i)        atomic_set((v), (i))
 #endif
 /*
diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 11e1b1f3acda..eb887dd13e74 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -73,20 +73,11 @@
 #endif
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+                u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
-                return -EFAULT;
 #ifndef CONFIG_ARC_HAS_LLSC
        preempt_disable();      /* to guarantee atomic r-m-w of futex op */
 #endif
@@ -118,30 +109,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        preempt_enable();
 #endif
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ:
-                        ret = (oldval == cmparg);
-                        break;
-                case FUTEX_OP_CMP_NE:
-                        ret = (oldval != cmparg);
-                        break;
-                case FUTEX_OP_CMP_LT:
-                        ret = (oldval < cmparg);
-                        break;
-                case FUTEX_OP_CMP_GE:
-                        ret = (oldval >= cmparg);
-                        break;
-                case FUTEX_OP_CMP_LE:
-                        ret = (oldval <= cmparg);
-                        break;
-                case FUTEX_OP_CMP_GT:
-                        ret = (oldval > cmparg);
-                        break;
-                default:
-                        ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 6795368ad023..cc414382dab4 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -128,20 +128,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 #endif /* !SMP */
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret, tmp;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
 #ifndef CONFIG_SMP
        preempt_disable();
 #endif
@@ -172,17 +162,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
        preempt_enable();
 #endif
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-                case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-                case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-                case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-                case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-                default: ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index f32b42e8725d..5bb2fd4674e7 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -48,20 +48,10 @@ do {									\
 } while (0)
 static inline int
-futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (int)(encoded_op << 8) >> 20;
-        int cmparg = (int)(encoded_op << 20) >> 20;
        int oldval = 0, ret, tmp;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1U << (oparg & 0x1f);
-        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
        pagefault_disable();
        switch (op) {
@@ -91,17 +81,9 @@ futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-                case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-                case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-                case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-                case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-                default: ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index f445bd7f2b9f..95ad7102b63c 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -310,14 +310,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 #define arch_read_relax(lock)   cpu_relax()
 #define arch_write_relax(lock)  cpu_relax()
-/*
+/* See include/linux/spinlock.h */
- * Accesses appearing in program order before a spin_lock() operation
+#define smp_mb__after_spinlock()        smp_mb()
- * can be reordered with accesses inside the critical section, by virtue
- * of arch_spin_lock being constructed using acquire semantics.
- *
- * In cases where this is problematic (e.g. try_to_wake_up), an
- * smp_mb__before_spinlock() can restore the required ordering.
- */
-#define smp_mb__before_spinlock()       smp_mb()
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h
index 2e1da71e27a4..ab346f5f8820 100644
--- a/arch/frv/include/asm/futex.h
+++ b/arch/frv/include/asm/futex.h
@@ -7,7 +7,8 @@
 #include <asm/errno.h>
 #include <linux/uaccess.h>
-extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr);
+extern int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+                u32 __user *uaddr);
 static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c
index d155ca9e5098..37f7b2bf7f73 100644
--- a/arch/frv/kernel/futex.c
+++ b/arch/frv/kernel/futex.c
@@ -186,20 +186,10 @@ static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *_o
 /*
 * do the futex operations
 */
-int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
        pagefault_disable();
        switch (op) {
@@ -225,18 +215,9 @@ int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-                case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-                case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-                case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-                case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-                default: ret = -ENOSYS; break;
-                }
-        }
        return ret;
-} /* end futex_atomic_op_inuser() */
+} /* end arch_futex_atomic_op_inuser() */
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index a62ba368b27d..fb3dfb2a667e 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -42,6 +42,8 @@ static inline void atomic_set(atomic_t *v, int new)
        );
 }
+#define atomic_set_release(v, i)        atomic_set((v), (i))
 /**
 * atomic_read - reads a word, atomically
 * @v: pointer to atomic value
diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h
index 7e597f8434da..c607b77c8215 100644
--- a/arch/hexagon/include/asm/futex.h
+++ b/arch/hexagon/include/asm/futex.h
@@ -31,18 +31,9 @@
 static inline int
-futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
-                return -EFAULT;
        pagefault_disable();
@@ -72,30 +63,9 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ:
-                        ret = (oldval == cmparg);
-                        break;
-                case FUTEX_OP_CMP_NE:
-                        ret = (oldval != cmparg);
-                        break;
-                case FUTEX_OP_CMP_LT:
-                        ret = (oldval < cmparg);
-                        break;
-                case FUTEX_OP_CMP_GE:
-                        ret = (oldval >= cmparg);
-                        break;
-                case FUTEX_OP_CMP_LE:
-                        ret = (oldval <= cmparg);
-                        break;
-                case FUTEX_OP_CMP_GT:
-                        ret = (oldval > cmparg);
-                        break;
-                default:
-                        ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h
index 76acbcd5c060..6d67dc1eaf2b 100644
--- a/arch/ia64/include/asm/futex.h
+++ b/arch/ia64/include/asm/futex.h
@@ -45,18 +45,9 @@ do {									\
 } while (0)
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
        pagefault_disable();
@@ -84,17 +75,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-                case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-                case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-                case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-                case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-                default: ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h
index 6c1380a8a0d4..eee779f26cc4 100644
--- a/arch/metag/include/asm/atomic_lock1.h
+++ b/arch/metag/include/asm/atomic_lock1.h
@@ -37,6 +37,8 @@ static inline int atomic_set(atomic_t *v, int i)
        return i;
 }
+#define atomic_set_release(v, i) atomic_set((v), (i))
 #define ATOMIC_OP(op, c_op)                                             \
 static inline void atomic_##op(int i, atomic_t *v)                      \
 {                                                                       \
diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h
index 01848f056f43..a9dad9e5e132 100644
--- a/arch/microblaze/include/asm/futex.h
+++ b/arch/microblaze/include/asm/futex.h
@@ -29,18 +29,9 @@
 })
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
        pagefault_disable();
@@ -66,30 +57,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ:
-                        ret = (oldval == cmparg);
-                        break;
-                case FUTEX_OP_CMP_NE:
-                        ret = (oldval != cmparg);
-                        break;
-                case FUTEX_OP_CMP_LT:
-                        ret = (oldval < cmparg);
-                        break;
-                case FUTEX_OP_CMP_GE:
-                        ret = (oldval >= cmparg);
-                        break;
-                case FUTEX_OP_CMP_LE:
-                        ret = (oldval <= cmparg);
-                        break;
-                case FUTEX_OP_CMP_GT:
-                        ret = (oldval > cmparg);
-                        break;
-                default:
-                        ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h
index 1de190bdfb9c..a9e61ea54ca9 100644
--- a/arch/mips/include/asm/futex.h
+++ b/arch/mips/include/asm/futex.h
@@ -83,18 +83,9 @@
 }
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
        pagefault_disable();
@@ -125,17 +116,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-                case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-                case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-                case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-                case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-                default: ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 6bace7695788..c7cbddfcdc3b 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one);
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 static DEFINE_PER_CPU(atomic_t, tick_broadcast_count);
-static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd);
+static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);
 void tick_broadcast(const struct cpumask *mask)
 {
        atomic_t *count;
-        struct call_single_data *csd;
+        call_single_data_t *csd;
        int cpu;
        for_each_cpu(cpu, mask) {
@@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info)
 static int __init tick_broadcast_init(void)
 {
-        struct call_single_data *csd;
+        call_single_data_t *csd;
        int cpu;
        for (cpu = 0; cpu < NR_CPUS; cpu++) {
diff --git a/arch/openrisc/include/asm/futex.h b/arch/openrisc/include/asm/futex.h
index 778087341977..8fed278a24b8 100644
--- a/arch/openrisc/include/asm/futex.h
+++ b/arch/openrisc/include/asm/futex.h
@@ -30,20 +30,10 @@
 })
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
        pagefault_disable();
        switch (op) {
@@ -68,30 +58,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ:
-                        ret = (oldval == cmparg);
-                        break;
-                case FUTEX_OP_CMP_NE:
-                        ret = (oldval != cmparg);
-                        break;
-                case FUTEX_OP_CMP_LT:
-                        ret = (oldval < cmparg);
-                        break;
-                case FUTEX_OP_CMP_GE:
-                        ret = (oldval >= cmparg);
-                        break;
-                case FUTEX_OP_CMP_LE:
-                        ret = (oldval <= cmparg);
-                        break;
-                case FUTEX_OP_CMP_GT:
-                        ret = (oldval > cmparg);
-                        break;
-                default:
-                        ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 5394b9c5f914..17b98a87e5e2 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -65,6 +65,8 @@ static __inline__ void atomic_set(atomic_t *v, int i)
        _atomic_spin_unlock_irqrestore(v, flags);
 }
+#define atomic_set_release(v, i)        atomic_set((v), (i))
 static __inline__ int atomic_read(const atomic_t *v)
 {
        return READ_ONCE((v)->counter);
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index 0ba14300cd8e..c601aab2fb36 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -32,22 +32,12 @@ _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags)
 }
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
        unsigned long int flags;
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval, ret;
        u32 tmp;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr)))
-                return -EFAULT;
        _futex_spin_lock_irqsave(uaddr, &flags);
        pagefault_disable();
@@ -85,17 +75,9 @@ out_pagefault_enable:
        pagefault_enable();
        _futex_spin_unlock_irqrestore(uaddr, &flags);
-        if (ret == 0) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-                case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-                case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-                case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-                case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-                default: ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index 25d42bd3f114..9c601adfc500 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -74,13 +74,6 @@ do {									\
        ___p1;                                                          \
 })
-/*
- * This must resolve to hwsync on SMP for the context switch path.
- * See _switch, and core scheduler context switch memory ordering
- * comments.
- */
-#define smp_mb__before_spinlock()   smp_mb()
 #include <asm-generic/barrier.h>
 #endif /* _ASM_POWERPC_BARRIER_H */
diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index eaada6c92344..719ed9b61ea7 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -29,18 +29,10 @@
        : "b" (uaddr), "i" (-EFAULT), "r" (oparg) \
        : "cr0", "memory")
-static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+                u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
        pagefault_disable();
@@ -66,17 +58,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-                case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-                case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-                case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-                case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-                default: ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index d256e448ea49..edbe571bcc54 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -309,5 +309,8 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 #define arch_read_relax(lock)   __rw_yield(lock)
 #define arch_write_relax(lock)  __rw_yield(lock)
+/* See include/linux/spinlock.h */
+#define smp_mb__after_spinlock()   smp_mb()
 #endif /* __KERNEL__ */
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h
index a4811aa0304d..8f8eec9e1198 100644
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -21,17 +21,12 @@
                : "0" (-EFAULT), "d" (oparg), "a" (uaddr),              \
                  "m" (*uaddr) : "cc");
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+                u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, newval, ret;
        load_kernel_asce();
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
        pagefault_disable();
        switch (op) {
@@ -60,17 +55,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        }
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-                case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-                case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-                case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-                case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-                default: ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h
index d0078747d308..8f8cf941a8cd 100644
--- a/arch/sh/include/asm/futex.h
+++ b/arch/sh/include/asm/futex.h
@@ -27,21 +27,12 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
        return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval);
 }
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval,
+                u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        u32 oparg = (encoded_op << 8) >> 20;
-        u32 cmparg = (encoded_op << 20) >> 20;
        u32 oldval, newval, prev;
        int ret;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
        pagefault_disable();
        do {
@@ -80,17 +71,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-                case FUTEX_OP_CMP_LT: ret = ((int)oldval < (int)cmparg); break;
-                case FUTEX_OP_CMP_GE: ret = ((int)oldval >= (int)cmparg); break;
-                case FUTEX_OP_CMP_LE: ret = ((int)oldval <= (int)cmparg); break;
-                case FUTEX_OP_CMP_GT: ret = ((int)oldval > (int)cmparg); break;
-                default: ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index ee3f11c43cda..7643e979e333 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -29,6 +29,8 @@ int atomic_xchg(atomic_t *, int);
 int __atomic_add_unless(atomic_t *, int, int);
 void atomic_set(atomic_t *, int);
+#define atomic_set_release(v, i)        atomic_set((v), (i))
 #define atomic_read(v)          ACCESS_ONCE((v)->counter)
 #define atomic_add(i, v)        ((void)atomic_add_return( (int)(i), (v)))
diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h
index 4e899b0dabf7..1cfd89d92208 100644
--- a/arch/sparc/include/asm/futex_64.h
+++ b/arch/sparc/include/asm/futex_64.h
@@ -29,22 +29,14 @@
        : "r" (uaddr), "r" (oparg), "i" (-EFAULT)       \
        : "memory")
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+                u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret, tem;
-        if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
-                return -EFAULT;
        if (unlikely((((unsigned long) uaddr) & 0x3UL)))
                return -EINVAL;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
        pagefault_disable();
        switch (op) {
@@ -69,17 +61,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-                case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-                case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-                case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-                case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-                default: ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h
index a93774255136..53a423e7cb92 100644
--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -101,6 +101,8 @@ static inline void atomic_set(atomic_t *v, int n)
        _atomic_xchg(&v->counter, n);
 }
+#define atomic_set_release(v, i)        atomic_set((v), (i))
 /* A 64bit atomic type */
 typedef struct {
diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h
index e64a1b75fc38..83c1e639b411 100644
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -106,12 +106,9 @@
        lock = __atomic_hashed_lock((int __force *)uaddr)
 #endif
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval,
+                u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int uninitialized_var(val), ret;
        __futex_prolog();
@@ -119,12 +116,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        /* The 32-bit futex code makes this assumption, so validate it here. */
        BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
        pagefault_disable();
        switch (op) {
        case FUTEX_OP_SET:
@@ -148,30 +139,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        }
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = val;
-                case FUTEX_OP_CMP_EQ:
-                        ret = (val == cmparg);
-                        break;
-                case FUTEX_OP_CMP_NE:
-                        ret = (val != cmparg);
-                        break;
-                case FUTEX_OP_CMP_LT:
-                        ret = (val < cmparg);
-                        break;
-                case FUTEX_OP_CMP_GE:
-                        ret = (val >= cmparg);
-                        break;
-                case FUTEX_OP_CMP_LE:
-                        ret = (val <= cmparg);
-                        break;
-                case FUTEX_OP_CMP_GT:
-                        ret = (val > cmparg);
-                        break;
-                default:
-                        ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9c95aa417e9b..cce15191e9e9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -55,6 +55,8 @@ config X86
        select ARCH_HAS_KCOV                    if X86_64
        select ARCH_HAS_MMIO_FLUSH
        select ARCH_HAS_PMEM_API                if X86_64
+        # Causing hangs/crashes, see the commit that added this change for details.
+        select ARCH_HAS_REFCOUNT                if BROKEN
        select ARCH_HAS_UACCESS_FLUSHCACHE      if X86_64
        select ARCH_HAS_SET_MEMORY
        select ARCH_HAS_SG_CHAIN
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 7a9df3beb89b..676ee5807d86 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -74,6 +74,9 @@
 # define _ASM_EXTABLE_EX(from, to)                              \
        _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+# define _ASM_EXTABLE_REFCOUNT(from, to)                        \
+        _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
 # define _ASM_NOKPROBE(entry)                                   \
        .pushsection "_kprobe_blacklist","aw" ;                 \
        _ASM_ALIGN ;                                            \
@@ -123,6 +126,9 @@
 # define _ASM_EXTABLE_EX(from, to)                              \
        _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+# define _ASM_EXTABLE_REFCOUNT(from, to)                        \
+        _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 33380b871463..0874ebda3069 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new)
        return xchg(&v->counter, new);
 }
-#define ATOMIC_OP(op)                                                   \
+static inline void atomic_and(int i, atomic_t *v)
-static inline void atomic_##op(int i, atomic_t *v)                      \
+{
-{                                                                       \
+        asm volatile(LOCK_PREFIX "andl %1,%0"
-        asm volatile(LOCK_PREFIX #op"l %1,%0"                           \
+                        : "+m" (v->counter)
-                        : "+m" (v->counter)                             \
+                        : "ir" (i)
-                        : "ir" (i)                                      \
+                        : "memory");
-                        : "memory");                                    \
+}
+static inline int atomic_fetch_and(int i, atomic_t *v)
+{
+        int val = atomic_read(v);
+        do { } while (!atomic_try_cmpxchg(v, &val, val & i));
+        return val;
 }
-#define ATOMIC_FETCH_OP(op, c_op)                                       \
+static inline void atomic_or(int i, atomic_t *v)
-static inline int atomic_fetch_##op(int i, atomic_t *v)                 \
+{
-{                                                                       \
+        asm volatile(LOCK_PREFIX "orl %1,%0"
-        int val = atomic_read(v);                                       \
+                        : "+m" (v->counter)
-        do {                                                            \
+                        : "ir" (i)
-        } while (!atomic_try_cmpxchg(v, &val, val c_op i));             \
+                        : "memory");
-        return val;                                                     \
 }
-#define ATOMIC_OPS(op, c_op)                                            \
+static inline int atomic_fetch_or(int i, atomic_t *v)
-        ATOMIC_OP(op)                                                   \
+{
-        ATOMIC_FETCH_OP(op, c_op)
+        int val = atomic_read(v);
-ATOMIC_OPS(and, &)
+        do { } while (!atomic_try_cmpxchg(v, &val, val | i));
-ATOMIC_OPS(or , |)
-ATOMIC_OPS(xor, ^)
-#undef ATOMIC_OPS
+        return val;
-#undef ATOMIC_FETCH_OP
+}
-#undef ATOMIC_OP
+static inline void atomic_xor(int i, atomic_t *v)
+{
+        asm volatile(LOCK_PREFIX "xorl %1,%0"
+                        : "+m" (v->counter)
+                        : "ir" (i)
+                        : "memory");
+}
+static inline int atomic_fetch_xor(int i, atomic_t *v)
+{
+        int val = atomic_read(v);
+        do { } while (!atomic_try_cmpxchg(v, &val, val ^ i));
+        return val;
+}
 /**
 * __atomic_add_unless - add unless the number is already a given value
@@ -239,10 +260,12 @@ ATOMIC_OPS(xor, ^)
 static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
        int c = atomic_read(v);
        do {
                if (unlikely(c == u))
                        break;
        } while (!atomic_try_cmpxchg(v, &c, c + a));
        return c;
 }
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 71d7705fb303..9e206f31ce2a 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
 #undef alternative_atomic64
 #undef __alternative_atomic64
-#define ATOMIC64_OP(op, c_op)                                           \
+static inline void atomic64_and(long long i, atomic64_t *v)
-static inline void atomic64_##op(long long i, atomic64_t *v)            \
+{
-{                                                                       \
+        long long old, c = 0;
-        long long old, c = 0;                                           \
-        while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c)           \
+        while ((old = atomic64_cmpxchg(v, c, c & i)) != c)
-                c = old;                                                \
+                c = old;
 }
-#define ATOMIC64_FETCH_OP(op, c_op)                                     \
+static inline long long atomic64_fetch_and(long long i, atomic64_t *v)
-static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \
+{
-{                                                                       \
+        long long old, c = 0;
-        long long old, c = 0;                                           \
-        while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c)           \
+        while ((old = atomic64_cmpxchg(v, c, c & i)) != c)
-                c = old;                                                \
+                c = old;
-        return old;                                                     \
+        return old;
 }
-ATOMIC64_FETCH_OP(add, +)
+static inline void atomic64_or(long long i, atomic64_t *v)
+{
+        long long old, c = 0;
-#define atomic64_fetch_sub(i, v)        atomic64_fetch_add(-(i), (v))
+        while ((old = atomic64_cmpxchg(v, c, c | i)) != c)
+                c = old;
+}
+static inline long long atomic64_fetch_or(long long i, atomic64_t *v)
+{
+        long long old, c = 0;
+        while ((old = atomic64_cmpxchg(v, c, c | i)) != c)
+                c = old;
+        return old;
+}
-#define ATOMIC64_OPS(op, c_op)                                          \
+static inline void atomic64_xor(long long i, atomic64_t *v)
-        ATOMIC64_OP(op, c_op)                                           \
+{
-        ATOMIC64_FETCH_OP(op, c_op)
+        long long old, c = 0;
+        while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c)
+                c = old;
+}
-ATOMIC64_OPS(and, &)
+static inline long long atomic64_fetch_xor(long long i, atomic64_t *v)
-ATOMIC64_OPS(or, |)
+{
-ATOMIC64_OPS(xor, ^)
+        long long old, c = 0;
+        while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c)
+                c = old;
+        return old;
+}
-#undef ATOMIC64_OPS
+static inline long long atomic64_fetch_add(long long i, atomic64_t *v)
-#undef ATOMIC64_FETCH_OP
+{
-#undef ATOMIC64_OP
+        long long old, c = 0;
+        while ((old = atomic64_cmpxchg(v, c, c + i)) != c)
+                c = old;
+        return old;
+}
+#define atomic64_fetch_sub(i, v)        atomic64_fetch_add(-(i), (v))
 #endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 6189a433c9a9..5d9de36a2f04 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
 }
 #define atomic64_try_cmpxchg atomic64_try_cmpxchg
-static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new)
+static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new)
 {
        return try_cmpxchg(&v->counter, old, new);
 }
@@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new)
 */
 static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
 {
-        long c = atomic64_read(v);
+        s64 c = atomic64_read(v);
        do {
                if (unlikely(c == u))
                        return false;
@@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
 */
 static inline long atomic64_dec_if_positive(atomic64_t *v)
 {
-        long dec, c = atomic64_read(v);
+        s64 dec, c = atomic64_read(v);
        do {
                dec = c - 1;
                if (unlikely(dec < 0))
@@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
        return dec;
 }
-#define ATOMIC64_OP(op)                                                 \
+static inline void atomic64_and(long i, atomic64_t *v)
-static inline void atomic64_##op(long i, atomic64_t *v)                 \
+{
-{                                                                       \
+        asm volatile(LOCK_PREFIX "andq %1,%0"
-        asm volatile(LOCK_PREFIX #op"q %1,%0"                           \
+                        : "+m" (v->counter)
-                        : "+m" (v->counter)                             \
+                        : "er" (i)
-                        : "er" (i)                                      \
+                        : "memory");
-                        : "memory");                                    \
 }
-#define ATOMIC64_FETCH_OP(op, c_op)                                     \
+static inline long atomic64_fetch_and(long i, atomic64_t *v)
-static inline long atomic64_fetch_##op(long i, atomic64_t *v)           \
+{
-{                                                                       \
+        s64 val = atomic64_read(v);
-        long val = atomic64_read(v);                                    \
-        do {                                                            \
+        do {
-        } while (!atomic64_try_cmpxchg(v, &val, val c_op i));           \
+        } while (!atomic64_try_cmpxchg(v, &val, val & i));
-        return val;                                                     \
+        return val;
 }
-#define ATOMIC64_OPS(op, c_op)                                          \
+static inline void atomic64_or(long i, atomic64_t *v)
-        ATOMIC64_OP(op)                                                 \
+{
-        ATOMIC64_FETCH_OP(op, c_op)
+        asm volatile(LOCK_PREFIX "orq %1,%0"
+                        : "+m" (v->counter)
+                        : "er" (i)
+                        : "memory");
+}
-ATOMIC64_OPS(and, &)
+static inline long atomic64_fetch_or(long i, atomic64_t *v)
-ATOMIC64_OPS(or, |)
+{
-ATOMIC64_OPS(xor, ^)
+        s64 val = atomic64_read(v);
-#undef ATOMIC64_OPS
+        do {
-#undef ATOMIC64_FETCH_OP
+        } while (!atomic64_try_cmpxchg(v, &val, val | i));
-#undef ATOMIC64_OP
+        return val;
+}
+static inline void atomic64_xor(long i, atomic64_t *v)
+{
+        asm volatile(LOCK_PREFIX "xorq %1,%0"
+                        : "+m" (v->counter)
+                        : "er" (i)
+                        : "memory");
+}
+static inline long atomic64_fetch_xor(long i, atomic64_t *v)
+{
+        s64 val = atomic64_read(v);
+        do {
+        } while (!atomic64_try_cmpxchg(v, &val, val ^ i));
+        return val;
+}
 #endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index d90296d061e8..b5069e802d5c 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -157,7 +157,7 @@ extern void __add_wrong_size(void)
 #define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock)                \
 ({                                                                      \
        bool success;                                                   \
-        __typeof__(_ptr) _old = (_pold);                                \
+        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);              \
        __typeof__(*(_ptr)) __old = *_old;                              \
        __typeof__(*(_ptr)) __new = (_new);                             \
        switch (size) {                                                 \
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index b4c1f5453436..f4dc9b63bdda 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -41,20 +41,11 @@
                       "+m" (*uaddr), "=&r" (tem)               \
                     : "r" (oparg), "i" (-EFAULT), "1" (0))
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+                u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret, tem;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
        pagefault_disable();
        switch (op) {
@@ -80,30 +71,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ:
-                        ret = (oldval == cmparg);
-                        break;
-                case FUTEX_OP_CMP_NE:
-                        ret = (oldval != cmparg);
-                        break;
-                case FUTEX_OP_CMP_LT:
-                        ret = (oldval < cmparg);
-                        break;
-                case FUTEX_OP_CMP_GE:
-                        ret = (oldval >= cmparg);
-                        break;
-                case FUTEX_OP_CMP_LE:
-                        ret = (oldval <= cmparg);
-                        break;
-                case FUTEX_OP_CMP_GT:
-                        ret = (oldval > cmparg);
-                        break;
-                default:
-                        ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
new file mode 100644
index 000000000000..ff871210b9f2
--- /dev/null
+++ b/arch/x86/include/asm/refcount.h
@@ -0,0 +1,109 @@
+#ifndef __ASM_X86_REFCOUNT_H
+#define __ASM_X86_REFCOUNT_H
+/*
+ * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from
+ * PaX/grsecurity.
+ */
+#include <linux/refcount.h>
+/*
+ * This is the first portion of the refcount error handling, which lives in
+ * .text.unlikely, and is jumped to from the CPU flag check (in the
+ * following macros). This saves the refcount value location into CX for
+ * the exception handler to use (in mm/extable.c), and then triggers the
+ * central refcount exception. The fixup address for the exception points
+ * back to the regular execution flow in .text.
+ */
+#define _REFCOUNT_EXCEPTION                             \
+        ".pushsection .text.unlikely\n"                 \
+        "111:\tlea %[counter], %%" _ASM_CX "\n"         \
+        "112:\t" ASM_UD0 "\n"                           \
+        ASM_UNREACHABLE                                 \
+        ".popsection\n"                                 \
+        "113:\n"                                        \
+        _ASM_EXTABLE_REFCOUNT(112b, 113b)
+/* Trigger refcount exception if refcount result is negative. */
+#define REFCOUNT_CHECK_LT_ZERO                          \
+        "js 111f\n\t"                                   \
+        _REFCOUNT_EXCEPTION
+/* Trigger refcount exception if refcount result is zero or negative. */
+#define REFCOUNT_CHECK_LE_ZERO                          \
+        "jz 111f\n\t"                                   \
+        REFCOUNT_CHECK_LT_ZERO
+/* Trigger refcount exception unconditionally. */
+#define REFCOUNT_ERROR                                  \
+        "jmp 111f\n\t"                                  \
+        _REFCOUNT_EXCEPTION
+static __always_inline void refcount_add(unsigned int i, refcount_t *r)
+{
+        asm volatile(LOCK_PREFIX "addl %1,%0\n\t"
+                REFCOUNT_CHECK_LT_ZERO
+                : [counter] "+m" (r->refs.counter)
+                : "ir" (i)
+                : "cc", "cx");
+}
+static __always_inline void refcount_inc(refcount_t *r)
+{
+        asm volatile(LOCK_PREFIX "incl %0\n\t"
+                REFCOUNT_CHECK_LT_ZERO
+                : [counter] "+m" (r->refs.counter)
+                : : "cc", "cx");
+}
+static __always_inline void refcount_dec(refcount_t *r)
+{
+        asm volatile(LOCK_PREFIX "decl %0\n\t"
+                REFCOUNT_CHECK_LE_ZERO
+                : [counter] "+m" (r->refs.counter)
+                : : "cc", "cx");
+}
+static __always_inline __must_check
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+        GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO,
+                                  r->refs.counter, "er", i, "%0", e);
+}
+static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
+{
+        GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO,
+                                 r->refs.counter, "%0", e);
+}
+static __always_inline __must_check
+bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+        int c, result;
+        c = atomic_read(&(r->refs));
+        do {
+                if (unlikely(c == 0))
+                        return false;
+                result = c + i;
+                /* Did we try to increment from/to an undesirable state? */
+                if (unlikely(c < 0 || c == INT_MAX || result < c)) {
+                        asm volatile(REFCOUNT_ERROR
+                                     : : [counter] "m" (r->refs.counter)
+                                     : "cc", "cx");
+                        break;
+                }
+        } while (!atomic_try_cmpxchg(&(r->refs), &c, result));
+        return c != 0;
+}
+static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r)
+{
+        return refcount_add_not_zero(1, r);
+}
+#endif
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index fb2ddcdf7c73..c076f710de4c 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -36,6 +36,48 @@ bool ex_handler_fault(const struct exception_table_entry *fixup,
 }
 EXPORT_SYMBOL_GPL(ex_handler_fault);
+/*
+ * Handler for UD0 exception following a failed test against the
+ * result of a refcount inc/dec/add/sub.
+ */
+bool ex_handler_refcount(const struct exception_table_entry *fixup,
+                         struct pt_regs *regs, int trapnr)
+{
+        /* First unconditionally saturate the refcount. */
+        *(int *)regs->cx = INT_MIN / 2;
+        /*
+         * Strictly speaking, this reports the fixup destination, not
+         * the fault location, and not the actually overflowing
+         * instruction, which is the instruction before the "js", but
+         * since that instruction could be a variety of lengths, just
+         * report the location after the overflow, which should be close
+         * enough for finding the overflow, as it's at least back in
+         * the function, having returned from .text.unlikely.
+         */
+        regs->ip = ex_fixup_addr(fixup);
+        /*
+         * This function has been called because either a negative refcount
+         * value was seen by any of the refcount functions, or a zero
+         * refcount value was seen by refcount_dec().
+         *
+         * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result
+         * wrapped around) will be set. Additionally, seeing the refcount
+         * reach 0 will set ZF (Zero Flag: result was zero). In each of
+         * these cases we want a report, since it's a boundary condition.
+         *
+         */
+        if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
+                bool zero = regs->flags & X86_EFLAGS_ZF;
+                refcount_error_report(regs, zero ? "hit zero" : "overflow");
+        }
+        return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_refcount);
 bool ex_handler_ext(const struct exception_table_entry *fixup,
                   struct pt_regs *regs, int trapnr)
 {
diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h
index b39531babec0..eaaf1ebcc7a4 100644
--- a/arch/xtensa/include/asm/futex.h
+++ b/arch/xtensa/include/asm/futex.h
@@ -44,18 +44,10 @@
        : "r" (uaddr), "I" (-EFAULT), "r" (oparg)       \
        : "memory")
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+                u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
 #if !XCHAL_HAVE_S32C1I
        return -ENOSYS;
@@ -89,19 +81,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
        pagefault_enable();
-        if (ret)
+        if (!ret)
-                return ret;
+                *oval = oldval;
-        switch (cmp) {
+        return ret;
-        case FUTEX_OP_CMP_EQ: return (oldval == cmparg);
-        case FUTEX_OP_CMP_NE: return (oldval != cmparg);
-        case FUTEX_OP_CMP_LT: return (oldval < cmparg);
-        case FUTEX_OP_CMP_GE: return (oldval >= cmparg);
-        case FUTEX_OP_CMP_LE: return (oldval <= cmparg);
-        case FUTEX_OP_CMP_GT: return (oldval > cmparg);
-        }
-        return -ENOSYS;
 }
 static inline int
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 87b7df4851bf..07125e7941f4 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -60,7 +60,7 @@ static void trigger_softirq(void *data)
 static int raise_blk_irq(int cpu, struct request *rq)
 {
        if (cpu_online(cpu)) {
-                struct call_single_data *data = &rq->csd;
+                call_single_data_t *data = &rq->csd;
                data->func = trigger_softirq;
                data->info = rq;
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 19182d091587..1893e416e7c0 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2884,7 +2884,7 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
         * need to be interruptible while waiting.
         */
        INIT_WORK_ONSTACK(&flush.work, flush_probe);
-        COMPLETION_INITIALIZER_ONSTACK(flush.cmp);
+        init_completion(&flush.cmp);
        queue_work(nfit_wq, &flush.work);
        mutex_unlock(&acpi_desc->init_mutex);
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 85c24cace973..81142ce781da 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -13,7 +13,7 @@
 struct nullb_cmd {
        struct list_head list;
        struct llist_node ll_list;
-        struct call_single_data csd;
+        call_single_data_t csd;
        struct request *rq;
        struct bio *bio;
        unsigned int tag;
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 72bbfccef113..fd4b7f684bd0 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -455,7 +455,11 @@ void arch_timer_enable_workaround(const struct arch_timer_erratum_workaround *wa
                        per_cpu(timer_unstable_counter_workaround, i) = wa;
        }
-        static_branch_enable(&arch_timer_read_ool_enabled);
+        /*
+         * Use the locked version, as we're called from the CPU
+         * hotplug framework. Otherwise, we end-up in deadlock-land.
+         */
+        static_branch_enable_cpuslocked(&arch_timer_read_ool_enabled);
        /*
         * Don't use the vdso fastpath if errata require using the
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 71e586d7df71..147f38ea0fcd 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -119,13 +119,13 @@ struct cpuidle_coupled {
 #define CPUIDLE_COUPLED_NOT_IDLE        (-1)
-static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
+static DEFINE_PER_CPU(call_single_data_t, cpuidle_coupled_poke_cb);
 /*
 * The cpuidle_coupled_poke_pending mask is used to avoid calling
- * __smp_call_function_single with the per cpu call_single_data struct already
+ * __smp_call_function_single with the per cpu call_single_data_t struct already
 * in use.  This prevents a deadlock where two cpus are waiting for each others
- * call_single_data struct to be available
+ * call_single_data_t struct to be available
 */
 static cpumask_t cpuidle_coupled_poke_pending;
@@ -339,7 +339,7 @@ static void cpuidle_coupled_handle_poke(void *info)
 */
 static void cpuidle_coupled_poke(int cpu)
 {
-        struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
+        call_single_data_t *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
        if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending))
                smp_call_function_single_async(cpu, csd);
@@ -651,7 +651,7 @@ int cpuidle_coupled_register_device(struct cpuidle_device *dev)
 {
        int cpu;
        struct cpuidle_device *other_dev;
-        struct call_single_data *csd;
+        call_single_data_t *csd;
        struct cpuidle_coupled *coupled;
        if (cpumask_empty(&dev->coupled_cpus))
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 48572b157222..a36216bd2a84 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -28,6 +28,7 @@
 #include <linux/debugfs.h>
 #include <linux/sort.h>
+#include <linux/sched/mm.h>
 #include "intel_drv.h"
 static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
@@ -4305,7 +4306,7 @@ i915_drop_caches_set(void *data, u64 val)
                mutex_unlock(&dev->struct_mutex);
        }
-        lockdep_set_current_reclaim_state(GFP_KERNEL);
+        fs_reclaim_acquire(GFP_KERNEL);
        if (val & DROP_BOUND)
                i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_BOUND);
@@ -4314,7 +4315,7 @@ i915_drop_caches_set(void *data, u64 val)
        if (val & DROP_SHRINK_ALL)
                i915_gem_shrink_all(dev_priv);
-        lockdep_clear_current_reclaim_state();
+        fs_reclaim_release(GFP_KERNEL);
        if (val & DROP_FREED) {
                synchronize_rcu();
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 51583ae4b1eb..120b6e537b28 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2468,7 +2468,7 @@ static void liquidio_napi_drv_callback(void *arg)
        if (OCTEON_CN23XX_PF(oct) || droq->cpu_id == this_cpu) {
                napi_schedule_irqoff(&droq->napi);
        } else {
-                struct call_single_data *csd = &droq->csd;
+                call_single_data_t *csd = &droq->csd;
                csd->func = napi_schedule_wrapper;
                csd->info = &droq->napi;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
index 6efd139b894d..f91bc84d1719 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
@@ -328,7 +328,7 @@ struct octeon_droq {
        u32 cpu_id;
-        struct call_single_data csd;
+        call_single_data_t csd;
 };
 #define OCT_DROQ_SIZE   (sizeof(struct octeon_droq))
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 3d424a51cabb..f0fd3adb1693 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -446,14 +446,14 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
                        ovl_path_upper(dentry, &upperpath);
                        realfile = ovl_path_open(&upperpath, O_RDONLY);
-                        smp_mb__before_spinlock();
                        inode_lock(inode);
                        if (!od->upperfile) {
                                if (IS_ERR(realfile)) {
                                        inode_unlock(inode);
                                        return PTR_ERR(realfile);
                                }
-                                od->upperfile = realfile;
+                                smp_store_release(&od->upperfile, realfile);
                        } else {
                                /* somebody has beaten us to it */
                                if (!IS_ERR(realfile))
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index b0d5897bc4e6..886085b47c75 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -109,27 +109,24 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
                goto out;
        WRITE_ONCE(uwq->waken, true);
        /*
-         * The implicit smp_mb__before_spinlock in try_to_wake_up()
+         * The Program-Order guarantees provided by the scheduler
-         * renders uwq->waken visible to other CPUs before the task is
+         * ensure uwq->waken is visible before the task is woken.
-         * waken.
         */
        ret = wake_up_state(wq->private, mode);
-        if (ret)
+        if (ret) {
                /*
                 * Wake only once, autoremove behavior.
                 *
-                 * After the effect of list_del_init is visible to the
+                 * After the effect of list_del_init is visible to the other
-                 * other CPUs, the waitqueue may disappear from under
+                 * CPUs, the waitqueue may disappear from under us, see the
-                 * us, see the !list_empty_careful() in
+                 * !list_empty_careful() in handle_userfault().
-                 * handle_userfault(). try_to_wake_up() has an
+                 *
-                 * implicit smp_mb__before_spinlock, and the
+                 * try_to_wake_up() has an implicit smp_mb(), and the
-                 * wq->private is read before calling the extern
+                 * wq->private is read before calling the extern function
-                 * function "wake_up_state" (which in turns calls
+                 * "wake_up_state" (which in turns calls try_to_wake_up).
-                 * try_to_wake_up). While the spin_lock;spin_unlock;
-                 * wouldn't be enough, the smp_mb__before_spinlock is
-                 * enough to avoid an explicit smp_mb() here.
                 */
                list_del_init(&wq->entry);
+        }
 out:
        return ret;
 }
diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index dad68bf46c77..8d28eb010d0d 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h
@@ -21,6 +21,8 @@ typedef struct {
 extern long long atomic64_read(const atomic64_t *v);
 extern void      atomic64_set(atomic64_t *v, long long i);
+#define atomic64_set_release(v, i)      atomic64_set((v), (i))
 #define ATOMIC64_OP(op)                                                 \
 extern void      atomic64_##op(long long a, atomic64_t *v);
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index bf2d34c9d804..f0d8b1c51343 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -13,7 +13,7 @@
 */
 /**
- * futex_atomic_op_inuser() - Atomic arithmetic operation with constant
+ * arch_futex_atomic_op_inuser() - Atomic arithmetic operation with constant
 *                        argument and comparison of the previous
 *                        futex value with another constant.
 *
@@ -25,18 +25,11 @@
 * <0 - On error
 */
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval, ret;
        u32 tmp;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
        preempt_disable();
        pagefault_disable();
@@ -74,17 +67,9 @@ out_pagefault_enable:
        pagefault_enable();
        preempt_enable();
-        if (ret == 0) {
+        if (ret == 0)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-                case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-                case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-                case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-                case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-                default: ret = -ENOSYS;
-                }
-        }
        return ret;
 }
@@ -126,18 +111,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 #else
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr)
 {
-        int op = (encoded_op >> 28) & 7;
-        int cmp = (encoded_op >> 24) & 15;
-        int oparg = (encoded_op << 8) >> 20;
-        int cmparg = (encoded_op << 20) >> 20;
        int oldval = 0, ret;
-        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-                oparg = 1 << oparg;
-        if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-                return -EFAULT;
        pagefault_disable();
@@ -153,17 +129,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
        pagefault_enable();
-        if (!ret) {
+        if (!ret)
-                switch (cmp) {
+                *oval = oldval;
-                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-                case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-                case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-                case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-                case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-                default: ret = -ENOSYS;
-                }
-        }
        return ret;
 }
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index c56be7410130..40d6bfec0e0d 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -38,6 +38,9 @@
 * Besides, if an arch has a special barrier for acquire/release, it could
 * implement its own __atomic_op_* and use the same framework for building
 * variants
+ *
+ * If an architecture overrides __atomic_op_acquire() it will probably want
+ * to define smp_mb__after_spinlock().
 */
 #ifndef __atomic_op_acquire
 #define __atomic_op_acquire(op, args...)                                \
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2a5d52fa90f5..4b99b13c7e68 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -134,7 +134,7 @@ typedef __u32 __bitwise req_flags_t;
 struct request {
        struct list_head queuelist;
        union {
-                struct call_single_data csd;
+                call_single_data_t csd;
                u64 fifo_time;
        };
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 5d5aaae3af43..cae5400022a3 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -9,6 +9,9 @@
 */
 #include <linux/wait.h>
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
+#include <linux/lockdep.h>
+#endif
 /*
 * struct completion - structure used to maintain state for a "completion"
@@ -25,13 +28,53 @@
 struct completion {
        unsigned int done;
        wait_queue_head_t wait;
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
+        struct lockdep_map_cross map;
+#endif
 };
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
+static inline void complete_acquire(struct completion *x)
+{
+        lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_);
+}
+static inline void complete_release(struct completion *x)
+{
+        lock_release((struct lockdep_map *)&x->map, 0, _RET_IP_);
+}
+static inline void complete_release_commit(struct completion *x)
+{
+        lock_commit_crosslock((struct lockdep_map *)&x->map);
+}
+#define init_completion(x)                                              \
+do {                                                                    \
+        static struct lock_class_key __key;                             \
+        lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map,     \
+                        "(complete)" #x,                                \
+                        &__key, 0);                                     \
+        __init_completion(x);                                           \
+} while (0)
+#else
+#define init_completion(x) __init_completion(x)
+static inline void complete_acquire(struct completion *x) {}
+static inline void complete_release(struct completion *x) {}
+static inline void complete_release_commit(struct completion *x) {}
+#endif
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
+#define COMPLETION_INITIALIZER(work) \
+        { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
+        STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
+#else
 #define COMPLETION_INITIALIZER(work) \
        { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
+#endif
 #define COMPLETION_INITIALIZER_ONSTACK(work) \
-        ({ init_completion(&work); work; })
+        (*({ init_completion(&work); &work; }))
 /**
 * DECLARE_COMPLETION - declare and initialize a completion structure
@@ -70,7 +113,7 @@ struct completion {
 * This inline function will initialize a dynamically created completion
 * structure.
 */
-static inline void init_completion(struct completion *x)
+static inline void __init_completion(struct completion *x)
 {
        x->done = 0;
        init_waitqueue_head(&x->wait);
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 898cfe2eeb42..e74655d941b7 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -37,12 +37,6 @@ static inline bool cpusets_enabled(void)
        return static_branch_unlikely(&cpusets_enabled_key);
 }
-static inline int nr_cpusets(void)
-{
-        /* jump label reference count + the top-level cpuset */
-        return static_key_count(&cpusets_enabled_key.key) + 1;
-}
 static inline void cpuset_inc(void)
 {
        static_branch_inc(&cpusets_pre_enable_key);
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 7c5b694864cd..f36bfd26f998 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -54,7 +54,6 @@ union futex_key {
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
-extern void exit_pi_state_list(struct task_struct *curr);
 #ifdef CONFIG_HAVE_FUTEX_CMPXCHG
 #define futex_cmpxchg_enabled 1
 #else
@@ -64,8 +63,14 @@ extern int futex_cmpxchg_enabled;
 static inline void exit_robust_list(struct task_struct *curr)
 {
 }
+#endif
+#ifdef CONFIG_FUTEX_PI
+extern void exit_pi_state_list(struct task_struct *curr);
+#else
 static inline void exit_pi_state_list(struct task_struct *curr)
 {
 }
 #endif
 #endif
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 5dd1272d1ab2..5fdd93bb9300 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -23,10 +23,26 @@
 # define trace_softirq_context(p)       ((p)->softirq_context)
 # define trace_hardirqs_enabled(p)      ((p)->hardirqs_enabled)
 # define trace_softirqs_enabled(p)      ((p)->softirqs_enabled)
-# define trace_hardirq_enter()  do { current->hardirq_context++; } while (0)
+# define trace_hardirq_enter()                  \
-# define trace_hardirq_exit()   do { current->hardirq_context--; } while (0)
+do {                                            \
-# define lockdep_softirq_enter()        do { current->softirq_context++; } while (0)
+        current->hardirq_context++;             \
-# define lockdep_softirq_exit() do { current->softirq_context--; } while (0)
+        crossrelease_hist_start(XHLOCK_HARD);   \
+} while (0)
+# define trace_hardirq_exit()                   \
+do {                                            \
+        current->hardirq_context--;             \
+        crossrelease_hist_end(XHLOCK_HARD);     \
+} while (0)
+# define lockdep_softirq_enter()                \
+do {                                            \
+        current->softirq_context++;             \
+        crossrelease_hist_start(XHLOCK_SOFT);   \
+} while (0)
+# define lockdep_softirq_exit()                 \
+do {                                            \
+        current->softirq_context--;             \
+        crossrelease_hist_end(XHLOCK_SOFT);     \
+} while (0)
 # define INIT_TRACE_IRQFLAGS    .softirqs_enabled = 1,
 #else
 # define trace_hardirqs_on()            do { } while (0)
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 2afd74b9d844..cd5861651b17 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -163,6 +163,8 @@ extern void jump_label_apply_nops(struct module *mod);
 extern int static_key_count(struct static_key *key);
 extern void static_key_enable(struct static_key *key);
 extern void static_key_disable(struct static_key *key);
+extern void static_key_enable_cpuslocked(struct static_key *key);
+extern void static_key_disable_cpuslocked(struct static_key *key);
 /*
 * We should be using ATOMIC_INIT() for initializing .enabled, but
@@ -234,24 +236,29 @@ static inline int jump_label_apply_nops(struct module *mod)
 static inline void static_key_enable(struct static_key *key)
 {
-        int count = static_key_count(key);
+        STATIC_KEY_CHECK_USE();
-        WARN_ON_ONCE(count < 0 || count > 1);
-        if (!count)
+        if (atomic_read(&key->enabled) != 0) {
-                static_key_slow_inc(key);
+                WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
+                return;
+        }
+        atomic_set(&key->enabled, 1);
 }
 static inline void static_key_disable(struct static_key *key)
 {
-        int count = static_key_count(key);
+        STATIC_KEY_CHECK_USE();
-        WARN_ON_ONCE(count < 0 || count > 1);
-        if (count)
+        if (atomic_read(&key->enabled) != 1) {
-                static_key_slow_dec(key);
+                WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
+                return;
+        }
+        atomic_set(&key->enabled, 0);
 }
+#define static_key_enable_cpuslocked(k)         static_key_enable((k))
+#define static_key_disable_cpuslocked(k)        static_key_disable((k))
 #define STATIC_KEY_INIT_TRUE    { .enabled = ATOMIC_INIT(1) }
 #define STATIC_KEY_INIT_FALSE   { .enabled = ATOMIC_INIT(0) }
@@ -413,8 +420,10 @@ extern bool ____wrong_branch_error(void);
 * Normal usage; boolean enable/disable.
 */
-#define static_branch_enable(x)         static_key_enable(&(x)->key)
+#define static_branch_enable(x)                 static_key_enable(&(x)->key)
-#define static_branch_disable(x)        static_key_disable(&(x)->key)
+#define static_branch_disable(x)                static_key_disable(&(x)->key)
+#define static_branch_enable_cpuslocked(x)      static_key_enable_cpuslocked(&(x)->key)
+#define static_branch_disable_cpuslocked(x)     static_key_disable_cpuslocked(&(x)->key)
 #endif /* __ASSEMBLY__ */
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index b7f8aced7870..41960fecf783 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -2,11 +2,13 @@
 #define _LINUX_KASAN_CHECKS_H
 #ifdef CONFIG_KASAN
-void kasan_check_read(const void *p, unsigned int size);
+void kasan_check_read(const volatile void *p, unsigned int size);
-void kasan_check_write(const void *p, unsigned int size);
+void kasan_check_write(const volatile void *p, unsigned int size);
 #else
-static inline void kasan_check_read(const void *p, unsigned int size) { }
+static inline void kasan_check_read(const volatile void *p, unsigned int size)
-static inline void kasan_check_write(const void *p, unsigned int size) { }
+{ }
+static inline void kasan_check_write(const volatile void *p, unsigned int size)
+{ }
 #endif
 #endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index bd6d96cf80b1..6607225d0ea4 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -277,6 +277,13 @@ extern int oops_may_print(void);
 void do_exit(long error_code) __noreturn;
 void complete_and_exit(struct completion *, long) __noreturn;
+#ifdef CONFIG_ARCH_HAS_REFCOUNT
+void refcount_error_report(struct pt_regs *regs, const char *err);
+#else
+static inline void refcount_error_report(struct pt_regs *regs, const char *err)
+{ }
+#endif
 /* Internal, do not use. */
 int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
 int __must_check _kstrtol(const char *s, unsigned int base, long *res);
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index fffe49f188e6..bfa8e0b0d6f1 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -18,6 +18,8 @@ extern int lock_stat;
 #define MAX_LOCKDEP_SUBCLASSES          8UL
+#include <linux/types.h>
 #ifdef CONFIG_LOCKDEP
 #include <linux/linkage.h>
@@ -29,7 +31,7 @@ extern int lock_stat;
 * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
 * the total number of states... :-(
 */
-#define XXX_LOCK_USAGE_STATES           (1+3*4)
+#define XXX_LOCK_USAGE_STATES           (1+2*4)
 /*
 * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
@@ -155,6 +157,12 @@ struct lockdep_map {
        int                             cpu;
        unsigned long                   ip;
 #endif
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+        /*
+         * Whether it's a crosslock.
+         */
+        int                             cross;
+#endif
 };
 static inline void lockdep_copy_map(struct lockdep_map *to,
@@ -258,8 +266,95 @@ struct held_lock {
        unsigned int hardirqs_off:1;
        unsigned int references:12;                                     /* 32 bits */
        unsigned int pin_count;
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+        /*
+         * Generation id.
+         *
+         * A value of cross_gen_id will be stored when holding this,
+         * which is globally increased whenever each crosslock is held.
+         */
+        unsigned int gen_id;
+#endif
+};
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+#define MAX_XHLOCK_TRACE_ENTRIES 5
+/*
+ * This is for keeping locks waiting for commit so that true dependencies
+ * can be added at commit step.
+ */
+struct hist_lock {
+        /*
+         * Id for each entry in the ring buffer. This is used to
+         * decide whether the ring buffer was overwritten or not.
+         *
+         * For example,
+         *
+         *           |<----------- hist_lock ring buffer size ------->|
+         *           pppppppppppppppppppppiiiiiiiiiiiiiiiiiiiiiiiiiiiii
+         * wrapped > iiiiiiiiiiiiiiiiiiiiiiiiiii.......................
+         *
+         *           where 'p' represents an acquisition in process
+         *           context, 'i' represents an acquisition in irq
+         *           context.
+         *
+         * In this example, the ring buffer was overwritten by
+         * acquisitions in irq context, that should be detected on
+         * rollback or commit.
+         */
+        unsigned int hist_id;
+        /*
+         * Seperate stack_trace data. This will be used at commit step.
+         */
+        struct stack_trace      trace;
+        unsigned long           trace_entries[MAX_XHLOCK_TRACE_ENTRIES];
+        /*
+         * Seperate hlock instance. This will be used at commit step.
+         *
+         * TODO: Use a smaller data structure containing only necessary
+         * data. However, we should make lockdep code able to handle the
+         * smaller one first.
+         */
+        struct held_lock        hlock;
+};
+/*
+ * To initialize a lock as crosslock, lockdep_init_map_crosslock() should
+ * be called instead of lockdep_init_map().
+ */
+struct cross_lock {
+        /*
+         * When more than one acquisition of crosslocks are overlapped,
+         * we have to perform commit for them based on cross_gen_id of
+         * the first acquisition, which allows us to add more true
+         * dependencies.
+         *
+         * Moreover, when no acquisition of a crosslock is in progress,
+         * we should not perform commit because the lock might not exist
+         * any more, which might cause incorrect memory access. So we
+         * have to track the number of acquisitions of a crosslock.
+         */
+        int nr_acquire;
+        /*
+         * Seperate hlock instance. This will be used at commit step.
+         *
+         * TODO: Use a smaller data structure containing only necessary
+         * data. However, we should make lockdep code able to handle the
+         * smaller one first.
+         */
+        struct held_lock        hlock;
 };
+struct lockdep_map_cross {
+        struct lockdep_map map;
+        struct cross_lock xlock;
+};
+#endif
 /*
 * Initialization, self-test and debugging-output methods:
 */
@@ -282,13 +377,6 @@ extern void lockdep_init_map(struct lockdep_map *lock, const char *name,
                             struct lock_class_key *key, int subclass);
 /*
- * To initialize a lockdep_map statically use this macro.
- * Note that _name must not be NULL.
- */
-#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
-        { .name = (_name), .key = (void *)(_key), }
-/*
 * Reinitialize a lock key - for cases where there is special locking or
 * special initialization of locks so that the validator gets the scope
 * of dependencies wrong: they are either too broad (they need a class-split)
@@ -363,10 +451,6 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
 extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);
-extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
-extern void lockdep_clear_current_reclaim_state(void);
-extern void lockdep_trace_alloc(gfp_t mask);
 struct pin_cookie { unsigned int val; };
 #define NIL_COOKIE (struct pin_cookie){ .val = 0U, }
@@ -375,7 +459,7 @@ extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
 extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
 extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
-# define INIT_LOCKDEP                           .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
+# define INIT_LOCKDEP                           .lockdep_recursion = 0,
 #define lockdep_depth(tsk)      (debug_locks ? (tsk)->lockdep_depth : 0)
@@ -416,9 +500,6 @@ static inline void lockdep_on(void)
 # define lock_downgrade(l, i)                   do { } while (0)
 # define lock_set_class(l, n, k, s, i)          do { } while (0)
 # define lock_set_subclass(l, s, i)             do { } while (0)
-# define lockdep_set_current_reclaim_state(g)   do { } while (0)
-# define lockdep_clear_current_reclaim_state()  do { } while (0)
-# define lockdep_trace_alloc(g)                 do { } while (0)
 # define lockdep_info()                         do { } while (0)
 # define lockdep_init_map(lock, name, key, sub) \
                do { (void)(name); (void)(key); } while (0)
@@ -467,6 +548,58 @@ struct pin_cookie { };
 #endif /* !LOCKDEP */
+enum xhlock_context_t {
+        XHLOCK_HARD,
+        XHLOCK_SOFT,
+        XHLOCK_CTX_NR,
+};
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+extern void lockdep_init_map_crosslock(struct lockdep_map *lock,
+                                       const char *name,
+                                       struct lock_class_key *key,
+                                       int subclass);
+extern void lock_commit_crosslock(struct lockdep_map *lock);
+/*
+ * What we essencially have to initialize is 'nr_acquire'. Other members
+ * will be initialized in add_xlock().
+ */
+#define STATIC_CROSS_LOCK_INIT() \
+        { .nr_acquire = 0,}
+#define STATIC_CROSS_LOCKDEP_MAP_INIT(_name, _key) \
+        { .map.name = (_name), .map.key = (void *)(_key), \
+          .map.cross = 1, .xlock = STATIC_CROSS_LOCK_INIT(), }
+/*
+ * To initialize a lockdep_map statically use this macro.
+ * Note that _name must not be NULL.
+ */
+#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
+        { .name = (_name), .key = (void *)(_key), .cross = 0, }
+extern void crossrelease_hist_start(enum xhlock_context_t c);
+extern void crossrelease_hist_end(enum xhlock_context_t c);
+extern void lockdep_invariant_state(bool force);
+extern void lockdep_init_task(struct task_struct *task);
+extern void lockdep_free_task(struct task_struct *task);
+#else /* !CROSSRELEASE */
+#define lockdep_init_map_crosslock(m, n, k, s) do {} while (0)
+/*
+ * To initialize a lockdep_map statically use this macro.
+ * Note that _name must not be NULL.
+ */
+#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
+        { .name = (_name), .key = (void *)(_key), }
+static inline void crossrelease_hist_start(enum xhlock_context_t c) {}
+static inline void crossrelease_hist_end(enum xhlock_context_t c) {}
+static inline void lockdep_invariant_state(bool force) {}
+static inline void lockdep_init_task(struct task_struct *task) {}
+static inline void lockdep_free_task(struct task_struct *task) {}
+#endif /* CROSSRELEASE */
 #ifdef CONFIG_LOCK_STAT
 extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3cadee0a3508..57378c7cb5f8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -526,26 +526,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 extern void tlb_finish_mmu(struct mmu_gather *tlb,
                                unsigned long start, unsigned long end);
-/*
- * Memory barriers to keep this state in sync are graciously provided by
- * the page table locks, outside of which no page table modifications happen.
- * The barriers are used to ensure the order between tlb_flush_pending updates,
- * which happen while the lock is not taken, and the PTE updates, which happen
- * while the lock is taken, are serialized.
- */
-static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
-{
-        return atomic_read(&mm->tlb_flush_pending) > 0;
-}
-/*
- * Returns true if there are two above TLB batching threads in parallel.
- */
-static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
-{
-        return atomic_read(&mm->tlb_flush_pending) > 1;
-}
 static inline void init_tlb_flush_pending(struct mm_struct *mm)
 {
        atomic_set(&mm->tlb_flush_pending, 0);
@@ -554,27 +534,82 @@ static inline void init_tlb_flush_pending(struct mm_struct *mm)
 static inline void inc_tlb_flush_pending(struct mm_struct *mm)
 {
        atomic_inc(&mm->tlb_flush_pending);
        /*
-         * Guarantee that the tlb_flush_pending increase does not leak into the
+         * The only time this value is relevant is when there are indeed pages
-         * critical section updating the page tables
+         * to flush. And we'll only flush pages after changing them, which
+         * requires the PTL.
+         *
+         * So the ordering here is:
+         *
+         *      atomic_inc(&mm->tlb_flush_pending);
+         *      spin_lock(&ptl);
+         *      ...
+         *      set_pte_at();
+         *      spin_unlock(&ptl);
+         *
+         *                              spin_lock(&ptl)
+         *                              mm_tlb_flush_pending();
+         *                              ....
+         *                              spin_unlock(&ptl);
+         *
+         *      flush_tlb_range();
+         *      atomic_dec(&mm->tlb_flush_pending);
+         *
+         * Where the increment if constrained by the PTL unlock, it thus
+         * ensures that the increment is visible if the PTE modification is
+         * visible. After all, if there is no PTE modification, nobody cares
+         * about TLB flushes either.
+         *
+         * This very much relies on users (mm_tlb_flush_pending() and
+         * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
+         * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
+         * locks (PPC) the unlock of one doesn't order against the lock of
+         * another PTL.
+         *
+         * The decrement is ordered by the flush_tlb_range(), such that
+         * mm_tlb_flush_pending() will not return false unless all flushes have
+         * completed.
         */
-        smp_mb__before_spinlock();
 }
-/* Clearing is done after a TLB flush, which also provides a barrier. */
 static inline void dec_tlb_flush_pending(struct mm_struct *mm)
 {
        /*
-         * Guarantee that the tlb_flush_pending does not not leak into the
+         * See inc_tlb_flush_pending().
-         * critical section, since we must order the PTE change and changes to
+         *
-         * the pending TLB flush indication. We could have relied on TLB flush
+         * This cannot be smp_mb__before_atomic() because smp_mb() simply does
-         * as a memory barrier, but this behavior is not clearly documented.
+         * not order against TLB invalidate completion, which is what we need.
+         *
+         * Therefore we must rely on tlb_flush_*() to guarantee order.
         */
-        smp_mb__before_atomic();
        atomic_dec(&mm->tlb_flush_pending);
 }
+static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
+{
+        /*
+         * Must be called after having acquired the PTL; orders against that
+         * PTLs release and therefore ensures that if we observe the modified
+         * PTE we must also observe the increment from inc_tlb_flush_pending().
+         *
+         * That is, it only guarantees to return true if there is a flush
+         * pending for _this_ PTL.
+         */
+        return atomic_read(&mm->tlb_flush_pending);
+}
+static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
+{
+        /*
+         * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
+         * for which there is a TLB flush pending in order to guarantee
+         * we've seen both that PTE modification and the increment.
+         *
+         * (no requirement on actually still holding the PTL, that is irrelevant)
+         */
+        return atomic_read(&mm->tlb_flush_pending) > 1;
+}
 struct vm_fault;
 struct vm_special_mapping {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c99ba7914c0a..461bd5757af6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2774,7 +2774,7 @@ struct softnet_data {
        unsigned int            input_queue_head ____cacheline_aligned_in_smp;
        /* Elements below can be accessed between CPUs for RPS/RFS */
-        struct call_single_data csd ____cacheline_aligned_in_smp;
+        call_single_data_t      csd ____cacheline_aligned_in_smp;
        struct softnet_data     *rps_ipi_next;
        unsigned int            cpu;
        unsigned int            input_queue_tail;
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 591792c8e5b0..48b7c9c68c4d 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -53,6 +53,9 @@ extern __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r);
 extern __must_check bool refcount_dec_and_test(refcount_t *r);
 extern void refcount_dec(refcount_t *r);
 #else
+# ifdef CONFIG_ARCH_HAS_REFCOUNT
+#  include <asm/refcount.h>
+# else
 static inline __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r)
 {
        return atomic_add_unless(&r->refs, i, 0);
@@ -87,6 +90,7 @@ static inline void refcount_dec(refcount_t *r)
 {
        atomic_dec(&r->refs);
 }
+# endif /* !CONFIG_ARCH_HAS_REFCOUNT */
 #endif /* CONFIG_REFCOUNT_FULL */
 extern __must_check bool refcount_dec_if_one(refcount_t *r);
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index ae0528b834cd..e784761a4443 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -32,6 +32,7 @@ struct rw_semaphore {
 #define RWSEM_UNLOCKED_VALUE            0x00000000
 extern void __down_read(struct rw_semaphore *sem);
+extern int __must_check __down_read_killable(struct rw_semaphore *sem);
 extern int __down_read_trylock(struct rw_semaphore *sem);
 extern void __down_write(struct rw_semaphore *sem);
 extern int __must_check __down_write_killable(struct rw_semaphore *sem);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index dd1d14250340..0ad7318ff299 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -44,6 +44,7 @@ struct rw_semaphore {
 };
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e5fbce866073..9ba42c663fba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -847,7 +847,17 @@ struct task_struct {
        int                             lockdep_depth;
        unsigned int                    lockdep_recursion;
        struct held_lock                held_locks[MAX_LOCK_DEPTH];
-        gfp_t                           lockdep_reclaim_gfp;
+#endif
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+#define MAX_XHLOCKS_NR 64UL
+        struct hist_lock *xhlocks; /* Crossrelease history locks */
+        unsigned int xhlock_idx;
+        /* For restoring at history boundaries */
+        unsigned int xhlock_idx_hist[XHLOCK_CTX_NR];
+        unsigned int hist_id;
+        /* For overwrite check at each context exit */
+        unsigned int hist_id_save[XHLOCK_CTX_NR];
 #endif
 #ifdef CONFIG_UBSAN
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2b24a6974847..2b0a281f9d26 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -167,6 +167,14 @@ static inline gfp_t current_gfp_context(gfp_t flags)
        return flags;
 }
+#ifdef CONFIG_LOCKDEP
+extern void fs_reclaim_acquire(gfp_t gfp_mask);
+extern void fs_reclaim_release(gfp_t gfp_mask);
+#else
+static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
+static inline void fs_reclaim_release(gfp_t gfp_mask) { }
+#endif
 static inline unsigned int memalloc_noio_save(void)
 {
        unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 68123c1fe549..98b1fe027fc9 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -14,13 +14,17 @@
 #include <linux/llist.h>
 typedef void (*smp_call_func_t)(void *info);
-struct call_single_data {
+struct __call_single_data {
        struct llist_node llist;
        smp_call_func_t func;
        void *info;
        unsigned int flags;
 };
+/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
+typedef struct __call_single_data call_single_data_t
+        __aligned(sizeof(struct __call_single_data));
 /* total number of cpus in this system (may exceed NR_CPUS) */
 extern unsigned int total_cpus;
@@ -48,7 +52,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
                smp_call_func_t func, void *info, bool wait,
                gfp_t gfp_flags);
-int smp_call_function_single_async(int cpu, struct call_single_data *csd);
+int smp_call_function_single_async(int cpu, call_single_data_t *csd);
 #ifdef CONFIG_SMP
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index ef018a6e4985..69e079c5ff98 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -118,16 +118,39 @@ do {								\
 #endif
 /*
- * Despite its name it doesn't necessarily has to be a full barrier.
+ * This barrier must provide two things:
- * It should only guarantee that a STORE before the critical section
+ *
- * can not be reordered with LOADs and STOREs inside this section.
+ *   - it must guarantee a STORE before the spin_lock() is ordered against a
- * spin_lock() is the one-way barrier, this LOAD can not escape out
+ *     LOAD after it, see the comments at its two usage sites.
- * of the region. So the default implementation simply ensures that
+ *
- * a STORE can not move into the critical section, smp_wmb() should
+ *   - it must ensure the critical section is RCsc.
- * serialize it with another STORE done by spin_lock().
+ *
+ * The latter is important for cases where we observe values written by other
+ * CPUs in spin-loops, without barriers, while being subject to scheduling.
+ *
+ * CPU0                 CPU1                    CPU2
+ *
+ *                      for (;;) {
+ *                        if (READ_ONCE(X))
+ *                          break;
+ *                      }
+ * X=1
+ *                      <sched-out>
+ *                                              <sched-in>
+ *                                              r = X;
+ *
+ * without transitivity it could be that CPU1 observes X!=0 breaks the loop,
+ * we get migrated and CPU2 sees X==0.
+ *
+ * Since most load-store architectures implement ACQUIRE with an smp_mb() after
+ * the LL/SC loop, they need no further barriers. Similarly all our TSO
+ * architectures imply an smp_mb() for each atomic instruction and equally don't
+ * need more.
+ *
+ * Architectures that can implement ACQUIRE better need to take care.
 */
-#ifndef smp_mb__before_spinlock
+#ifndef smp_mb__after_spinlock
-#define smp_mb__before_spinlock()       smp_wmb()
+#define smp_mb__after_spinlock()        do { } while (0)
 #endif
 #ifdef CONFIG_DEBUG_SPINLOCK
diff --git a/init/Kconfig b/init/Kconfig
index 8514b25db21c..5f0ef850e808 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1275,12 +1275,17 @@ config BASE_FULL
 config FUTEX
        bool "Enable futex support" if EXPERT
        default y
-        select RT_MUTEXES
+        imply RT_MUTEXES
        help
          Disabling this option will cause the kernel to be built without
          support for "fast userspace mutexes".  The resulting kernel may not
          run glibc-based applications correctly.
+config FUTEX_PI
+        bool
+        depends on FUTEX && RT_MUTEXES
+        default y
 config HAVE_FUTEX_CMPXCHG
        bool
        depends on FUTEX
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index df403e97b073..2f4039bafebb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -577,6 +577,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
        rcu_read_unlock();
 }
+/* Must be called with cpuset_mutex held.  */
+static inline int nr_cpusets(void)
+{
+        /* jump label reference count + the top-level cpuset */
+        return static_key_count(&cpusets_enabled_key.key) + 1;
+}
 /*
 * generate_sched_domains()
 *
diff --git a/kernel/exit.c b/kernel/exit.c
index f9ef3ecc78c1..a35d8a17e01f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -918,6 +918,7 @@ void __noreturn do_exit(long code)
        exit_rcu();
        exit_tasks_rcu_finish();
+        lockdep_free_task(tsk);
        do_task_dead();
 }
 EXPORT_SYMBOL_GPL(do_exit);
diff --git a/kernel/fork.c b/kernel/fork.c
index b7e9e57b71ea..dab73d18bc4d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -484,6 +484,8 @@ void __init fork_init(void)
        cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                          NULL, free_vm_stack_cache);
 #endif
+        lockdep_init_task(&init_task);
 }
 int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -1700,6 +1702,7 @@ static __latent_entropy struct task_struct *copy_process(
        p->lockdep_depth = 0; /* no locks held yet */
        p->curr_chain_key = 0;
        p->lockdep_recursion = 0;
+        lockdep_init_task(p);
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
@@ -1958,6 +1961,7 @@ bad_fork_cleanup_audit:
 bad_fork_cleanup_perf:
        perf_event_free_task(p);
 bad_fork_cleanup_policy:
+        lockdep_free_task(p);
 #ifdef CONFIG_NUMA
        mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
diff --git a/kernel/futex.c b/kernel/futex.c
index f50b434756c1..3d38eaf05492 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -876,6 +876,8 @@ static struct task_struct *futex_find_get_task(pid_t pid)
        return p;
 }
+#ifdef CONFIG_FUTEX_PI
 /*
 * This task is holding PI mutexes at exit time => bad.
 * Kernel cleans up PI-state, but userspace is likely hosed.
@@ -933,6 +935,8 @@ void exit_pi_state_list(struct task_struct *curr)
        raw_spin_unlock_irq(&curr->pi_lock);
 }
+#endif
 /*
 * We need to check the following states:
 *
@@ -1547,6 +1551,45 @@ out:
        return ret;
 }
+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+{
+        unsigned int op =         (encoded_op & 0x70000000) >> 28;
+        unsigned int cmp =        (encoded_op & 0x0f000000) >> 24;
+        int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12);
+        int cmparg = sign_extend32(encoded_op & 0x00000fff, 12);
+        int oldval, ret;
+        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
+                if (oparg < 0 || oparg > 31)
+                        return -EINVAL;
+                oparg = 1 << oparg;
+        }
+        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+                return -EFAULT;
+        ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
+        if (ret)
+                return ret;
+        switch (cmp) {
+        case FUTEX_OP_CMP_EQ:
+                return oldval == cmparg;
+        case FUTEX_OP_CMP_NE:
+                return oldval != cmparg;
+        case FUTEX_OP_CMP_LT:
+                return oldval < cmparg;
+        case FUTEX_OP_CMP_GE:
+                return oldval >= cmparg;
+        case FUTEX_OP_CMP_LE:
+                return oldval <= cmparg;
+        case FUTEX_OP_CMP_GT:
+                return oldval > cmparg;
+        default:
+                return -ENOSYS;
+        }
+}
 /*
 * Wake up all waiters hashed on the physical page that is mapped
 * to this virtual address:
@@ -1800,6 +1843,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        struct futex_q *this, *next;
        DEFINE_WAKE_Q(wake_q);
+        /*
+         * When PI not supported: return -ENOSYS if requeue_pi is true,
+         * consequently the compiler knows requeue_pi is always false past
+         * this point which will optimize away all the conditional code
+         * further down.
+         */
+        if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
+                return -ENOSYS;
        if (requeue_pi) {
                /*
                 * Requeue PI only works on two distinct uaddrs. This
@@ -2595,6 +2647,9 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
        struct futex_q q = futex_q_init;
        int res, ret;
+        if (!IS_ENABLED(CONFIG_FUTEX_PI))
+                return -ENOSYS;
        if (refill_pi_state_cache())
                return -ENOMEM;
@@ -2774,6 +2829,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
        struct futex_q *top_waiter;
        int ret;
+        if (!IS_ENABLED(CONFIG_FUTEX_PI))
+                return -ENOSYS;
 retry:
        if (get_user(uval, uaddr))
                return -EFAULT;
@@ -2984,6 +3042,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        struct futex_q q = futex_q_init;
        int res, ret;
+        if (!IS_ENABLED(CONFIG_FUTEX_PI))
+                return -ENOSYS;
        if (uaddr == uaddr2)
                return -EINVAL;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index d11c506a6ac3..0bf2e8f5244a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,29 +79,7 @@ int static_key_count(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_count);
-void static_key_enable(struct static_key *key)
+static void static_key_slow_inc_cpuslocked(struct static_key *key)
-{
-        int count = static_key_count(key);
-        WARN_ON_ONCE(count < 0 || count > 1);
-        if (!count)
-                static_key_slow_inc(key);
-}
-EXPORT_SYMBOL_GPL(static_key_enable);
-void static_key_disable(struct static_key *key)
-{
-        int count = static_key_count(key);
-        WARN_ON_ONCE(count < 0 || count > 1);
-        if (count)
-                static_key_slow_dec(key);
-}
-EXPORT_SYMBOL_GPL(static_key_disable);
-void static_key_slow_inc(struct static_key *key)
 {
        int v, v1;
@@ -125,24 +103,87 @@ void static_key_slow_inc(struct static_key *key)
                        return;
        }
-        cpus_read_lock();
        jump_label_lock();
        if (atomic_read(&key->enabled) == 0) {
                atomic_set(&key->enabled, -1);
                jump_label_update(key);
-                atomic_set(&key->enabled, 1);
+                /*
+                 * Ensure that if the above cmpxchg loop observes our positive
+                 * value, it must also observe all the text changes.
+                 */
+                atomic_set_release(&key->enabled, 1);
        } else {
                atomic_inc(&key->enabled);
        }
        jump_label_unlock();
+}
+void static_key_slow_inc(struct static_key *key)
+{
+        cpus_read_lock();
+        static_key_slow_inc_cpuslocked(key);
        cpus_read_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
-static void __static_key_slow_dec(struct static_key *key,
+void static_key_enable_cpuslocked(struct static_key *key)
-                unsigned long rate_limit, struct delayed_work *work)
+{
+        STATIC_KEY_CHECK_USE();
+        if (atomic_read(&key->enabled) > 0) {
+                WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
+                return;
+        }
+        jump_label_lock();
+        if (atomic_read(&key->enabled) == 0) {
+                atomic_set(&key->enabled, -1);
+                jump_label_update(key);
+                /*
+                 * See static_key_slow_inc().
+                 */
+                atomic_set_release(&key->enabled, 1);
+        }
+        jump_label_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked);
+void static_key_enable(struct static_key *key)
+{
+        cpus_read_lock();
+        static_key_enable_cpuslocked(key);
+        cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_enable);
+void static_key_disable_cpuslocked(struct static_key *key)
+{
+        STATIC_KEY_CHECK_USE();
+        if (atomic_read(&key->enabled) != 1) {
+                WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
+                return;
+        }
+        jump_label_lock();
+        if (atomic_cmpxchg(&key->enabled, 1, 0))
+                jump_label_update(key);
+        jump_label_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked);
+void static_key_disable(struct static_key *key)
 {
        cpus_read_lock();
+        static_key_disable_cpuslocked(key);
+        cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_disable);
+static void static_key_slow_dec_cpuslocked(struct static_key *key,
+                                           unsigned long rate_limit,
+                                           struct delayed_work *work)
+{
        /*
         * The negative count check is valid even when a negative
         * key->enabled is in use by static_key_slow_inc(); a
@@ -153,7 +194,6 @@ static void __static_key_slow_dec(struct static_key *key,
        if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
                WARN(atomic_read(&key->enabled) < 0,
                     "jump label: negative count!\n");
-                cpus_read_unlock();
                return;
        }
@@ -164,6 +204,14 @@ static void __static_key_slow_dec(struct static_key *key,
                jump_label_update(key);
        }
        jump_label_unlock();
+}
+static void __static_key_slow_dec(struct static_key *key,
+                                  unsigned long rate_limit,
+                                  struct delayed_work *work)
+{
+        cpus_read_lock();
+        static_key_slow_dec_cpuslocked(key, rate_limit, work);
        cpus_read_unlock();
 }
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7d2499bec5fe..44c8d0d17170 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -58,6 +58,10 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/lock.h>
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+#include <linux/slab.h>
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
 module_param(prove_locking, int, 0644);
@@ -344,14 +348,12 @@ EXPORT_SYMBOL(lockdep_on);
 #if VERBOSE
 # define HARDIRQ_VERBOSE        1
 # define SOFTIRQ_VERBOSE        1
-# define RECLAIM_VERBOSE        1
 #else
 # define HARDIRQ_VERBOSE        0
 # define SOFTIRQ_VERBOSE        0
-# define RECLAIM_VERBOSE        0
 #endif
-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
 /*
 * Quick filtering for interesting events:
 */
@@ -726,6 +728,18 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
 }
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+static void cross_init(struct lockdep_map *lock, int cross);
+static int cross_lock(struct lockdep_map *lock);
+static int lock_acquire_crosslock(struct held_lock *hlock);
+static int lock_release_crosslock(struct lockdep_map *lock);
+#else
+static inline void cross_init(struct lockdep_map *lock, int cross) {}
+static inline int cross_lock(struct lockdep_map *lock) { return 0; }
+static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
+static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
+#endif
 /*
 * Register a lock's class in the hash-table, if the class is not present
 * yet. Otherwise we look it up. We cache the result in the lock object
@@ -1125,22 +1139,41 @@ print_circular_lock_scenario(struct held_lock *src,
                printk(KERN_CONT "\n\n");
        }
-        printk(" Possible unsafe locking scenario:\n\n");
+        if (cross_lock(tgt->instance)) {
-        printk("       CPU0                    CPU1\n");
+                printk(" Possible unsafe locking scenario by crosslock:\n\n");
-        printk("       ----                    ----\n");
+                printk("       CPU0                    CPU1\n");
-        printk("  lock(");
+                printk("       ----                    ----\n");
-        __print_lock_name(target);
+                printk("  lock(");
-        printk(KERN_CONT ");\n");
+                __print_lock_name(parent);
-        printk("                               lock(");
+                printk(KERN_CONT ");\n");
-        __print_lock_name(parent);
+                printk("  lock(");
-        printk(KERN_CONT ");\n");
+                __print_lock_name(target);
-        printk("                               lock(");
+                printk(KERN_CONT ");\n");
-        __print_lock_name(target);
+                printk("                               lock(");
-        printk(KERN_CONT ");\n");
+                __print_lock_name(source);
-        printk("  lock(");
+                printk(KERN_CONT ");\n");
-        __print_lock_name(source);
+                printk("                               unlock(");
-        printk(KERN_CONT ");\n");
+                __print_lock_name(target);
-        printk("\n *** DEADLOCK ***\n\n");
+                printk(KERN_CONT ");\n");
+                printk("\n *** DEADLOCK ***\n\n");
+        } else {
+                printk(" Possible unsafe locking scenario:\n\n");
+                printk("       CPU0                    CPU1\n");
+                printk("       ----                    ----\n");
+                printk("  lock(");
+                __print_lock_name(target);
+                printk(KERN_CONT ");\n");
+                printk("                               lock(");
+                __print_lock_name(parent);
+                printk(KERN_CONT ");\n");
+                printk("                               lock(");
+                __print_lock_name(target);
+                printk(KERN_CONT ");\n");
+                printk("  lock(");
+                __print_lock_name(source);
+                printk(KERN_CONT ");\n");
+                printk("\n *** DEADLOCK ***\n\n");
+        }
 }
 /*
@@ -1165,7 +1198,12 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
        pr_warn("%s/%d is trying to acquire lock:\n",
                curr->comm, task_pid_nr(curr));
        print_lock(check_src);
-        pr_warn("\nbut task is already holding lock:\n");
+        if (cross_lock(check_tgt->instance))
+                pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
+        else
+                pr_warn("\nbut task is already holding lock:\n");
        print_lock(check_tgt);
        pr_warn("\nwhich lock already depends on the new lock.\n\n");
        pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
@@ -1183,7 +1221,8 @@ static inline int class_equal(struct lock_list *entry, void *data)
 static noinline int print_circular_bug(struct lock_list *this,
                                struct lock_list *target,
                                struct held_lock *check_src,
-                                struct held_lock *check_tgt)
+                                struct held_lock *check_tgt,
+                                struct stack_trace *trace)
 {
        struct task_struct *curr = current;
        struct lock_list *parent;
@@ -1193,7 +1232,9 @@ static noinline int print_circular_bug(struct lock_list *this,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        if (!save_trace(&this->trace))
+        if (cross_lock(check_tgt->instance))
+                this->trace = *trace;
+        else if (!save_trace(&this->trace))
                return 0;
        depth = get_lock_depth(target);
@@ -1309,6 +1350,19 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
        return result;
 }
+static noinline int
+check_redundant(struct lock_list *root, struct lock_class *target,
+                struct lock_list **target_entry)
+{
+        int result;
+        debug_atomic_inc(nr_redundant_checks);
+        result = __bfs_forwards(root, target, class_equal, target_entry);
+        return result;
+}
 #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
 * Forwards and backwards subgraph searching, for the purposes of
@@ -1784,6 +1838,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
                if (nest)
                        return 2;
+                if (cross_lock(prev->instance))
+                        continue;
                return print_deadlock_bug(curr, prev, next);
        }
        return 1;
@@ -1813,20 +1870,13 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
 */
 static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
-               struct held_lock *next, int distance, int *stack_saved)
+               struct held_lock *next, int distance, struct stack_trace *trace,
+               int (*save)(struct stack_trace *trace))
 {
        struct lock_list *entry;
        int ret;
        struct lock_list this;
        struct lock_list *uninitialized_var(target_entry);
-        /*
-         * Static variable, serialized by the graph_lock().
-         *
-         * We use this static variable to save the stack trace in case
-         * we call into this function multiple times due to encountering
-         * trylocks in the held lock stack.
-         */
-        static struct stack_trace trace;
        /*
         * Prove that the new <prev> -> <next> dependency would not
@@ -1841,7 +1891,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
        this.parent = NULL;
        ret = check_noncircular(&this, hlock_class(prev), &target_entry);
        if (unlikely(!ret))
-                return print_circular_bug(&this, target_entry, next, prev);
+                return print_circular_bug(&this, target_entry, next, prev, trace);
        else if (unlikely(ret < 0))
                return print_bfs_bug(ret);
@@ -1870,15 +1920,26 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
                if (entry->class == hlock_class(next)) {
                        if (distance == 1)
                                entry->distance = 1;
-                        return 2;
+                        return 1;
                }
        }
-        if (!*stack_saved) {
+        /*
-                if (!save_trace(&trace))
+         * Is the <prev> -> <next> link redundant?
-                        return 0;
+         */
-                *stack_saved = 1;
+        this.class = hlock_class(prev);
+        this.parent = NULL;
+        ret = check_redundant(&this, hlock_class(next), &target_entry);
+        if (!ret) {
+                debug_atomic_inc(nr_redundant);
+                return 2;
        }
+        if (ret < 0)
+                return print_bfs_bug(ret);
+        if (save && !save(trace))
+                return 0;
        /*
         * Ok, all validations passed, add the new lock
@@ -1886,14 +1947,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
         */
        ret = add_lock_to_list(hlock_class(next),
                               &hlock_class(prev)->locks_after,
-                               next->acquire_ip, distance, &trace);
+                               next->acquire_ip, distance, trace);
        if (!ret)
                return 0;
        ret = add_lock_to_list(hlock_class(prev),
                               &hlock_class(next)->locks_before,
-                               next->acquire_ip, distance, &trace);
+                               next->acquire_ip, distance, trace);
        if (!ret)
                return 0;
@@ -1901,8 +1962,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
         * Debugging printouts:
         */
        if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
-                /* We drop graph lock, so another thread can overwrite trace. */
-                *stack_saved = 0;
                graph_unlock();
                printk("\n new dependency: ");
                print_lock_name(hlock_class(prev));
@@ -1910,9 +1969,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
                print_lock_name(hlock_class(next));
                printk(KERN_CONT "\n");
                dump_stack();
-                return graph_lock();
+                if (!graph_lock())
+                        return 0;
        }
-        return 1;
+        return 2;
 }
 /*
@@ -1925,8 +1985,9 @@ static int
 check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
        int depth = curr->lockdep_depth;
-        int stack_saved = 0;
        struct held_lock *hlock;
+        struct stack_trace trace;
+        int (*save)(struct stack_trace *trace) = save_trace;
        /*
         * Debugging checks.
@@ -1947,21 +2008,36 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
                int distance = curr->lockdep_depth - depth + 1;
                hlock = curr->held_locks + depth - 1;
                /*
-                 * Only non-recursive-read entries get new dependencies
+                 * Only non-crosslock entries get new dependencies added.
-                 * added:
+                 * Crosslock entries will be added by commit later:
                 */
-                if (hlock->read != 2 && hlock->check) {
+                if (!cross_lock(hlock->instance)) {
-                        if (!check_prev_add(curr, hlock, next,
-                                                distance, &stack_saved))
-                                return 0;
                        /*
-                         * Stop after the first non-trylock entry,
+                         * Only non-recursive-read entries get new dependencies
-                         * as non-trylock entries have added their
+                         * added:
-                         * own direct dependencies already, so this
-                         * lock is connected to them indirectly:
                         */
-                        if (!hlock->trylock)
+                        if (hlock->read != 2 && hlock->check) {
-                                break;
+                                int ret = check_prev_add(curr, hlock, next,
+                                                         distance, &trace, save);
+                                if (!ret)
+                                        return 0;
+                                /*
+                                 * Stop saving stack_trace if save_trace() was
+                                 * called at least once:
+                                 */
+                                if (save && ret == 2)
+                                        save = NULL;
+                                /*
+                                 * Stop after the first non-trylock entry,
+                                 * as non-trylock entries have added their
+                                 * own direct dependencies already, so this
+                                 * lock is connected to them indirectly:
+                                 */
+                                if (!hlock->trylock)
+                                        break;
+                        }
                }
                depth--;
                /*
@@ -2126,19 +2202,26 @@ static int check_no_collision(struct task_struct *curr,
 }
 /*
- * Look up a dependency chain. If the key is not present yet then
+ * This is for building a chain between just two different classes,
- * add it and return 1 - in this case the new dependency chain is
+ * instead of adding a new hlock upon current, which is done by
- * validated. If the key is already hashed, return 0.
+ * add_chain_cache().
- * (On return with 1 graph_lock is held.)
+ *
+ * This can be called in any context with two classes, while
+ * add_chain_cache() must be done within the lock owener's context
+ * since it uses hlock which might be racy in another context.
 */
-static inline int lookup_chain_cache(struct task_struct *curr,
+static inline int add_chain_cache_classes(unsigned int prev,
-                                     struct held_lock *hlock,
+                                          unsigned int next,
-                                     u64 chain_key)
+                                          unsigned int irq_context,
+                                          u64 chain_key)
 {
-        struct lock_class *class = hlock_class(hlock);
        struct hlist_head *hash_head = chainhashentry(chain_key);
        struct lock_chain *chain;
-        int i, j;
+        /*
+         * Allocate a new chain entry from the static array, and add
+         * it to the hash:
+         */
        /*
         * We might need to take the graph lock, ensure we've got IRQs
@@ -2147,43 +2230,76 @@ static inline int lookup_chain_cache(struct task_struct *curr,
         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
+        if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+                if (!debug_locks_off_graph_unlock())
+                        return 0;
+                print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
+                dump_stack();
+                return 0;
+        }
+        chain = lock_chains + nr_lock_chains++;
+        chain->chain_key = chain_key;
+        chain->irq_context = irq_context;
+        chain->depth = 2;
+        if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
+                chain->base = nr_chain_hlocks;
+                nr_chain_hlocks += chain->depth;
+                chain_hlocks[chain->base] = prev - 1;
+                chain_hlocks[chain->base + 1] = next -1;
+        }
+#ifdef CONFIG_DEBUG_LOCKDEP
        /*
-         * We can walk it lock-free, because entries only get added
+         * Important for check_no_collision().
-         * to the hash:
         */
-        hlist_for_each_entry_rcu(chain, hash_head, entry) {
+        else {
-                if (chain->chain_key == chain_key) {
+                if (!debug_locks_off_graph_unlock())
-cache_hit:
-                        debug_atomic_inc(chain_lookup_hits);
-                        if (!check_no_collision(curr, hlock, chain))
-                                return 0;
-                        if (very_verbose(class))
-                                printk("\nhash chain already cached, key: "
-                                        "%016Lx tail class: [%p] %s\n",
-                                        (unsigned long long)chain_key,
-                                        class->key, class->name);
                        return 0;
-                }
+                print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
+                dump_stack();
+                return 0;
        }
-        if (very_verbose(class))
+#endif
-                printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
-                        (unsigned long long)chain_key, class->key, class->name);
+        hlist_add_head_rcu(&chain->entry, hash_head);
+        debug_atomic_inc(chain_lookup_misses);
+        inc_chains();
+        return 1;
+}
+/*
+ * Adds a dependency chain into chain hashtable. And must be called with
+ * graph_lock held.
+ *
+ * Return 0 if fail, and graph_lock is released.
+ * Return 1 if succeed, with graph_lock held.
+ */
+static inline int add_chain_cache(struct task_struct *curr,
+                                  struct held_lock *hlock,
+                                  u64 chain_key)
+{
+        struct lock_class *class = hlock_class(hlock);
+        struct hlist_head *hash_head = chainhashentry(chain_key);
+        struct lock_chain *chain;
+        int i, j;
        /*
         * Allocate a new chain entry from the static array, and add
         * it to the hash:
         */
-        if (!graph_lock())
-                return 0;
        /*
-         * We have to walk the chain again locked - to avoid duplicates:
+         * We might need to take the graph lock, ensure we've got IRQs
+         * disabled to make this an IRQ-safe lock.. for recursion reasons
+         * lockdep won't complain about its own locking errors.
         */
-        hlist_for_each_entry(chain, hash_head, entry) {
+        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
-                if (chain->chain_key == chain_key) {
+                return 0;
-                        graph_unlock();
-                        goto cache_hit;
-                }
-        }
        if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
                if (!debug_locks_off_graph_unlock())
                        return 0;
@@ -2235,6 +2351,78 @@ cache_hit:
        return 1;
 }
+/*
+ * Look up a dependency chain.
+ */
+static inline struct lock_chain *lookup_chain_cache(u64 chain_key)
+{
+        struct hlist_head *hash_head = chainhashentry(chain_key);
+        struct lock_chain *chain;
+        /*
+         * We can walk it lock-free, because entries only get added
+         * to the hash:
+         */
+        hlist_for_each_entry_rcu(chain, hash_head, entry) {
+                if (chain->chain_key == chain_key) {
+                        debug_atomic_inc(chain_lookup_hits);
+                        return chain;
+                }
+        }
+        return NULL;
+}
+/*
+ * If the key is not present yet in dependency chain cache then
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
+ */
+static inline int lookup_chain_cache_add(struct task_struct *curr,
+                                         struct held_lock *hlock,
+                                         u64 chain_key)
+{
+        struct lock_class *class = hlock_class(hlock);
+        struct lock_chain *chain = lookup_chain_cache(chain_key);
+        if (chain) {
+cache_hit:
+                if (!check_no_collision(curr, hlock, chain))
+                        return 0;
+                if (very_verbose(class)) {
+                        printk("\nhash chain already cached, key: "
+                                        "%016Lx tail class: [%p] %s\n",
+                                        (unsigned long long)chain_key,
+                                        class->key, class->name);
+                }
+                return 0;
+        }
+        if (very_verbose(class)) {
+                printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
+                        (unsigned long long)chain_key, class->key, class->name);
+        }
+        if (!graph_lock())
+                return 0;
+        /*
+         * We have to walk the chain again locked - to avoid duplicates:
+         */
+        chain = lookup_chain_cache(chain_key);
+        if (chain) {
+                graph_unlock();
+                goto cache_hit;
+        }
+        if (!add_chain_cache(curr, hlock, chain_key))
+                return 0;
+        return 1;
+}
 static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
                struct held_lock *hlock, int chain_head, u64 chain_key)
 {
@@ -2245,11 +2433,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
         *
         * We look up the chain_key and do the O(N^2) check and update of
         * the dependencies only if this is a new dependency chain.
-         * (If lookup_chain_cache() returns with 1 it acquires
+         * (If lookup_chain_cache_add() return with 1 it acquires
         * graph_lock for us)
         */
        if (!hlock->trylock && hlock->check &&
-            lookup_chain_cache(curr, hlock, chain_key)) {
+            lookup_chain_cache_add(curr, hlock, chain_key)) {
                /*
                 * Check whether last held lock:
                 *
@@ -2277,14 +2465,17 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
                 * Add dependency only if this lock is not the head
                 * of the chain, and if it's not a secondary read-lock:
                 */
-                if (!chain_head && ret != 2)
+                if (!chain_head && ret != 2) {
                        if (!check_prevs_add(curr, hlock))
                                return 0;
+                }
                graph_unlock();
-        } else
+        } else {
-                /* after lookup_chain_cache(): */
+                /* after lookup_chain_cache_add(): */
                if (unlikely(!debug_locks))
                        return 0;
+        }
        return 1;
 }
@@ -2567,14 +2758,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
        return 0;
 }
-static int RECLAIM_FS_verbose(struct lock_class *class)
-{
-#if RECLAIM_VERBOSE
-        return class_filter(class);
-#endif
-        return 0;
-}
 #define STRICT_READ_CHECKS      1
 static int (*state_verbose_f[])(struct lock_class *class) = {
@@ -2870,57 +3053,6 @@ void trace_softirqs_off(unsigned long ip)
                debug_atomic_inc(redundant_softirqs_off);
 }
-static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
-{
-        struct task_struct *curr = current;
-        if (unlikely(!debug_locks))
-                return;
-        gfp_mask = current_gfp_context(gfp_mask);
-        /* no reclaim without waiting on it */
-        if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
-                return;
-        /* this guy won't enter reclaim */
-        if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
-                return;
-        /* We're only interested __GFP_FS allocations for now */
-        if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
-                return;
-        /*
-         * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
-         */
-        if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
-                return;
-        /* Disable lockdep if explicitly requested */
-        if (gfp_mask & __GFP_NOLOCKDEP)
-                return;
-        mark_held_locks(curr, RECLAIM_FS);
-}
-static void check_flags(unsigned long flags);
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
-        unsigned long flags;
-        if (unlikely(current->lockdep_recursion))
-                return;
-        raw_local_irq_save(flags);
-        check_flags(flags);
-        current->lockdep_recursion = 1;
-        __lockdep_trace_alloc(gfp_mask, flags);
-        current->lockdep_recursion = 0;
-        raw_local_irq_restore(flags);
-}
 static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 {
        /*
@@ -2966,22 +3098,6 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
                }
        }
-        /*
-         * We reuse the irq context infrastructure more broadly as a general
-         * context checking code. This tests GFP_FS recursion (a lock taken
-         * during reclaim for a GFP_FS allocation is held over a GFP_FS
-         * allocation).
-         */
-        if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
-                if (hlock->read) {
-                        if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
-                                        return 0;
-                } else {
-                        if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
-                                        return 0;
-                }
-        }
        return 1;
 }
@@ -3040,10 +3156,6 @@ static inline int separate_irq_context(struct task_struct *curr,
        return 0;
 }
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
-}
 #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
 /*
@@ -3116,7 +3228,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 /*
 * Initialize a lock instance's lock-class mapping info:
 */
-void lockdep_init_map(struct lockdep_map *lock, const char *name,
+static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
                      struct lock_class_key *key, int subclass)
 {
        int i;
@@ -3174,8 +3286,25 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
                raw_local_irq_restore(flags);
        }
 }
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
+                      struct lock_class_key *key, int subclass)
+{
+        cross_init(lock, 0);
+        __lockdep_init_map(lock, name, key, subclass);
+}
 EXPORT_SYMBOL_GPL(lockdep_init_map);
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
+                      struct lock_class_key *key, int subclass)
+{
+        cross_init(lock, 1);
+        __lockdep_init_map(lock, name, key, subclass);
+}
+EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
+#endif
 struct lock_class_key __lockdep_no_validate__;
 EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
@@ -3231,6 +3360,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        int chain_head = 0;
        int class_idx;
        u64 chain_key;
+        int ret;
        if (unlikely(!debug_locks))
                return 0;
@@ -3279,7 +3409,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        class_idx = class - lock_classes + 1;
-        if (depth) {
+        /* TODO: nest_lock is not implemented for crosslock yet. */
+        if (depth && !cross_lock(lock)) {
                hlock = curr->held_locks + depth - 1;
                if (hlock->class_idx == class_idx && nest_lock) {
                        if (hlock->references) {
@@ -3367,6 +3498,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
                return 0;
+        ret = lock_acquire_crosslock(hlock);
+        /*
+         * 2 means normal acquire operations are needed. Otherwise, it's
+         * ok just to return with '0:fail, 1:success'.
+         */
+        if (ret != 2)
+                return ret;
        curr->curr_chain_key = chain_key;
        curr->lockdep_depth++;
        check_chain_key(curr);
@@ -3604,11 +3743,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
        struct task_struct *curr = current;
        struct held_lock *hlock;
        unsigned int depth;
-        int i;
+        int ret, i;
        if (unlikely(!debug_locks))
                return 0;
+        ret = lock_release_crosslock(lock);
+        /*
+         * 2 means normal release operations are needed. Otherwise, it's
+         * ok just to return with '0:fail, 1:success'.
+         */
+        if (ret != 2)
+                return ret;
        depth = curr->lockdep_depth;
        /*
         * So we're all set to release this lock.. wait what lock? We don't
@@ -3952,18 +4099,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
 }
 EXPORT_SYMBOL_GPL(lock_unpin_lock);
-void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
-{
-        current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
-}
-EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
-void lockdep_clear_current_reclaim_state(void)
-{
-        current->lockdep_reclaim_gfp = 0;
-}
-EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state);
 #ifdef CONFIG_LOCK_STAT
 static int
 print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
@@ -4484,6 +4619,12 @@ asmlinkage __visible void lockdep_sys_exit(void)
                                curr->comm, curr->pid);
                lockdep_print_held_locks(curr);
        }
+        /*
+         * The lock history for each syscall should be independent. So wipe the
+         * slate clean on return to userspace.
+         */
+        lockdep_invariant_state(false);
 }
 void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
@@ -4532,3 +4673,488 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
        dump_stack();
 }
 EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+/*
+ * Crossrelease works by recording a lock history for each thread and
+ * connecting those historic locks that were taken after the
+ * wait_for_completion() in the complete() context.
+ *
+ * Task-A                               Task-B
+ *
+ *                                      mutex_lock(&A);
+ *                                      mutex_unlock(&A);
+ *
+ * wait_for_completion(&C);
+ *   lock_acquire_crosslock();
+ *     atomic_inc_return(&cross_gen_id);
+ *                                |
+ *                                |     mutex_lock(&B);
+ *                                |     mutex_unlock(&B);
+ *                                |
+ *                                |     complete(&C);
+ *                                `--     lock_commit_crosslock();
+ *
+ * Which will then add a dependency between B and C.
+ */
+#define xhlock(i)         (current->xhlocks[(i) % MAX_XHLOCKS_NR])
+/*
+ * Whenever a crosslock is held, cross_gen_id will be increased.
+ */
+static atomic_t cross_gen_id; /* Can be wrapped */
+/*
+ * Make an entry of the ring buffer invalid.
+ */
+static inline void invalidate_xhlock(struct hist_lock *xhlock)
+{
+        /*
+         * Normally, xhlock->hlock.instance must be !NULL.
+         */
+        xhlock->hlock.instance = NULL;
+}
+/*
+ * Lock history stacks; we have 2 nested lock history stacks:
+ *
+ *   HARD(IRQ)
+ *   SOFT(IRQ)
+ *
+ * The thing is that once we complete a HARD/SOFT IRQ the future task locks
+ * should not depend on any of the locks observed while running the IRQ.  So
+ * what we do is rewind the history buffer and erase all our knowledge of that
+ * temporal event.
+ */
+void crossrelease_hist_start(enum xhlock_context_t c)
+{
+        struct task_struct *cur = current;
+        if (!cur->xhlocks)
+                return;
+        cur->xhlock_idx_hist[c] = cur->xhlock_idx;
+        cur->hist_id_save[c]    = cur->hist_id;
+}
+void crossrelease_hist_end(enum xhlock_context_t c)
+{
+        struct task_struct *cur = current;
+        if (cur->xhlocks) {
+                unsigned int idx = cur->xhlock_idx_hist[c];
+                struct hist_lock *h = &xhlock(idx);
+                cur->xhlock_idx = idx;
+                /* Check if the ring was overwritten. */
+                if (h->hist_id != cur->hist_id_save[c])
+                        invalidate_xhlock(h);
+        }
+}
+/*
+ * lockdep_invariant_state() is used to annotate independence inside a task, to
+ * make one task look like multiple independent 'tasks'.
+ *
+ * Take for instance workqueues; each work is independent of the last. The
+ * completion of a future work does not depend on the completion of a past work
+ * (in general). Therefore we must not carry that (lock) dependency across
+ * works.
+ *
+ * This is true for many things; pretty much all kthreads fall into this
+ * pattern, where they have an invariant state and future completions do not
+ * depend on past completions. Its just that since they all have the 'same'
+ * form -- the kthread does the same over and over -- it doesn't typically
+ * matter.
+ *
+ * The same is true for system-calls, once a system call is completed (we've
+ * returned to userspace) the next system call does not depend on the lock
+ * history of the previous system call.
+ *
+ * They key property for independence, this invariant state, is that it must be
+ * a point where we hold no locks and have no history. Because if we were to
+ * hold locks, the restore at _end() would not necessarily recover it's history
+ * entry. Similarly, independence per-definition means it does not depend on
+ * prior state.
+ */
+void lockdep_invariant_state(bool force)
+{
+        /*
+         * We call this at an invariant point, no current state, no history.
+         * Verify the former, enforce the latter.
+         */
+        WARN_ON_ONCE(!force && current->lockdep_depth);
+        invalidate_xhlock(&xhlock(current->xhlock_idx));
+}
+static int cross_lock(struct lockdep_map *lock)
+{
+        return lock ? lock->cross : 0;
+}
+/*
+ * This is needed to decide the relationship between wrapable variables.
+ */
+static inline int before(unsigned int a, unsigned int b)
+{
+        return (int)(a - b) < 0;
+}
+static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
+{
+        return hlock_class(&xhlock->hlock);
+}
+static inline struct lock_class *xlock_class(struct cross_lock *xlock)
+{
+        return hlock_class(&xlock->hlock);
+}
+/*
+ * Should we check a dependency with previous one?
+ */
+static inline int depend_before(struct held_lock *hlock)
+{
+        return hlock->read != 2 && hlock->check && !hlock->trylock;
+}
+/*
+ * Should we check a dependency with next one?
+ */
+static inline int depend_after(struct held_lock *hlock)
+{
+        return hlock->read != 2 && hlock->check;
+}
+/*
+ * Check if the xhlock is valid, which would be false if,
+ *
+ *    1. Has not used after initializaion yet.
+ *    2. Got invalidated.
+ *
+ * Remind hist_lock is implemented as a ring buffer.
+ */
+static inline int xhlock_valid(struct hist_lock *xhlock)
+{
+        /*
+         * xhlock->hlock.instance must be !NULL.
+         */
+        return !!xhlock->hlock.instance;
+}
+/*
+ * Record a hist_lock entry.
+ *
+ * Irq disable is only required.
+ */
+static void add_xhlock(struct held_lock *hlock)
+{
+        unsigned int idx = ++current->xhlock_idx;
+        struct hist_lock *xhlock = &xhlock(idx);
+#ifdef CONFIG_DEBUG_LOCKDEP
+        /*
+         * This can be done locklessly because they are all task-local
+         * state, we must however ensure IRQs are disabled.
+         */
+        WARN_ON_ONCE(!irqs_disabled());
+#endif
+        /* Initialize hist_lock's members */
+        xhlock->hlock = *hlock;
+        xhlock->hist_id = ++current->hist_id;
+        xhlock->trace.nr_entries = 0;
+        xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
+        xhlock->trace.entries = xhlock->trace_entries;
+        xhlock->trace.skip = 3;
+        save_stack_trace(&xhlock->trace);
+}
+static inline int same_context_xhlock(struct hist_lock *xhlock)
+{
+        return xhlock->hlock.irq_context == task_irq_context(current);
+}
+/*
+ * This should be lockless as far as possible because this would be
+ * called very frequently.
+ */
+static void check_add_xhlock(struct held_lock *hlock)
+{
+        /*
+         * Record a hist_lock, only in case that acquisitions ahead
+         * could depend on the held_lock. For example, if the held_lock
+         * is trylock then acquisitions ahead never depends on that.
+         * In that case, we don't need to record it. Just return.
+         */
+        if (!current->xhlocks || !depend_before(hlock))
+                return;
+        add_xhlock(hlock);
+}
+/*
+ * For crosslock.
+ */
+static int add_xlock(struct held_lock *hlock)
+{
+        struct cross_lock *xlock;
+        unsigned int gen_id;
+        if (!graph_lock())
+                return 0;
+        xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
+        /*
+         * When acquisitions for a crosslock are overlapped, we use
+         * nr_acquire to perform commit for them, based on cross_gen_id
+         * of the first acquisition, which allows to add additional
+         * dependencies.
+         *
+         * Moreover, when no acquisition of a crosslock is in progress,
+         * we should not perform commit because the lock might not exist
+         * any more, which might cause incorrect memory access. So we
+         * have to track the number of acquisitions of a crosslock.
+         *
+         * depend_after() is necessary to initialize only the first
+         * valid xlock so that the xlock can be used on its commit.
+         */
+        if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
+                goto unlock;
+        gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
+        xlock->hlock = *hlock;
+        xlock->hlock.gen_id = gen_id;
+unlock:
+        graph_unlock();
+        return 1;
+}
+/*
+ * Called for both normal and crosslock acquires. Normal locks will be
+ * pushed on the hist_lock queue. Cross locks will record state and
+ * stop regular lock_acquire() to avoid being placed on the held_lock
+ * stack.
+ *
+ * Return: 0 - failure;
+ *         1 - crosslock, done;
+ *         2 - normal lock, continue to held_lock[] ops.
+ */
+static int lock_acquire_crosslock(struct held_lock *hlock)
+{
+        /*
+         *      CONTEXT 1               CONTEXT 2
+         *      ---------               ---------
+         *      lock A (cross)
+         *      X = atomic_inc_return(&cross_gen_id)
+         *      ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+         *                              Y = atomic_read_acquire(&cross_gen_id)
+         *                              lock B
+         *
+         * atomic_read_acquire() is for ordering between A and B,
+         * IOW, A happens before B, when CONTEXT 2 see Y >= X.
+         *
+         * Pairs with atomic_inc_return() in add_xlock().
+         */
+        hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
+        if (cross_lock(hlock->instance))
+                return add_xlock(hlock);
+        check_add_xhlock(hlock);
+        return 2;
+}
+static int copy_trace(struct stack_trace *trace)
+{
+        unsigned long *buf = stack_trace + nr_stack_trace_entries;
+        unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+        unsigned int nr = min(max_nr, trace->nr_entries);
+        trace->nr_entries = nr;
+        memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
+        trace->entries = buf;
+        nr_stack_trace_entries += nr;
+        if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
+                if (!debug_locks_off_graph_unlock())
+                        return 0;
+                print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
+                dump_stack();
+                return 0;
+        }
+        return 1;
+}
+static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
+{
+        unsigned int xid, pid;
+        u64 chain_key;
+        xid = xlock_class(xlock) - lock_classes;
+        chain_key = iterate_chain_key((u64)0, xid);
+        pid = xhlock_class(xhlock) - lock_classes;
+        chain_key = iterate_chain_key(chain_key, pid);
+        if (lookup_chain_cache(chain_key))
+                return 1;
+        if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
+                                chain_key))
+                return 0;
+        if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
+                            &xhlock->trace, copy_trace))
+                return 0;
+        return 1;
+}
+static void commit_xhlocks(struct cross_lock *xlock)
+{
+        unsigned int cur = current->xhlock_idx;
+        unsigned int prev_hist_id = xhlock(cur).hist_id;
+        unsigned int i;
+        if (!graph_lock())
+                return;
+        if (xlock->nr_acquire) {
+                for (i = 0; i < MAX_XHLOCKS_NR; i++) {
+                        struct hist_lock *xhlock = &xhlock(cur - i);
+                        if (!xhlock_valid(xhlock))
+                                break;
+                        if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
+                                break;
+                        if (!same_context_xhlock(xhlock))
+                                break;
+                        /*
+                         * Filter out the cases where the ring buffer was
+                         * overwritten and the current entry has a bigger
+                         * hist_id than the previous one, which is impossible
+                         * otherwise:
+                         */
+                        if (unlikely(before(prev_hist_id, xhlock->hist_id)))
+                                break;
+                        prev_hist_id = xhlock->hist_id;
+                        /*
+                         * commit_xhlock() returns 0 with graph_lock already
+                         * released if fail.
+                         */
+                        if (!commit_xhlock(xlock, xhlock))
+                                return;
+                }
+        }
+        graph_unlock();
+}
+void lock_commit_crosslock(struct lockdep_map *lock)
+{
+        struct cross_lock *xlock;
+        unsigned long flags;
+        if (unlikely(!debug_locks || current->lockdep_recursion))
+                return;
+        if (!current->xhlocks)
+                return;
+        /*
+         * Do commit hist_locks with the cross_lock, only in case that
+         * the cross_lock could depend on acquisitions after that.
+         *
+         * For example, if the cross_lock does not have the 'check' flag
+         * then we don't need to check dependencies and commit for that.
+         * Just skip it. In that case, of course, the cross_lock does
+         * not depend on acquisitions ahead, either.
+         *
+         * WARNING: Don't do that in add_xlock() in advance. When an
+         * acquisition context is different from the commit context,
+         * invalid(skipped) cross_lock might be accessed.
+         */
+        if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
+                return;
+        raw_local_irq_save(flags);
+        check_flags(flags);
+        current->lockdep_recursion = 1;
+        xlock = &((struct lockdep_map_cross *)lock)->xlock;
+        commit_xhlocks(xlock);
+        current->lockdep_recursion = 0;
+        raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_commit_crosslock);
+/*
+ * Return: 0 - failure;
+ *         1 - crosslock, done;
+ *         2 - normal lock, continue to held_lock[] ops.
+ */
+static int lock_release_crosslock(struct lockdep_map *lock)
+{
+        if (cross_lock(lock)) {
+                if (!graph_lock())
+                        return 0;
+                ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
+                graph_unlock();
+                return 1;
+        }
+        return 2;
+}
+static void cross_init(struct lockdep_map *lock, int cross)
+{
+        if (cross)
+                ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
+        lock->cross = cross;
+        /*
+         * Crossrelease assumes that the ring buffer size of xhlocks
+         * is aligned with power of 2. So force it on build.
+         */
+        BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
+}
+void lockdep_init_task(struct task_struct *task)
+{
+        int i;
+        task->xhlock_idx = UINT_MAX;
+        task->hist_id = 0;
+        for (i = 0; i < XHLOCK_CTX_NR; i++) {
+                task->xhlock_idx_hist[i] = UINT_MAX;
+                task->hist_id_save[i] = 0;
+        }
+        task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
+                                GFP_KERNEL);
+}
+void lockdep_free_task(struct task_struct *task)
+{
+        if (task->xhlocks) {
+                void *tmp = task->xhlocks;
+                /* Diable crossrelease for current */
+                task->xhlocks = NULL;
+                kfree(tmp);
+        }
+}
+#endif
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index c08fbd2f5ba9..1da4669d57a7 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -143,6 +143,8 @@ struct lockdep_stats {
        int     redundant_softirqs_on;
        int     redundant_softirqs_off;
        int     nr_unused_locks;
+        int     nr_redundant_checks;
+        int     nr_redundant;
        int     nr_cyclic_checks;
        int     nr_cyclic_check_recursions;
        int     nr_find_usage_forwards_checks;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 6d1fcc786081..68d9e267ccd4 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -201,6 +201,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
                debug_atomic_read(chain_lookup_hits));
        seq_printf(m, " cyclic checks:                 %11llu\n",
                debug_atomic_read(nr_cyclic_checks));
+        seq_printf(m, " redundant checks:              %11llu\n",
+                debug_atomic_read(nr_redundant_checks));
+        seq_printf(m, " redundant links:               %11llu\n",
+                debug_atomic_read(nr_redundant));
        seq_printf(m, " find-mask forwards checks:     %11llu\n",
                debug_atomic_read(nr_find_usage_forwards_checks));
        seq_printf(m, " find-mask backwards checks:    %11llu\n",
diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc2b84c..35ca09f2ed0b 100644
--- a/kernel/locking/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
@@ -6,4 +6,3 @@
 */
 LOCKDEP_STATE(HARDIRQ)
 LOCKDEP_STATE(SOFTIRQ)
-LOCKDEP_STATE(RECLAIM_FS)
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index a3167941093b..a74ee6abd039 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -109,6 +109,19 @@ bool osq_lock(struct optimistic_spin_queue *lock)
        prev = decode_cpu(old);
        node->prev = prev;
+        /*
+         * osq_lock()                   unqueue
+         *
+         * node->prev = prev            osq_wait_next()
+         * WMB                          MB
+         * prev->next = node            next->prev = prev // unqueue-C
+         *
+         * Here 'node->prev' and 'next->prev' are the same variable and we need
+         * to ensure these stores happen in-order to avoid corrupting the list.
+         */
+        smp_wmb();
        WRITE_ONCE(prev->next, node);
        /*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 4ccfcaae5b89..43555681c40b 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
        struct __qspinlock *l = (void *)lock;
        if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
-            (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+            (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
                qstat_inc(qstat_pv_lock_stealing, true);
                return true;
        }
@@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock)
 /*
 * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
- * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
+ * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
- * just to be sure that it will get it.
+ * lock just to be sure that it will get it.
 */
 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 {
        struct __qspinlock *l = (void *)lock;
        return !READ_ONCE(l->locked) &&
-               (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
+               (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
-                        == _Q_PENDING_VAL);
+                                _Q_LOCKED_VAL) == _Q_PENDING_VAL);
 }
 #else /* _Q_PENDING_BITS == 8 */
 static __always_inline void set_pending(struct qspinlock *lock)
@@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
                 */
                old = val;
                new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
-                val = atomic_cmpxchg(&lock->val, old, new);
+                val = atomic_cmpxchg_acquire(&lock->val, old, new);
                if (val == old)
                        return 1;
@@ -362,8 +362,18 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
         * observe its next->locked value and advance itself.
         *
         * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+         *
+         * The write to next->locked in arch_mcs_spin_unlock_contended()
+         * must be ordered before the read of pn->state in the cmpxchg()
+         * below for the code to work correctly. To guarantee full ordering
+         * irrespective of the success or failure of the cmpxchg(),
+         * a relaxed version with explicit barrier is used. The control
+         * dependency will order the reading of pn->state before any
+         * subsequent writes.
         */
-        if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+        smp_mb__before_atomic();
+        if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed)
+            != vcpu_halted)
                return;
        /*
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 72ad45a9a794..8d039b928d61 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -40,6 +40,9 @@ struct rt_mutex_waiter {
 /*
 * Various helpers to access the waiters-tree:
 */
+#ifdef CONFIG_RT_MUTEXES
 static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
 {
        return !RB_EMPTY_ROOT(&lock->waiters);
@@ -69,6 +72,32 @@ task_top_pi_waiter(struct task_struct *p)
                        pi_tree_entry);
 }
+#else
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+        return false;
+}
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+        return NULL;
+}
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+        return false;
+}
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+        return NULL;
+}
+#endif
 /*
 * lock->owner state tracking:
 */
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 20819df98125..0848634c5512 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -126,7 +126,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem)
 /*
 * get a read lock on the semaphore
 */
-void __sched __down_read(struct rw_semaphore *sem)
+int __sched __down_read_common(struct rw_semaphore *sem, int state)
 {
        struct rwsem_waiter waiter;
        unsigned long flags;
@@ -140,8 +140,6 @@ void __sched __down_read(struct rw_semaphore *sem)
                goto out;
        }
-        set_current_state(TASK_UNINTERRUPTIBLE);
        /* set up my own style of waitqueue */
        waiter.task = current;
        waiter.type = RWSEM_WAITING_FOR_READ;
@@ -149,20 +147,41 @@ void __sched __down_read(struct rw_semaphore *sem)
        list_add_tail(&waiter.list, &sem->wait_list);
-        /* we don't need to touch the semaphore struct anymore */
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
        /* wait to be given the lock */
        for (;;) {
                if (!waiter.task)
                        break;
+                if (signal_pending_state(state, current))
+                        goto out_nolock;
+                set_current_state(state);
+                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
                schedule();
-                set_current_state(TASK_UNINTERRUPTIBLE);
+                raw_spin_lock_irqsave(&sem->wait_lock, flags);
        }
-        __set_current_state(TASK_RUNNING);
+        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 out:
-        ;
+        return 0;
+out_nolock:
+        /*
+         * We didn't take the lock, so that there is a writer, which
+         * is owner or the first waiter of the sem. If it's a waiter,
+         * it will be woken by current owner. Not need to wake anybody.
+         */
+        list_del(&waiter.list);
+        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+        return -EINTR;
+}
+void __sched __down_read(struct rw_semaphore *sem)
+{
+        __down_read_common(sem, TASK_UNINTERRUPTIBLE);
+}
+int __sched __down_read_killable(struct rw_semaphore *sem)
+{
+        return __down_read_common(sem, TASK_KILLABLE);
 }
 /*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 34e727f18e49..02f660666ab8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -221,8 +221,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 /*
 * Wait for the read lock to be granted
 */
-__visible
+static inline struct rw_semaphore __sched *
-struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
+__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
 {
        long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
        struct rwsem_waiter waiter;
@@ -255,17 +255,44 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        /* wait to be given the lock */
        while (true) {
-                set_current_state(TASK_UNINTERRUPTIBLE);
+                set_current_state(state);
                if (!waiter.task)
                        break;
+                if (signal_pending_state(state, current)) {
+                        raw_spin_lock_irq(&sem->wait_lock);
+                        if (waiter.task)
+                                goto out_nolock;
+                        raw_spin_unlock_irq(&sem->wait_lock);
+                        break;
+                }
                schedule();
        }
        __set_current_state(TASK_RUNNING);
        return sem;
+out_nolock:
+        list_del(&waiter.list);
+        if (list_empty(&sem->wait_list))
+                atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
+        raw_spin_unlock_irq(&sem->wait_lock);
+        __set_current_state(TASK_RUNNING);
+        return ERR_PTR(-EINTR);
+}
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+        return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed_killable(struct rw_semaphore *sem)
+{
+        return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed_killable);
 /*
 * This function must be called with the sem->wait_lock held to prevent
 * race conditions between checking the rwsem wait list and setting the
diff --git a/kernel/panic.c b/kernel/panic.c
index a58932b41700..bdd18afa19a4 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -26,6 +26,7 @@
 #include <linux/nmi.h>
 #include <linux/console.h>
 #include <linux/bug.h>
+#include <linux/ratelimit.h>
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -601,6 +602,17 @@ EXPORT_SYMBOL(__stack_chk_fail);
 #endif
+#ifdef CONFIG_ARCH_HAS_REFCOUNT
+void refcount_error_report(struct pt_regs *regs, const char *err)
+{
+        WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n",
+                err, (void *)instruction_pointer(regs),
+                current->comm, task_pid_nr(current),
+                from_kuid_munged(&init_user_ns, current_uid()),
+                from_kuid_munged(&init_user_ns, current_euid()));
+}
+#endif
 core_param(panic, panic_timeout, int, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
 core_param(panic_on_warn, panic_on_warn, int, 0644);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 5d9131aa846f..cc873075c3bd 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -32,6 +32,12 @@ void complete(struct completion *x)
        unsigned long flags;
        spin_lock_irqsave(&x->wait.lock, flags);
+        /*
+         * Perform commit of crossrelease here.
+         */
+        complete_release_commit(x);
        if (x->done != UINT_MAX)
                x->done++;
        __wake_up_locked(&x->wait, TASK_NORMAL, 1);
@@ -99,9 +105,14 @@ __wait_for_common(struct completion *x,
 {
        might_sleep();
+        complete_acquire(x);
        spin_lock_irq(&x->wait.lock);
        timeout = do_wait_for_common(x, action, timeout, state);
        spin_unlock_irq(&x->wait.lock);
+        complete_release(x);
        return timeout;
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1fcd96cf432..6d2c7ff9ba98 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1972,8 +1972,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         * reordered with p->state check below. This pairs with mb() in
         * set_current_state() the waiting thread does.
         */
-        smp_mb__before_spinlock();
        raw_spin_lock_irqsave(&p->pi_lock, flags);
+        smp_mb__after_spinlock();
        if (!(p->state & state))
                goto out;
@@ -3296,8 +3296,8 @@ static void __sched notrace __schedule(bool preempt)
         * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
         * done by the caller to avoid the race with signal_wake_up().
         */
-        smp_mb__before_spinlock();
        rq_lock(rq, &rf);
+        smp_mb__after_spinlock();
        /* Promote REQ to ACT */
        rq->clock_update_flags <<= 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 25e5cb1107f3..ab1c7f5409a0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -769,7 +769,7 @@ struct rq {
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
        int hrtick_csd_pending;
-        struct call_single_data hrtick_csd;
+        call_single_data_t hrtick_csd;
 #endif
        struct hrtimer hrtick_timer;
 #endif
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 3d5610dcce11..2227e183e202 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -33,9 +33,6 @@ void swake_up(struct swait_queue_head *q)
 {
        unsigned long flags;
-        if (!swait_active(q))
-                return;
        raw_spin_lock_irqsave(&q->lock, flags);
        swake_up_locked(q);
        raw_spin_unlock_irqrestore(&q->lock, flags);
@@ -51,9 +48,6 @@ void swake_up_all(struct swait_queue_head *q)
        struct swait_queue *curr;
        LIST_HEAD(tmp);
-        if (!swait_active(q))
-                return;
        raw_spin_lock_irq(&q->lock);
        list_splice_init(&q->task_list, &tmp);
        while (!list_empty(&tmp)) {
diff --git a/kernel/smp.c b/kernel/smp.c
index 3061483cb3ad..81cfca9b4cc3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -28,7 +28,7 @@ enum {
 };
 struct call_function_data {
-        struct call_single_data __percpu *csd;
+        call_single_data_t      __percpu *csd;
        cpumask_var_t           cpumask;
        cpumask_var_t           cpumask_ipi;
 };
@@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu)
                free_cpumask_var(cfd->cpumask);
                return -ENOMEM;
        }
-        cfd->csd = alloc_percpu(struct call_single_data);
+        cfd->csd = alloc_percpu(call_single_data_t);
        if (!cfd->csd) {
                free_cpumask_var(cfd->cpumask);
                free_cpumask_var(cfd->cpumask_ipi);
@@ -103,12 +103,12 @@ void __init call_function_init(void)
 * previous function call. For multi-cpu calls its even more interesting
 * as we'll have to ensure no other cpu is observing our csd.
 */
-static __always_inline void csd_lock_wait(struct call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
 {
        smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
 }
-static __always_inline void csd_lock(struct call_single_data *csd)
+static __always_inline void csd_lock(call_single_data_t *csd)
 {
        csd_lock_wait(csd);
        csd->flags |= CSD_FLAG_LOCK;
@@ -116,12 +116,12 @@ static __always_inline void csd_lock(struct call_single_data *csd)
        /*
         * prevent CPU from reordering the above assignment
         * to ->flags with any subsequent assignments to other
-         * fields of the specified call_single_data structure:
+         * fields of the specified call_single_data_t structure:
         */
        smp_wmb();
 }
-static __always_inline void csd_unlock(struct call_single_data *csd)
+static __always_inline void csd_unlock(call_single_data_t *csd)
 {
        WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
@@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd)
        smp_store_release(&csd->flags, 0);
 }
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
 /*
- * Insert a previously allocated call_single_data element
+ * Insert a previously allocated call_single_data_t element
 * for execution on the given CPU. data must already have
 * ->func, ->info, and ->flags set.
 */
-static int generic_exec_single(int cpu, struct call_single_data *csd,
+static int generic_exec_single(int cpu, call_single_data_t *csd,
                               smp_call_func_t func, void *info)
 {
        if (cpu == smp_processor_id()) {
@@ -210,7 +210,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 {
        struct llist_head *head;
        struct llist_node *entry;
-        struct call_single_data *csd, *csd_next;
+        call_single_data_t *csd, *csd_next;
        static bool warned;
        WARN_ON(!irqs_disabled());
@@ -268,8 +268,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                             int wait)
 {
-        struct call_single_data *csd;
+        call_single_data_t *csd;
-        struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
+        call_single_data_t csd_stack = {
+                .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+        };
        int this_cpu;
        int err;
@@ -321,7 +323,7 @@ EXPORT_SYMBOL(smp_call_function_single);
 * NOTE: Be careful, there is unfortunately no current debugging facility to
 * validate the correctness of this serialization.
 */
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 {
        int err = 0;
@@ -444,7 +446,7 @@ void smp_call_function_many(const struct cpumask *mask,
        cpumask_clear(cfd->cpumask_ipi);
        for_each_cpu(cpu, cfd->cpumask) {
-                struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
+                call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
                csd_lock(csd);
                if (wait)
@@ -460,7 +462,7 @@ void smp_call_function_many(const struct cpumask *mask,
        if (wait) {
                for_each_cpu(cpu, cfd->cpumask) {
-                        struct call_single_data *csd;
+                        call_single_data_t *csd;
                        csd = per_cpu_ptr(cfd->csd, cpu);
                        csd_lock_wait(csd);
diff --git a/kernel/up.c b/kernel/up.c
index ee81ac9af4ca..42c46bf3e0a5 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 {
        unsigned long flags;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ca937b0c3a96..ab3c0dc8c7ed 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2091,8 +2091,30 @@ __acquires(&pool->lock)
        spin_unlock_irq(&pool->lock);
-        lock_map_acquire_read(&pwq->wq->lockdep_map);
+        lock_map_acquire(&pwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
+        /*
+         * Strictly speaking we should mark the invariant state without holding
+         * any locks, that is, before these two lock_map_acquire()'s.
+         *
+         * However, that would result in:
+         *
+         *   A(W1)
+         *   WFC(C)
+         *              A(W1)
+         *              C(C)
+         *
+         * Which would create W1->C->W1 dependencies, even though there is no
+         * actual deadlock possible. There are two solutions, using a
+         * read-recursive acquire on the work(queue) 'locks', but this will then
+         * hit the lockdep limitation on recursive locks, or simply discard
+         * these locks.
+         *
+         * AFAICT there is no possible deadlock scenario between the
+         * flush_work() and complete() primitives (except for single-threaded
+         * workqueues), so hiding them isn't a problem.
+         */
+        lockdep_invariant_state(true);
        trace_workqueue_execute_start(work);
        worker->current_func(work);
        /*
@@ -2474,7 +2496,16 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
         */
        INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
-        init_completion(&barr->done);
+        /*
+         * Explicitly init the crosslock for wq_barrier::done, make its lock
+         * key a subkey of the corresponding work. As a result we won't
+         * build a dependency between wq_barrier::done and unrelated work.
+         */
+        lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map,
+                                   "(complete)wq_barr::done",
+                                   target->lockdep_map.key, 1);
+        __init_completion(&barr->done);
        barr->task = current;
        /*
@@ -2815,16 +2846,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
        spin_unlock_irq(&pool->lock);
        /*
-         * If @max_active is 1 or rescuer is in use, flushing another work
+         * Force a lock recursion deadlock when using flush_work() inside a
-         * item on the same workqueue may lead to deadlock.  Make sure the
+         * single-threaded or rescuer equipped workqueue.
-         * flusher is not running on the same workqueue by verifying write
+         *
-         * access.
+         * For single threaded workqueues the deadlock happens when the work
+         * is after the work issuing the flush_work(). For rescuer equipped
+         * workqueues the deadlock happens when the rescuer stalls, blocking
+         * forward progress.
         */
-        if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
+        if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) {
                lock_map_acquire(&pwq->wq->lockdep_map);
-        else
+                lock_map_release(&pwq->wq->lockdep_map);
-                lock_map_acquire_read(&pwq->wq->lockdep_map);
+        }
-        lock_map_release(&pwq->wq->lockdep_map);
        return true;
 already_gone:
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2a9a8759752b..7396f5044397 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1091,6 +1091,8 @@ config PROVE_LOCKING
        select DEBUG_MUTEXES
        select DEBUG_RT_MUTEXES if RT_MUTEXES
        select DEBUG_LOCK_ALLOC
+        select LOCKDEP_CROSSRELEASE
+        select LOCKDEP_COMPLETIONS
        select TRACE_IRQFLAGS
        default n
        help
@@ -1160,6 +1162,22 @@ config LOCK_STAT
         CONFIG_LOCK_STAT defines "contended" and "acquired" lock events.
         (CONFIG_LOCKDEP defines "acquire" and "release" events.)
+config LOCKDEP_CROSSRELEASE
+        bool
+        help
+         This makes lockdep work for crosslock which is a lock allowed to
+         be released in a different context from the acquisition context.
+         Normally a lock must be released in the context acquiring the lock.
+         However, relexing this constraint helps synchronization primitives
+         such as page locks or completions can use the lock correctness
+         detector, lockdep.
+config LOCKDEP_COMPLETIONS
+        bool
+        help
+         A deadlock caused by wait_for_completion() and complete() can be
+         detected by lockdep using crossrelease feature.
 config DEBUG_LOCKDEP
        bool "Lock dependency engine debugging"
        depends on DEBUG_KERNEL && LOCKDEP
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 6f2b135dc5e8..cd0b5c964bd0 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -363,6 +363,103 @@ static void rsem_AA3(void)
 }
 /*
+ * read_lock(A)
+ * spin_lock(B)
+ *              spin_lock(B)
+ *              write_lock(A)
+ */
+static void rlock_ABBA1(void)
+{
+        RL(X1);
+        L(Y1);
+        U(Y1);
+        RU(X1);
+        L(Y1);
+        WL(X1);
+        WU(X1);
+        U(Y1); // should fail
+}
+static void rwsem_ABBA1(void)
+{
+        RSL(X1);
+        ML(Y1);
+        MU(Y1);
+        RSU(X1);
+        ML(Y1);
+        WSL(X1);
+        WSU(X1);
+        MU(Y1); // should fail
+}
+/*
+ * read_lock(A)
+ * spin_lock(B)
+ *              spin_lock(B)
+ *              read_lock(A)
+ */
+static void rlock_ABBA2(void)
+{
+        RL(X1);
+        L(Y1);
+        U(Y1);
+        RU(X1);
+        L(Y1);
+        RL(X1);
+        RU(X1);
+        U(Y1); // should NOT fail
+}
+static void rwsem_ABBA2(void)
+{
+        RSL(X1);
+        ML(Y1);
+        MU(Y1);
+        RSU(X1);
+        ML(Y1);
+        RSL(X1);
+        RSU(X1);
+        MU(Y1); // should fail
+}
+/*
+ * write_lock(A)
+ * spin_lock(B)
+ *              spin_lock(B)
+ *              write_lock(A)
+ */
+static void rlock_ABBA3(void)
+{
+        WL(X1);
+        L(Y1);
+        U(Y1);
+        WU(X1);
+        L(Y1);
+        WL(X1);
+        WU(X1);
+        U(Y1); // should fail
+}
+static void rwsem_ABBA3(void)
+{
+        WSL(X1);
+        ML(Y1);
+        MU(Y1);
+        WSU(X1);
+        ML(Y1);
+        WSL(X1);
+        WSU(X1);
+        MU(Y1); // should fail
+}
+/*
 * ABBA deadlock:
 */
@@ -1056,8 +1153,6 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
        if (debug_locks != expected) {
                unexpected_testcase_failures++;
                pr_cont("FAILED|");
-                dump_stack();
        } else {
                testcase_successes++;
                pr_cont("  ok  |");
@@ -1933,6 +2028,30 @@ void locking_selftest(void)
        dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM);
        pr_cont("\n");
+        print_testname("mixed read-lock/lock-write ABBA");
+        pr_cont("             |");
+        dotest(rlock_ABBA1, FAILURE, LOCKTYPE_RWLOCK);
+        /*
+         * Lockdep does indeed fail here, but there's nothing we can do about
+         * that now.  Don't kill lockdep for it.
+         */
+        unexpected_testcase_failures--;
+        pr_cont("             |");
+        dotest(rwsem_ABBA1, FAILURE, LOCKTYPE_RWSEM);
+        print_testname("mixed read-lock/lock-read ABBA");
+        pr_cont("             |");
+        dotest(rlock_ABBA2, SUCCESS, LOCKTYPE_RWLOCK);
+        pr_cont("             |");
+        dotest(rwsem_ABBA2, FAILURE, LOCKTYPE_RWSEM);
+        print_testname("mixed write-lock/lock-write ABBA");
+        pr_cont("             |");
+        dotest(rlock_ABBA3, FAILURE, LOCKTYPE_RWLOCK);
+        pr_cont("             |");
+        dotest(rwsem_ABBA3, FAILURE, LOCKTYPE_RWSEM);
        printk("  --------------------------------------------------------------------------\n");
        /*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 90731e3b7e58..3644ff918434 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1510,8 +1510,15 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
        }
        /*
-         * The page_table_lock above provides a memory barrier
+         * Since we took the NUMA fault, we must have observed the !accessible
-         * with change_protection_range.
+         * bit. Make sure all other CPUs agree with that, to avoid them
+         * modifying the page we're about to migrate.
+         *
+         * Must be done under PTL such that we'll observe the relevant
+         * inc_tlb_flush_pending().
+         *
+         * We are not sure a pending tlb flush here is for a huge page
+         * mapping or not. Hence use the tlb range variant
         */
        if (mm_tlb_flush_pending(vma->vm_mm))
                flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
@@ -1521,6 +1528,7 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
         * and access rights restored.
         */
        spin_unlock(vmf->ptl);
        migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
                                vmf->pmd, pmd, vmf->address, page, target_nid);
        if (migrated) {
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index ca11bc4ce205..6f319fb81718 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -267,13 +267,13 @@ static void check_memory_region(unsigned long addr,
        check_memory_region_inline(addr, size, write, ret_ip);
 }
-void kasan_check_read(const void *p, unsigned int size)
+void kasan_check_read(const volatile void *p, unsigned int size)
 {
        check_memory_region((unsigned long)p, size, false, _RET_IP_);
 }
 EXPORT_SYMBOL(kasan_check_read);
-void kasan_check_write(const void *p, unsigned int size)
+void kasan_check_write(const volatile void *p, unsigned int size)
 {
        check_memory_region((unsigned long)p, size, true, _RET_IP_);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1423da8dd16f..9327a940e373 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,7 @@
 #include <linux/kthread.h>
 #include <linux/memcontrol.h>
 #include <linux/ftrace.h>
+#include <linux/lockdep.h>
 #include <linux/nmi.h>
 #include <asm/sections.h>
@@ -3513,6 +3514,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
 }
 #endif /* CONFIG_COMPACTION */
+#ifdef CONFIG_LOCKDEP
+struct lockdep_map __fs_reclaim_map =
+        STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
+static bool __need_fs_reclaim(gfp_t gfp_mask)
+{
+        gfp_mask = current_gfp_context(gfp_mask);
+        /* no reclaim without waiting on it */
+        if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
+                return false;
+        /* this guy won't enter reclaim */
+        if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+                return false;
+        /* We're only interested __GFP_FS allocations for now */
+        if (!(gfp_mask & __GFP_FS))
+                return false;
+        if (gfp_mask & __GFP_NOLOCKDEP)
+                return false;
+        return true;
+}
+void fs_reclaim_acquire(gfp_t gfp_mask)
+{
+        if (__need_fs_reclaim(gfp_mask))
+                lock_map_acquire(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
+void fs_reclaim_release(gfp_t gfp_mask)
+{
+        if (__need_fs_reclaim(gfp_mask))
+                lock_map_release(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_release);
+#endif
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -3527,7 +3569,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
        noreclaim_flag = memalloc_noreclaim_save();
-        lockdep_set_current_reclaim_state(gfp_mask);
+        fs_reclaim_acquire(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        current->reclaim_state = &reclaim_state;
@@ -3535,7 +3577,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
                                                                ac->nodemask);
        current->reclaim_state = NULL;
-        lockdep_clear_current_reclaim_state();
+        fs_reclaim_release(gfp_mask);
        memalloc_noreclaim_restore(noreclaim_flag);
        cond_resched();
@@ -4064,7 +4106,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
                        *alloc_flags |= ALLOC_CPUSET;
        }
-        lockdep_trace_alloc(gfp_mask);
+        fs_reclaim_acquire(gfp_mask);
+        fs_reclaim_release(gfp_mask);
        might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
diff --git a/mm/slab.h b/mm/slab.h
index 6885e1192ec5..073362816acc 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -43,6 +43,7 @@ struct kmem_cache {
 #include <linux/kasan.h>
 #include <linux/kmemleak.h>
 #include <linux/random.h>
+#include <linux/sched/mm.h>
 /*
 * State of the slab allocator.
@@ -412,7 +413,10 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
                                                     gfp_t flags)
 {
        flags &= gfp_allowed_mask;
-        lockdep_trace_alloc(flags);
+        fs_reclaim_acquire(flags);
+        fs_reclaim_release(flags);
        might_sleep_if(gfpflags_allow_blocking(flags));
        if (should_failslab(s, flags))
diff --git a/mm/slob.c b/mm/slob.c
index 1bae78d71096..a8bd6fa11a66 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -432,7 +432,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
        gfp &= gfp_allowed_mask;
-        lockdep_trace_alloc(gfp);
+        fs_reclaim_acquire(gfp);
+        fs_reclaim_release(gfp);
        if (size < PAGE_SIZE - align) {
                if (!size)
@@ -538,7 +539,8 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
        flags &= gfp_allowed_mask;
-        lockdep_trace_alloc(flags);
+        fs_reclaim_acquire(flags);
+        fs_reclaim_release(flags);
        if (c->size < PAGE_SIZE) {
                b = slob_alloc(c->size, flags, c->align, node);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a1af041930a6..f957afe900ec 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3525,8 +3525,6 @@ static int kswapd(void *p)
        };
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-        lockdep_set_current_reclaim_state(GFP_KERNEL);
        if (!cpumask_empty(cpumask))
                set_cpus_allowed_ptr(tsk, cpumask);
        current->reclaim_state = &reclaim_state;
@@ -3585,14 +3583,15 @@ kswapd_try_sleep:
                 */
                trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
                                                alloc_order);
+                fs_reclaim_acquire(GFP_KERNEL);
                reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
+                fs_reclaim_release(GFP_KERNEL);
                if (reclaim_order < alloc_order)
                        goto kswapd_try_sleep;
        }
        tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
        current->reclaim_state = NULL;
-        lockdep_clear_current_reclaim_state();
        return 0;
 }
@@ -3655,14 +3654,14 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
        unsigned int noreclaim_flag;
        noreclaim_flag = memalloc_noreclaim_save();
-        lockdep_set_current_reclaim_state(sc.gfp_mask);
+        fs_reclaim_acquire(sc.gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
        p->reclaim_state = NULL;
-        lockdep_clear_current_reclaim_state();
+        fs_reclaim_release(sc.gfp_mask);
        memalloc_noreclaim_restore(noreclaim_flag);
        return nr_reclaimed;
@@ -3847,7 +3846,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
         */
        noreclaim_flag = memalloc_noreclaim_save();
        p->flags |= PF_SWAPWRITE;
-        lockdep_set_current_reclaim_state(sc.gfp_mask);
+        fs_reclaim_acquire(sc.gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
@@ -3862,9 +3861,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
        }
        p->reclaim_state = NULL;
+        fs_reclaim_release(gfp_mask);
        current->flags &= ~PF_SWAPWRITE;
        memalloc_noreclaim_restore(noreclaim_flag);
-        lockdep_clear_current_reclaim_state();
        return sc.nr_reclaimed >= nr_pages;
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 62344804baae..38e795e0c4bf 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1810,8 +1810,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 static struct static_key udp_encap_needed __read_mostly;
 void udp_encap_enable(void)
 {
-        if (!static_key_enabled(&udp_encap_needed))
+        static_key_enable(&udp_encap_needed);
-                static_key_slow_inc(&udp_encap_needed);
 }
 EXPORT_SYMBOL(udp_encap_enable);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d6886228e1d0..56030d45823a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -575,8 +575,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
 static struct static_key udpv6_encap_needed __read_mostly;
 void udpv6_encap_enable(void)
 {
-        if (!static_key_enabled(&udpv6_encap_needed))
+        static_key_enable(&udpv6_encap_needed);
-                static_key_slow_inc(&udpv6_encap_needed);
 }
 EXPORT_SYMBOL(udpv6_encap_enable);