aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-09-03 11:08:21 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-03 11:08:21 -0400
commitfc6d0b037678f50014ef409c92c5bedc01208fcd (patch)
treeb7de25e97b03c31ea6c5f2540f641b0be2c91832
parent6e4664525b1db28f8c4e1130957f70a94c19213e (diff)
parentbc08b449ee14ace4d869adaa1bb35a44ce68d775 (diff)
Merge branch 'lockref' (locked reference counts)
Merge lockref infrastructure code by me and Waiman Long. I already merged some of the preparatory patches that didn't actually do any semantic changes earlier, but this merges the actual _reason_ for those preparatory patches. The "lockref" structure is a combination "spinlock and reference count" that allows optimized reference count accesses. In particular, it guarantees that the reference count will be updated AS IF the spinlock was held, but using atomic accesses that cover both the reference count and the spinlock words, we can often do the update without actually having to take the lock. This allows us to avoid the nastiest cases of spinlock contention on large machines under heavy pathname lookup loads. When updating the dentry reference counts on a large system, we'll still end up with the cache line bouncing around, but that's much less noticeable than actually having to spin waiting for the lock. * lockref: lockref: implement lockless reference count updates using cmpxchg() lockref: uninline lockref helper functions vfs: reimplement d_rcu_to_refcount() using lockref_get_or_lock() vfs: use lockref_get_not_zero() for optimistic lockless dget_parent() lockref: add 'lockref_get_or_lock() helper
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/spinlock.h5
-rw-r--r--fs/dcache.c17
-rw-r--r--fs/namei.c90
-rw-r--r--include/linux/dcache.h22
-rw-r--r--include/linux/lockref.h61
-rw-r--r--lib/Kconfig10
-rw-r--r--lib/Makefile1
-rw-r--r--lib/lockref.c127
9 files changed, 237 insertions, 97 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b32ebf92b0ce..67e00740531c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -16,6 +16,7 @@ config X86_64
16 def_bool y 16 def_bool y
17 depends on 64BIT 17 depends on 64BIT
18 select X86_DEV_DMA_OPS 18 select X86_DEV_DMA_OPS
19 select ARCH_USE_CMPXCHG_LOCKREF
19 20
20### Arch settings 21### Arch settings
21config X86 22config X86
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index e3ddd7db723f..e0e668422c75 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -34,6 +34,11 @@
34# define UNLOCK_LOCK_PREFIX 34# define UNLOCK_LOCK_PREFIX
35#endif 35#endif
36 36
37static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
38{
39 return lock.tickets.head == lock.tickets.tail;
40}
41
37/* 42/*
38 * Ticket locks are conceptually two parts, one indicating the current head of 43 * Ticket locks are conceptually two parts, one indicating the current head of
39 * the queue, and the other indicating the current tail. The lock is acquired 44 * the queue, and the other indicating the current tail. The lock is acquired
diff --git a/fs/dcache.c b/fs/dcache.c
index b949af850cd6..96655f4f4574 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -611,8 +611,23 @@ static inline void __dget(struct dentry *dentry)
611 611
612struct dentry *dget_parent(struct dentry *dentry) 612struct dentry *dget_parent(struct dentry *dentry)
613{ 613{
614 int gotref;
614 struct dentry *ret; 615 struct dentry *ret;
615 616
617 /*
618 * Do optimistic parent lookup without any
619 * locking.
620 */
621 rcu_read_lock();
622 ret = ACCESS_ONCE(dentry->d_parent);
623 gotref = lockref_get_not_zero(&ret->d_lockref);
624 rcu_read_unlock();
625 if (likely(gotref)) {
626 if (likely(ret == ACCESS_ONCE(dentry->d_parent)))
627 return ret;
628 dput(ret);
629 }
630
616repeat: 631repeat:
617 /* 632 /*
618 * Don't need rcu_dereference because we re-check it was correct under 633 * Don't need rcu_dereference because we re-check it was correct under
@@ -1771,7 +1786,7 @@ static noinline enum slow_d_compare slow_dentry_cmp(
1771 * without taking d_lock and checking d_seq sequence count against @seq 1786 * without taking d_lock and checking d_seq sequence count against @seq
1772 * returned here. 1787 * returned here.
1773 * 1788 *
1774 * A refcount may be taken on the found dentry with the __d_rcu_to_refcount 1789 * A refcount may be taken on the found dentry with the d_rcu_to_refcount
1775 * function. 1790 * function.
1776 * 1791 *
1777 * Alternatively, __d_lookup_rcu may be called again to look up the child of 1792 * Alternatively, __d_lookup_rcu may be called again to look up the child of
diff --git a/fs/namei.c b/fs/namei.c
index 7720fbd5277b..2c30c84d4ea1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -494,6 +494,50 @@ static inline void unlock_rcu_walk(void)
494 br_read_unlock(&vfsmount_lock); 494 br_read_unlock(&vfsmount_lock);
495} 495}
496 496
497/*
498 * When we move over from the RCU domain to properly refcounted
499 * long-lived dentries, we need to check the sequence numbers
500 * we got before lookup very carefully.
501 *
502 * We cannot blindly increment a dentry refcount - even if it
503 * is not locked - if it is zero, because it may have gone
504 * through the final d_kill() logic already.
505 *
506 * So for a zero refcount, we need to get the spinlock (which is
507 * safe even for a dead dentry because the de-allocation is
508 * RCU-delayed), and check the sequence count under the lock.
509 *
510 * Once we have checked the sequence count, we know it is live,
511 * and since we hold the spinlock it cannot die from under us.
512 *
513 * In contrast, if the reference count wasn't zero, we can just
514 * increment the lockref without having to take the spinlock.
515 * Even if the sequence number ends up being stale, we haven't
516 * gone through the final dput() and killed the dentry yet.
517 */
518static inline int d_rcu_to_refcount(struct dentry *dentry, seqcount_t *validate, unsigned seq)
519{
520 int gotref;
521
522 gotref = lockref_get_or_lock(&dentry->d_lockref);
523
524 /* Does the sequence number still match? */
525 if (read_seqcount_retry(validate, seq)) {
526 if (gotref)
527 dput(dentry);
528 else
529 spin_unlock(&dentry->d_lock);
530 return -ECHILD;
531 }
532
533 /* Get the ref now, if we couldn't get it originally */
534 if (!gotref) {
535 dentry->d_lockref.count++;
536 spin_unlock(&dentry->d_lock);
537 }
538 return 0;
539}
540
497/** 541/**
498 * unlazy_walk - try to switch to ref-walk mode. 542 * unlazy_walk - try to switch to ref-walk mode.
499 * @nd: nameidata pathwalk data 543 * @nd: nameidata pathwalk data
@@ -518,29 +562,28 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
518 nd->root.dentry != fs->root.dentry) 562 nd->root.dentry != fs->root.dentry)
519 goto err_root; 563 goto err_root;
520 } 564 }
521 spin_lock(&parent->d_lock); 565
566 /*
567 * For a negative lookup, the lookup sequence point is the parents
568 * sequence point, and it only needs to revalidate the parent dentry.
569 *
570 * For a positive lookup, we need to move both the parent and the
571 * dentry from the RCU domain to be properly refcounted. And the
572 * sequence number in the dentry validates *both* dentry counters,
573 * since we checked the sequence number of the parent after we got
574 * the child sequence number. So we know the parent must still
575 * be valid if the child sequence number is still valid.
576 */
522 if (!dentry) { 577 if (!dentry) {
523 if (!__d_rcu_to_refcount(parent, nd->seq)) 578 if (d_rcu_to_refcount(parent, &parent->d_seq, nd->seq) < 0)
524 goto err_parent; 579 goto err_root;
525 BUG_ON(nd->inode != parent->d_inode); 580 BUG_ON(nd->inode != parent->d_inode);
526 } else { 581 } else {
527 if (dentry->d_parent != parent) 582 if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0)
583 goto err_root;
584 if (d_rcu_to_refcount(parent, &dentry->d_seq, nd->seq) < 0)
528 goto err_parent; 585 goto err_parent;
529 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
530 if (!__d_rcu_to_refcount(dentry, nd->seq))
531 goto err_child;
532 /*
533 * If the sequence check on the child dentry passed, then
534 * the child has not been removed from its parent. This
535 * means the parent dentry must be valid and able to take
536 * a reference at this point.
537 */
538 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
539 BUG_ON(!parent->d_lockref.count);
540 parent->d_lockref.count++;
541 spin_unlock(&dentry->d_lock);
542 } 586 }
543 spin_unlock(&parent->d_lock);
544 if (want_root) { 587 if (want_root) {
545 path_get(&nd->root); 588 path_get(&nd->root);
546 spin_unlock(&fs->lock); 589 spin_unlock(&fs->lock);
@@ -551,10 +594,8 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
551 nd->flags &= ~LOOKUP_RCU; 594 nd->flags &= ~LOOKUP_RCU;
552 return 0; 595 return 0;
553 596
554err_child:
555 spin_unlock(&dentry->d_lock);
556err_parent: 597err_parent:
557 spin_unlock(&parent->d_lock); 598 dput(dentry);
558err_root: 599err_root:
559 if (want_root) 600 if (want_root)
560 spin_unlock(&fs->lock); 601 spin_unlock(&fs->lock);
@@ -585,14 +626,11 @@ static int complete_walk(struct nameidata *nd)
585 nd->flags &= ~LOOKUP_RCU; 626 nd->flags &= ~LOOKUP_RCU;
586 if (!(nd->flags & LOOKUP_ROOT)) 627 if (!(nd->flags & LOOKUP_ROOT))
587 nd->root.mnt = NULL; 628 nd->root.mnt = NULL;
588 spin_lock(&dentry->d_lock); 629
589 if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { 630 if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) {
590 spin_unlock(&dentry->d_lock);
591 unlock_rcu_walk(); 631 unlock_rcu_walk();
592 return -ECHILD; 632 return -ECHILD;
593 } 633 }
594 BUG_ON(nd->inode != dentry->d_inode);
595 spin_unlock(&dentry->d_lock);
596 mntget(nd->path.mnt); 634 mntget(nd->path.mnt);
597 unlock_rcu_walk(); 635 unlock_rcu_walk();
598 } 636 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index efdc94434c30..9169b91ea2d2 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -304,28 +304,6 @@ extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
304extern struct dentry *__d_lookup_rcu(const struct dentry *parent, 304extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
305 const struct qstr *name, unsigned *seq); 305 const struct qstr *name, unsigned *seq);
306 306
307/**
308 * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok
309 * @dentry: dentry to take a ref on
310 * @seq: seqcount to verify against
311 * Returns: 0 on failure, else 1.
312 *
313 * __d_rcu_to_refcount operates on a dentry,seq pair that was returned
314 * by __d_lookup_rcu, to get a reference on an rcu-walk dentry.
315 */
316static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq)
317{
318 int ret = 0;
319
320 assert_spin_locked(&dentry->d_lock);
321 if (!read_seqcount_retry(&dentry->d_seq, seq)) {
322 ret = 1;
323 dentry->d_lockref.count++;
324 }
325
326 return ret;
327}
328
329static inline unsigned d_count(const struct dentry *dentry) 307static inline unsigned d_count(const struct dentry *dentry)
330{ 308{
331 return dentry->d_lockref.count; 309 return dentry->d_lockref.count;
diff --git a/include/linux/lockref.h b/include/linux/lockref.h
index 01233e01627a..ca07b5028b01 100644
--- a/include/linux/lockref.h
+++ b/include/linux/lockref.h
@@ -17,55 +17,20 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18 18
19struct lockref { 19struct lockref {
20 spinlock_t lock; 20 union {
21 unsigned int count; 21#ifdef CONFIG_CMPXCHG_LOCKREF
22 aligned_u64 lock_count;
23#endif
24 struct {
25 spinlock_t lock;
26 unsigned int count;
27 };
28 };
22}; 29};
23 30
24/** 31extern void lockref_get(struct lockref *);
25 * lockref_get - Increments reference count unconditionally 32extern int lockref_get_not_zero(struct lockref *);
26 * @lockcnt: pointer to lockref structure 33extern int lockref_get_or_lock(struct lockref *);
27 * 34extern int lockref_put_or_lock(struct lockref *);
28 * This operation is only valid if you already hold a reference
29 * to the object, so you know the count cannot be zero.
30 */
31static inline void lockref_get(struct lockref *lockref)
32{
33 spin_lock(&lockref->lock);
34 lockref->count++;
35 spin_unlock(&lockref->lock);
36}
37
38/**
39 * lockref_get_not_zero - Increments count unless the count is 0
40 * @lockcnt: pointer to lockref structure
41 * Return: 1 if count updated successfully or 0 if count is 0
42 */
43static inline int lockref_get_not_zero(struct lockref *lockref)
44{
45 int retval = 0;
46
47 spin_lock(&lockref->lock);
48 if (lockref->count) {
49 lockref->count++;
50 retval = 1;
51 }
52 spin_unlock(&lockref->lock);
53 return retval;
54}
55
56/**
57 * lockref_put_or_lock - decrements count unless count <= 1 before decrement
58 * @lockcnt: pointer to lockref structure
59 * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
60 */
61static inline int lockref_put_or_lock(struct lockref *lockref)
62{
63 spin_lock(&lockref->lock);
64 if (lockref->count <= 1)
65 return 0;
66 lockref->count--;
67 spin_unlock(&lockref->lock);
68 return 1;
69}
70 35
71#endif /* __LINUX_LOCKREF_H */ 36#endif /* __LINUX_LOCKREF_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 71d9f81f6eed..65561716c16c 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -48,6 +48,16 @@ config STMP_DEVICE
48config PERCPU_RWSEM 48config PERCPU_RWSEM
49 boolean 49 boolean
50 50
51config ARCH_USE_CMPXCHG_LOCKREF
52 bool
53
54config CMPXCHG_LOCKREF
55 def_bool y if ARCH_USE_CMPXCHG_LOCKREF
56 depends on SMP
57 depends on !GENERIC_LOCKBREAK
58 depends on !DEBUG_SPINLOCK
59 depends on !DEBUG_LOCK_ALLOC
60
51config CRC_CCITT 61config CRC_CCITT
52 tristate "CRC-CCITT functions" 62 tristate "CRC-CCITT functions"
53 help 63 help
diff --git a/lib/Makefile b/lib/Makefile
index 7baccfd8a4e9..f2cb3082697c 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -20,6 +20,7 @@ lib-$(CONFIG_MMU) += ioremap.o
20lib-$(CONFIG_SMP) += cpumask.o 20lib-$(CONFIG_SMP) += cpumask.o
21 21
22lib-y += kobject.o klist.o 22lib-y += kobject.o klist.o
23obj-y += lockref.o
23 24
24obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ 25obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
25 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ 26 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
diff --git a/lib/lockref.c b/lib/lockref.c
new file mode 100644
index 000000000000..7819c2d1d315
--- /dev/null
+++ b/lib/lockref.c
@@ -0,0 +1,127 @@
1#include <linux/export.h>
2#include <linux/lockref.h>
3
4#ifdef CONFIG_CMPXCHG_LOCKREF
5
6/*
7 * Note that the "cmpxchg()" reloads the "old" value for the
8 * failure case.
9 */
10#define CMPXCHG_LOOP(CODE, SUCCESS) do { \
11 struct lockref old; \
12 BUILD_BUG_ON(sizeof(old) != 8); \
13 old.lock_count = ACCESS_ONCE(lockref->lock_count); \
14 while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) { \
15 struct lockref new = old, prev = old; \
16 CODE \
17 old.lock_count = cmpxchg(&lockref->lock_count, \
18 old.lock_count, new.lock_count); \
19 if (likely(old.lock_count == prev.lock_count)) { \
20 SUCCESS; \
21 } \
22 } \
23} while (0)
24
25#else
26
27#define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)
28
29#endif
30
31/**
32 * lockref_get - Increments reference count unconditionally
33 * @lockcnt: pointer to lockref structure
34 *
35 * This operation is only valid if you already hold a reference
36 * to the object, so you know the count cannot be zero.
37 */
38void lockref_get(struct lockref *lockref)
39{
40 CMPXCHG_LOOP(
41 new.count++;
42 ,
43 return;
44 );
45
46 spin_lock(&lockref->lock);
47 lockref->count++;
48 spin_unlock(&lockref->lock);
49}
50EXPORT_SYMBOL(lockref_get);
51
52/**
53 * lockref_get_not_zero - Increments count unless the count is 0
54 * @lockcnt: pointer to lockref structure
55 * Return: 1 if count updated successfully or 0 if count was zero
56 */
57int lockref_get_not_zero(struct lockref *lockref)
58{
59 int retval;
60
61 CMPXCHG_LOOP(
62 new.count++;
63 if (!old.count)
64 return 0;
65 ,
66 return 1;
67 );
68
69 spin_lock(&lockref->lock);
70 retval = 0;
71 if (lockref->count) {
72 lockref->count++;
73 retval = 1;
74 }
75 spin_unlock(&lockref->lock);
76 return retval;
77}
78EXPORT_SYMBOL(lockref_get_not_zero);
79
80/**
81 * lockref_get_or_lock - Increments count unless the count is 0
82 * @lockcnt: pointer to lockref structure
83 * Return: 1 if count updated successfully or 0 if count was zero
84 * and we got the lock instead.
85 */
86int lockref_get_or_lock(struct lockref *lockref)
87{
88 CMPXCHG_LOOP(
89 new.count++;
90 if (!old.count)
91 break;
92 ,
93 return 1;
94 );
95
96 spin_lock(&lockref->lock);
97 if (!lockref->count)
98 return 0;
99 lockref->count++;
100 spin_unlock(&lockref->lock);
101 return 1;
102}
103EXPORT_SYMBOL(lockref_get_or_lock);
104
105/**
106 * lockref_put_or_lock - decrements count unless count <= 1 before decrement
107 * @lockcnt: pointer to lockref structure
108 * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
109 */
110int lockref_put_or_lock(struct lockref *lockref)
111{
112 CMPXCHG_LOOP(
113 new.count--;
114 if (old.count <= 1)
115 break;
116 ,
117 return 1;
118 );
119
120 spin_lock(&lockref->lock);
121 if (lockref->count <= 1)
122 return 0;
123 lockref->count--;
124 spin_unlock(&lockref->lock);
125 return 1;
126}
127EXPORT_SYMBOL(lockref_put_or_lock);