1 files changed, 190 insertions, 18 deletions
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae7761db6..68fd9ccf1805 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -26,45 +26,215 @@
 #include "locking.h"
 /*
- * locks the per buffer mutex in an extent buffer.  This uses adaptive locks
+ * btrfs_header_level() isn't free, so don't call it when lockdep isn't
- * and the spin is not tuned very extensively.  The spinning does make a big
+ * on
- * difference in almost every workload, but spinning for the right amount of
- * time needs some help.
- *
- * In general, we want to spin as long as the lock holder is doing btree
- * searches, and we should give up if they are in more expensive code.
 */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void spin_nested(struct extent_buffer *eb)
+{
+        spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
+}
+#else
+static inline void spin_nested(struct extent_buffer *eb)
+{
+        spin_lock(&eb->lock);
+}
+#endif
-int btrfs_tree_lock(struct extent_buffer *eb)
+/*
+ * Setting a lock to blocking will drop the spinlock and set the
+ * flag that forces other procs who want the lock to wait.  After
+ * this you can safely schedule with the lock held.
+ */
+void btrfs_set_lock_blocking(struct extent_buffer *eb)
 {
-        int i;
+        if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+                set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+                spin_unlock(&eb->lock);
+        }
+        /* exit with the spin lock released and the bit set */
+}
-        if (mutex_trylock(&eb->mutex))
+/*
-                return 0;
+ * clearing the blocking flag will take the spinlock again.
+ * After this you can't safely schedule
+ */
+void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+{
+        if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+                spin_nested(eb);
+                clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+                smp_mb__after_clear_bit();
+        }
+        /* exit with the spin lock held */
+}
+/*
+ * unfortunately, many of the places that currently set a lock to blocking
+ * don't end up blocking for every long, and often they don't block
+ * at all.  For a dbench 50 run, if we don't spin one the blocking bit
+ * at all, the context switch rate can jump up to 400,000/sec or more.
+ *
+ * So, we're still stuck with this crummy spin on the blocking bit,
+ * at least until the most common causes of the short blocks
+ * can be dealt with.
+ */
+static int btrfs_spin_on_block(struct extent_buffer *eb)
+{
+        int i;
        for (i = 0; i < 512; i++) {
                cpu_relax();
-                if (mutex_trylock(&eb->mutex))
+                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                        return 1;
+                if (need_resched())
+                        break;
+        }
+        return 0;
+}
+/*
+ * This is somewhat different from trylock.  It will take the
+ * spinlock but if it finds the lock is set to blocking, it will
+ * return without the lock held.
+ *
+ * returns 1 if it was able to take the lock and zero otherwise
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_try_spin_lock(struct extent_buffer *eb)
+{
+        int i;
+        spin_nested(eb);
+        if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                return 1;
+        spin_unlock(&eb->lock);
+        /* spin for a bit on the BLOCKING flag */
+        for (i = 0; i < 2; i++) {
+                if (!btrfs_spin_on_block(eb))
+                        break;
+                spin_nested(eb);
+                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                        return 1;
+                spin_unlock(&eb->lock);
+        }
+        return 0;
+}
+/*
+ * the autoremove wake function will return 0 if it tried to wake up
+ * a process that was already awake, which means that process won't
+ * count as an exclusive wakeup.  The waitq code will continue waking
+ * procs until it finds one that was actually sleeping.
+ *
+ * For btrfs, this isn't quite what we want.  We want a single proc
+ * to be notified that the lock is ready for taking.  If that proc
+ * already happen to be awake, great, it will loop around and try for
+ * the lock.
+ *
+ * So, btrfs_wake_function always returns 1, even when the proc that we
+ * tried to wake up was already awake.
+ */
+static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
+                               int sync, void *key)
+{
+        autoremove_wake_function(wait, mode, sync, key);
+        return 1;
+}
+/*
+ * returns with the extent buffer spinlocked.
+ *
+ * This will spin and/or wait as required to take the lock, and then
+ * return with the spinlock held.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+        DEFINE_WAIT(wait);
+        wait.func = btrfs_wake_function;
+        while(1) {
+                spin_nested(eb);
+                /* nobody is blocking, exit with the spinlock held */
+                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
                        return 0;
+                /*
+                 * we have the spinlock, but the real owner is blocking.
+                 * wait for them
+                 */
+                spin_unlock(&eb->lock);
+                /*
+                 * spin for a bit, and if the blocking flag goes away,
+                 * loop around
+                 */
+                if (btrfs_spin_on_block(eb))
+                        continue;
+                prepare_to_wait_exclusive(&eb->lock_wq, &wait,
+                                          TASK_UNINTERRUPTIBLE);
+                if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                        schedule();
+                finish_wait(&eb->lock_wq, &wait);
        }
-        cpu_relax();
-        mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
        return 0;
 }
+/*
+ * Very quick trylock, this does not spin or schedule.  It returns
+ * 1 with the spinlock held if it was able to take the lock, or it
+ * returns zero if it was unable to take the lock.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
 int btrfs_try_tree_lock(struct extent_buffer *eb)
 {
-        return mutex_trylock(&eb->mutex);
+        if (spin_trylock(&eb->lock)) {
+                if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+                        /*
+                         * we've got the spinlock, but the real owner is
+                         * blocking.  Drop the spinlock and return failure
+                         */
+                        spin_unlock(&eb->lock);
+                        return 0;
+                }
+                return 1;
+        }
+        /* someone else has the spinlock giveup */
+        return 0;
 }
 int btrfs_tree_unlock(struct extent_buffer *eb)
 {
-        mutex_unlock(&eb->mutex);
+        /*
+         * if we were a blocking owner, we don't have the spinlock held
+         * just clear the bit and look for waiters
+         */
+        if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                smp_mb__after_clear_bit();
+        else
+                spin_unlock(&eb->lock);
+        if (waitqueue_active(&eb->lock_wq))
+                wake_up(&eb->lock_wq);
        return 0;
 }
 int btrfs_tree_locked(struct extent_buffer *eb)
 {
-        return mutex_is_locked(&eb->mutex);
+        return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
+                        spin_is_locked(&eb->lock);
 }
 /*
@@ -75,12 +245,14 @@ int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
 {
        int i;
        struct extent_buffer *eb;
        for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
                eb = path->nodes[i];
                if (!eb)
                        break;
                smp_mb();
-                if (!list_empty(&eb->mutex.wait_list))
+                if (spin_is_contended(&eb->lock) ||
+                    waitqueue_active(&eb->lock_wq))
                        return 1;
        }
        return 0;

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 39bae7761db6..68fd9ccf1805 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c
@@ -26,45 +26,215 @@
26	#include "locking.h"	26	#include "locking.h"
27		27
28	/*	28	/*
29	* locks the per buffer mutex in an extent buffer. This uses adaptive locks	29	* btrfs_header_level() isn't free, so don't call it when lockdep isn't
30	* and the spin is not tuned very extensively. The spinning does make a big	30	* on
31	* difference in almost every workload, but spinning for the right amount of
32	* time needs some help.
33	*
34	* In general, we want to spin as long as the lock holder is doing btree
35	* searches, and we should give up if they are in more expensive code.
36	*/	31	*/
		32	#ifdef CONFIG_DEBUG_LOCK_ALLOC
		33	static inline void spin_nested(struct extent_buffer *eb)
		34	{
		35	spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
		36	}
		37	#else
		38	static inline void spin_nested(struct extent_buffer *eb)
		39	{
		40	spin_lock(&eb->lock);
		41	}
		42	#endif
37		43
38	int btrfs_tree_lock(struct extent_buffer *eb)	44	/*
		45	* Setting a lock to blocking will drop the spinlock and set the
		46	* flag that forces other procs who want the lock to wait. After
		47	* this you can safely schedule with the lock held.
		48	*/
		49	void btrfs_set_lock_blocking(struct extent_buffer *eb)
39	{	50	{
40	int i;	51	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
		52	set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
		53	spin_unlock(&eb->lock);
		54	}
		55	/* exit with the spin lock released and the bit set */
		56	}
41		57
42	if (mutex_trylock(&eb->mutex))	58	/*
43	return 0;	59	* clearing the blocking flag will take the spinlock again.
		60	* After this you can't safely schedule
		61	*/
		62	void btrfs_clear_lock_blocking(struct extent_buffer *eb)
		63	{
		64	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
		65	spin_nested(eb);
		66	clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
		67	smp_mb__after_clear_bit();
		68	}
		69	/* exit with the spin lock held */
		70	}
		71
		72	/*
		73	* unfortunately, many of the places that currently set a lock to blocking
		74	* don't end up blocking for every long, and often they don't block
		75	* at all. For a dbench 50 run, if we don't spin one the blocking bit
		76	* at all, the context switch rate can jump up to 400,000/sec or more.
		77	*
		78	* So, we're still stuck with this crummy spin on the blocking bit,
		79	* at least until the most common causes of the short blocks
		80	* can be dealt with.
		81	*/
		82	static int btrfs_spin_on_block(struct extent_buffer *eb)
		83	{
		84	int i;
44	for (i = 0; i < 512; i++) {	85	for (i = 0; i < 512; i++) {
45	cpu_relax();	86	cpu_relax();
46	if (mutex_trylock(&eb->mutex))	87	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
		88	return 1;
		89	if (need_resched())
		90	break;
		91	}
		92	return 0;
		93	}
		94
		95	/*
		96	* This is somewhat different from trylock. It will take the
		97	* spinlock but if it finds the lock is set to blocking, it will
		98	* return without the lock held.
		99	*
		100	* returns 1 if it was able to take the lock and zero otherwise
		101	*
		102	* After this call, scheduling is not safe without first calling
		103	* btrfs_set_lock_blocking()
		104	*/
		105	int btrfs_try_spin_lock(struct extent_buffer *eb)
		106	{
		107	int i;
		108
		109	spin_nested(eb);
		110	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
		111	return 1;
		112	spin_unlock(&eb->lock);
		113
		114	/* spin for a bit on the BLOCKING flag */
		115	for (i = 0; i < 2; i++) {
		116	if (!btrfs_spin_on_block(eb))
		117	break;
		118
		119	spin_nested(eb);
		120	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
		121	return 1;
		122	spin_unlock(&eb->lock);
		123	}
		124	return 0;
		125	}
		126
		127	/*
		128	* the autoremove wake function will return 0 if it tried to wake up
		129	* a process that was already awake, which means that process won't
		130	* count as an exclusive wakeup. The waitq code will continue waking
		131	* procs until it finds one that was actually sleeping.
		132	*
		133	* For btrfs, this isn't quite what we want. We want a single proc
		134	* to be notified that the lock is ready for taking. If that proc
		135	* already happen to be awake, great, it will loop around and try for
		136	* the lock.
		137	*
		138	* So, btrfs_wake_function always returns 1, even when the proc that we
		139	* tried to wake up was already awake.
		140	*/
		141	static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
		142	int sync, void *key)
		143	{
		144	autoremove_wake_function(wait, mode, sync, key);
		145	return 1;
		146	}
		147
		148	/*
		149	* returns with the extent buffer spinlocked.
		150	*
		151	* This will spin and/or wait as required to take the lock, and then
		152	* return with the spinlock held.
		153	*
		154	* After this call, scheduling is not safe without first calling
		155	* btrfs_set_lock_blocking()
		156	*/
		157	int btrfs_tree_lock(struct extent_buffer *eb)
		158	{
		159	DEFINE_WAIT(wait);
		160	wait.func = btrfs_wake_function;
		161
		162	while(1) {
		163	spin_nested(eb);
		164
		165	/* nobody is blocking, exit with the spinlock held */
		166	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
47	return 0;	167	return 0;
		168
		169	/*
		170	* we have the spinlock, but the real owner is blocking.
		171	* wait for them
		172	*/
		173	spin_unlock(&eb->lock);
		174
		175	/*
		176	* spin for a bit, and if the blocking flag goes away,
		177	* loop around
		178	*/
		179	if (btrfs_spin_on_block(eb))
		180	continue;
		181
		182	prepare_to_wait_exclusive(&eb->lock_wq, &wait,
		183	TASK_UNINTERRUPTIBLE);
		184
		185	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
		186	schedule();
		187
		188	finish_wait(&eb->lock_wq, &wait);
48	}	189	}
49	cpu_relax();
50	mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51	return 0;	190	return 0;
52	}	191	}
53		192
		193	/*
		194	* Very quick trylock, this does not spin or schedule. It returns
		195	* 1 with the spinlock held if it was able to take the lock, or it
		196	* returns zero if it was unable to take the lock.
		197	*
		198	* After this call, scheduling is not safe without first calling
		199	* btrfs_set_lock_blocking()
		200	*/
54	int btrfs_try_tree_lock(struct extent_buffer *eb)	201	int btrfs_try_tree_lock(struct extent_buffer *eb)
55	{	202	{
56	return mutex_trylock(&eb->mutex);	203	if (spin_trylock(&eb->lock)) {
		204	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
		205	/*
		206	* we've got the spinlock, but the real owner is
		207	* blocking. Drop the spinlock and return failure
		208	*/
		209	spin_unlock(&eb->lock);
		210	return 0;
		211	}
		212	return 1;
		213	}
		214	/* someone else has the spinlock giveup */
		215	return 0;
57	}	216	}
58		217
59	int btrfs_tree_unlock(struct extent_buffer *eb)	218	int btrfs_tree_unlock(struct extent_buffer *eb)
60	{	219	{
61	mutex_unlock(&eb->mutex);	220	/*
		221	* if we were a blocking owner, we don't have the spinlock held
		222	* just clear the bit and look for waiters
		223	*/
		224	if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
		225	smp_mb__after_clear_bit();
		226	else
		227	spin_unlock(&eb->lock);
		228
		229	if (waitqueue_active(&eb->lock_wq))
		230	wake_up(&eb->lock_wq);
62	return 0;	231	return 0;
63	}	232	}
64		233
65	int btrfs_tree_locked(struct extent_buffer *eb)	234	int btrfs_tree_locked(struct extent_buffer *eb)
66	{	235	{
67	return mutex_is_locked(&eb->mutex);	236	return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) \|\|
		237	spin_is_locked(&eb->lock);
68	}	238	}
69		239
70	/*	240	/*
@@ -75,12 +245,14 @@ int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
75	{	245	{
76	int i;	246	int i;
77	struct extent_buffer *eb;	247	struct extent_buffer *eb;
		248
78	for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {	249	for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
79	eb = path->nodes[i];	250	eb = path->nodes[i];
80	if (!eb)	251	if (!eb)
81	break;	252	break;
82	smp_mb();	253	smp_mb();
83	if (!list_empty(&eb->mutex.wait_list))	254	if (spin_is_contended(&eb->lock) \|\|
		255	waitqueue_active(&eb->lock_wq))
84	return 1;	256	return 1;
85	}	257	}
86	return 0;	258	return 0;