Btrfs: Change btree locking to use explicit blocking points

Most of the btrfs metadata operations can be protected by a spinlock, but some operations still need to schedule. So far, btrfs has been using a mutex along with a trylock loop, most of the time it is able to avoid going for the full mutex, so the trylock loop is a big performance gain. This commit is step one for getting rid of the blocking locks entirely. btrfs_tree_lock takes a spinlock, and the code explicitly switches to a blocking lock when it starts an operation that can schedule. We'll be able get rid of the blocking locks in smaller pieces over time. Tracing allows us to find the most common cause of blocking, so we can start with the hot spots first. The basic idea is: btrfs_tree_lock() returns with the spin lock held btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in the extent buffer flags, and then drops the spin lock. The buffer is still considered locked by all of the btrfs code. If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops the spin lock and waits on a wait queue for the blocking bit to go away. Much of the code that needs to set the blocking bit finishes without actually blocking a good percentage of the time. So, an adaptive spin is still used against the blocking bit to avoid very high context switch rates. btrfs_clear_lock_blocking() clears the blocking bit and returns with the spinlock held again. btrfs_tree_unlock() can be called on either blocking or spinning locks, it does the right thing based on the blocking bit. ctree.c has a helper function to set/clear all the locked buffers in a path as blocking. Signed-off-by: Chris Mason <chris.mason@oracle.com>
author: Chris Mason <chris.mason@oracle.com> 2009-02-04 09:25:08 -0500
committer: Chris Mason <chris.mason@oracle.com> 2009-02-04 09:25:08 -0500
commit: b4ce94de9b4d64e8ab3cf155d13653c666e22b9b (patch)
tree: ebc44a9554a50b495b091cb0979d79fd29e50fe7 /fs/btrfs/locking.c
parent: c487685d7c18a8481900755aa5c56a7a74193101 (diff)
1 files changed, 190 insertions, 18 deletions
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae7761db6..68fd9ccf1805 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -26,45 +26,215 @@
 #include "locking.h"
 /*
- * locks the per buffer mutex in an extent buffer.  This uses adaptive locks
+ * btrfs_header_level() isn't free, so don't call it when lockdep isn't
- * and the spin is not tuned very extensively.  The spinning does make a big
+ * on
- * difference in almost every workload, but spinning for the right amount of
- * time needs some help.
- *
- * In general, we want to spin as long as the lock holder is doing btree
- * searches, and we should give up if they are in more expensive code.
 */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void spin_nested(struct extent_buffer *eb)
+{
+        spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
+}
+#else
+static inline void spin_nested(struct extent_buffer *eb)
+{
+        spin_lock(&eb->lock);
+}
+#endif
-int btrfs_tree_lock(struct extent_buffer *eb)
+/*
+ * Setting a lock to blocking will drop the spinlock and set the
+ * flag that forces other procs who want the lock to wait.  After
+ * this you can safely schedule with the lock held.
+ */
+void btrfs_set_lock_blocking(struct extent_buffer *eb)
 {
-        int i;
+        if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+                set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+                spin_unlock(&eb->lock);
+        }
+        /* exit with the spin lock released and the bit set */
+}
-        if (mutex_trylock(&eb->mutex))
+/*
-                return 0;
+ * clearing the blocking flag will take the spinlock again.
+ * After this you can't safely schedule
+ */
+void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+{
+        if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+                spin_nested(eb);
+                clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+                smp_mb__after_clear_bit();
+        }
+        /* exit with the spin lock held */
+}
+/*
+ * unfortunately, many of the places that currently set a lock to blocking
+ * don't end up blocking for every long, and often they don't block
+ * at all.  For a dbench 50 run, if we don't spin one the blocking bit
+ * at all, the context switch rate can jump up to 400,000/sec or more.
+ *
+ * So, we're still stuck with this crummy spin on the blocking bit,
+ * at least until the most common causes of the short blocks
+ * can be dealt with.
+ */
+static int btrfs_spin_on_block(struct extent_buffer *eb)
+{
+        int i;
        for (i = 0; i < 512; i++) {
                cpu_relax();
-                if (mutex_trylock(&eb->mutex))
+                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                        return 1;
+                if (need_resched())
+                        break;
+        }
+        return 0;
+}
+/*
+ * This is somewhat different from trylock.  It will take the
+ * spinlock but if it finds the lock is set to blocking, it will
+ * return without the lock held.
+ *
+ * returns 1 if it was able to take the lock and zero otherwise
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_try_spin_lock(struct extent_buffer *eb)
+{
+        int i;
+        spin_nested(eb);
+        if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                return 1;
+        spin_unlock(&eb->lock);
+        /* spin for a bit on the BLOCKING flag */
+        for (i = 0; i < 2; i++) {
+                if (!btrfs_spin_on_block(eb))
+                        break;
+                spin_nested(eb);
+                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                        return 1;
+                spin_unlock(&eb->lock);
+        }
+        return 0;
+}
+/*
+ * the autoremove wake function will return 0 if it tried to wake up
+ * a process that was already awake, which means that process won't
+ * count as an exclusive wakeup.  The waitq code will continue waking
+ * procs until it finds one that was actually sleeping.
+ *
+ * For btrfs, this isn't quite what we want.  We want a single proc
+ * to be notified that the lock is ready for taking.  If that proc
+ * already happen to be awake, great, it will loop around and try for
+ * the lock.
+ *
+ * So, btrfs_wake_function always returns 1, even when the proc that we
+ * tried to wake up was already awake.
+ */
+static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
+                               int sync, void *key)
+{
+        autoremove_wake_function(wait, mode, sync, key);
+        return 1;
+}
+/*
+ * returns with the extent buffer spinlocked.
+ *
+ * This will spin and/or wait as required to take the lock, and then
+ * return with the spinlock held.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+        DEFINE_WAIT(wait);
+        wait.func = btrfs_wake_function;
+        while(1) {
+                spin_nested(eb);
+                /* nobody is blocking, exit with the spinlock held */
+                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
                        return 0;
+                /*
+                 * we have the spinlock, but the real owner is blocking.
+                 * wait for them
+                 */
+                spin_unlock(&eb->lock);
+                /*
+                 * spin for a bit, and if the blocking flag goes away,
+                 * loop around
+                 */
+                if (btrfs_spin_on_block(eb))
+                        continue;
+                prepare_to_wait_exclusive(&eb->lock_wq, &wait,
+                                          TASK_UNINTERRUPTIBLE);
+                if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                        schedule();
+                finish_wait(&eb->lock_wq, &wait);
        }
-        cpu_relax();
-        mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
        return 0;
 }
+/*
+ * Very quick trylock, this does not spin or schedule.  It returns
+ * 1 with the spinlock held if it was able to take the lock, or it
+ * returns zero if it was unable to take the lock.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
 int btrfs_try_tree_lock(struct extent_buffer *eb)
 {
-        return mutex_trylock(&eb->mutex);
+        if (spin_trylock(&eb->lock)) {
+                if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+                        /*
+                         * we've got the spinlock, but the real owner is
+                         * blocking.  Drop the spinlock and return failure
+                         */
+                        spin_unlock(&eb->lock);
+                        return 0;
+                }
+                return 1;
+        }
+        /* someone else has the spinlock giveup */
+        return 0;
 }
 int btrfs_tree_unlock(struct extent_buffer *eb)
 {
-        mutex_unlock(&eb->mutex);
+        /*
+         * if we were a blocking owner, we don't have the spinlock held
+         * just clear the bit and look for waiters
+         */
+        if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                smp_mb__after_clear_bit();
+        else
+                spin_unlock(&eb->lock);
+        if (waitqueue_active(&eb->lock_wq))
+                wake_up(&eb->lock_wq);
        return 0;
 }
 int btrfs_tree_locked(struct extent_buffer *eb)
 {
-        return mutex_is_locked(&eb->mutex);
+        return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
+                        spin_is_locked(&eb->lock);
 }
 /*
@@ -75,12 +245,14 @@ int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
 {
        int i;
        struct extent_buffer *eb;
        for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
                eb = path->nodes[i];
                if (!eb)
                        break;
                smp_mb();
-                if (!list_empty(&eb->mutex.wait_list))
+                if (spin_is_contended(&eb->lock) ||
+                    waitqueue_active(&eb->lock_wq))
                        return 1;
        }
        return 0;
author	Chris Mason <chris.mason@oracle.com>	2009-02-04 09:25:08 -0500
committer	Chris Mason <chris.mason@oracle.com>	2009-02-04 09:25:08 -0500
commit	b4ce94de9b4d64e8ab3cf155d13653c666e22b9b (patch)
tree	ebc44a9554a50b495b091cb0979d79fd29e50fe7 /fs/btrfs/locking.c
parent	c487685d7c18a8481900755aa5c56a7a74193101 (diff)

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 39bae7761db6..68fd9ccf1805 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c
@@ -26,45 +26,215 @@
26	#include "locking.h"	26	#include "locking.h"
27		27
28	/*	28	/*
29	* locks the per buffer mutex in an extent buffer. This uses adaptive locks	29	* btrfs_header_level() isn't free, so don't call it when lockdep isn't
30	* and the spin is not tuned very extensively. The spinning does make a big	30	* on
31	* difference in almost every workload, but spinning for the right amount of
32	* time needs some help.
33	*
34	* In general, we want to spin as long as the lock holder is doing btree
35	* searches, and we should give up if they are in more expensive code.
36	*/	31	*/
		32	#ifdef CONFIG_DEBUG_LOCK_ALLOC
		33	static inline void spin_nested(struct extent_buffer *eb)
		34	{
		35	spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
		36	}
		37	#else
		38	static inline void spin_nested(struct extent_buffer *eb)
		39	{
		40	spin_lock(&eb->lock);
		41	}
		42	#endif
37		43
38	int btrfs_tree_lock(struct extent_buffer *eb)	44	/*
		45	* Setting a lock to blocking will drop the spinlock and set the
		46	* flag that forces other procs who want the lock to wait. After
		47	* this you can safely schedule with the lock held.
		48	*/
		49	void btrfs_set_lock_blocking(struct extent_buffer *eb)
39	{	50	{
40	int i;	51	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
		52	set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
		53	spin_unlock(&eb->lock);
		54	}
		55	/* exit with the spin lock released and the bit set */
		56	}
41		57
42	if (mutex_trylock(&eb->mutex))	58	/*
43	return 0;	59	* clearing the blocking flag will take the spinlock again.
		60	* After this you can't safely schedule
		61	*/
		62	void btrfs_clear_lock_blocking(struct extent_buffer *eb)
		63	{
		64	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
		65	spin_nested(eb);
		66	clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
		67	smp_mb__after_clear_bit();
		68	}
		69	/* exit with the spin lock held */
		70	}
		71
		72	/*
		73	* unfortunately, many of the places that currently set a lock to blocking
		74	* don't end up blocking for every long, and often they don't block
		75	* at all. For a dbench 50 run, if we don't spin one the blocking bit
		76	* at all, the context switch rate can jump up to 400,000/sec or more.
		77	*
		78	* So, we're still stuck with this crummy spin on the blocking bit,
		79	* at least until the most common causes of the short blocks
		80	* can be dealt with.
		81	*/
		82	static int btrfs_spin_on_block(struct extent_buffer *eb)
		83	{
		84	int i;
44	for (i = 0; i < 512; i++) {	85	for (i = 0; i < 512; i++) {
45	cpu_relax();	86	cpu_relax();
46	if (mutex_trylock(&eb->mutex))	87	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
		88	return 1;
		89	if (need_resched())
		90	break;
		91	}
		92	return 0;
		93	}
		94
		95	/*
		96	* This is somewhat different from trylock. It will take the
		97	* spinlock but if it finds the lock is set to blocking, it will
		98	* return without the lock held.
		99	*
		100	* returns 1 if it was able to take the lock and zero otherwise
		101	*
		102	* After this call, scheduling is not safe without first calling
		103	* btrfs_set_lock_blocking()
		104	*/
		105	int btrfs_try_spin_lock(struct extent_buffer *eb)
		106	{
		107	int i;
		108
		109	spin_nested(eb);
		110	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
		111	return 1;
		112	spin_unlock(&eb->lock);
		113
		114	/* spin for a bit on the BLOCKING flag */
		115	for (i = 0; i < 2; i++) {
		116	if (!btrfs_spin_on_block(eb))
		117	break;
		118
		119	spin_nested(eb);
		120	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
		121	return 1;
		122	spin_unlock(&eb->lock);
		123	}
		124	return 0;
		125	}
		126
		127	/*
		128	* the autoremove wake function will return 0 if it tried to wake up
		129	* a process that was already awake, which means that process won't
		130	* count as an exclusive wakeup. The waitq code will continue waking
		131	* procs until it finds one that was actually sleeping.
		132	*
		133	* For btrfs, this isn't quite what we want. We want a single proc
		134	* to be notified that the lock is ready for taking. If that proc
		135	* already happen to be awake, great, it will loop around and try for
		136	* the lock.
		137	*
		138	* So, btrfs_wake_function always returns 1, even when the proc that we
		139	* tried to wake up was already awake.
		140	*/
		141	static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
		142	int sync, void *key)
		143	{
		144	autoremove_wake_function(wait, mode, sync, key);
		145	return 1;
		146	}
		147
		148	/*
		149	* returns with the extent buffer spinlocked.
		150	*
		151	* This will spin and/or wait as required to take the lock, and then
		152	* return with the spinlock held.
		153	*
		154	* After this call, scheduling is not safe without first calling
		155	* btrfs_set_lock_blocking()
		156	*/
		157	int btrfs_tree_lock(struct extent_buffer *eb)
		158	{
		159	DEFINE_WAIT(wait);
		160	wait.func = btrfs_wake_function;
		161
		162	while(1) {
		163	spin_nested(eb);
		164
		165	/* nobody is blocking, exit with the spinlock held */
		166	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
47	return 0;	167	return 0;
		168
		169	/*
		170	* we have the spinlock, but the real owner is blocking.
		171	* wait for them
		172	*/
		173	spin_unlock(&eb->lock);
		174
		175	/*
		176	* spin for a bit, and if the blocking flag goes away,
		177	* loop around
		178	*/
		179	if (btrfs_spin_on_block(eb))
		180	continue;
		181
		182	prepare_to_wait_exclusive(&eb->lock_wq, &wait,
		183	TASK_UNINTERRUPTIBLE);
		184
		185	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
		186	schedule();
		187
		188	finish_wait(&eb->lock_wq, &wait);
48	}	189	}
49	cpu_relax();
50	mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51	return 0;	190	return 0;
52	}	191	}
53		192
		193	/*
		194	* Very quick trylock, this does not spin or schedule. It returns
		195	* 1 with the spinlock held if it was able to take the lock, or it
		196	* returns zero if it was unable to take the lock.
		197	*
		198	* After this call, scheduling is not safe without first calling
		199	* btrfs_set_lock_blocking()
		200	*/
54	int btrfs_try_tree_lock(struct extent_buffer *eb)	201	int btrfs_try_tree_lock(struct extent_buffer *eb)
55	{	202	{
56	return mutex_trylock(&eb->mutex);	203	if (spin_trylock(&eb->lock)) {
		204	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
		205	/*
		206	* we've got the spinlock, but the real owner is
		207	* blocking. Drop the spinlock and return failure
		208	*/
		209	spin_unlock(&eb->lock);
		210	return 0;
		211	}
		212	return 1;
		213	}
		214	/* someone else has the spinlock giveup */
		215	return 0;
57	}	216	}
58		217
59	int btrfs_tree_unlock(struct extent_buffer *eb)	218	int btrfs_tree_unlock(struct extent_buffer *eb)
60	{	219	{
61	mutex_unlock(&eb->mutex);	220	/*
		221	* if we were a blocking owner, we don't have the spinlock held
		222	* just clear the bit and look for waiters
		223	*/
		224	if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
		225	smp_mb__after_clear_bit();
		226	else
		227	spin_unlock(&eb->lock);
		228
		229	if (waitqueue_active(&eb->lock_wq))
		230	wake_up(&eb->lock_wq);
62	return 0;	231	return 0;
63	}	232	}
64		233
65	int btrfs_tree_locked(struct extent_buffer *eb)	234	int btrfs_tree_locked(struct extent_buffer *eb)
66	{	235	{
67	return mutex_is_locked(&eb->mutex);	236	return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) \|\|
		237	spin_is_locked(&eb->lock);
68	}	238	}
69		239
70	/*	240	/*
@@ -75,12 +245,14 @@ int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
75	{	245	{
76	int i;	246	int i;
77	struct extent_buffer *eb;	247	struct extent_buffer *eb;
		248
78	for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {	249	for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
79	eb = path->nodes[i];	250	eb = path->nodes[i];
80	if (!eb)	251	if (!eb)
81	break;	252	break;
82	smp_mb();	253	smp_mb();
83	if (!list_empty(&eb->mutex.wait_list))	254	if (spin_is_contended(&eb->lock) \|\|
		255	waitqueue_active(&eb->lock_wq))
84	return 1;	256	return 1;
85	}	257	}
86	return 0;	258	return 0;