fs/locks: create a tree of dependent requests.

When we find an existing lock which conflicts with a request, and the request wants to wait, we currently add the request to a list. When the lock is removed, the whole list is woken. This can cause the thundering-herd problem. To reduce the problem, we make use of the (new) fact that a pending request can itself have a list of blocked requests. When we find a conflict, we look through the existing blocked requests. If any one of them blocks the new request, the new request is attached below that request, otherwise it is added to the list of blocked requests, which are now known to be mutually non-conflicting. This way, when the lock is released, only a set of non-conflicting locks will be woken, the rest can stay asleep. If the lock request cannot be granted and the request needs to be requeued, all the other requests it blocks will then be woken To make this more concrete: If you have a many-core machine, and have many threads all wanting to briefly lock a give file (udev is known to do this), you can get quite poor performance. When one thread releases a lock, it wakes up all other threads that are waiting (classic thundering-herd) - one will get the lock and the others go to sleep. When you have few cores, this is not very noticeable: by the time the 4th or 5th thread gets enough CPU time to try to claim the lock, the earlier threads have claimed it, done what was needed, and released. So with few cores, many of the threads don't end up contending. With 50+ cores, lost of threads can get the CPU at the same time, and the contention can easily be measured. This patchset creates a tree of pending lock requests in which siblings don't conflict and each lock request does conflict with its parent. When a lock is released, only requests which don't conflict with each other a woken. Testing shows that lock-acquisitions-per-second is now fairly stable even as the number of contending process goes to 1000. Without this patch, locks-per-second drops off steeply after a few 10s of processes. There is a small cost to this extra complexity. At 20 processes running a particular test on 72 cores, the lock acquisitions per second drops from 1.8 million to 1.4 million with this patch. For 100 processes, this patch still provides 1.4 million while without this patch there are about 700,000. Reported-and-tested-by: Martin Wilck <mwilck@suse.de> Signed-off-by: NeilBrown <neilb@suse.com> Reviewed-by: J. Bruce Fields <bfields@redhat.com> Signed-off-by: Jeff Layton <jlayton@kernel.org>
author: NeilBrown <neilb@suse.com> 2018-11-29 18:04:08 -0500
committer: Jeff Layton <jlayton@kernel.org> 2018-12-07 06:49:24 -0500
commit: fd7732e033e30b3a586923b57e338c859e17858a (patch)
tree: da099dfbe65b8fe9f5251b4fab2152c454373172 /fs
parent: c0e15908979d269a8263b0c0a222b894b9f403e9 (diff)
1 files changed, 63 insertions, 6 deletions
diff --git a/fs/locks.c b/fs/locks.c
index c5f35910c57a..4d6a5a3f903a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -112,6 +112,46 @@
 *  Leases and LOCK_MAND
 *  Matthew Wilcox <willy@debian.org>, June, 2000.
 *  Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
+ *
+ * Locking conflicts and dependencies:
+ * If multiple threads attempt to lock the same byte (or flock the same file)
+ * only one can be granted the lock, and other must wait their turn.
+ * The first lock has been "applied" or "granted", the others are "waiting"
+ * and are "blocked" by the "applied" lock..
+ *
+ * Waiting and applied locks are all kept in trees whose properties are:
+ *
+ *      - the root of a tree may be an applied or waiting lock.
+ *      - every other node in the tree is a waiting lock that
+ *        conflicts with every ancestor of that node.
+ *
+ * Every such tree begins life as a waiting singleton which obviously
+ * satisfies the above properties.
+ *
+ * The only ways we modify trees preserve these properties:
+ *
+ *      1. We may add a new leaf node, but only after first verifying that it
+ *         conflicts with all of its ancestors.
+ *      2. We may remove the root of a tree, creating a new singleton
+ *         tree from the root and N new trees rooted in the immediate
+ *         children.
+ *      3. If the root of a tree is not currently an applied lock, we may
+ *         apply it (if possible).
+ *      4. We may upgrade the root of the tree (either extend its range,
+ *         or upgrade its entire range from read to write).
+ *
+ * When an applied lock is modified in a way that reduces or downgrades any
+ * part of its range, we remove all its children (2 above).  This particularly
+ * happens when a lock is unlocked.
+ *
+ * For each of those child trees we "wake up" the thread which is
+ * waiting for the lock so it can continue handling as follows: if the
+ * root of the tree applies, we do so (3).  If it doesn't, it must
+ * conflict with some applied lock.  We remove (wake up) all of its children
+ * (2), and add it is a new leaf to the tree rooted in the applied
+ * lock (1).  We then repeat the process recursively with those
+ * children.
+ *
 */
 #include <linux/capability.h>
@@ -740,11 +780,25 @@ static void locks_delete_block(struct file_lock *waiter)
 * but by ensuring that the flc_lock is also held on insertions we can avoid
 * taking the blocked_lock_lock in some cases when we see that the
 * fl_blocked_requests list is empty.
+ *
+ * Rather than just adding to the list, we check for conflicts with any existing
+ * waiters, and add beneath any waiter that blocks the new waiter.
+ * Thus wakeups don't happen until needed.
 */
 static void __locks_insert_block(struct file_lock *blocker,
-                                        struct file_lock *waiter)
+                                 struct file_lock *waiter,
+                                 bool conflict(struct file_lock *,
+                                               struct file_lock *))
 {
+        struct file_lock *fl;
        BUG_ON(!list_empty(&waiter->fl_blocked_member));
+new_blocker:
+        list_for_each_entry(fl, &blocker->fl_blocked_requests, fl_blocked_member)
+                if (conflict(fl, waiter)) {
+                        blocker =  fl;
+                        goto new_blocker;
+                }
        waiter->fl_blocker = blocker;
        list_add_tail(&waiter->fl_blocked_member, &blocker->fl_blocked_requests);
        if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
@@ -759,10 +813,12 @@ static void __locks_insert_block(struct file_lock *blocker,
 /* Must be called with flc_lock held. */
 static void locks_insert_block(struct file_lock *blocker,
-                                        struct file_lock *waiter)
+                               struct file_lock *waiter,
+                               bool conflict(struct file_lock *,
+                                             struct file_lock *))
 {
        spin_lock(&blocked_lock_lock);
-        __locks_insert_block(blocker, waiter);
+        __locks_insert_block(blocker, waiter, conflict);
        spin_unlock(&blocked_lock_lock);
 }
@@ -1021,7 +1077,7 @@ find_conflict:
                if (!(request->fl_flags & FL_SLEEP))
                        goto out;
                error = FILE_LOCK_DEFERRED;
-                locks_insert_block(fl, request);
+                locks_insert_block(fl, request, flock_locks_conflict);
                goto out;
        }
        if (request->fl_flags & FL_ACCESS)
@@ -1096,7 +1152,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
                        spin_lock(&blocked_lock_lock);
                        if (likely(!posix_locks_deadlock(request, fl))) {
                                error = FILE_LOCK_DEFERRED;
-                                __locks_insert_block(fl, request);
+                                __locks_insert_block(fl, request,
+                                                     posix_locks_conflict);
                        }
                        spin_unlock(&blocked_lock_lock);
                        goto out;
@@ -1567,7 +1624,7 @@ restart:
                break_time -= jiffies;
        if (break_time == 0)
                break_time++;
-        locks_insert_block(fl, new_fl);
+        locks_insert_block(fl, new_fl, leases_conflict);
        trace_break_lease_block(inode, new_fl);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read_preempt_enable(&file_rwsem);
author	NeilBrown <neilb@suse.com>	2018-11-29 18:04:08 -0500
committer	Jeff Layton <jlayton@kernel.org>	2018-12-07 06:49:24 -0500
commit	fd7732e033e30b3a586923b57e338c859e17858a (patch)
tree	da099dfbe65b8fe9f5251b4fab2152c454373172 /fs
parent	c0e15908979d269a8263b0c0a222b894b9f403e9 (diff)

diff --git a/fs/locks.c b/fs/locks.c index c5f35910c57a..4d6a5a3f903a 100644 --- a/fs/locks.c +++ b/fs/locks.c
@@ -112,6 +112,46 @@
112	* Leases and LOCK_MAND	112	* Leases and LOCK_MAND
113	* Matthew Wilcox <willy@debian.org>, June, 2000.	113	* Matthew Wilcox <willy@debian.org>, June, 2000.
114	* Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.	114	* Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
		115	*
		116	* Locking conflicts and dependencies:
		117	* If multiple threads attempt to lock the same byte (or flock the same file)
		118	* only one can be granted the lock, and other must wait their turn.
		119	* The first lock has been "applied" or "granted", the others are "waiting"
		120	* and are "blocked" by the "applied" lock..
		121	*
		122	* Waiting and applied locks are all kept in trees whose properties are:
		123	*
		124	* - the root of a tree may be an applied or waiting lock.
		125	* - every other node in the tree is a waiting lock that
		126	* conflicts with every ancestor of that node.
		127	*
		128	* Every such tree begins life as a waiting singleton which obviously
		129	* satisfies the above properties.
		130	*
		131	* The only ways we modify trees preserve these properties:
		132	*
		133	* 1. We may add a new leaf node, but only after first verifying that it
		134	* conflicts with all of its ancestors.
		135	* 2. We may remove the root of a tree, creating a new singleton
		136	* tree from the root and N new trees rooted in the immediate
		137	* children.
		138	* 3. If the root of a tree is not currently an applied lock, we may
		139	* apply it (if possible).
		140	* 4. We may upgrade the root of the tree (either extend its range,
		141	* or upgrade its entire range from read to write).
		142	*
		143	* When an applied lock is modified in a way that reduces or downgrades any
		144	* part of its range, we remove all its children (2 above). This particularly
		145	* happens when a lock is unlocked.
		146	*
		147	* For each of those child trees we "wake up" the thread which is
		148	* waiting for the lock so it can continue handling as follows: if the
		149	* root of the tree applies, we do so (3). If it doesn't, it must
		150	* conflict with some applied lock. We remove (wake up) all of its children
		151	* (2), and add it is a new leaf to the tree rooted in the applied
		152	* lock (1). We then repeat the process recursively with those
		153	* children.
		154	*
115	*/	155	*/
116		156
117	#include <linux/capability.h>	157	#include <linux/capability.h>
@@ -740,11 +780,25 @@ static void locks_delete_block(struct file_lock *waiter)
740	* but by ensuring that the flc_lock is also held on insertions we can avoid	780	* but by ensuring that the flc_lock is also held on insertions we can avoid
741	* taking the blocked_lock_lock in some cases when we see that the	781	* taking the blocked_lock_lock in some cases when we see that the
742	* fl_blocked_requests list is empty.	782	* fl_blocked_requests list is empty.
		783	*
		784	* Rather than just adding to the list, we check for conflicts with any existing
		785	* waiters, and add beneath any waiter that blocks the new waiter.
		786	* Thus wakeups don't happen until needed.
743	*/	787	*/
744	static void __locks_insert_block(struct file_lock *blocker,	788	static void __locks_insert_block(struct file_lock *blocker,
745	struct file_lock *waiter)	789	struct file_lock *waiter,
		790	bool conflict(struct file_lock *,
		791	struct file_lock *))
746	{	792	{
		793	struct file_lock *fl;
747	BUG_ON(!list_empty(&waiter->fl_blocked_member));	794	BUG_ON(!list_empty(&waiter->fl_blocked_member));
		795
		796	new_blocker:
		797	list_for_each_entry(fl, &blocker->fl_blocked_requests, fl_blocked_member)
		798	if (conflict(fl, waiter)) {
		799	blocker = fl;
		800	goto new_blocker;
		801	}
748	waiter->fl_blocker = blocker;	802	waiter->fl_blocker = blocker;
749	list_add_tail(&waiter->fl_blocked_member, &blocker->fl_blocked_requests);	803	list_add_tail(&waiter->fl_blocked_member, &blocker->fl_blocked_requests);
750	if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))	804	if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
@@ -759,10 +813,12 @@ static void __locks_insert_block(struct file_lock *blocker,
759		813
760	/* Must be called with flc_lock held. */	814	/* Must be called with flc_lock held. */
761	static void locks_insert_block(struct file_lock *blocker,	815	static void locks_insert_block(struct file_lock *blocker,
762	struct file_lock *waiter)	816	struct file_lock *waiter,
		817	bool conflict(struct file_lock *,
		818	struct file_lock *))
763	{	819	{
764	spin_lock(&blocked_lock_lock);	820	spin_lock(&blocked_lock_lock);
765	__locks_insert_block(blocker, waiter);	821	__locks_insert_block(blocker, waiter, conflict);
766	spin_unlock(&blocked_lock_lock);	822	spin_unlock(&blocked_lock_lock);
767	}	823	}
768		824
@@ -1021,7 +1077,7 @@ find_conflict:
1021	if (!(request->fl_flags & FL_SLEEP))	1077	if (!(request->fl_flags & FL_SLEEP))
1022	goto out;	1078	goto out;
1023	error = FILE_LOCK_DEFERRED;	1079	error = FILE_LOCK_DEFERRED;
1024	locks_insert_block(fl, request);	1080	locks_insert_block(fl, request, flock_locks_conflict);
1025	goto out;	1081	goto out;
1026	}	1082	}
1027	if (request->fl_flags & FL_ACCESS)	1083	if (request->fl_flags & FL_ACCESS)
@@ -1096,7 +1152,8 @@ static int posix_lock_inode(struct inode inode, struct file_lock request,
1096	spin_lock(&blocked_lock_lock);	1152	spin_lock(&blocked_lock_lock);
1097	if (likely(!posix_locks_deadlock(request, fl))) {	1153	if (likely(!posix_locks_deadlock(request, fl))) {
1098	error = FILE_LOCK_DEFERRED;	1154	error = FILE_LOCK_DEFERRED;
1099	__locks_insert_block(fl, request);	1155	__locks_insert_block(fl, request,
		1156	posix_locks_conflict);
1100	}	1157	}
1101	spin_unlock(&blocked_lock_lock);	1158	spin_unlock(&blocked_lock_lock);
1102	goto out;	1159	goto out;
@@ -1567,7 +1624,7 @@ restart:
1567	break_time -= jiffies;	1624	break_time -= jiffies;
1568	if (break_time == 0)	1625	if (break_time == 0)
1569	break_time++;	1626	break_time++;
1570	locks_insert_block(fl, new_fl);	1627	locks_insert_block(fl, new_fl, leases_conflict);
1571	trace_break_lease_block(inode, new_fl);	1628	trace_break_lease_block(inode, new_fl);
1572	spin_unlock(&ctx->flc_lock);	1629	spin_unlock(&ctx->flc_lock);
1573	percpu_up_read_preempt_enable(&file_rwsem);	1630	percpu_up_read_preempt_enable(&file_rwsem);