fix the broken lockdep logic in __sb_start_write()

1. wait_event(frozen < level) without rwsem_acquire_read() is just wrong from lockdep perspective. If we are going to deadlock because the caller is buggy, lockdep can't detect this problem. 2. __sb_start_write() can race with thaw_super() + freeze_super(), and after "goto retry" the 2nd acquire_freeze_lock() is wrong. 3. The "tell lockdep we are doing trylock" hack doesn't look nice. I think this is correct, but this logic should be more explicit. Yes, the recursive read_lock() is fine if we hold the lock on a higher level. But we do not need to fool lockdep. If we can not deadlock in this case then try-lock must not fail and we can use use wait == F throughout this code. Note: as Dave Chinner explains, the "trylock" hack and the fat comment can be probably removed. But this needs a separate change and it will be trivial: just kill __sb_start_write() and rename do_sb_start_write() back to __sb_start_write(). Signed-off-by: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Jan Kara <jack@suse.com>
author: Oleg Nesterov <oleg@redhat.com> 2015-07-19 18:50:55 -0400
committer: Oleg Nesterov <oleg@redhat.com> 2015-08-15 07:52:09 -0400
commit: f4b554af9931585174d4913b482eacab75858964 (patch)
tree: e01977e85b9568688a28058ebacd82f34be6ae14
parent: bee9182d955227f01ff3b80c4cb6acca9bb40b11 (diff)
1 files changed, 40 insertions, 33 deletions
diff --git a/fs/super.c b/fs/super.c
index b61372354f2b..24a76bcd62a5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1158,38 +1158,11 @@ void __sb_end_write(struct super_block *sb, int level)
 }
 EXPORT_SYMBOL(__sb_end_write);
-#ifdef CONFIG_LOCKDEP
+static int do_sb_start_write(struct super_block *sb, int level, bool wait,
-/*
- * We want lockdep to tell us about possible deadlocks with freezing but
- * it's it bit tricky to properly instrument it. Getting a freeze protection
- * works as getting a read lock but there are subtle problems. XFS for example
- * gets freeze protection on internal level twice in some cases, which is OK
- * only because we already hold a freeze protection also on higher level. Due
- * to these cases we have to tell lockdep we are doing trylock when we
- * already hold a freeze protection for a higher freeze level.
- */
-static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
                                unsigned long ip)
 {
-        int i;
+        if (wait)
+                rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip);
-        if (!trylock) {
-                for (i = 0; i < level - 1; i++)
-                        if (lock_is_held(&sb->s_writers.lock_map[i])) {
-                                trylock = true;
-                                break;
-                        }
-        }
-        rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
-}
-#endif
-/*
- * This is an internal function, please use sb_start_{write,pagefault,intwrite}
- * instead.
- */
-int __sb_start_write(struct super_block *sb, int level, bool wait)
-{
 retry:
        if (unlikely(sb->s_writers.frozen >= level)) {
                if (!wait)
@@ -1198,9 +1171,6 @@ retry:
                           sb->s_writers.frozen < level);
        }
-#ifdef CONFIG_LOCKDEP
-        acquire_freeze_lock(sb, level, !wait, _RET_IP_);
-#endif
        percpu_counter_inc(&sb->s_writers.counter[level-1]);
        /*
         * Make sure counter is updated before we check for frozen.
@@ -1211,8 +1181,45 @@ retry:
                __sb_end_write(sb, level);
                goto retry;
        }
+        if (!wait)
+                rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip);
        return 1;
 }
+/*
+ * This is an internal function, please use sb_start_{write,pagefault,intwrite}
+ * instead.
+ */
+int __sb_start_write(struct super_block *sb, int level, bool wait)
+{
+        bool force_trylock = false;
+        int ret;
+#ifdef CONFIG_LOCKDEP
+        /*
+         * We want lockdep to tell us about possible deadlocks with freezing
+         * but it's it bit tricky to properly instrument it. Getting a freeze
+         * protection works as getting a read lock but there are subtle
+         * problems. XFS for example gets freeze protection on internal level
+         * twice in some cases, which is OK only because we already hold a
+         * freeze protection also on higher level. Due to these cases we have
+         * to use wait == F (trylock mode) which must not fail.
+         */
+        if (wait) {
+                int i;
+                for (i = 0; i < level - 1; i++)
+                        if (lock_is_held(&sb->s_writers.lock_map[i])) {
+                                force_trylock = true;
+                                break;
+                        }
+        }
+#endif
+        ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_);
+        WARN_ON(force_trylock & !ret);
+        return ret;
+}
 EXPORT_SYMBOL(__sb_start_write);
 /**
author	Oleg Nesterov <oleg@redhat.com>	2015-07-19 18:50:55 -0400
committer	Oleg Nesterov <oleg@redhat.com>	2015-08-15 07:52:09 -0400
commit	f4b554af9931585174d4913b482eacab75858964 (patch)
tree	e01977e85b9568688a28058ebacd82f34be6ae14
parent	bee9182d955227f01ff3b80c4cb6acca9bb40b11 (diff)

diff --git a/fs/super.c b/fs/super.c index b61372354f2b..24a76bcd62a5 100644 --- a/fs/super.c +++ b/fs/super.c
@@ -1158,38 +1158,11 @@ void __sb_end_write(struct super_block *sb, int level)
1158	}	1158	}
1159	EXPORT_SYMBOL(__sb_end_write);	1159	EXPORT_SYMBOL(__sb_end_write);
1160		1160
1161	#ifdef CONFIG_LOCKDEP	1161	static int do_sb_start_write(struct super_block *sb, int level, bool wait,
1162	/*
1163	* We want lockdep to tell us about possible deadlocks with freezing but
1164	* it's it bit tricky to properly instrument it. Getting a freeze protection
1165	* works as getting a read lock but there are subtle problems. XFS for example
1166	* gets freeze protection on internal level twice in some cases, which is OK
1167	* only because we already hold a freeze protection also on higher level. Due
1168	* to these cases we have to tell lockdep we are doing trylock when we
1169	* already hold a freeze protection for a higher freeze level.
1170	*/
1171	static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
1172	unsigned long ip)	1162	unsigned long ip)
1173	{	1163	{
1174	int i;	1164	if (wait)
1175		1165	rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip);
1176	if (!trylock) {
1177	for (i = 0; i < level - 1; i++)
1178	if (lock_is_held(&sb->s_writers.lock_map[i])) {
1179	trylock = true;
1180	break;
1181	}
1182	}
1183	rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
1184	}
1185	#endif
1186
1187	/*
1188	* This is an internal function, please use sb_start_{write,pagefault,intwrite}
1189	* instead.
1190	*/
1191	int __sb_start_write(struct super_block *sb, int level, bool wait)
1192	{
1193	retry:	1166	retry:
1194	if (unlikely(sb->s_writers.frozen >= level)) {	1167	if (unlikely(sb->s_writers.frozen >= level)) {
1195	if (!wait)	1168	if (!wait)
@@ -1198,9 +1171,6 @@ retry:
1198	sb->s_writers.frozen < level);	1171	sb->s_writers.frozen < level);
1199	}	1172	}
1200		1173
1201	#ifdef CONFIG_LOCKDEP
1202	acquire_freeze_lock(sb, level, !wait, _RET_IP_);
1203	#endif
1204	percpu_counter_inc(&sb->s_writers.counter[level-1]);	1174	percpu_counter_inc(&sb->s_writers.counter[level-1]);
1205	/*	1175	/*
1206	* Make sure counter is updated before we check for frozen.	1176	* Make sure counter is updated before we check for frozen.
@@ -1211,8 +1181,45 @@ retry:
1211	__sb_end_write(sb, level);	1181	__sb_end_write(sb, level);
1212	goto retry;	1182	goto retry;
1213	}	1183	}
		1184
		1185	if (!wait)
		1186	rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip);
1214	return 1;	1187	return 1;
1215	}	1188	}
		1189
		1190	/*
		1191	* This is an internal function, please use sb_start_{write,pagefault,intwrite}
		1192	* instead.
		1193	*/
		1194	int __sb_start_write(struct super_block *sb, int level, bool wait)
		1195	{
		1196	bool force_trylock = false;
		1197	int ret;
		1198
		1199	#ifdef CONFIG_LOCKDEP
		1200	/*
		1201	* We want lockdep to tell us about possible deadlocks with freezing
		1202	* but it's it bit tricky to properly instrument it. Getting a freeze
		1203	* protection works as getting a read lock but there are subtle
		1204	* problems. XFS for example gets freeze protection on internal level
		1205	* twice in some cases, which is OK only because we already hold a
		1206	* freeze protection also on higher level. Due to these cases we have
		1207	* to use wait == F (trylock mode) which must not fail.
		1208	*/
		1209	if (wait) {
		1210	int i;
		1211
		1212	for (i = 0; i < level - 1; i++)
		1213	if (lock_is_held(&sb->s_writers.lock_map[i])) {
		1214	force_trylock = true;
		1215	break;
		1216	}
		1217	}
		1218	#endif
		1219	ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_);
		1220	WARN_ON(force_trylock & !ret);
		1221	return ret;
		1222	}
1216	EXPORT_SYMBOL(__sb_start_write);	1223	EXPORT_SYMBOL(__sb_start_write);
1217		1224
1218	/**	1225	/**