diff options
author | Oleg Nesterov <oleg@redhat.com> | 2015-08-11 11:05:04 -0400 |
---|---|---|
committer | Oleg Nesterov <oleg@redhat.com> | 2015-08-15 07:52:13 -0400 |
commit | 8129ed29644bf56ed17ec1bbbeed5c568b43d6a0 (patch) | |
tree | 14a01f7077300477f7f1f7e7c3ad1b9b6589dd32 /fs/super.c | |
parent | 853b39a7c82826b8413048feec7bf08e98ce7a84 (diff) |
change sb_writers to use percpu_rw_semaphore
We can remove everything from struct sb_writers except frozen
and add the array of percpu_rw_semaphore's instead.
This patch doesn't remove sb_writers->wait_unfrozen yet, we keep
it for get_super_thawed(). We will probably remove it later.
This change tries to address the following problems:
- Firstly, __sb_start_write() looks simply buggy. It does
__sb_end_write() if it sees ->frozen, but if it migrates
to another CPU before percpu_counter_dec(), sb_wait_write()
can wrongly succeed if there is another task which holds
the same "semaphore": sb_wait_write() can miss the result
of the previous percpu_counter_inc() but see the result
of this percpu_counter_dec().
- As Dave Hansen reports, it is suboptimal. The trivial
microbenchmark that writes to a tmpfs file in a loop runs
12% faster if we change this code to rely on RCU and kill
the memory barriers.
- This code doesn't look simple. It would be better to rely
on the generic locking code.
According to Dave, this change adds the same performance
improvement.
Note: with this change both freeze_super() and thaw_super() will do
synchronize_sched_expedited() 3 times. This is just ugly. But:
- This will be "fixed" by the rcu_sync changes we are going
to merge. After that freeze_super()->percpu_down_write()
will use synchronize_sched(), and thaw_super() won't use
synchronize() at all.
This doesn't need any changes in fs/super.c.
- Once we merge rcu_sync changes, we can also change super.c
so that all wb_write->rw_sem's will share the single ->rss
in struct sb_writes, then freeze_super() will need only one
synchronize_sched().
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Jan Kara <jack@suse.com>
Diffstat (limited to 'fs/super.c')
-rw-r--r-- | fs/super.c | 111 |
1 files changed, 30 insertions, 81 deletions
diff --git a/fs/super.c b/fs/super.c index c937bd7b4d33..767b1e10f6ad 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -142,7 +142,7 @@ static void destroy_super_work(struct work_struct *work) | |||
142 | int i; | 142 | int i; |
143 | 143 | ||
144 | for (i = 0; i < SB_FREEZE_LEVELS; i++) | 144 | for (i = 0; i < SB_FREEZE_LEVELS; i++) |
145 | percpu_counter_destroy(&s->s_writers.counter[i]); | 145 | percpu_free_rwsem(&s->s_writers.rw_sem[i]); |
146 | kfree(s); | 146 | kfree(s); |
147 | } | 147 | } |
148 | 148 | ||
@@ -193,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) | |||
193 | goto fail; | 193 | goto fail; |
194 | 194 | ||
195 | for (i = 0; i < SB_FREEZE_LEVELS; i++) { | 195 | for (i = 0; i < SB_FREEZE_LEVELS; i++) { |
196 | if (percpu_counter_init(&s->s_writers.counter[i], 0, | 196 | if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], |
197 | GFP_KERNEL) < 0) | 197 | sb_writers_name[i], |
198 | &type->s_writers_key[i])) | ||
198 | goto fail; | 199 | goto fail; |
199 | lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], | ||
200 | &type->s_writers_key[i], 0); | ||
201 | } | 200 | } |
202 | init_waitqueue_head(&s->s_writers.wait); | ||
203 | init_waitqueue_head(&s->s_writers.wait_unfrozen); | 201 | init_waitqueue_head(&s->s_writers.wait_unfrozen); |
204 | s->s_bdi = &noop_backing_dev_info; | 202 | s->s_bdi = &noop_backing_dev_info; |
205 | s->s_flags = flags; | 203 | s->s_flags = flags; |
@@ -1161,47 +1159,10 @@ out: | |||
1161 | */ | 1159 | */ |
1162 | void __sb_end_write(struct super_block *sb, int level) | 1160 | void __sb_end_write(struct super_block *sb, int level) |
1163 | { | 1161 | { |
1164 | percpu_counter_dec(&sb->s_writers.counter[level-1]); | 1162 | percpu_up_read(sb->s_writers.rw_sem + level-1); |
1165 | /* | ||
1166 | * Make sure s_writers are updated before we wake up waiters in | ||
1167 | * freeze_super(). | ||
1168 | */ | ||
1169 | smp_mb(); | ||
1170 | if (waitqueue_active(&sb->s_writers.wait)) | ||
1171 | wake_up(&sb->s_writers.wait); | ||
1172 | rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); | ||
1173 | } | 1163 | } |
1174 | EXPORT_SYMBOL(__sb_end_write); | 1164 | EXPORT_SYMBOL(__sb_end_write); |
1175 | 1165 | ||
1176 | static int do_sb_start_write(struct super_block *sb, int level, bool wait, | ||
1177 | unsigned long ip) | ||
1178 | { | ||
1179 | if (wait) | ||
1180 | rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip); | ||
1181 | retry: | ||
1182 | if (unlikely(sb->s_writers.frozen >= level)) { | ||
1183 | if (!wait) | ||
1184 | return 0; | ||
1185 | wait_event(sb->s_writers.wait_unfrozen, | ||
1186 | sb->s_writers.frozen < level); | ||
1187 | } | ||
1188 | |||
1189 | percpu_counter_inc(&sb->s_writers.counter[level-1]); | ||
1190 | /* | ||
1191 | * Make sure counter is updated before we check for frozen. | ||
1192 | * freeze_super() first sets frozen and then checks the counter. | ||
1193 | */ | ||
1194 | smp_mb(); | ||
1195 | if (unlikely(sb->s_writers.frozen >= level)) { | ||
1196 | __sb_end_write(sb, level); | ||
1197 | goto retry; | ||
1198 | } | ||
1199 | |||
1200 | if (!wait) | ||
1201 | rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip); | ||
1202 | return 1; | ||
1203 | } | ||
1204 | |||
1205 | /* | 1166 | /* |
1206 | * This is an internal function, please use sb_start_{write,pagefault,intwrite} | 1167 | * This is an internal function, please use sb_start_{write,pagefault,intwrite} |
1207 | * instead. | 1168 | * instead. |
@@ -1209,7 +1170,7 @@ retry: | |||
1209 | int __sb_start_write(struct super_block *sb, int level, bool wait) | 1170 | int __sb_start_write(struct super_block *sb, int level, bool wait) |
1210 | { | 1171 | { |
1211 | bool force_trylock = false; | 1172 | bool force_trylock = false; |
1212 | int ret; | 1173 | int ret = 1; |
1213 | 1174 | ||
1214 | #ifdef CONFIG_LOCKDEP | 1175 | #ifdef CONFIG_LOCKDEP |
1215 | /* | 1176 | /* |
@@ -1225,13 +1186,17 @@ int __sb_start_write(struct super_block *sb, int level, bool wait) | |||
1225 | int i; | 1186 | int i; |
1226 | 1187 | ||
1227 | for (i = 0; i < level - 1; i++) | 1188 | for (i = 0; i < level - 1; i++) |
1228 | if (lock_is_held(&sb->s_writers.lock_map[i])) { | 1189 | if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) { |
1229 | force_trylock = true; | 1190 | force_trylock = true; |
1230 | break; | 1191 | break; |
1231 | } | 1192 | } |
1232 | } | 1193 | } |
1233 | #endif | 1194 | #endif |
1234 | ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_); | 1195 | if (wait && !force_trylock) |
1196 | percpu_down_read(sb->s_writers.rw_sem + level-1); | ||
1197 | else | ||
1198 | ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); | ||
1199 | |||
1235 | WARN_ON(force_trylock & !ret); | 1200 | WARN_ON(force_trylock & !ret); |
1236 | return ret; | 1201 | return ret; |
1237 | } | 1202 | } |
@@ -1243,15 +1208,11 @@ EXPORT_SYMBOL(__sb_start_write); | |||
1243 | * @level: type of writers we wait for (normal vs page fault) | 1208 | * @level: type of writers we wait for (normal vs page fault) |
1244 | * | 1209 | * |
1245 | * This function waits until there are no writers of given type to given file | 1210 | * This function waits until there are no writers of given type to given file |
1246 | * system. Caller of this function should make sure there can be no new writers | 1211 | * system. |
1247 | * of type @level before calling this function. Otherwise this function can | ||
1248 | * livelock. | ||
1249 | */ | 1212 | */ |
1250 | static void sb_wait_write(struct super_block *sb, int level) | 1213 | static void sb_wait_write(struct super_block *sb, int level) |
1251 | { | 1214 | { |
1252 | s64 writers; | 1215 | percpu_down_write(sb->s_writers.rw_sem + level-1); |
1253 | |||
1254 | rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); | ||
1255 | /* | 1216 | /* |
1256 | * We are going to return to userspace and forget about this lock, the | 1217 | * We are going to return to userspace and forget about this lock, the |
1257 | * ownership goes to the caller of thaw_super() which does unlock. | 1218 | * ownership goes to the caller of thaw_super() which does unlock. |
@@ -1262,24 +1223,18 @@ static void sb_wait_write(struct super_block *sb, int level) | |||
1262 | * this leads to lockdep false-positives, so currently we do the early | 1223 | * this leads to lockdep false-positives, so currently we do the early |
1263 | * release right after acquire. | 1224 | * release right after acquire. |
1264 | */ | 1225 | */ |
1265 | rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); | 1226 | percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_); |
1266 | 1227 | } | |
1267 | do { | ||
1268 | DEFINE_WAIT(wait); | ||
1269 | 1228 | ||
1270 | /* | 1229 | static void sb_freeze_unlock(struct super_block *sb) |
1271 | * We use a barrier in prepare_to_wait() to separate setting | 1230 | { |
1272 | * of frozen and checking of the counter | 1231 | int level; |
1273 | */ | ||
1274 | prepare_to_wait(&sb->s_writers.wait, &wait, | ||
1275 | TASK_UNINTERRUPTIBLE); | ||
1276 | 1232 | ||
1277 | writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); | 1233 | for (level = 0; level < SB_FREEZE_LEVELS; ++level) |
1278 | if (writers) | 1234 | percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); |
1279 | schedule(); | ||
1280 | 1235 | ||
1281 | finish_wait(&sb->s_writers.wait, &wait); | 1236 | for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) |
1282 | } while (writers); | 1237 | percpu_up_write(sb->s_writers.rw_sem + level); |
1283 | } | 1238 | } |
1284 | 1239 | ||
1285 | /** | 1240 | /** |
@@ -1338,20 +1293,14 @@ int freeze_super(struct super_block *sb) | |||
1338 | return 0; | 1293 | return 0; |
1339 | } | 1294 | } |
1340 | 1295 | ||
1341 | /* From now on, no new normal writers can start */ | ||
1342 | sb->s_writers.frozen = SB_FREEZE_WRITE; | 1296 | sb->s_writers.frozen = SB_FREEZE_WRITE; |
1343 | smp_wmb(); | ||
1344 | |||
1345 | /* Release s_umount to preserve sb_start_write -> s_umount ordering */ | 1297 | /* Release s_umount to preserve sb_start_write -> s_umount ordering */ |
1346 | up_write(&sb->s_umount); | 1298 | up_write(&sb->s_umount); |
1347 | |||
1348 | sb_wait_write(sb, SB_FREEZE_WRITE); | 1299 | sb_wait_write(sb, SB_FREEZE_WRITE); |
1300 | down_write(&sb->s_umount); | ||
1349 | 1301 | ||
1350 | /* Now we go and block page faults... */ | 1302 | /* Now we go and block page faults... */ |
1351 | down_write(&sb->s_umount); | ||
1352 | sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; | 1303 | sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; |
1353 | smp_wmb(); | ||
1354 | |||
1355 | sb_wait_write(sb, SB_FREEZE_PAGEFAULT); | 1304 | sb_wait_write(sb, SB_FREEZE_PAGEFAULT); |
1356 | 1305 | ||
1357 | /* All writers are done so after syncing there won't be dirty data */ | 1306 | /* All writers are done so after syncing there won't be dirty data */ |
@@ -1359,7 +1308,6 @@ int freeze_super(struct super_block *sb) | |||
1359 | 1308 | ||
1360 | /* Now wait for internal filesystem counter */ | 1309 | /* Now wait for internal filesystem counter */ |
1361 | sb->s_writers.frozen = SB_FREEZE_FS; | 1310 | sb->s_writers.frozen = SB_FREEZE_FS; |
1362 | smp_wmb(); | ||
1363 | sb_wait_write(sb, SB_FREEZE_FS); | 1311 | sb_wait_write(sb, SB_FREEZE_FS); |
1364 | 1312 | ||
1365 | if (sb->s_op->freeze_fs) { | 1313 | if (sb->s_op->freeze_fs) { |
@@ -1368,7 +1316,7 @@ int freeze_super(struct super_block *sb) | |||
1368 | printk(KERN_ERR | 1316 | printk(KERN_ERR |
1369 | "VFS:Filesystem freeze failed\n"); | 1317 | "VFS:Filesystem freeze failed\n"); |
1370 | sb->s_writers.frozen = SB_UNFROZEN; | 1318 | sb->s_writers.frozen = SB_UNFROZEN; |
1371 | smp_wmb(); | 1319 | sb_freeze_unlock(sb); |
1372 | wake_up(&sb->s_writers.wait_unfrozen); | 1320 | wake_up(&sb->s_writers.wait_unfrozen); |
1373 | deactivate_locked_super(sb); | 1321 | deactivate_locked_super(sb); |
1374 | return ret; | 1322 | return ret; |
@@ -1400,8 +1348,10 @@ int thaw_super(struct super_block *sb) | |||
1400 | return -EINVAL; | 1348 | return -EINVAL; |
1401 | } | 1349 | } |
1402 | 1350 | ||
1403 | if (sb->s_flags & MS_RDONLY) | 1351 | if (sb->s_flags & MS_RDONLY) { |
1352 | sb->s_writers.frozen = SB_UNFROZEN; | ||
1404 | goto out; | 1353 | goto out; |
1354 | } | ||
1405 | 1355 | ||
1406 | if (sb->s_op->unfreeze_fs) { | 1356 | if (sb->s_op->unfreeze_fs) { |
1407 | error = sb->s_op->unfreeze_fs(sb); | 1357 | error = sb->s_op->unfreeze_fs(sb); |
@@ -1413,12 +1363,11 @@ int thaw_super(struct super_block *sb) | |||
1413 | } | 1363 | } |
1414 | } | 1364 | } |
1415 | 1365 | ||
1416 | out: | ||
1417 | sb->s_writers.frozen = SB_UNFROZEN; | 1366 | sb->s_writers.frozen = SB_UNFROZEN; |
1418 | smp_wmb(); | 1367 | sb_freeze_unlock(sb); |
1368 | out: | ||
1419 | wake_up(&sb->s_writers.wait_unfrozen); | 1369 | wake_up(&sb->s_writers.wait_unfrozen); |
1420 | deactivate_locked_super(sb); | 1370 | deactivate_locked_super(sb); |
1421 | |||
1422 | return 0; | 1371 | return 0; |
1423 | } | 1372 | } |
1424 | EXPORT_SYMBOL(thaw_super); | 1373 | EXPORT_SYMBOL(thaw_super); |