summaryrefslogtreecommitdiffstats
path: root/fs/super.c
diff options
context:
space:
mode:
authorOleg Nesterov <oleg@redhat.com>2015-08-11 11:05:04 -0400
committerOleg Nesterov <oleg@redhat.com>2015-08-15 07:52:13 -0400
commit8129ed29644bf56ed17ec1bbbeed5c568b43d6a0 (patch)
tree14a01f7077300477f7f1f7e7c3ad1b9b6589dd32 /fs/super.c
parent853b39a7c82826b8413048feec7bf08e98ce7a84 (diff)
change sb_writers to use percpu_rw_semaphore
We can remove everything from struct sb_writers except frozen and add the array of percpu_rw_semaphore's instead. This patch doesn't remove sb_writers->wait_unfrozen yet, we keep it for get_super_thawed(). We will probably remove it later. This change tries to address the following problems: - Firstly, __sb_start_write() looks simply buggy. It does __sb_end_write() if it sees ->frozen, but if it migrates to another CPU before percpu_counter_dec(), sb_wait_write() can wrongly succeed if there is another task which holds the same "semaphore": sb_wait_write() can miss the result of the previous percpu_counter_inc() but see the result of this percpu_counter_dec(). - As Dave Hansen reports, it is suboptimal. The trivial microbenchmark that writes to a tmpfs file in a loop runs 12% faster if we change this code to rely on RCU and kill the memory barriers. - This code doesn't look simple. It would be better to rely on the generic locking code. According to Dave, this change adds the same performance improvement. Note: with this change both freeze_super() and thaw_super() will do synchronize_sched_expedited() 3 times. This is just ugly. But: - This will be "fixed" by the rcu_sync changes we are going to merge. After that freeze_super()->percpu_down_write() will use synchronize_sched(), and thaw_super() won't use synchronize() at all. This doesn't need any changes in fs/super.c. - Once we merge rcu_sync changes, we can also change super.c so that all wb_write->rw_sem's will share the single ->rss in struct sb_writes, then freeze_super() will need only one synchronize_sched(). Signed-off-by: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Jan Kara <jack@suse.com>
Diffstat (limited to 'fs/super.c')
-rw-r--r--fs/super.c111
1 files changed, 30 insertions, 81 deletions
diff --git a/fs/super.c b/fs/super.c
index c937bd7b4d33..767b1e10f6ad 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -142,7 +142,7 @@ static void destroy_super_work(struct work_struct *work)
142 int i; 142 int i;
143 143
144 for (i = 0; i < SB_FREEZE_LEVELS; i++) 144 for (i = 0; i < SB_FREEZE_LEVELS; i++)
145 percpu_counter_destroy(&s->s_writers.counter[i]); 145 percpu_free_rwsem(&s->s_writers.rw_sem[i]);
146 kfree(s); 146 kfree(s);
147} 147}
148 148
@@ -193,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
193 goto fail; 193 goto fail;
194 194
195 for (i = 0; i < SB_FREEZE_LEVELS; i++) { 195 for (i = 0; i < SB_FREEZE_LEVELS; i++) {
196 if (percpu_counter_init(&s->s_writers.counter[i], 0, 196 if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
197 GFP_KERNEL) < 0) 197 sb_writers_name[i],
198 &type->s_writers_key[i]))
198 goto fail; 199 goto fail;
199 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
200 &type->s_writers_key[i], 0);
201 } 200 }
202 init_waitqueue_head(&s->s_writers.wait);
203 init_waitqueue_head(&s->s_writers.wait_unfrozen); 201 init_waitqueue_head(&s->s_writers.wait_unfrozen);
204 s->s_bdi = &noop_backing_dev_info; 202 s->s_bdi = &noop_backing_dev_info;
205 s->s_flags = flags; 203 s->s_flags = flags;
@@ -1161,47 +1159,10 @@ out:
1161 */ 1159 */
1162void __sb_end_write(struct super_block *sb, int level) 1160void __sb_end_write(struct super_block *sb, int level)
1163{ 1161{
1164 percpu_counter_dec(&sb->s_writers.counter[level-1]); 1162 percpu_up_read(sb->s_writers.rw_sem + level-1);
1165 /*
1166 * Make sure s_writers are updated before we wake up waiters in
1167 * freeze_super().
1168 */
1169 smp_mb();
1170 if (waitqueue_active(&sb->s_writers.wait))
1171 wake_up(&sb->s_writers.wait);
1172 rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
1173} 1163}
1174EXPORT_SYMBOL(__sb_end_write); 1164EXPORT_SYMBOL(__sb_end_write);
1175 1165
1176static int do_sb_start_write(struct super_block *sb, int level, bool wait,
1177 unsigned long ip)
1178{
1179 if (wait)
1180 rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip);
1181retry:
1182 if (unlikely(sb->s_writers.frozen >= level)) {
1183 if (!wait)
1184 return 0;
1185 wait_event(sb->s_writers.wait_unfrozen,
1186 sb->s_writers.frozen < level);
1187 }
1188
1189 percpu_counter_inc(&sb->s_writers.counter[level-1]);
1190 /*
1191 * Make sure counter is updated before we check for frozen.
1192 * freeze_super() first sets frozen and then checks the counter.
1193 */
1194 smp_mb();
1195 if (unlikely(sb->s_writers.frozen >= level)) {
1196 __sb_end_write(sb, level);
1197 goto retry;
1198 }
1199
1200 if (!wait)
1201 rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip);
1202 return 1;
1203}
1204
1205/* 1166/*
1206 * This is an internal function, please use sb_start_{write,pagefault,intwrite} 1167 * This is an internal function, please use sb_start_{write,pagefault,intwrite}
1207 * instead. 1168 * instead.
@@ -1209,7 +1170,7 @@ retry:
1209int __sb_start_write(struct super_block *sb, int level, bool wait) 1170int __sb_start_write(struct super_block *sb, int level, bool wait)
1210{ 1171{
1211 bool force_trylock = false; 1172 bool force_trylock = false;
1212 int ret; 1173 int ret = 1;
1213 1174
1214#ifdef CONFIG_LOCKDEP 1175#ifdef CONFIG_LOCKDEP
1215 /* 1176 /*
@@ -1225,13 +1186,17 @@ int __sb_start_write(struct super_block *sb, int level, bool wait)
1225 int i; 1186 int i;
1226 1187
1227 for (i = 0; i < level - 1; i++) 1188 for (i = 0; i < level - 1; i++)
1228 if (lock_is_held(&sb->s_writers.lock_map[i])) { 1189 if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
1229 force_trylock = true; 1190 force_trylock = true;
1230 break; 1191 break;
1231 } 1192 }
1232 } 1193 }
1233#endif 1194#endif
1234 ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_); 1195 if (wait && !force_trylock)
1196 percpu_down_read(sb->s_writers.rw_sem + level-1);
1197 else
1198 ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
1199
1235 WARN_ON(force_trylock & !ret); 1200 WARN_ON(force_trylock & !ret);
1236 return ret; 1201 return ret;
1237} 1202}
@@ -1243,15 +1208,11 @@ EXPORT_SYMBOL(__sb_start_write);
1243 * @level: type of writers we wait for (normal vs page fault) 1208 * @level: type of writers we wait for (normal vs page fault)
1244 * 1209 *
1245 * This function waits until there are no writers of given type to given file 1210 * This function waits until there are no writers of given type to given file
1246 * system. Caller of this function should make sure there can be no new writers 1211 * system.
1247 * of type @level before calling this function. Otherwise this function can
1248 * livelock.
1249 */ 1212 */
1250static void sb_wait_write(struct super_block *sb, int level) 1213static void sb_wait_write(struct super_block *sb, int level)
1251{ 1214{
1252 s64 writers; 1215 percpu_down_write(sb->s_writers.rw_sem + level-1);
1253
1254 rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
1255 /* 1216 /*
1256 * We are going to return to userspace and forget about this lock, the 1217 * We are going to return to userspace and forget about this lock, the
1257 * ownership goes to the caller of thaw_super() which does unlock. 1218 * ownership goes to the caller of thaw_super() which does unlock.
@@ -1262,24 +1223,18 @@ static void sb_wait_write(struct super_block *sb, int level)
1262 * this leads to lockdep false-positives, so currently we do the early 1223 * this leads to lockdep false-positives, so currently we do the early
1263 * release right after acquire. 1224 * release right after acquire.
1264 */ 1225 */
1265 rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); 1226 percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_);
1266 1227}
1267 do {
1268 DEFINE_WAIT(wait);
1269 1228
1270 /* 1229static void sb_freeze_unlock(struct super_block *sb)
1271 * We use a barrier in prepare_to_wait() to separate setting 1230{
1272 * of frozen and checking of the counter 1231 int level;
1273 */
1274 prepare_to_wait(&sb->s_writers.wait, &wait,
1275 TASK_UNINTERRUPTIBLE);
1276 1232
1277 writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); 1233 for (level = 0; level < SB_FREEZE_LEVELS; ++level)
1278 if (writers) 1234 percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
1279 schedule();
1280 1235
1281 finish_wait(&sb->s_writers.wait, &wait); 1236 for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
1282 } while (writers); 1237 percpu_up_write(sb->s_writers.rw_sem + level);
1283} 1238}
1284 1239
1285/** 1240/**
@@ -1338,20 +1293,14 @@ int freeze_super(struct super_block *sb)
1338 return 0; 1293 return 0;
1339 } 1294 }
1340 1295
1341 /* From now on, no new normal writers can start */
1342 sb->s_writers.frozen = SB_FREEZE_WRITE; 1296 sb->s_writers.frozen = SB_FREEZE_WRITE;
1343 smp_wmb();
1344
1345 /* Release s_umount to preserve sb_start_write -> s_umount ordering */ 1297 /* Release s_umount to preserve sb_start_write -> s_umount ordering */
1346 up_write(&sb->s_umount); 1298 up_write(&sb->s_umount);
1347
1348 sb_wait_write(sb, SB_FREEZE_WRITE); 1299 sb_wait_write(sb, SB_FREEZE_WRITE);
1300 down_write(&sb->s_umount);
1349 1301
1350 /* Now we go and block page faults... */ 1302 /* Now we go and block page faults... */
1351 down_write(&sb->s_umount);
1352 sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; 1303 sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
1353 smp_wmb();
1354
1355 sb_wait_write(sb, SB_FREEZE_PAGEFAULT); 1304 sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
1356 1305
1357 /* All writers are done so after syncing there won't be dirty data */ 1306 /* All writers are done so after syncing there won't be dirty data */
@@ -1359,7 +1308,6 @@ int freeze_super(struct super_block *sb)
1359 1308
1360 /* Now wait for internal filesystem counter */ 1309 /* Now wait for internal filesystem counter */
1361 sb->s_writers.frozen = SB_FREEZE_FS; 1310 sb->s_writers.frozen = SB_FREEZE_FS;
1362 smp_wmb();
1363 sb_wait_write(sb, SB_FREEZE_FS); 1311 sb_wait_write(sb, SB_FREEZE_FS);
1364 1312
1365 if (sb->s_op->freeze_fs) { 1313 if (sb->s_op->freeze_fs) {
@@ -1368,7 +1316,7 @@ int freeze_super(struct super_block *sb)
1368 printk(KERN_ERR 1316 printk(KERN_ERR
1369 "VFS:Filesystem freeze failed\n"); 1317 "VFS:Filesystem freeze failed\n");
1370 sb->s_writers.frozen = SB_UNFROZEN; 1318 sb->s_writers.frozen = SB_UNFROZEN;
1371 smp_wmb(); 1319 sb_freeze_unlock(sb);
1372 wake_up(&sb->s_writers.wait_unfrozen); 1320 wake_up(&sb->s_writers.wait_unfrozen);
1373 deactivate_locked_super(sb); 1321 deactivate_locked_super(sb);
1374 return ret; 1322 return ret;
@@ -1400,8 +1348,10 @@ int thaw_super(struct super_block *sb)
1400 return -EINVAL; 1348 return -EINVAL;
1401 } 1349 }
1402 1350
1403 if (sb->s_flags & MS_RDONLY) 1351 if (sb->s_flags & MS_RDONLY) {
1352 sb->s_writers.frozen = SB_UNFROZEN;
1404 goto out; 1353 goto out;
1354 }
1405 1355
1406 if (sb->s_op->unfreeze_fs) { 1356 if (sb->s_op->unfreeze_fs) {
1407 error = sb->s_op->unfreeze_fs(sb); 1357 error = sb->s_op->unfreeze_fs(sb);
@@ -1413,12 +1363,11 @@ int thaw_super(struct super_block *sb)
1413 } 1363 }
1414 } 1364 }
1415 1365
1416out:
1417 sb->s_writers.frozen = SB_UNFROZEN; 1366 sb->s_writers.frozen = SB_UNFROZEN;
1418 smp_wmb(); 1367 sb_freeze_unlock(sb);
1368out:
1419 wake_up(&sb->s_writers.wait_unfrozen); 1369 wake_up(&sb->s_writers.wait_unfrozen);
1420 deactivate_locked_super(sb); 1370 deactivate_locked_super(sb);
1421
1422 return 0; 1371 return 0;
1423} 1372}
1424EXPORT_SYMBOL(thaw_super); 1373EXPORT_SYMBOL(thaw_super);