aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_mount.c
diff options
context:
space:
mode:
authorDavid Chinner <dgc@sgi.com>2007-05-24 01:26:31 -0400
committerTim Shimmin <tes@chook.melbourne.sgi.com>2007-07-14 01:28:50 -0400
commit92821e2ba4ae26887223326fb0b95cdab963b768 (patch)
treea40a2ef10e5b0791df3e522f3139193d39bf2454 /fs/xfs/xfs_mount.c
parent3260f78ad6d5b788e78ea709d377f58e569bee41 (diff)
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all typically modify the on disk superblock in some way. create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify free block counts. When these counts are modified in a transaction, they must eventually lock the superblock buffer and apply the mods. The buffer then remains locked until the transaction is committed into the incore log buffer. The result of this is that with enough transactions on the fly the incore superblock buffer becomes a bottleneck. The result of contention on the incore superblock buffer is that transaction rates fall - the more pressure that is put on the superblock buffer, the slower things go. The key to removing the contention is to not require the superblock fields in question to be locked. We do that by not marking the superblock dirty in the transaction. IOWs, we modify the incore superblock but do not modify the cached superblock buffer. In short, we do not log superblock modifications to critical fields in the superblock on every transaction. In fact we only do it just before we write the superblock to disk every sync period or just before unmount. This creates an interesting problem - if we don't log or write out the fields in every transaction, then how do the values get recovered after a crash? the answer is simple - we keep enough duplicate, logged information in other structures that we can reconstruct the correct count after log recovery has been performed. It is the AGF and AGI structures that contain the duplicate information; after recovery, we walk every AGI and AGF and sum their individual counters to get the correct value, and we do a transaction into the log to correct them. An optimisation of this is that if we have a clean unmount record, we know the value in the superblock is correct, so we can avoid the summation walk under normal conditions and so mount/recovery times do not change under normal operation. One wrinkle that was discovered during development was that the blocks used in the freespace btrees are never accounted for in the AGF counters. This was once a valid optimisation to make; when the filesystem is full, the free space btrees are empty and consume no space. Hence when it matters, the "accounting" is correct. But that means the when we do the AGF summations, we would not have a correct count and xfs_check would complain. Hence a new counter was added to track the number of blocks used by the free space btrees. This is an *on-disk format change*. As a result of this, lazy superblock counters are a mkfs option and at the moment on linux there is no way to convert an old filesystem. This is possible - xfs_db can be used to twiddle the right bits and then xfs_repair will do the format conversion for you. Similarly, you can convert backwards as well. At some point we'll add functionality to xfs_admin to do the bit twiddling easily.... SGI-PV: 964999 SGI-Modid: xfs-linux-melb:xfs-kern:28652a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Tim Shimmin <tes@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_mount.c')
-rw-r--r--fs/xfs/xfs_mount.c154
1 files changed, 148 insertions, 6 deletions
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5de1f392e632..f6fe47d8c4dc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -643,6 +643,64 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
643 sbp->sb_inopblock); 643 sbp->sb_inopblock);
644 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; 644 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
645} 645}
646
647/*
648 * xfs_initialize_perag_data
649 *
650 * Read in each per-ag structure so we can count up the number of
651 * allocated inodes, free inodes and used filesystem blocks as this
652 * information is no longer persistent in the superblock. Once we have
653 * this information, write it into the in-core superblock structure.
654 */
655STATIC int
656xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
657{
658 xfs_agnumber_t index;
659 xfs_perag_t *pag;
660 xfs_sb_t *sbp = &mp->m_sb;
661 uint64_t ifree = 0;
662 uint64_t ialloc = 0;
663 uint64_t bfree = 0;
664 uint64_t bfreelst = 0;
665 uint64_t btree = 0;
666 int error;
667 int s;
668
669 for (index = 0; index < agcount; index++) {
670 /*
671 * read the agf, then the agi. This gets us
672 * all the inforamtion we need and populates the
673 * per-ag structures for us.
674 */
675 error = xfs_alloc_pagf_init(mp, NULL, index, 0);
676 if (error)
677 return error;
678
679 error = xfs_ialloc_pagi_init(mp, NULL, index);
680 if (error)
681 return error;
682 pag = &mp->m_perag[index];
683 ifree += pag->pagi_freecount;
684 ialloc += pag->pagi_count;
685 bfree += pag->pagf_freeblks;
686 bfreelst += pag->pagf_flcount;
687 btree += pag->pagf_btreeblks;
688 }
689 /*
690 * Overwrite incore superblock counters with just-read data
691 */
692 s = XFS_SB_LOCK(mp);
693 sbp->sb_ifree = ifree;
694 sbp->sb_icount = ialloc;
695 sbp->sb_fdblocks = bfree + bfreelst + btree;
696 XFS_SB_UNLOCK(mp, s);
697
698 /* Fixup the per-cpu counters as well. */
699 xfs_icsb_reinit_counters(mp);
700
701 return 0;
702}
703
646/* 704/*
647 * xfs_mountfs 705 * xfs_mountfs
648 * 706 *
@@ -987,6 +1045,34 @@ xfs_mountfs(
987 } 1045 }
988 1046
989 /* 1047 /*
1048 * Now the log is mounted, we know if it was an unclean shutdown or
1049 * not. If it was, with the first phase of recovery has completed, we
1050 * have consistent AG blocks on disk. We have not recovered EFIs yet,
1051 * but they are recovered transactionally in the second recovery phase
1052 * later.
1053 *
1054 * Hence we can safely re-initialise incore superblock counters from
1055 * the per-ag data. These may not be correct if the filesystem was not
1056 * cleanly unmounted, so we need to wait for recovery to finish before
1057 * doing this.
1058 *
1059 * If the filesystem was cleanly unmounted, then we can trust the
1060 * values in the superblock to be correct and we don't need to do
1061 * anything here.
1062 *
1063 * If we are currently making the filesystem, the initialisation will
1064 * fail as the perag data is in an undefined state.
1065 */
1066
1067 if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
1068 !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
1069 !mp->m_sb.sb_inprogress) {
1070 error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
1071 if (error) {
1072 goto error2;
1073 }
1074 }
1075 /*
990 * Get and sanity-check the root inode. 1076 * Get and sanity-check the root inode.
991 * Save the pointer to it in the mount structure. 1077 * Save the pointer to it in the mount structure.
992 */ 1078 */
@@ -1049,6 +1135,7 @@ xfs_mountfs(
1049 goto error4; 1135 goto error4;
1050 } 1136 }
1051 1137
1138
1052 /* 1139 /*
1053 * Complete the quota initialisation, post-log-replay component. 1140 * Complete the quota initialisation, post-log-replay component.
1054 */ 1141 */
@@ -1111,10 +1198,9 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
1111 xfs_binval(mp->m_rtdev_targp); 1198 xfs_binval(mp->m_rtdev_targp);
1112 } 1199 }
1113 1200
1201 xfs_log_sbcount(mp, 1);
1114 xfs_unmountfs_writesb(mp); 1202 xfs_unmountfs_writesb(mp);
1115
1116 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1203 xfs_unmountfs_wait(mp); /* wait for async bufs */
1117
1118 xfs_log_unmount(mp); /* Done! No more fs ops. */ 1204 xfs_log_unmount(mp); /* Done! No more fs ops. */
1119 1205
1120 xfs_freesb(mp); 1206 xfs_freesb(mp);
@@ -1161,6 +1247,62 @@ xfs_unmountfs_wait(xfs_mount_t *mp)
1161} 1247}
1162 1248
1163int 1249int
1250xfs_fs_writable(xfs_mount_t *mp)
1251{
1252 bhv_vfs_t *vfsp = XFS_MTOVFS(mp);
1253
1254 return !(vfs_test_for_freeze(vfsp) || XFS_FORCED_SHUTDOWN(mp) ||
1255 (vfsp->vfs_flag & VFS_RDONLY));
1256}
1257
1258/*
1259 * xfs_log_sbcount
1260 *
1261 * Called either periodically to keep the on disk superblock values
1262 * roughly up to date or from unmount to make sure the values are
1263 * correct on a clean unmount.
1264 *
1265 * Note this code can be called during the process of freezing, so
1266 * we may need to use the transaction allocator which does not not
1267 * block when the transaction subsystem is in its frozen state.
1268 */
1269int
1270xfs_log_sbcount(
1271 xfs_mount_t *mp,
1272 uint sync)
1273{
1274 xfs_trans_t *tp;
1275 int error;
1276
1277 if (!xfs_fs_writable(mp))
1278 return 0;
1279
1280 xfs_icsb_sync_counters(mp);
1281
1282 /*
1283 * we don't need to do this if we are updating the superblock
1284 * counters on every modification.
1285 */
1286 if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
1287 return 0;
1288
1289 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT);
1290 error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
1291 XFS_DEFAULT_LOG_COUNT);
1292 if (error) {
1293 xfs_trans_cancel(tp, 0);
1294 return error;
1295 }
1296
1297 xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
1298 if (sync)
1299 xfs_trans_set_sync(tp);
1300 xfs_trans_commit(tp, 0);
1301
1302 return 0;
1303}
1304
1305int
1164xfs_unmountfs_writesb(xfs_mount_t *mp) 1306xfs_unmountfs_writesb(xfs_mount_t *mp)
1165{ 1307{
1166 xfs_buf_t *sbp; 1308 xfs_buf_t *sbp;
@@ -1171,16 +1313,15 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1171 * skip superblock write if fs is read-only, or 1313 * skip superblock write if fs is read-only, or
1172 * if we are doing a forced umount. 1314 * if we are doing a forced umount.
1173 */ 1315 */
1174 sbp = xfs_getsb(mp, 0);
1175 if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY || 1316 if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY ||
1176 XFS_FORCED_SHUTDOWN(mp))) { 1317 XFS_FORCED_SHUTDOWN(mp))) {
1177 1318
1178 xfs_icsb_sync_counters(mp); 1319 sbp = xfs_getsb(mp, 0);
1320 sb = XFS_BUF_TO_SBP(sbp);
1179 1321
1180 /* 1322 /*
1181 * mark shared-readonly if desired 1323 * mark shared-readonly if desired
1182 */ 1324 */
1183 sb = XFS_BUF_TO_SBP(sbp);
1184 if (mp->m_mk_sharedro) { 1325 if (mp->m_mk_sharedro) {
1185 if (!(sb->sb_flags & XFS_SBF_READONLY)) 1326 if (!(sb->sb_flags & XFS_SBF_READONLY))
1186 sb->sb_flags |= XFS_SBF_READONLY; 1327 sb->sb_flags |= XFS_SBF_READONLY;
@@ -1189,6 +1330,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1189 xfs_fs_cmn_err(CE_NOTE, mp, 1330 xfs_fs_cmn_err(CE_NOTE, mp,
1190 "Unmounting, marking shared read-only"); 1331 "Unmounting, marking shared read-only");
1191 } 1332 }
1333
1192 XFS_BUF_UNDONE(sbp); 1334 XFS_BUF_UNDONE(sbp);
1193 XFS_BUF_UNREAD(sbp); 1335 XFS_BUF_UNREAD(sbp);
1194 XFS_BUF_UNDELAYWRITE(sbp); 1336 XFS_BUF_UNDELAYWRITE(sbp);
@@ -1203,8 +1345,8 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1203 mp, sbp, XFS_BUF_ADDR(sbp)); 1345 mp, sbp, XFS_BUF_ADDR(sbp));
1204 if (error && mp->m_mk_sharedro) 1346 if (error && mp->m_mk_sharedro)
1205 xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly"); 1347 xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly");
1348 xfs_buf_relse(sbp);
1206 } 1349 }
1207 xfs_buf_relse(sbp);
1208 return error; 1350 return error;
1209} 1351}
1210 1352