aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2015-07-28 21:48:01 -0400
committerDave Chinner <david@fromorbit.com>2015-07-28 21:48:01 -0400
commite3c32ee9e3e747fec01eb38e6610a9157d44c3ea (patch)
treee9e87e9ecbe30a63c223c5b505d0d5eecd320a4e
parentb2442c5a7fe92cca08437070c8a45a7aa0d1703e (diff)
xfs: remote attribute headers contain an invalid LSN
In recent testing, a system that crashed failed log recovery on restart with a bad symlink buffer magic number: XFS (vda): Starting recovery (logdev: internal) XFS (vda): Bad symlink block magic! XFS: Assertion failed: 0, file: fs/xfs/xfs_log_recover.c, line: 2060 On examination of the log via xfs_logprint, none of the symlink buffers in the log had a bad magic number, nor were any other types of buffer log format headers mis-identified as symlink buffers. Tracing was used to find the buffer the kernel was tripping over, and xfs_db identified it's contents as: 000: 5841524d 00000000 00000346 64d82b48 8983e692 d71e4680 a5f49e2c b317576e 020: 00000000 00602038 00000000 006034ce d0020000 00000000 4d4d4d4d 4d4d4d4d 040: 4d4d4d4d 4d4d4d4d 4d4d4d4d 4d4d4d4d 4d4d4d4d 4d4d4d4d 4d4d4d4d 4d4d4d4d 060: 4d4d4d4d 4d4d4d4d 4d4d4d4d 4d4d4d4d 4d4d4d4d 4d4d4d4d 4d4d4d4d 4d4d4d4d ..... This is a remote attribute buffer, which are notable in that they are not logged but are instead written synchronously by the remote attribute code so that they exist on disk before the attribute transactions are committed to the journal. The above remote attribute block has an invalid LSN in it - cycle 0xd002000, block 0 - which means when log recovery comes along to determine if the transaction that writes to the underlying block should be replayed, it sees a block that has a future LSN and so does not replay the buffer data in the transaction. Instead, it validates the buffer magic number and attaches the buffer verifier to it. It is this buffer magic number check that is failing in the above assert, indicating that we skipped replay due to the LSN of the underlying buffer. The problem here is that the remote attribute buffers cannot have a valid LSN placed into them, because the transaction that contains the attribute tree pointer changes and the block allocation that the attribute data is being written to hasn't yet been committed. Hence the LSN field in the attribute block is completely unwritten, thereby leaving the underlying contents of the block in the LSN field. It could have any value, and hence a future overwrite of the block by log recovery may or may not work correctly. Fix this by always writing an invalid LSN to the remote attribute block, as any buffer in log recovery that needs to write over the remote attribute should occur. We are protected from having old data written over the attribute by the fact that freeing the block before the remote attribute is written will result in the buffer being marked stale in the log and so all changes prior to the buffer stale transaction will be cancelled by log recovery. Hence it is safe to ignore the LSN in the case or synchronously written, unlogged metadata such as remote attribute blocks, and to ensure we do that correctly, we need to write an invalid LSN to all remote attribute blocks to trigger immediate recovery of metadata that is written over the top. As a further protection for filesystems that may already have remote attribute blocks with bad LSNs on disk, change the log recovery code to always trigger immediate recovery of metadata over remote attribute blocks. cc: <stable@vger.kernel.org> Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c29
-rw-r--r--fs/xfs/xfs_log_recover.c11
2 files changed, 31 insertions, 9 deletions
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 20de88d1bf86..2faec26962e8 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -159,11 +159,10 @@ xfs_attr3_rmt_write_verify(
159 struct xfs_buf *bp) 159 struct xfs_buf *bp)
160{ 160{
161 struct xfs_mount *mp = bp->b_target->bt_mount; 161 struct xfs_mount *mp = bp->b_target->bt_mount;
162 struct xfs_buf_log_item *bip = bp->b_fspriv; 162 int blksize = mp->m_attr_geo->blksize;
163 char *ptr; 163 char *ptr;
164 int len; 164 int len;
165 xfs_daddr_t bno; 165 xfs_daddr_t bno;
166 int blksize = mp->m_attr_geo->blksize;
167 166
168 /* no verification of non-crc buffers */ 167 /* no verification of non-crc buffers */
169 if (!xfs_sb_version_hascrc(&mp->m_sb)) 168 if (!xfs_sb_version_hascrc(&mp->m_sb))
@@ -175,16 +174,22 @@ xfs_attr3_rmt_write_verify(
175 ASSERT(len >= blksize); 174 ASSERT(len >= blksize);
176 175
177 while (len > 0) { 176 while (len > 0) {
177 struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
178
178 if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { 179 if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
179 xfs_buf_ioerror(bp, -EFSCORRUPTED); 180 xfs_buf_ioerror(bp, -EFSCORRUPTED);
180 xfs_verifier_error(bp); 181 xfs_verifier_error(bp);
181 return; 182 return;
182 } 183 }
183 if (bip) {
184 struct xfs_attr3_rmt_hdr *rmt;
185 184
186 rmt = (struct xfs_attr3_rmt_hdr *)ptr; 185 /*
187 rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); 186 * Ensure we aren't writing bogus LSNs to disk. See
187 * xfs_attr3_rmt_hdr_set() for the explanation.
188 */
189 if (rmt->rm_lsn != cpu_to_be64(NULLCOMMITLSN)) {
190 xfs_buf_ioerror(bp, -EFSCORRUPTED);
191 xfs_verifier_error(bp);
192 return;
188 } 193 }
189 xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF); 194 xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
190 195
@@ -221,6 +226,18 @@ xfs_attr3_rmt_hdr_set(
221 rmt->rm_owner = cpu_to_be64(ino); 226 rmt->rm_owner = cpu_to_be64(ino);
222 rmt->rm_blkno = cpu_to_be64(bno); 227 rmt->rm_blkno = cpu_to_be64(bno);
223 228
229 /*
230 * Remote attribute blocks are written synchronously, so we don't
231 * have an LSN that we can stamp in them that makes any sense to log
232 * recovery. To ensure that log recovery handles overwrites of these
233 * blocks sanely (i.e. once they've been freed and reallocated as some
234 * other type of metadata) we need to ensure that the LSN has a value
235 * that tells log recovery to ignore the LSN and overwrite the buffer
236 * with whatever is in it's log. To do this, we use the magic
237 * NULLCOMMITLSN to indicate that the LSN is invalid.
238 */
239 rmt->rm_lsn = cpu_to_be64(NULLCOMMITLSN);
240
224 return sizeof(struct xfs_attr3_rmt_hdr); 241 return sizeof(struct xfs_attr3_rmt_hdr);
225} 242}
226 243
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 01dd228ca05e..480ebba8464f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1886,9 +1886,14 @@ xlog_recover_get_buf_lsn(
1886 uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; 1886 uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
1887 break; 1887 break;
1888 case XFS_ATTR3_RMT_MAGIC: 1888 case XFS_ATTR3_RMT_MAGIC:
1889 lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); 1889 /*
1890 uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; 1890 * Remote attr blocks are written synchronously, rather than
1891 break; 1891 * being logged. That means they do not contain a valid LSN
1892 * (i.e. transactionally ordered) in them, and hence any time we
1893 * see a buffer to replay over the top of a remote attribute
1894 * block we should simply do so.
1895 */
1896 goto recover_immediately;
1892 case XFS_SB_MAGIC: 1897 case XFS_SB_MAGIC:
1893 lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); 1898 lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
1894 uuid = &((struct xfs_dsb *)blk)->sb_uuid; 1899 uuid = &((struct xfs_dsb *)blk)->sb_uuid;