aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2013-04-03 01:11:17 -0400
committerBen Myers <bpm@sgi.com>2013-04-21 16:03:33 -0400
commit93848a999cf9b9e4f4f77dba843a48c393f33c59 (patch)
tree92c2fb4e741a8d70e70f9d31df308d1a30ecaef6 /fs/xfs
parent3fe58f30b4fc3f8a9084b035a02bc0c67bee8d00 (diff)
xfs: add version 3 inode format with CRCs
Add a new inode version with a larger core. The primary objective is to allow for a crc of the inode, and location information (uuid and ino) to verify it was written in the right place. We also extend it by: a creation time (for Samba); a changecount (for NFSv4); a flush sequence (in LSN format for recovery); an additional inode flags field; and some additional padding. These additional fields are not implemented yet, but already laid out in the structure. [dchinner@redhat.com] Added LSN and flags field, some factoring and rework to capture all the necessary information in the crc calculation. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Ben Myers <bpm@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_buf_item.h4
-rw-r--r--fs/xfs/xfs_dinode.h33
-rw-r--r--fs/xfs/xfs_ialloc.c50
-rw-r--r--fs/xfs/xfs_inode.c179
-rw-r--r--fs/xfs/xfs_inode.h26
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_log_recover.c32
-rw-r--r--fs/xfs/xfs_trans_buf.c5
8 files changed, 254 insertions, 77 deletions
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index c25660691e08..abae8c8c4ec4 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -48,6 +48,7 @@ extern kmem_zone_t *xfs_buf_item_zone;
48#define XFS_BLF_AGF_BUF (1<<6) 48#define XFS_BLF_AGF_BUF (1<<6)
49#define XFS_BLF_AGFL_BUF (1<<7) 49#define XFS_BLF_AGFL_BUF (1<<7)
50#define XFS_BLF_AGI_BUF (1<<8) 50#define XFS_BLF_AGI_BUF (1<<8)
51#define XFS_BLF_DINO_BUF (1<<9)
51 52
52#define XFS_BLF_TYPE_MASK \ 53#define XFS_BLF_TYPE_MASK \
53 (XFS_BLF_UDQUOT_BUF | \ 54 (XFS_BLF_UDQUOT_BUF | \
@@ -56,7 +57,8 @@ extern kmem_zone_t *xfs_buf_item_zone;
56 XFS_BLF_BTREE_BUF | \ 57 XFS_BLF_BTREE_BUF | \
57 XFS_BLF_AGF_BUF | \ 58 XFS_BLF_AGF_BUF | \
58 XFS_BLF_AGFL_BUF | \ 59 XFS_BLF_AGFL_BUF | \
59 XFS_BLF_AGI_BUF) 60 XFS_BLF_AGI_BUF | \
61 XFS_BLF_DINO_BUF)
60 62
61#define XFS_BLF_CHUNK 128 63#define XFS_BLF_CHUNK 128
62#define XFS_BLF_SHIFT 7 64#define XFS_BLF_SHIFT 7
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 6b5bd1745dbe..f7a0e95d197a 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -19,7 +19,7 @@
19#define __XFS_DINODE_H__ 19#define __XFS_DINODE_H__
20 20
21#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ 21#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
22#define XFS_DINODE_GOOD_VERSION(v) (((v) == 1 || (v) == 2)) 22#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3)
23 23
24typedef struct xfs_timestamp { 24typedef struct xfs_timestamp {
25 __be32 t_sec; /* timestamp seconds */ 25 __be32 t_sec; /* timestamp seconds */
@@ -70,11 +70,36 @@ typedef struct xfs_dinode {
70 70
71 /* di_next_unlinked is the only non-core field in the old dinode */ 71 /* di_next_unlinked is the only non-core field in the old dinode */
72 __be32 di_next_unlinked;/* agi unlinked list ptr */ 72 __be32 di_next_unlinked;/* agi unlinked list ptr */
73} __attribute__((packed)) xfs_dinode_t; 73
74 /* start of the extended dinode, writable fields */
75 __le32 di_crc; /* CRC of the inode */
76 __be64 di_changecount; /* number of attribute changes */
77 __be64 di_lsn; /* flush sequence */
78 __be64 di_flags2; /* more random flags */
79 __u8 di_pad2[16]; /* more padding for future expansion */
80
81 /* fields only written to during inode creation */
82 xfs_timestamp_t di_crtime; /* time created */
83 __be64 di_ino; /* inode number */
84 uuid_t di_uuid; /* UUID of the filesystem */
85
86 /* structure must be padded to 64 bit alignment */
87} xfs_dinode_t;
74 88
75#define DI_MAX_FLUSH 0xffff 89#define DI_MAX_FLUSH 0xffff
76 90
77/* 91/*
92 * Size of the core inode on disk. Version 1 and 2 inodes have
93 * the same size, but version 3 has grown a few additional fields.
94 */
95static inline uint xfs_dinode_size(int version)
96{
97 if (version == 3)
98 return sizeof(struct xfs_dinode);
99 return offsetof(struct xfs_dinode, di_crc);
100}
101
102/*
78 * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. 103 * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
79 * Since the pathconf interface is signed, we use 2^31 - 1 instead. 104 * Since the pathconf interface is signed, we use 2^31 - 1 instead.
80 * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. 105 * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
@@ -105,7 +130,7 @@ typedef enum xfs_dinode_fmt {
105 * Inode size for given fs. 130 * Inode size for given fs.
106 */ 131 */
107#define XFS_LITINO(mp, version) \ 132#define XFS_LITINO(mp, version) \
108 ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode))) 133 ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
109 134
110#define XFS_BROOT_SIZE_ADJ(ip) \ 135#define XFS_BROOT_SIZE_ADJ(ip) \
111 (XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t)) 136 (XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t))
@@ -133,7 +158,7 @@ typedef enum xfs_dinode_fmt {
133 * Return pointers to the data or attribute forks. 158 * Return pointers to the data or attribute forks.
134 */ 159 */
135#define XFS_DFORK_DPTR(dip) \ 160#define XFS_DFORK_DPTR(dip) \
136 ((char *)(dip) + sizeof(struct xfs_dinode)) 161 ((char *)dip + xfs_dinode_size(dip->di_version))
137#define XFS_DFORK_APTR(dip) \ 162#define XFS_DFORK_APTR(dip) \
138 (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) 163 (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
139#define XFS_DFORK_PTR(dip,w) \ 164#define XFS_DFORK_PTR(dip,w) \
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 6d0a4954aa8c..3039f829c96a 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -167,6 +167,7 @@ xfs_ialloc_inode_init(
167 int version; 167 int version;
168 int i, j; 168 int i, j;
169 xfs_daddr_t d; 169 xfs_daddr_t d;
170 xfs_ino_t ino = 0;
170 171
171 /* 172 /*
172 * Loop over the new block(s), filling in the inodes. 173 * Loop over the new block(s), filling in the inodes.
@@ -185,13 +186,29 @@ xfs_ialloc_inode_init(
185 } 186 }
186 187
187 /* 188 /*
188 * Figure out what version number to use in the inodes we create. 189 * Figure out what version number to use in the inodes we create. If
189 * If the superblock version has caught up to the one that supports 190 * the superblock version has caught up to the one that supports the new
190 * the new inode format, then use the new inode version. Otherwise 191 * inode format, then use the new inode version. Otherwise use the old
191 * use the old version so that old kernels will continue to be 192 * version so that old kernels will continue to be able to use the file
192 * able to use the file system. 193 * system.
194 *
195 * For v3 inodes, we also need to write the inode number into the inode,
196 * so calculate the first inode number of the chunk here as
197 * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
198 * across multiple filesystem blocks (such as a cluster) and so cannot
199 * be used in the cluster buffer loop below.
200 *
201 * Further, because we are writing the inode directly into the buffer
202 * and calculating a CRC on the entire inode, we have ot log the entire
203 * inode so that the entire range the CRC covers is present in the log.
204 * That means for v3 inode we log the entire buffer rather than just the
205 * inode cores.
193 */ 206 */
194 if (xfs_sb_version_hasnlink(&mp->m_sb)) 207 if (xfs_sb_version_hascrc(&mp->m_sb)) {
208 version = 3;
209 ino = XFS_AGINO_TO_INO(mp, agno,
210 XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
211 } else if (xfs_sb_version_hasnlink(&mp->m_sb))
195 version = 2; 212 version = 2;
196 else 213 else
197 version = 1; 214 version = 1;
@@ -214,17 +231,32 @@ xfs_ialloc_inode_init(
214 * individual transactions causing a lot of log traffic. 231 * individual transactions causing a lot of log traffic.
215 */ 232 */
216 fbuf->b_ops = &xfs_inode_buf_ops; 233 fbuf->b_ops = &xfs_inode_buf_ops;
217 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); 234 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
218 for (i = 0; i < ninodes; i++) { 235 for (i = 0; i < ninodes; i++) {
219 int ioffset = i << mp->m_sb.sb_inodelog; 236 int ioffset = i << mp->m_sb.sb_inodelog;
220 uint isize = sizeof(struct xfs_dinode); 237 uint isize = xfs_dinode_size(version);
221 238
222 free = xfs_make_iptr(mp, fbuf, i); 239 free = xfs_make_iptr(mp, fbuf, i);
223 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 240 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
224 free->di_version = version; 241 free->di_version = version;
225 free->di_gen = cpu_to_be32(gen); 242 free->di_gen = cpu_to_be32(gen);
226 free->di_next_unlinked = cpu_to_be32(NULLAGINO); 243 free->di_next_unlinked = cpu_to_be32(NULLAGINO);
227 xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1); 244
245 if (version == 3) {
246 free->di_ino = cpu_to_be64(ino);
247 ino++;
248 uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
249 xfs_dinode_calc_crc(mp, free);
250 } else {
251 /* just log the inode core */
252 xfs_trans_log_buf(tp, fbuf, ioffset,
253 ioffset + isize - 1);
254 }
255 }
256 if (version == 3) {
257 /* need to log the entire buffer */
258 xfs_trans_log_buf(tp, fbuf, 0,
259 BBTOB(fbuf->b_length) - 1);
228 } 260 }
229 xfs_trans_inode_alloc_buf(tp, fbuf); 261 xfs_trans_inode_alloc_buf(tp, fbuf);
230 } 262 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 202ce37e66cb..558ef4947206 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -44,6 +44,7 @@
44#include "xfs_quota.h" 44#include "xfs_quota.h"
45#include "xfs_filestream.h" 45#include "xfs_filestream.h"
46#include "xfs_vnodeops.h" 46#include "xfs_vnodeops.h"
47#include "xfs_cksum.h"
47#include "xfs_trace.h" 48#include "xfs_trace.h"
48#include "xfs_icache.h" 49#include "xfs_icache.h"
49 50
@@ -866,6 +867,17 @@ xfs_dinode_from_disk(
866 to->di_dmstate = be16_to_cpu(from->di_dmstate); 867 to->di_dmstate = be16_to_cpu(from->di_dmstate);
867 to->di_flags = be16_to_cpu(from->di_flags); 868 to->di_flags = be16_to_cpu(from->di_flags);
868 to->di_gen = be32_to_cpu(from->di_gen); 869 to->di_gen = be32_to_cpu(from->di_gen);
870
871 if (to->di_version == 3) {
872 to->di_changecount = be64_to_cpu(from->di_changecount);
873 to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
874 to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
875 to->di_flags2 = be64_to_cpu(from->di_flags2);
876 to->di_ino = be64_to_cpu(from->di_ino);
877 to->di_lsn = be64_to_cpu(from->di_lsn);
878 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
879 uuid_copy(&to->di_uuid, &from->di_uuid);
880 }
869} 881}
870 882
871void 883void
@@ -902,6 +914,17 @@ xfs_dinode_to_disk(
902 to->di_dmstate = cpu_to_be16(from->di_dmstate); 914 to->di_dmstate = cpu_to_be16(from->di_dmstate);
903 to->di_flags = cpu_to_be16(from->di_flags); 915 to->di_flags = cpu_to_be16(from->di_flags);
904 to->di_gen = cpu_to_be32(from->di_gen); 916 to->di_gen = cpu_to_be32(from->di_gen);
917
918 if (from->di_version == 3) {
919 to->di_changecount = cpu_to_be64(from->di_changecount);
920 to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
921 to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
922 to->di_flags2 = cpu_to_be64(from->di_flags2);
923 to->di_ino = cpu_to_be64(from->di_ino);
924 to->di_lsn = cpu_to_be64(from->di_lsn);
925 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
926 uuid_copy(&to->di_uuid, &from->di_uuid);
927 }
905} 928}
906 929
907STATIC uint 930STATIC uint
@@ -962,6 +985,47 @@ xfs_dic2xflags(
962 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); 985 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
963} 986}
964 987
988static bool
989xfs_dinode_verify(
990 struct xfs_mount *mp,
991 struct xfs_inode *ip,
992 struct xfs_dinode *dip)
993{
994 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
995 return false;
996
997 /* only version 3 or greater inodes are extensively verified here */
998 if (dip->di_version < 3)
999 return true;
1000
1001 if (!xfs_sb_version_hascrc(&mp->m_sb))
1002 return false;
1003 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
1004 offsetof(struct xfs_dinode, di_crc)))
1005 return false;
1006 if (be64_to_cpu(dip->di_ino) != ip->i_ino)
1007 return false;
1008 if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
1009 return false;
1010 return true;
1011}
1012
1013void
1014xfs_dinode_calc_crc(
1015 struct xfs_mount *mp,
1016 struct xfs_dinode *dip)
1017{
1018 __uint32_t crc;
1019
1020 if (dip->di_version < 3)
1021 return;
1022
1023 ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
1024 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
1025 offsetof(struct xfs_dinode, di_crc));
1026 dip->di_crc = xfs_end_cksum(crc);
1027}
1028
965/* 1029/*
966 * Read the disk inode attributes into the in-core inode structure. 1030 * Read the disk inode attributes into the in-core inode structure.
967 */ 1031 */
@@ -990,17 +1054,13 @@ xfs_iread(
990 if (error) 1054 if (error)
991 return error; 1055 return error;
992 1056
993 /* 1057 /* even unallocated inodes are verified */
994 * If we got something that isn't an inode it means someone 1058 if (!xfs_dinode_verify(mp, ip, dip)) {
995 * (nfs or dmi) has a stale handle. 1059 xfs_alert(mp, "%s: validation failed for inode %lld failed",
996 */ 1060 __func__, ip->i_ino);
997 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) { 1061
998#ifdef DEBUG 1062 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
999 xfs_alert(mp, 1063 error = XFS_ERROR(EFSCORRUPTED);
1000 "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
1001 __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
1002#endif /* DEBUG */
1003 error = XFS_ERROR(EINVAL);
1004 goto out_brelse; 1064 goto out_brelse;
1005 } 1065 }
1006 1066
@@ -1022,10 +1082,20 @@ xfs_iread(
1022 goto out_brelse; 1082 goto out_brelse;
1023 } 1083 }
1024 } else { 1084 } else {
1085 /*
1086 * Partial initialisation of the in-core inode. Just the bits
1087 * that xfs_ialloc won't overwrite or relies on being correct.
1088 */
1025 ip->i_d.di_magic = be16_to_cpu(dip->di_magic); 1089 ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
1026 ip->i_d.di_version = dip->di_version; 1090 ip->i_d.di_version = dip->di_version;
1027 ip->i_d.di_gen = be32_to_cpu(dip->di_gen); 1091 ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
1028 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); 1092 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
1093
1094 if (dip->di_version == 3) {
1095 ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
1096 uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
1097 }
1098
1029 /* 1099 /*
1030 * Make sure to pull in the mode here as well in 1100 * Make sure to pull in the mode here as well in
1031 * case the inode is released without being used. 1101 * case the inode is released without being used.
@@ -1161,6 +1231,7 @@ xfs_ialloc(
1161 xfs_buf_t **ialloc_context, 1231 xfs_buf_t **ialloc_context,
1162 xfs_inode_t **ipp) 1232 xfs_inode_t **ipp)
1163{ 1233{
1234 struct xfs_mount *mp = tp->t_mountp;
1164 xfs_ino_t ino; 1235 xfs_ino_t ino;
1165 xfs_inode_t *ip; 1236 xfs_inode_t *ip;
1166 uint flags; 1237 uint flags;
@@ -1187,7 +1258,7 @@ xfs_ialloc(
1187 * This is because we're setting fields here we need 1258 * This is because we're setting fields here we need
1188 * to prevent others from looking at until we're done. 1259 * to prevent others from looking at until we're done.
1189 */ 1260 */
1190 error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, 1261 error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
1191 XFS_ILOCK_EXCL, &ip); 1262 XFS_ILOCK_EXCL, &ip);
1192 if (error) 1263 if (error)
1193 return error; 1264 return error;
@@ -1208,7 +1279,7 @@ xfs_ialloc(
1208 * the inode version number now. This way we only do the conversion 1279 * the inode version number now. This way we only do the conversion
1209 * here rather than here and in the flush/logging code. 1280 * here rather than here and in the flush/logging code.
1210 */ 1281 */
1211 if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && 1282 if (xfs_sb_version_hasnlink(&mp->m_sb) &&
1212 ip->i_d.di_version == 1) { 1283 ip->i_d.di_version == 1) {
1213 ip->i_d.di_version = 2; 1284 ip->i_d.di_version = 2;
1214 /* 1285 /*
@@ -1258,6 +1329,19 @@ xfs_ialloc(
1258 ip->i_d.di_dmevmask = 0; 1329 ip->i_d.di_dmevmask = 0;
1259 ip->i_d.di_dmstate = 0; 1330 ip->i_d.di_dmstate = 0;
1260 ip->i_d.di_flags = 0; 1331 ip->i_d.di_flags = 0;
1332
1333 if (ip->i_d.di_version == 3) {
1334 ASSERT(ip->i_d.di_ino == ino);
1335 ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
1336 ip->i_d.di_crc = 0;
1337 ip->i_d.di_changecount = 1;
1338 ip->i_d.di_lsn = 0;
1339 ip->i_d.di_flags2 = 0;
1340 memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
1341 ip->i_d.di_crtime = ip->i_d.di_mtime;
1342 }
1343
1344
1261 flags = XFS_ILOG_CORE; 1345 flags = XFS_ILOG_CORE;
1262 switch (mode & S_IFMT) { 1346 switch (mode & S_IFMT) {
1263 case S_IFIFO: 1347 case S_IFIFO:
@@ -2716,20 +2800,18 @@ abort_out:
2716 2800
2717STATIC int 2801STATIC int
2718xfs_iflush_int( 2802xfs_iflush_int(
2719 xfs_inode_t *ip, 2803 struct xfs_inode *ip,
2720 xfs_buf_t *bp) 2804 struct xfs_buf *bp)
2721{ 2805{
2722 xfs_inode_log_item_t *iip; 2806 struct xfs_inode_log_item *iip = ip->i_itemp;
2723 xfs_dinode_t *dip; 2807 struct xfs_dinode *dip;
2724 xfs_mount_t *mp; 2808 struct xfs_mount *mp = ip->i_mount;
2725 2809
2726 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2810 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2727 ASSERT(xfs_isiflocked(ip)); 2811 ASSERT(xfs_isiflocked(ip));
2728 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2812 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2729 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 2813 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2730 2814 ASSERT(iip != NULL && iip->ili_fields != 0);
2731 iip = ip->i_itemp;
2732 mp = ip->i_mount;
2733 2815
2734 /* set *dip = inode's place in the buffer */ 2816 /* set *dip = inode's place in the buffer */
2735 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2817 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -2790,9 +2872,9 @@ xfs_iflush_int(
2790 } 2872 }
2791 /* 2873 /*
2792 * bump the flush iteration count, used to detect flushes which 2874 * bump the flush iteration count, used to detect flushes which
2793 * postdate a log record during recovery. 2875 * postdate a log record during recovery. This is redundant as we now
2876 * log every change and hence this can't happen. Still, it doesn't hurt.
2794 */ 2877 */
2795
2796 ip->i_d.di_flushiter++; 2878 ip->i_d.di_flushiter++;
2797 2879
2798 /* 2880 /*
@@ -2868,41 +2950,30 @@ xfs_iflush_int(
2868 * need the AIL lock, because it is a 64 bit value that cannot be read 2950 * need the AIL lock, because it is a 64 bit value that cannot be read
2869 * atomically. 2951 * atomically.
2870 */ 2952 */
2871 if (iip != NULL && iip->ili_fields != 0) { 2953 iip->ili_last_fields = iip->ili_fields;
2872 iip->ili_last_fields = iip->ili_fields; 2954 iip->ili_fields = 0;
2873 iip->ili_fields = 0; 2955 iip->ili_logged = 1;
2874 iip->ili_logged = 1;
2875 2956
2876 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 2957 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2877 &iip->ili_item.li_lsn); 2958 &iip->ili_item.li_lsn);
2878 2959
2879 /* 2960 /*
2880 * Attach the function xfs_iflush_done to the inode's 2961 * Attach the function xfs_iflush_done to the inode's
2881 * buffer. This will remove the inode from the AIL 2962 * buffer. This will remove the inode from the AIL
2882 * and unlock the inode's flush lock when the inode is 2963 * and unlock the inode's flush lock when the inode is
2883 * completely written to disk. 2964 * completely written to disk.
2884 */ 2965 */
2885 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); 2966 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
2886 2967
2887 ASSERT(bp->b_fspriv != NULL); 2968 /* update the lsn in the on disk inode if required */
2888 ASSERT(bp->b_iodone != NULL); 2969 if (ip->i_d.di_version == 3)
2889 } else { 2970 dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
2890 /* 2971
2891 * We're flushing an inode which is not in the AIL and has 2972 /* generate the checksum. */
2892 * not been logged. For this case we can immediately drop 2973 xfs_dinode_calc_crc(mp, dip);
2893 * the inode flush lock because we can avoid the whole
2894 * AIL state thing. It's OK to drop the flush lock now,
2895 * because we've already locked the buffer and to do anything
2896 * you really need both.
2897 */
2898 if (iip != NULL) {
2899 ASSERT(iip->ili_logged == 0);
2900 ASSERT(iip->ili_last_fields == 0);
2901 ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
2902 }
2903 xfs_ifunlock(ip);
2904 }
2905 2974
2975 ASSERT(bp->b_fspriv != NULL);
2976 ASSERT(bp->b_iodone != NULL);
2906 return 0; 2977 return 0;
2907 2978
2908corrupt_out: 2979corrupt_out:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b8520b5c3a18..91129794aaec 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -150,13 +150,38 @@ typedef struct xfs_icdinode {
150 __uint16_t di_dmstate; /* DMIG state info */ 150 __uint16_t di_dmstate; /* DMIG state info */
151 __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ 151 __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
152 __uint32_t di_gen; /* generation number */ 152 __uint32_t di_gen; /* generation number */
153
154 /* di_next_unlinked is the only non-core field in the old dinode */
155 xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */
156
157 /* start of the extended dinode, writable fields */
158 __uint32_t di_crc; /* CRC of the inode */
159 __uint64_t di_changecount; /* number of attribute changes */
160 xfs_lsn_t di_lsn; /* flush sequence */
161 __uint64_t di_flags2; /* more random flags */
162 __uint8_t di_pad2[16]; /* more padding for future expansion */
163
164 /* fields only written to during inode creation */
165 xfs_ictimestamp_t di_crtime; /* time created */
166 xfs_ino_t di_ino; /* inode number */
167 uuid_t di_uuid; /* UUID of the filesystem */
168
169 /* structure must be padded to 64 bit alignment */
153} xfs_icdinode_t; 170} xfs_icdinode_t;
154 171
172static inline uint xfs_icdinode_size(int version)
173{
174 if (version == 3)
175 return sizeof(struct xfs_icdinode);
176 return offsetof(struct xfs_icdinode, di_next_unlinked);
177}
178
155/* 179/*
156 * Flags for xfs_ichgtime(). 180 * Flags for xfs_ichgtime().
157 */ 181 */
158#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ 182#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
159#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ 183#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
184#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
160 185
161/* 186/*
162 * Per-fork incore inode flags. 187 * Per-fork incore inode flags.
@@ -556,6 +581,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
556 struct xfs_buf **, uint, uint); 581 struct xfs_buf **, uint, uint);
557int xfs_iread(struct xfs_mount *, struct xfs_trans *, 582int xfs_iread(struct xfs_mount *, struct xfs_trans *,
558 struct xfs_inode *, uint); 583 struct xfs_inode *, uint);
584void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
559void xfs_dinode_to_disk(struct xfs_dinode *, 585void xfs_dinode_to_disk(struct xfs_dinode *,
560 struct xfs_icdinode *); 586 struct xfs_icdinode *);
561void xfs_idestroy_fork(struct xfs_inode *, int); 587void xfs_idestroy_fork(struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index f034bd1652f0..f76ff52e43c0 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -179,7 +179,7 @@ xfs_inode_item_format(
179 nvecs = 1; 179 nvecs = 1;
180 180
181 vecp->i_addr = &ip->i_d; 181 vecp->i_addr = &ip->i_d;
182 vecp->i_len = sizeof(struct xfs_icdinode); 182 vecp->i_len = xfs_icdinode_size(ip->i_d.di_version);
183 vecp->i_type = XLOG_REG_TYPE_ICORE; 183 vecp->i_type = XLOG_REG_TYPE_ICORE;
184 vecp++; 184 vecp++;
185 nvecs++; 185 nvecs++;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 27b3ec214a67..287878219af7 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1786,6 +1786,7 @@ xlog_recover_do_inode_buffer(
1786 xfs_agino_t *buffer_nextp; 1786 xfs_agino_t *buffer_nextp;
1787 1787
1788 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 1788 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1789 bp->b_ops = &xfs_inode_buf_ops;
1789 1790
1790 inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; 1791 inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
1791 for (i = 0; i < inodes_per_buf; i++) { 1792 for (i = 0; i < inodes_per_buf; i++) {
@@ -1989,6 +1990,18 @@ xlog_recover_do_reg_buffer(
1989 } 1990 }
1990 bp->b_ops = &xfs_dquot_buf_ops; 1991 bp->b_ops = &xfs_dquot_buf_ops;
1991 break; 1992 break;
1993 case XFS_BLF_DINO_BUF:
1994 /*
1995 * we get here with inode allocation buffers, not buffers that
1996 * track unlinked list changes.
1997 */
1998 if (*(__be16 *)bp->b_addr != cpu_to_be16(XFS_DINODE_MAGIC)) {
1999 xfs_warn(mp, "Bad INODE block magic!");
2000 ASSERT(0);
2001 break;
2002 }
2003 bp->b_ops = &xfs_inode_buf_ops;
2004 break;
1992 default: 2005 default:
1993 break; 2006 break;
1994 } 2007 }
@@ -2277,6 +2290,7 @@ xlog_recover_inode_pass2(
2277 int attr_index; 2290 int attr_index;
2278 uint fields; 2291 uint fields;
2279 xfs_icdinode_t *dicp; 2292 xfs_icdinode_t *dicp;
2293 uint isize;
2280 int need_free = 0; 2294 int need_free = 0;
2281 2295
2282 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { 2296 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
@@ -2302,7 +2316,7 @@ xlog_recover_inode_pass2(
2302 trace_xfs_log_recover_inode_recover(log, in_f); 2316 trace_xfs_log_recover_inode_recover(log, in_f);
2303 2317
2304 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, 2318 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
2305 NULL); 2319 &xfs_inode_buf_ops);
2306 if (!bp) { 2320 if (!bp) {
2307 error = ENOMEM; 2321 error = ENOMEM;
2308 goto error; 2322 goto error;
@@ -2413,7 +2427,8 @@ xlog_recover_inode_pass2(
2413 error = EFSCORRUPTED; 2427 error = EFSCORRUPTED;
2414 goto error; 2428 goto error;
2415 } 2429 }
2416 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { 2430 isize = xfs_icdinode_size(dicp->di_version);
2431 if (unlikely(item->ri_buf[1].i_len > isize)) {
2417 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", 2432 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2418 XFS_ERRLEVEL_LOW, mp, dicp); 2433 XFS_ERRLEVEL_LOW, mp, dicp);
2419 xfs_buf_relse(bp); 2434 xfs_buf_relse(bp);
@@ -2425,13 +2440,13 @@ xlog_recover_inode_pass2(
2425 } 2440 }
2426 2441
2427 /* The core is in in-core format */ 2442 /* The core is in in-core format */
2428 xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr); 2443 xfs_dinode_to_disk(dip, dicp);
2429 2444
2430 /* the rest is in on-disk format */ 2445 /* the rest is in on-disk format */
2431 if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { 2446 if (item->ri_buf[1].i_len > isize) {
2432 memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode), 2447 memcpy((char *)dip + isize,
2433 item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode), 2448 item->ri_buf[1].i_addr + isize,
2434 item->ri_buf[1].i_len - sizeof(struct xfs_icdinode)); 2449 item->ri_buf[1].i_len - isize);
2435 } 2450 }
2436 2451
2437 fields = in_f->ilf_fields; 2452 fields = in_f->ilf_fields;
@@ -2515,6 +2530,9 @@ xlog_recover_inode_pass2(
2515 } 2530 }
2516 2531
2517write_inode_buffer: 2532write_inode_buffer:
2533 /* re-generate the checksum. */
2534 xfs_dinode_calc_crc(log->l_mp, dip);
2535
2518 ASSERT(bp->b_target->bt_mount == mp); 2536 ASSERT(bp->b_target->bt_mount == mp);
2519 bp->b_iodone = xlog_recover_iodone; 2537 bp->b_iodone = xlog_recover_iodone;
2520 xfs_buf_delwri_queue(bp, buffer_list); 2538 xfs_buf_delwri_queue(bp, buffer_list);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index f950edd0d537..8a0f6af51206 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -704,12 +704,13 @@ xfs_trans_inode_buf(
704 ASSERT(atomic_read(&bip->bli_refcount) > 0); 704 ASSERT(atomic_read(&bip->bli_refcount) > 0);
705 705
706 bip->bli_flags |= XFS_BLI_INODE_BUF; 706 bip->bli_flags |= XFS_BLI_INODE_BUF;
707 xfs_trans_buf_set_type(tp, bp, XFS_BLF_DINO_BUF);
707} 708}
708 709
709/* 710/*
710 * This call is used to indicate that the buffer is going to 711 * This call is used to indicate that the buffer is going to
711 * be staled and was an inode buffer. This means it gets 712 * be staled and was an inode buffer. This means it gets
712 * special processing during unpin - where any inodes 713 * special processing during unpin - where any inodes
713 * associated with the buffer should be removed from ail. 714 * associated with the buffer should be removed from ail.
714 * There is also special processing during recovery, 715 * There is also special processing during recovery,
715 * any replay of the inodes in the buffer needs to be 716 * any replay of the inodes in the buffer needs to be
@@ -728,6 +729,7 @@ xfs_trans_stale_inode_buf(
728 729
729 bip->bli_flags |= XFS_BLI_STALE_INODE; 730 bip->bli_flags |= XFS_BLI_STALE_INODE;
730 bip->bli_item.li_cb = xfs_buf_iodone; 731 bip->bli_item.li_cb = xfs_buf_iodone;
732 xfs_trans_buf_set_type(tp, bp, XFS_BLF_DINO_BUF);
731} 733}
732 734
733/* 735/*
@@ -751,6 +753,7 @@ xfs_trans_inode_alloc_buf(
751 ASSERT(atomic_read(&bip->bli_refcount) > 0); 753 ASSERT(atomic_read(&bip->bli_refcount) > 0);
752 754
753 bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; 755 bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
756 xfs_trans_buf_set_type(tp, bp, XFS_BLF_DINO_BUF);
754} 757}
755 758
756/* 759/*