aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 12:19:45 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 12:19:45 -0500
commit3f1c64f410e4394ecefadd7a597a7c20368a65fc (patch)
tree10f15d6a222b15a34831f2d7d1e3ac26f1436638
parent22a40fd9a60388aec8106b0baffc8f59f83bb1b4 (diff)
parentf9668a09e32ac6d2aa22f44cc310e430a8f4a40f (diff)
Merge tag 'for-linus-v3.8-rc1' of git://oss.sgi.com/xfs/xfs
Pull xfs update from Ben Myers: "There is plenty going on, including the cleanup of xfssyncd, metadata verifiers, CRC infrastructure for the log, tracking of inodes with speculative allocation, a cleanup of xfs_fs_subr.c, fixes for XFS_IOC_ZERO_RANGE, and important fix related to log replay (only update the last_sync_lsn when a transaction completes), a fix for deadlock on AGF buffers, documentation and comment updates, and a few more cleanups and fixes. Details: - remove the xfssyncd mess - only update the last_sync_lsn when a transaction completes - zero allocation_args on the kernel stack - fix AGF/alloc workqueue deadlock - silence uninitialised f.file warning - Update inode alloc comments - Update mount options documentation - report projid32bit feature in geometry call - speculative preallocation inode tracking - fix attr tree double split corruption - fix broken error handling in xfs_vm_writepage - drop buffer io reference when a bad bio is built - add more attribute tree trace points - growfs infrastructure changes for 3.8 - fs/xfs/xfs_fs_subr.c die die die - add CRC infrastructure - add CRC checks to the log - Remove description of nodelaylog mount option from xfs.txt - inode allocation should use unmapped buffers - byte range granularity for XFS_IOC_ZERO_RANGE - fix direct IO nested transaction deadlock - fix stray dquot unlock when reclaiming dquots - fix sparse reported log CRC endian issue" Fix up trivial conflict in fs/xfs/xfs_fsops.c due to the same patch having been applied twice (commits eaef854335ce and 1375cb65e87b: "xfs: growfs: don't read garbage for new secondary superblocks") with later updates to the affected code in the XFS tree. * tag 'for-linus-v3.8-rc1' of git://oss.sgi.com/xfs/xfs: (78 commits) xfs: fix sparse reported log CRC endian issue xfs: fix stray dquot unlock when reclaiming dquots xfs: fix direct IO nested transaction deadlock. xfs: byte range granularity for XFS_IOC_ZERO_RANGE xfs: inode allocation should use unmapped buffers. xfs: Remove the description of nodelaylog mount option from xfs.txt xfs: add CRC checks to the log xfs: add CRC infrastructure xfs: convert buffer verifiers to an ops structure. xfs: connect up write verifiers to new buffers xfs: add pre-write metadata buffer verifier callbacks xfs: add buffer pre-write callback xfs: Add verifiers to dir2 data readahead. xfs: add xfs_da_node verification xfs: factor and verify attr leaf reads xfs: factor dir2 leaf read xfs: factor out dir2 data block reading xfs: factor dir2 free block reading xfs: verify dir2 block format buffers xfs: factor dir2 block read operations ...
-rw-r--r--Documentation/filesystems/xfs.txt13
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile4
-rw-r--r--fs/xfs/uuid.h6
-rw-r--r--fs/xfs/xfs_ag.h5
-rw-r--r--fs/xfs/xfs_alloc.c140
-rw-r--r--fs/xfs/xfs_alloc.h3
-rw-r--r--fs/xfs/xfs_alloc_btree.c77
-rw-r--r--fs/xfs/xfs_alloc_btree.h2
-rw-r--r--fs/xfs/xfs_aops.c83
-rw-r--r--fs/xfs/xfs_attr.c103
-rw-r--r--fs/xfs/xfs_attr_leaf.c143
-rw-r--r--fs/xfs/xfs_attr_leaf.h6
-rw-r--r--fs/xfs/xfs_bmap.c64
-rw-r--r--fs/xfs/xfs_bmap_btree.c63
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c111
-rw-r--r--fs/xfs/xfs_btree.h22
-rw-r--r--fs/xfs/xfs_buf.c59
-rw-r--r--fs/xfs/xfs_buf.h27
-rw-r--r--fs/xfs/xfs_cksum.h63
-rw-r--r--fs/xfs/xfs_da_btree.c141
-rw-r--r--fs/xfs/xfs_da_btree.h10
-rw-r--r--fs/xfs/xfs_dfrag.c13
-rw-r--r--fs/xfs/xfs_dir2_block.c436
-rw-r--r--fs/xfs/xfs_dir2_data.c170
-rw-r--r--fs/xfs/xfs_dir2_leaf.c172
-rw-r--r--fs/xfs/xfs_dir2_node.c288
-rw-r--r--fs/xfs/xfs_dir2_priv.h19
-rw-r--r--fs/xfs/xfs_dquot.c134
-rw-r--r--fs/xfs/xfs_dquot.h2
-rw-r--r--fs/xfs/xfs_export.c1
-rw-r--r--fs/xfs/xfs_file.c42
-rw-r--r--fs/xfs/xfs_fs.h33
-rw-r--r--fs/xfs/xfs_fs_subr.c96
-rw-r--r--fs/xfs/xfs_fsops.c141
-rw-r--r--fs/xfs/xfs_globals.c4
-rw-r--r--fs/xfs/xfs_ialloc.c83
-rw-r--r--fs/xfs/xfs_ialloc.h4
-rw-r--r--fs/xfs/xfs_ialloc_btree.c55
-rw-r--r--fs/xfs/xfs_ialloc_btree.h2
-rw-r--r--fs/xfs/xfs_icache.c (renamed from fs/xfs/xfs_sync.c)914
-rw-r--r--fs/xfs/xfs_icache.h (renamed from fs/xfs/xfs_sync.h)28
-rw-r--r--fs/xfs/xfs_iget.c705
-rw-r--r--fs/xfs/xfs_inode.c437
-rw-r--r--fs/xfs/xfs_inode.h12
-rw-r--r--fs/xfs/xfs_ioctl.c21
-rw-r--r--fs/xfs/xfs_iomap.c31
-rw-r--r--fs/xfs/xfs_iops.c8
-rw-r--r--fs/xfs/xfs_itable.c4
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c241
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_log_priv.h12
-rw-r--r--fs/xfs/xfs_log_recover.c146
-rw-r--r--fs/xfs/xfs_mount.c163
-rw-r--r--fs/xfs/xfs_mount.h13
-rw-r--r--fs/xfs/xfs_qm.c22
-rw-r--r--fs/xfs/xfs_qm_syscalls.c6
-rw-r--r--fs/xfs/xfs_rtalloc.c16
-rw-r--r--fs/xfs/xfs_sb.h7
-rw-r--r--fs/xfs/xfs_super.c148
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_sysctl.c9
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_trace.h60
-rw-r--r--fs/xfs/xfs_trans.h19
-rw-r--r--fs/xfs/xfs_trans_buf.c9
-rw-r--r--fs/xfs/xfs_vnodeops.c168
-rw-r--r--fs/xfs/xfs_vnodeops.h9
70 files changed, 3747 insertions, 2311 deletions
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 3fc0c31a6f5d..3e4b3dd1e046 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -43,7 +43,7 @@ When mounting an XFS filesystem, the following options are accepted.
43 Issue command to let the block device reclaim space freed by the 43 Issue command to let the block device reclaim space freed by the
44 filesystem. This is useful for SSD devices, thinly provisioned 44 filesystem. This is useful for SSD devices, thinly provisioned
45 LUNs and virtual machine images, but may have a performance 45 LUNs and virtual machine images, but may have a performance
46 impact. This option is incompatible with the nodelaylog option. 46 impact.
47 47
48 dmapi 48 dmapi
49 Enable the DMAPI (Data Management API) event callouts. 49 Enable the DMAPI (Data Management API) event callouts.
@@ -72,8 +72,15 @@ When mounting an XFS filesystem, the following options are accepted.
72 Indicates that XFS is allowed to create inodes at any location 72 Indicates that XFS is allowed to create inodes at any location
73 in the filesystem, including those which will result in inode 73 in the filesystem, including those which will result in inode
74 numbers occupying more than 32 bits of significance. This is 74 numbers occupying more than 32 bits of significance. This is
75 provided for backwards compatibility, but causes problems for 75 the default allocation option. Applications which do not handle
76 backup applications that cannot handle large inode numbers. 76 inode numbers bigger than 32 bits, should use inode32 option.
77
78 inode32
79 Indicates that XFS is limited to create inodes at locations which
80 will not result in inode numbers with more than 32 bits of
81 significance. This is provided for backwards compatibility, since
82 64 bits inode numbers might cause problems for some applications
83 that cannot handle large inode numbers.
77 84
78 largeio/nolargeio 85 largeio/nolargeio
79 If "nolargeio" is specified, the optimal I/O reported in 86 If "nolargeio" is specified, the optimal I/O reported in
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 6100ec0fa1d4..5a7ffe54f5d5 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -2,6 +2,7 @@ config XFS_FS
2 tristate "XFS filesystem support" 2 tristate "XFS filesystem support"
3 depends on BLOCK 3 depends on BLOCK
4 select EXPORTFS 4 select EXPORTFS
5 select LIBCRC32C
5 help 6 help
6 XFS is a high performance journaling filesystem which originated 7 XFS is a high performance journaling filesystem which originated
7 on the SGI IRIX platform. It is completely multi-threaded, can 8 on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d2bf974b1a2f..d02201df855b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -37,9 +37,8 @@ xfs-y += xfs_aops.o \
37 xfs_file.o \ 37 xfs_file.o \
38 xfs_filestream.o \ 38 xfs_filestream.o \
39 xfs_fsops.o \ 39 xfs_fsops.o \
40 xfs_fs_subr.o \
41 xfs_globals.o \ 40 xfs_globals.o \
42 xfs_iget.o \ 41 xfs_icache.o \
43 xfs_ioctl.o \ 42 xfs_ioctl.o \
44 xfs_iomap.o \ 43 xfs_iomap.o \
45 xfs_iops.o \ 44 xfs_iops.o \
@@ -47,7 +46,6 @@ xfs-y += xfs_aops.o \
47 xfs_message.o \ 46 xfs_message.o \
48 xfs_mru_cache.o \ 47 xfs_mru_cache.o \
49 xfs_super.o \ 48 xfs_super.o \
50 xfs_sync.o \
51 xfs_xattr.o \ 49 xfs_xattr.o \
52 xfs_rename.o \ 50 xfs_rename.o \
53 xfs_utils.o \ 51 xfs_utils.o \
diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h
index 4732d71262cc..104db0f3bed6 100644
--- a/fs/xfs/uuid.h
+++ b/fs/xfs/uuid.h
@@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid);
26extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); 26extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
27extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); 27extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
28 28
29static inline void
30uuid_copy(uuid_t *dst, uuid_t *src)
31{
32 memcpy(dst, src, sizeof(uuid_t));
33}
34
29#endif /* __XFS_SUPPORT_UUID_H__ */ 35#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 44d65c1533c0..f2aeedb6a579 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -108,6 +108,8 @@ typedef struct xfs_agf {
108extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, 108extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
109 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); 109 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
110 110
111extern const struct xfs_buf_ops xfs_agf_buf_ops;
112
111/* 113/*
112 * Size of the unlinked inode hash table in the agi. 114 * Size of the unlinked inode hash table in the agi.
113 */ 115 */
@@ -161,6 +163,8 @@ typedef struct xfs_agi {
161extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, 163extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
162 xfs_agnumber_t agno, struct xfs_buf **bpp); 164 xfs_agnumber_t agno, struct xfs_buf **bpp);
163 165
166extern const struct xfs_buf_ops xfs_agi_buf_ops;
167
164/* 168/*
165 * The third a.g. block contains the a.g. freelist, an array 169 * The third a.g. block contains the a.g. freelist, an array
166 * of block pointers to blocks owned by the allocation btree code. 170 * of block pointers to blocks owned by the allocation btree code.
@@ -233,6 +237,7 @@ typedef struct xfs_perag {
233#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup 237#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
234 in xfs_inode_ag_iterator */ 238 in xfs_inode_ag_iterator */
235#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ 239#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
240#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
236 241
237#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) 242#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
238#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ 243#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 335206a9c698..393055fe3aef 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -430,6 +430,60 @@ xfs_alloc_fixup_trees(
430 return 0; 430 return 0;
431} 431}
432 432
433static void
434xfs_agfl_verify(
435 struct xfs_buf *bp)
436{
437#ifdef WHEN_CRCS_COME_ALONG
438 /*
439 * we cannot actually do any verification of the AGFL because mkfs does
440 * not initialise the AGFL to zero or NULL. Hence the only valid part of
441 * the AGFL is what the AGF says is active. We can't get to the AGF, so
442 * we can't verify just those entries are valid.
443 *
444 * This problem goes away when the CRC format change comes along as that
445 * requires the AGFL to be initialised by mkfs. At that point, we can
446 * verify the blocks in the agfl -active or not- lie within the bounds
447 * of the AG. Until then, just leave this check ifdef'd out.
448 */
449 struct xfs_mount *mp = bp->b_target->bt_mount;
450 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
451 int agfl_ok = 1;
452
453 int i;
454
455 for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
456 if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
457 be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
458 agfl_ok = 0;
459 }
460
461 if (!agfl_ok) {
462 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
463 xfs_buf_ioerror(bp, EFSCORRUPTED);
464 }
465#endif
466}
467
468static void
469xfs_agfl_write_verify(
470 struct xfs_buf *bp)
471{
472 xfs_agfl_verify(bp);
473}
474
475static void
476xfs_agfl_read_verify(
477 struct xfs_buf *bp)
478{
479 xfs_agfl_verify(bp);
480}
481
482const struct xfs_buf_ops xfs_agfl_buf_ops = {
483 .verify_read = xfs_agfl_read_verify,
484 .verify_write = xfs_agfl_write_verify,
485};
486
433/* 487/*
434 * Read in the allocation group free block array. 488 * Read in the allocation group free block array.
435 */ 489 */
@@ -447,7 +501,7 @@ xfs_alloc_read_agfl(
447 error = xfs_trans_read_buf( 501 error = xfs_trans_read_buf(
448 mp, tp, mp->m_ddev_targp, 502 mp, tp, mp->m_ddev_targp,
449 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), 503 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
450 XFS_FSS_TO_BB(mp, 1), 0, &bp); 504 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
451 if (error) 505 if (error)
452 return error; 506 return error;
453 ASSERT(!xfs_buf_geterror(bp)); 507 ASSERT(!xfs_buf_geterror(bp));
@@ -2091,6 +2145,63 @@ xfs_alloc_put_freelist(
2091 return 0; 2145 return 0;
2092} 2146}
2093 2147
2148static void
2149xfs_agf_verify(
2150 struct xfs_buf *bp)
2151 {
2152 struct xfs_mount *mp = bp->b_target->bt_mount;
2153 struct xfs_agf *agf;
2154 int agf_ok;
2155
2156 agf = XFS_BUF_TO_AGF(bp);
2157
2158 agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
2159 XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2160 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2161 be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
2162 be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
2163 be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
2164
2165 /*
2166 * during growfs operations, the perag is not fully initialised,
2167 * so we can't use it for any useful checking. growfs ensures we can't
2168 * use it by using uncached buffers that don't have the perag attached
2169 * so we can detect and avoid this problem.
2170 */
2171 if (bp->b_pag)
2172 agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
2173 bp->b_pag->pag_agno;
2174
2175 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
2176 agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
2177 be32_to_cpu(agf->agf_length);
2178
2179 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
2180 XFS_RANDOM_ALLOC_READ_AGF))) {
2181 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
2182 xfs_buf_ioerror(bp, EFSCORRUPTED);
2183 }
2184}
2185
2186static void
2187xfs_agf_read_verify(
2188 struct xfs_buf *bp)
2189{
2190 xfs_agf_verify(bp);
2191}
2192
2193static void
2194xfs_agf_write_verify(
2195 struct xfs_buf *bp)
2196{
2197 xfs_agf_verify(bp);
2198}
2199
2200const struct xfs_buf_ops xfs_agf_buf_ops = {
2201 .verify_read = xfs_agf_read_verify,
2202 .verify_write = xfs_agf_write_verify,
2203};
2204
2094/* 2205/*
2095 * Read in the allocation group header (free/alloc section). 2206 * Read in the allocation group header (free/alloc section).
2096 */ 2207 */
@@ -2102,44 +2213,19 @@ xfs_read_agf(
2102 int flags, /* XFS_BUF_ */ 2213 int flags, /* XFS_BUF_ */
2103 struct xfs_buf **bpp) /* buffer for the ag freelist header */ 2214 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2104{ 2215{
2105 struct xfs_agf *agf; /* ag freelist header */
2106 int agf_ok; /* set if agf is consistent */
2107 int error; 2216 int error;
2108 2217
2109 ASSERT(agno != NULLAGNUMBER); 2218 ASSERT(agno != NULLAGNUMBER);
2110 error = xfs_trans_read_buf( 2219 error = xfs_trans_read_buf(
2111 mp, tp, mp->m_ddev_targp, 2220 mp, tp, mp->m_ddev_targp,
2112 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 2221 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
2113 XFS_FSS_TO_BB(mp, 1), flags, bpp); 2222 XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
2114 if (error) 2223 if (error)
2115 return error; 2224 return error;
2116 if (!*bpp) 2225 if (!*bpp)
2117 return 0; 2226 return 0;
2118 2227
2119 ASSERT(!(*bpp)->b_error); 2228 ASSERT(!(*bpp)->b_error);
2120 agf = XFS_BUF_TO_AGF(*bpp);
2121
2122 /*
2123 * Validate the magic number of the agf block.
2124 */
2125 agf_ok =
2126 agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
2127 XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2128 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2129 be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
2130 be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
2131 be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
2132 be32_to_cpu(agf->agf_seqno) == agno;
2133 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
2134 agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
2135 be32_to_cpu(agf->agf_length);
2136 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
2137 XFS_RANDOM_ALLOC_READ_AGF))) {
2138 XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
2139 XFS_ERRLEVEL_LOW, mp, agf);
2140 xfs_trans_brelse(tp, *bpp);
2141 return XFS_ERROR(EFSCORRUPTED);
2142 }
2143 xfs_buf_set_ref(*bpp, XFS_AGF_REF); 2229 xfs_buf_set_ref(*bpp, XFS_AGF_REF);
2144 return 0; 2230 return 0;
2145} 2231}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index feacb061bab7..99d0a6101558 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -231,4 +231,7 @@ xfs_alloc_get_rec(
231 xfs_extlen_t *len, /* output: length of extent */ 231 xfs_extlen_t *len, /* output: length of extent */
232 int *stat); /* output: success/failure */ 232 int *stat); /* output: success/failure */
233 233
234extern const struct xfs_buf_ops xfs_agf_buf_ops;
235extern const struct xfs_buf_ops xfs_agfl_buf_ops;
236
234#endif /* __XFS_ALLOC_H__ */ 237#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index f7876c6d6165..b1ddef6b2689 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -272,6 +272,82 @@ xfs_allocbt_key_diff(
272 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; 272 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
273} 273}
274 274
275static void
276xfs_allocbt_verify(
277 struct xfs_buf *bp)
278{
279 struct xfs_mount *mp = bp->b_target->bt_mount;
280 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
281 struct xfs_perag *pag = bp->b_pag;
282 unsigned int level;
283 int sblock_ok; /* block passes checks */
284
285 /*
286 * magic number and level verification
287 *
288 * During growfs operations, we can't verify the exact level as the
289 * perag is not fully initialised and hence not attached to the buffer.
290 * In this case, check against the maximum tree depth.
291 */
292 level = be16_to_cpu(block->bb_level);
293 switch (block->bb_magic) {
294 case cpu_to_be32(XFS_ABTB_MAGIC):
295 if (pag)
296 sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
297 else
298 sblock_ok = level < mp->m_ag_maxlevels;
299 break;
300 case cpu_to_be32(XFS_ABTC_MAGIC):
301 if (pag)
302 sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
303 else
304 sblock_ok = level < mp->m_ag_maxlevels;
305 break;
306 default:
307 sblock_ok = 0;
308 break;
309 }
310
311 /* numrecs verification */
312 sblock_ok = sblock_ok &&
313 be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
314
315 /* sibling pointer verification */
316 sblock_ok = sblock_ok &&
317 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
318 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
319 block->bb_u.s.bb_leftsib &&
320 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
321 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
322 block->bb_u.s.bb_rightsib;
323
324 if (!sblock_ok) {
325 trace_xfs_btree_corrupt(bp, _RET_IP_);
326 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
327 xfs_buf_ioerror(bp, EFSCORRUPTED);
328 }
329}
330
331static void
332xfs_allocbt_read_verify(
333 struct xfs_buf *bp)
334{
335 xfs_allocbt_verify(bp);
336}
337
338static void
339xfs_allocbt_write_verify(
340 struct xfs_buf *bp)
341{
342 xfs_allocbt_verify(bp);
343}
344
345const struct xfs_buf_ops xfs_allocbt_buf_ops = {
346 .verify_read = xfs_allocbt_read_verify,
347 .verify_write = xfs_allocbt_write_verify,
348};
349
350
275#ifdef DEBUG 351#ifdef DEBUG
276STATIC int 352STATIC int
277xfs_allocbt_keys_inorder( 353xfs_allocbt_keys_inorder(
@@ -327,6 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
327 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, 403 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
328 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, 404 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
329 .key_diff = xfs_allocbt_key_diff, 405 .key_diff = xfs_allocbt_key_diff,
406 .buf_ops = &xfs_allocbt_buf_ops,
330#ifdef DEBUG 407#ifdef DEBUG
331 .keys_inorder = xfs_allocbt_keys_inorder, 408 .keys_inorder = xfs_allocbt_keys_inorder,
332 .recs_inorder = xfs_allocbt_recs_inorder, 409 .recs_inorder = xfs_allocbt_recs_inorder,
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 359fb86ed876..7e89a2b429dd 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
93 xfs_agnumber_t, xfs_btnum_t); 93 xfs_agnumber_t, xfs_btnum_t);
94extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); 94extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
95 95
96extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
97
96#endif /* __XFS_ALLOC_BTREE_H__ */ 98#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e57e2daa357c..4111a40ebe1a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc(
124 ioend->io_append_trans = tp; 124 ioend->io_append_trans = tp;
125 125
126 /* 126 /*
127 * We will pass freeze protection with a transaction. So tell lockdep 127 * We may pass freeze protection with a transaction. So tell lockdep
128 * we released it. 128 * we released it.
129 */ 129 */
130 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 130 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
@@ -149,11 +149,13 @@ xfs_setfilesize(
149 xfs_fsize_t isize; 149 xfs_fsize_t isize;
150 150
151 /* 151 /*
152 * The transaction was allocated in the I/O submission thread, 152 * The transaction may have been allocated in the I/O submission thread,
153 * thus we need to mark ourselves as beeing in a transaction 153 * thus we need to mark ourselves as beeing in a transaction manually.
154 * manually. 154 * Similarly for freeze protection.
155 */ 155 */
156 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 156 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
157 rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
158 0, 1, _THIS_IP_);
157 159
158 xfs_ilock(ip, XFS_ILOCK_EXCL); 160 xfs_ilock(ip, XFS_ILOCK_EXCL);
159 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); 161 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
@@ -187,7 +189,8 @@ xfs_finish_ioend(
187 189
188 if (ioend->io_type == XFS_IO_UNWRITTEN) 190 if (ioend->io_type == XFS_IO_UNWRITTEN)
189 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 191 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
190 else if (ioend->io_append_trans) 192 else if (ioend->io_append_trans ||
193 (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
191 queue_work(mp->m_data_workqueue, &ioend->io_work); 194 queue_work(mp->m_data_workqueue, &ioend->io_work);
192 else 195 else
193 xfs_destroy_ioend(ioend); 196 xfs_destroy_ioend(ioend);
@@ -205,15 +208,6 @@ xfs_end_io(
205 struct xfs_inode *ip = XFS_I(ioend->io_inode); 208 struct xfs_inode *ip = XFS_I(ioend->io_inode);
206 int error = 0; 209 int error = 0;
207 210
208 if (ioend->io_append_trans) {
209 /*
210 * We've got freeze protection passed with the transaction.
211 * Tell lockdep about it.
212 */
213 rwsem_acquire_read(
214 &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
215 0, 1, _THIS_IP_);
216 }
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 211 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
218 ioend->io_error = -EIO; 212 ioend->io_error = -EIO;
219 goto done; 213 goto done;
@@ -226,35 +220,31 @@ xfs_end_io(
226 * range to normal written extens after the data I/O has finished. 220 * range to normal written extens after the data I/O has finished.
227 */ 221 */
228 if (ioend->io_type == XFS_IO_UNWRITTEN) { 222 if (ioend->io_type == XFS_IO_UNWRITTEN) {
223 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
224 ioend->io_size);
225 } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
229 /* 226 /*
230 * For buffered I/O we never preallocate a transaction when 227 * For direct I/O we do not know if we need to allocate blocks
231 * doing the unwritten extent conversion, but for direct I/O 228 * or not so we can't preallocate an append transaction as that
232 * we do not know if we are converting an unwritten extent 229 * results in nested reservations and log space deadlocks. Hence
233 * or not at the point where we preallocate the transaction. 230 * allocate the transaction here. While this is sub-optimal and
231 * can block IO completion for some time, we're stuck with doing
232 * it this way until we can pass the ioend to the direct IO
233 * allocation callbacks and avoid nesting that way.
234 */ 234 */
235 if (ioend->io_append_trans) { 235 error = xfs_setfilesize_trans_alloc(ioend);
236 ASSERT(ioend->io_isdirect); 236 if (error)
237
238 current_set_flags_nested(
239 &ioend->io_append_trans->t_pflags, PF_FSTRANS);
240 xfs_trans_cancel(ioend->io_append_trans, 0);
241 }
242
243 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
244 ioend->io_size);
245 if (error) {
246 ioend->io_error = -error;
247 goto done; 237 goto done;
248 } 238 error = xfs_setfilesize(ioend);
249 } else if (ioend->io_append_trans) { 239 } else if (ioend->io_append_trans) {
250 error = xfs_setfilesize(ioend); 240 error = xfs_setfilesize(ioend);
251 if (error)
252 ioend->io_error = -error;
253 } else { 241 } else {
254 ASSERT(!xfs_ioend_is_append(ioend)); 242 ASSERT(!xfs_ioend_is_append(ioend));
255 } 243 }
256 244
257done: 245done:
246 if (error)
247 ioend->io_error = -error;
258 xfs_destroy_ioend(ioend); 248 xfs_destroy_ioend(ioend);
259} 249}
260 250
@@ -1432,25 +1422,21 @@ xfs_vm_direct_IO(
1432 size_t size = iov_length(iov, nr_segs); 1422 size_t size = iov_length(iov, nr_segs);
1433 1423
1434 /* 1424 /*
1435 * We need to preallocate a transaction for a size update 1425 * We cannot preallocate a size update transaction here as we
1436 * here. In the case that this write both updates the size 1426 * don't know whether allocation is necessary or not. Hence we
1437 * and converts at least on unwritten extent we will cancel 1427 * can only tell IO completion that one is necessary if we are
1438 * the still clean transaction after the I/O has finished. 1428 * not doing unwritten extent conversion.
1439 */ 1429 */
1440 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); 1430 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
1441 if (offset + size > XFS_I(inode)->i_d.di_size) { 1431 if (offset + size > XFS_I(inode)->i_d.di_size)
1442 ret = xfs_setfilesize_trans_alloc(ioend);
1443 if (ret)
1444 goto out_destroy_ioend;
1445 ioend->io_isdirect = 1; 1432 ioend->io_isdirect = 1;
1446 }
1447 1433
1448 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1434 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1449 offset, nr_segs, 1435 offset, nr_segs,
1450 xfs_get_blocks_direct, 1436 xfs_get_blocks_direct,
1451 xfs_end_io_direct_write, NULL, 0); 1437 xfs_end_io_direct_write, NULL, 0);
1452 if (ret != -EIOCBQUEUED && iocb->private) 1438 if (ret != -EIOCBQUEUED && iocb->private)
1453 goto out_trans_cancel; 1439 goto out_destroy_ioend;
1454 } else { 1440 } else {
1455 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1441 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1456 offset, nr_segs, 1442 offset, nr_segs,
@@ -1460,15 +1446,6 @@ xfs_vm_direct_IO(
1460 1446
1461 return ret; 1447 return ret;
1462 1448
1463out_trans_cancel:
1464 if (ioend->io_append_trans) {
1465 current_set_flags_nested(&ioend->io_append_trans->t_pflags,
1466 PF_FSTRANS);
1467 rwsem_acquire_read(
1468 &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
1469 0, 1, _THIS_IP_);
1470 xfs_trans_cancel(ioend->io_append_trans, 0);
1471 }
1472out_destroy_ioend: 1449out_destroy_ioend:
1473 xfs_destroy_ioend(ioend); 1450 xfs_destroy_ioend(ioend);
1474 return ret; 1451 return ret;
@@ -1641,7 +1618,7 @@ xfs_vm_bmap(
1641 1618
1642 trace_xfs_vm_bmap(XFS_I(inode)); 1619 trace_xfs_vm_bmap(XFS_I(inode));
1643 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1620 xfs_ilock(ip, XFS_IOLOCK_SHARED);
1644 xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); 1621 filemap_write_and_wait(mapping);
1645 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1622 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1646 return generic_block_bmap(mapping, block, xfs_get_blocks); 1623 return generic_block_bmap(mapping, block, xfs_get_blocks);
1647} 1624}
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 0ca1f0be62d2..aaf472532b3c 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
903 */ 903 */
904 dp = args->dp; 904 dp = args->dp;
905 args->blkno = 0; 905 args->blkno = 0;
906 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 906 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
907 XFS_ATTR_FORK);
908 if (error) 907 if (error)
909 return(error); 908 return error;
910 ASSERT(bp != NULL);
911 909
912 /* 910 /*
913 * Look up the given attribute in the leaf block. Figure out if 911 * Look up the given attribute in the leaf block. Figure out if
@@ -1031,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
1031 * Read in the block containing the "old" attr, then 1029 * Read in the block containing the "old" attr, then
1032 * remove the "old" attr from that block (neat, huh!) 1030 * remove the "old" attr from that block (neat, huh!)
1033 */ 1031 */
1034 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, 1032 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno,
1035 &bp, XFS_ATTR_FORK); 1033 -1, &bp);
1036 if (error) 1034 if (error)
1037 return(error); 1035 return error;
1038 ASSERT(bp != NULL); 1036
1039 (void)xfs_attr_leaf_remove(bp, args); 1037 xfs_attr_leaf_remove(bp, args);
1040 1038
1041 /* 1039 /*
1042 * If the result is small enough, shrink it all into the inode. 1040 * If the result is small enough, shrink it all into the inode.
@@ -1100,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
1100 */ 1098 */
1101 dp = args->dp; 1099 dp = args->dp;
1102 args->blkno = 0; 1100 args->blkno = 0;
1103 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 1101 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
1104 XFS_ATTR_FORK); 1102 if (error)
1105 if (error) { 1103 return error;
1106 return(error);
1107 }
1108 1104
1109 ASSERT(bp != NULL);
1110 error = xfs_attr_leaf_lookup_int(bp, args); 1105 error = xfs_attr_leaf_lookup_int(bp, args);
1111 if (error == ENOATTR) { 1106 if (error == ENOATTR) {
1112 xfs_trans_brelse(args->trans, bp); 1107 xfs_trans_brelse(args->trans, bp);
1113 return(error); 1108 return(error);
1114 } 1109 }
1115 1110
1116 (void)xfs_attr_leaf_remove(bp, args); 1111 xfs_attr_leaf_remove(bp, args);
1117 1112
1118 /* 1113 /*
1119 * If the result is small enough, shrink it all into the inode. 1114 * If the result is small enough, shrink it all into the inode.
@@ -1155,12 +1150,12 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
1155 struct xfs_buf *bp; 1150 struct xfs_buf *bp;
1156 int error; 1151 int error;
1157 1152
1153 trace_xfs_attr_leaf_get(args);
1154
1158 args->blkno = 0; 1155 args->blkno = 0;
1159 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 1156 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
1160 XFS_ATTR_FORK);
1161 if (error) 1157 if (error)
1162 return(error); 1158 return error;
1163 ASSERT(bp != NULL);
1164 1159
1165 error = xfs_attr_leaf_lookup_int(bp, args); 1160 error = xfs_attr_leaf_lookup_int(bp, args);
1166 if (error != EEXIST) { 1161 if (error != EEXIST) {
@@ -1181,22 +1176,15 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
1181STATIC int 1176STATIC int
1182xfs_attr_leaf_list(xfs_attr_list_context_t *context) 1177xfs_attr_leaf_list(xfs_attr_list_context_t *context)
1183{ 1178{
1184 xfs_attr_leafblock_t *leaf;
1185 int error; 1179 int error;
1186 struct xfs_buf *bp; 1180 struct xfs_buf *bp;
1187 1181
1182 trace_xfs_attr_leaf_list(context);
1183
1188 context->cursor->blkno = 0; 1184 context->cursor->blkno = 0;
1189 error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); 1185 error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp);
1190 if (error) 1186 if (error)
1191 return XFS_ERROR(error); 1187 return XFS_ERROR(error);
1192 ASSERT(bp != NULL);
1193 leaf = bp->b_addr;
1194 if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
1195 XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
1196 context->dp->i_mount, leaf);
1197 xfs_trans_brelse(NULL, bp);
1198 return XFS_ERROR(EFSCORRUPTED);
1199 }
1200 1188
1201 error = xfs_attr_leaf_list_int(bp, context); 1189 error = xfs_attr_leaf_list_int(bp, context);
1202 xfs_trans_brelse(NULL, bp); 1190 xfs_trans_brelse(NULL, bp);
@@ -1600,12 +1588,9 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1600 ASSERT(state->path.blk[0].bp); 1588 ASSERT(state->path.blk[0].bp);
1601 state->path.blk[0].bp = NULL; 1589 state->path.blk[0].bp = NULL;
1602 1590
1603 error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, 1591 error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp);
1604 XFS_ATTR_FORK);
1605 if (error) 1592 if (error)
1606 goto out; 1593 goto out;
1607 ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) ==
1608 cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1609 1594
1610 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { 1595 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
1611 xfs_bmap_init(args->flist, args->firstblock); 1596 xfs_bmap_init(args->flist, args->firstblock);
@@ -1653,6 +1638,8 @@ xfs_attr_fillstate(xfs_da_state_t *state)
1653 xfs_da_state_blk_t *blk; 1638 xfs_da_state_blk_t *blk;
1654 int level; 1639 int level;
1655 1640
1641 trace_xfs_attr_fillstate(state->args);
1642
1656 /* 1643 /*
1657 * Roll down the "path" in the state structure, storing the on-disk 1644 * Roll down the "path" in the state structure, storing the on-disk
1658 * block number for those buffers in the "path". 1645 * block number for those buffers in the "path".
@@ -1699,6 +1686,8 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1699 xfs_da_state_blk_t *blk; 1686 xfs_da_state_blk_t *blk;
1700 int level, error; 1687 int level, error;
1701 1688
1689 trace_xfs_attr_refillstate(state->args);
1690
1702 /* 1691 /*
1703 * Roll down the "path" in the state structure, storing the on-disk 1692 * Roll down the "path" in the state structure, storing the on-disk
1704 * block number for those buffers in the "path". 1693 * block number for those buffers in the "path".
@@ -1707,7 +1696,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1707 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); 1696 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1708 for (blk = path->blk, level = 0; level < path->active; blk++, level++) { 1697 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1709 if (blk->disk_blkno) { 1698 if (blk->disk_blkno) {
1710 error = xfs_da_read_buf(state->args->trans, 1699 error = xfs_da_node_read(state->args->trans,
1711 state->args->dp, 1700 state->args->dp,
1712 blk->blkno, blk->disk_blkno, 1701 blk->blkno, blk->disk_blkno,
1713 &blk->bp, XFS_ATTR_FORK); 1702 &blk->bp, XFS_ATTR_FORK);
@@ -1726,7 +1715,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1726 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); 1715 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1727 for (blk = path->blk, level = 0; level < path->active; blk++, level++) { 1716 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1728 if (blk->disk_blkno) { 1717 if (blk->disk_blkno) {
1729 error = xfs_da_read_buf(state->args->trans, 1718 error = xfs_da_node_read(state->args->trans,
1730 state->args->dp, 1719 state->args->dp,
1731 blk->blkno, blk->disk_blkno, 1720 blk->blkno, blk->disk_blkno,
1732 &blk->bp, XFS_ATTR_FORK); 1721 &blk->bp, XFS_ATTR_FORK);
@@ -1755,6 +1744,8 @@ xfs_attr_node_get(xfs_da_args_t *args)
1755 int error, retval; 1744 int error, retval;
1756 int i; 1745 int i;
1757 1746
1747 trace_xfs_attr_node_get(args);
1748
1758 state = xfs_da_state_alloc(); 1749 state = xfs_da_state_alloc();
1759 state->args = args; 1750 state->args = args;
1760 state->mp = args->dp->i_mount; 1751 state->mp = args->dp->i_mount;
@@ -1804,6 +1795,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1804 int error, i; 1795 int error, i;
1805 struct xfs_buf *bp; 1796 struct xfs_buf *bp;
1806 1797
1798 trace_xfs_attr_node_list(context);
1799
1807 cursor = context->cursor; 1800 cursor = context->cursor;
1808 cursor->initted = 1; 1801 cursor->initted = 1;
1809 1802
@@ -1814,7 +1807,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1814 */ 1807 */
1815 bp = NULL; 1808 bp = NULL;
1816 if (cursor->blkno > 0) { 1809 if (cursor->blkno > 0) {
1817 error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, 1810 error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1,
1818 &bp, XFS_ATTR_FORK); 1811 &bp, XFS_ATTR_FORK);
1819 if ((error != 0) && (error != EFSCORRUPTED)) 1812 if ((error != 0) && (error != EFSCORRUPTED))
1820 return(error); 1813 return(error);
@@ -1856,17 +1849,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1856 if (bp == NULL) { 1849 if (bp == NULL) {
1857 cursor->blkno = 0; 1850 cursor->blkno = 0;
1858 for (;;) { 1851 for (;;) {
1859 error = xfs_da_read_buf(NULL, context->dp, 1852 error = xfs_da_node_read(NULL, context->dp,
1860 cursor->blkno, -1, &bp, 1853 cursor->blkno, -1, &bp,
1861 XFS_ATTR_FORK); 1854 XFS_ATTR_FORK);
1862 if (error) 1855 if (error)
1863 return(error); 1856 return(error);
1864 if (unlikely(bp == NULL)) {
1865 XFS_ERROR_REPORT("xfs_attr_node_list(2)",
1866 XFS_ERRLEVEL_LOW,
1867 context->dp->i_mount);
1868 return(XFS_ERROR(EFSCORRUPTED));
1869 }
1870 node = bp->b_addr; 1857 node = bp->b_addr;
1871 if (node->hdr.info.magic == 1858 if (node->hdr.info.magic ==
1872 cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) 1859 cpu_to_be16(XFS_ATTR_LEAF_MAGIC))
@@ -1907,14 +1894,6 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1907 */ 1894 */
1908 for (;;) { 1895 for (;;) {
1909 leaf = bp->b_addr; 1896 leaf = bp->b_addr;
1910 if (unlikely(leaf->hdr.info.magic !=
1911 cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
1912 XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
1913 XFS_ERRLEVEL_LOW,
1914 context->dp->i_mount, leaf);
1915 xfs_trans_brelse(NULL, bp);
1916 return(XFS_ERROR(EFSCORRUPTED));
1917 }
1918 error = xfs_attr_leaf_list_int(bp, context); 1897 error = xfs_attr_leaf_list_int(bp, context);
1919 if (error) { 1898 if (error) {
1920 xfs_trans_brelse(NULL, bp); 1899 xfs_trans_brelse(NULL, bp);
@@ -1924,16 +1903,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1924 break; 1903 break;
1925 cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); 1904 cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
1926 xfs_trans_brelse(NULL, bp); 1905 xfs_trans_brelse(NULL, bp);
1927 error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, 1906 error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1,
1928 &bp, XFS_ATTR_FORK); 1907 &bp);
1929 if (error) 1908 if (error)
1930 return(error); 1909 return error;
1931 if (unlikely((bp == NULL))) {
1932 XFS_ERROR_REPORT("xfs_attr_node_list(5)",
1933 XFS_ERRLEVEL_LOW,
1934 context->dp->i_mount);
1935 return(XFS_ERROR(EFSCORRUPTED));
1936 }
1937 } 1910 }
1938 xfs_trans_brelse(NULL, bp); 1911 xfs_trans_brelse(NULL, bp);
1939 return(0); 1912 return(0);
@@ -1959,6 +1932,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1959 int nmap, error, tmp, valuelen, blkcnt, i; 1932 int nmap, error, tmp, valuelen, blkcnt, i;
1960 xfs_dablk_t lblkno; 1933 xfs_dablk_t lblkno;
1961 1934
1935 trace_xfs_attr_rmtval_get(args);
1936
1962 ASSERT(!(args->flags & ATTR_KERNOVAL)); 1937 ASSERT(!(args->flags & ATTR_KERNOVAL));
1963 1938
1964 mp = args->dp->i_mount; 1939 mp = args->dp->i_mount;
@@ -1980,7 +1955,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1980 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); 1955 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
1981 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 1956 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
1982 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 1957 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
1983 dblkno, blkcnt, 0, &bp); 1958 dblkno, blkcnt, 0, &bp, NULL);
1984 if (error) 1959 if (error)
1985 return(error); 1960 return(error);
1986 1961
@@ -2014,6 +1989,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2014 xfs_dablk_t lblkno; 1989 xfs_dablk_t lblkno;
2015 int blkcnt, valuelen, nmap, error, tmp, committed; 1990 int blkcnt, valuelen, nmap, error, tmp, committed;
2016 1991
1992 trace_xfs_attr_rmtval_set(args);
1993
2017 dp = args->dp; 1994 dp = args->dp;
2018 mp = dp->i_mount; 1995 mp = dp->i_mount;
2019 src = args->value; 1996 src = args->value;
@@ -2143,6 +2120,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2143 xfs_dablk_t lblkno; 2120 xfs_dablk_t lblkno;
2144 int valuelen, blkcnt, nmap, error, done, committed; 2121 int valuelen, blkcnt, nmap, error, done, committed;
2145 2122
2123 trace_xfs_attr_rmtval_remove(args);
2124
2146 mp = args->dp->i_mount; 2125 mp = args->dp->i_mount;
2147 2126
2148 /* 2127 /*
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 70eec1829776..ee24993c7d12 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -57,7 +57,8 @@ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
57 struct xfs_buf **bpp); 57 struct xfs_buf **bpp);
58STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, 58STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer,
59 xfs_da_args_t *args, int freemap_index); 59 xfs_da_args_t *args, int freemap_index);
60STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer); 60STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args,
61 struct xfs_buf *leaf_buffer);
61STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, 62STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
62 xfs_da_state_blk_t *blk1, 63 xfs_da_state_blk_t *blk1,
63 xfs_da_state_blk_t *blk2); 64 xfs_da_state_blk_t *blk2);
@@ -87,6 +88,52 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
87 xfs_mount_t *mp); 88 xfs_mount_t *mp);
88STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); 89STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
89 90
91static void
92xfs_attr_leaf_verify(
93 struct xfs_buf *bp)
94{
95 struct xfs_mount *mp = bp->b_target->bt_mount;
96 struct xfs_attr_leaf_hdr *hdr = bp->b_addr;
97 int block_ok = 0;
98
99 block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
100 if (!block_ok) {
101 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
102 xfs_buf_ioerror(bp, EFSCORRUPTED);
103 }
104}
105
106static void
107xfs_attr_leaf_read_verify(
108 struct xfs_buf *bp)
109{
110 xfs_attr_leaf_verify(bp);
111}
112
113static void
114xfs_attr_leaf_write_verify(
115 struct xfs_buf *bp)
116{
117 xfs_attr_leaf_verify(bp);
118}
119
120const struct xfs_buf_ops xfs_attr_leaf_buf_ops = {
121 .verify_read = xfs_attr_leaf_read_verify,
122 .verify_write = xfs_attr_leaf_write_verify,
123};
124
125int
126xfs_attr_leaf_read(
127 struct xfs_trans *tp,
128 struct xfs_inode *dp,
129 xfs_dablk_t bno,
130 xfs_daddr_t mappedbno,
131 struct xfs_buf **bpp)
132{
133 return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
134 XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops);
135}
136
90/*======================================================================== 137/*========================================================================
91 * Namespace helper routines 138 * Namespace helper routines
92 *========================================================================*/ 139 *========================================================================*/
@@ -869,17 +916,16 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
869 error = xfs_da_grow_inode(args, &blkno); 916 error = xfs_da_grow_inode(args, &blkno);
870 if (error) 917 if (error)
871 goto out; 918 goto out;
872 error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, 919 error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1);
873 XFS_ATTR_FORK);
874 if (error) 920 if (error)
875 goto out; 921 goto out;
876 ASSERT(bp1 != NULL); 922
877 bp2 = NULL; 923 bp2 = NULL;
878 error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, 924 error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
879 XFS_ATTR_FORK); 925 XFS_ATTR_FORK);
880 if (error) 926 if (error)
881 goto out; 927 goto out;
882 ASSERT(bp2 != NULL); 928 bp2->b_ops = bp1->b_ops;
883 memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); 929 memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
884 bp1 = NULL; 930 bp1 = NULL;
885 xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); 931 xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
@@ -933,7 +979,7 @@ xfs_attr_leaf_create(
933 XFS_ATTR_FORK); 979 XFS_ATTR_FORK);
934 if (error) 980 if (error)
935 return(error); 981 return(error);
936 ASSERT(bp != NULL); 982 bp->b_ops = &xfs_attr_leaf_buf_ops;
937 leaf = bp->b_addr; 983 leaf = bp->b_addr;
938 memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); 984 memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
939 hdr = &leaf->hdr; 985 hdr = &leaf->hdr;
@@ -1071,7 +1117,7 @@ xfs_attr_leaf_add(
1071 * Compact the entries to coalesce free space. 1117 * Compact the entries to coalesce free space.
1072 * This may change the hdr->count via dropping INCOMPLETE entries. 1118 * This may change the hdr->count via dropping INCOMPLETE entries.
1073 */ 1119 */
1074 xfs_attr_leaf_compact(args->trans, bp); 1120 xfs_attr_leaf_compact(args, bp);
1075 1121
1076 /* 1122 /*
1077 * After compaction, the block is guaranteed to have only one 1123 * After compaction, the block is guaranteed to have only one
@@ -1102,6 +1148,8 @@ xfs_attr_leaf_add_work(
1102 xfs_mount_t *mp; 1148 xfs_mount_t *mp;
1103 int tmp, i; 1149 int tmp, i;
1104 1150
1151 trace_xfs_attr_leaf_add_work(args);
1152
1105 leaf = bp->b_addr; 1153 leaf = bp->b_addr;
1106 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1154 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1107 hdr = &leaf->hdr; 1155 hdr = &leaf->hdr;
@@ -1214,15 +1262,17 @@ xfs_attr_leaf_add_work(
1214 */ 1262 */
1215STATIC void 1263STATIC void
1216xfs_attr_leaf_compact( 1264xfs_attr_leaf_compact(
1217 struct xfs_trans *trans, 1265 struct xfs_da_args *args,
1218 struct xfs_buf *bp) 1266 struct xfs_buf *bp)
1219{ 1267{
1220 xfs_attr_leafblock_t *leaf_s, *leaf_d; 1268 xfs_attr_leafblock_t *leaf_s, *leaf_d;
1221 xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; 1269 xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
1222 xfs_mount_t *mp; 1270 struct xfs_trans *trans = args->trans;
1223 char *tmpbuffer; 1271 struct xfs_mount *mp = trans->t_mountp;
1272 char *tmpbuffer;
1273
1274 trace_xfs_attr_leaf_compact(args);
1224 1275
1225 mp = trans->t_mountp;
1226 tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); 1276 tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
1227 ASSERT(tmpbuffer != NULL); 1277 ASSERT(tmpbuffer != NULL);
1228 memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); 1278 memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
@@ -1345,9 +1395,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1345 max = be16_to_cpu(hdr2->firstused) 1395 max = be16_to_cpu(hdr2->firstused)
1346 - sizeof(xfs_attr_leaf_hdr_t); 1396 - sizeof(xfs_attr_leaf_hdr_t);
1347 max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); 1397 max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
1348 if (space > max) { 1398 if (space > max)
1349 xfs_attr_leaf_compact(args->trans, blk2->bp); 1399 xfs_attr_leaf_compact(args, blk2->bp);
1350 }
1351 1400
1352 /* 1401 /*
1353 * Move high entries from leaf1 to low end of leaf2. 1402 * Move high entries from leaf1 to low end of leaf2.
@@ -1378,9 +1427,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1378 max = be16_to_cpu(hdr1->firstused) 1427 max = be16_to_cpu(hdr1->firstused)
1379 - sizeof(xfs_attr_leaf_hdr_t); 1428 - sizeof(xfs_attr_leaf_hdr_t);
1380 max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); 1429 max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
1381 if (space > max) { 1430 if (space > max)
1382 xfs_attr_leaf_compact(args->trans, blk1->bp); 1431 xfs_attr_leaf_compact(args, blk1->bp);
1383 }
1384 1432
1385 /* 1433 /*
1386 * Move low entries from leaf2 to high end of leaf1. 1434 * Move low entries from leaf2 to high end of leaf1.
@@ -1577,6 +1625,8 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
1577 xfs_dablk_t blkno; 1625 xfs_dablk_t blkno;
1578 struct xfs_buf *bp; 1626 struct xfs_buf *bp;
1579 1627
1628 trace_xfs_attr_leaf_toosmall(state->args);
1629
1580 /* 1630 /*
1581 * Check for the degenerate case of the block being over 50% full. 1631 * Check for the degenerate case of the block being over 50% full.
1582 * If so, it's not worth even looking to see if we might be able 1632 * If so, it's not worth even looking to see if we might be able
@@ -1636,18 +1686,16 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
1636 blkno = be32_to_cpu(info->back); 1686 blkno = be32_to_cpu(info->back);
1637 if (blkno == 0) 1687 if (blkno == 0)
1638 continue; 1688 continue;
1639 error = xfs_da_read_buf(state->args->trans, state->args->dp, 1689 error = xfs_attr_leaf_read(state->args->trans, state->args->dp,
1640 blkno, -1, &bp, XFS_ATTR_FORK); 1690 blkno, -1, &bp);
1641 if (error) 1691 if (error)
1642 return(error); 1692 return(error);
1643 ASSERT(bp != NULL);
1644 1693
1645 leaf = (xfs_attr_leafblock_t *)info; 1694 leaf = (xfs_attr_leafblock_t *)info;
1646 count = be16_to_cpu(leaf->hdr.count); 1695 count = be16_to_cpu(leaf->hdr.count);
1647 bytes = state->blocksize - (state->blocksize>>2); 1696 bytes = state->blocksize - (state->blocksize>>2);
1648 bytes -= be16_to_cpu(leaf->hdr.usedbytes); 1697 bytes -= be16_to_cpu(leaf->hdr.usedbytes);
1649 leaf = bp->b_addr; 1698 leaf = bp->b_addr;
1650 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1651 count += be16_to_cpu(leaf->hdr.count); 1699 count += be16_to_cpu(leaf->hdr.count);
1652 bytes -= be16_to_cpu(leaf->hdr.usedbytes); 1700 bytes -= be16_to_cpu(leaf->hdr.usedbytes);
1653 bytes -= count * sizeof(xfs_attr_leaf_entry_t); 1701 bytes -= count * sizeof(xfs_attr_leaf_entry_t);
@@ -1702,6 +1750,8 @@ xfs_attr_leaf_remove(
1702 int tablesize, tmp, i; 1750 int tablesize, tmp, i;
1703 xfs_mount_t *mp; 1751 xfs_mount_t *mp;
1704 1752
1753 trace_xfs_attr_leaf_remove(args);
1754
1705 leaf = bp->b_addr; 1755 leaf = bp->b_addr;
1706 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1756 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1707 hdr = &leaf->hdr; 1757 hdr = &leaf->hdr;
@@ -2511,15 +2561,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
2511 /* 2561 /*
2512 * Set up the operation. 2562 * Set up the operation.
2513 */ 2563 */
2514 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 2564 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
2515 XFS_ATTR_FORK); 2565 if (error)
2516 if (error) {
2517 return(error); 2566 return(error);
2518 }
2519 ASSERT(bp != NULL);
2520 2567
2521 leaf = bp->b_addr; 2568 leaf = bp->b_addr;
2522 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2523 ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); 2569 ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
2524 ASSERT(args->index >= 0); 2570 ASSERT(args->index >= 0);
2525 entry = &leaf->entries[ args->index ]; 2571 entry = &leaf->entries[ args->index ];
@@ -2576,15 +2622,11 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
2576 /* 2622 /*
2577 * Set up the operation. 2623 * Set up the operation.
2578 */ 2624 */
2579 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 2625 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
2580 XFS_ATTR_FORK); 2626 if (error)
2581 if (error) {
2582 return(error); 2627 return(error);
2583 }
2584 ASSERT(bp != NULL);
2585 2628
2586 leaf = bp->b_addr; 2629 leaf = bp->b_addr;
2587 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2588 ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); 2630 ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
2589 ASSERT(args->index >= 0); 2631 ASSERT(args->index >= 0);
2590 entry = &leaf->entries[ args->index ]; 2632 entry = &leaf->entries[ args->index ];
@@ -2633,35 +2675,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
2633 /* 2675 /*
2634 * Read the block containing the "old" attr 2676 * Read the block containing the "old" attr
2635 */ 2677 */
2636 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, 2678 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
2637 XFS_ATTR_FORK); 2679 if (error)
2638 if (error) { 2680 return error;
2639 return(error);
2640 }
2641 ASSERT(bp1 != NULL);
2642 2681
2643 /* 2682 /*
2644 * Read the block containing the "new" attr, if it is different 2683 * Read the block containing the "new" attr, if it is different
2645 */ 2684 */
2646 if (args->blkno2 != args->blkno) { 2685 if (args->blkno2 != args->blkno) {
2647 error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, 2686 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2,
2648 -1, &bp2, XFS_ATTR_FORK); 2687 -1, &bp2);
2649 if (error) { 2688 if (error)
2650 return(error); 2689 return error;
2651 }
2652 ASSERT(bp2 != NULL);
2653 } else { 2690 } else {
2654 bp2 = bp1; 2691 bp2 = bp1;
2655 } 2692 }
2656 2693
2657 leaf1 = bp1->b_addr; 2694 leaf1 = bp1->b_addr;
2658 ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2659 ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); 2695 ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
2660 ASSERT(args->index >= 0); 2696 ASSERT(args->index >= 0);
2661 entry1 = &leaf1->entries[ args->index ]; 2697 entry1 = &leaf1->entries[ args->index ];
2662 2698
2663 leaf2 = bp2->b_addr; 2699 leaf2 = bp2->b_addr;
2664 ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2665 ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); 2700 ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
2666 ASSERT(args->index2 >= 0); 2701 ASSERT(args->index2 >= 0);
2667 entry2 = &leaf2->entries[ args->index2 ]; 2702 entry2 = &leaf2->entries[ args->index2 ];
@@ -2746,7 +2781,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
2746 * the extents in reverse order the extent containing 2781 * the extents in reverse order the extent containing
2747 * block 0 must still be there. 2782 * block 0 must still be there.
2748 */ 2783 */
2749 error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); 2784 error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
2750 if (error) 2785 if (error)
2751 return(error); 2786 return(error);
2752 blkno = XFS_BUF_ADDR(bp); 2787 blkno = XFS_BUF_ADDR(bp);
@@ -2831,7 +2866,7 @@ xfs_attr_node_inactive(
2831 * traversal of the tree so we may deal with many blocks 2866 * traversal of the tree so we may deal with many blocks
2832 * before we come back to this one. 2867 * before we come back to this one.
2833 */ 2868 */
2834 error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, 2869 error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp,
2835 XFS_ATTR_FORK); 2870 XFS_ATTR_FORK);
2836 if (error) 2871 if (error)
2837 return(error); 2872 return(error);
@@ -2872,8 +2907,8 @@ xfs_attr_node_inactive(
2872 * child block number. 2907 * child block number.
2873 */ 2908 */
2874 if ((i+1) < count) { 2909 if ((i+1) < count) {
2875 error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, 2910 error = xfs_da_node_read(*trans, dp, 0, parent_blkno,
2876 &bp, XFS_ATTR_FORK); 2911 &bp, XFS_ATTR_FORK);
2877 if (error) 2912 if (error)
2878 return(error); 2913 return(error);
2879 child_fsb = be32_to_cpu(node->btree[i+1].before); 2914 child_fsb = be32_to_cpu(node->btree[i+1].before);
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index dea17722945e..77de139a58f0 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -261,4 +261,10 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
261 struct xfs_buf *leaf2_bp); 261 struct xfs_buf *leaf2_bp);
262int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, 262int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
263 int *local); 263 int *local);
264int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
265 xfs_dablk_t bno, xfs_daddr_t mappedbno,
266 struct xfs_buf **bpp);
267
268extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops;
269
264#endif /* __XFS_ATTR_LEAF_H__ */ 270#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 83d0cf3df930..0e92d12765d2 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2662,8 +2662,9 @@ xfs_bmap_btree_to_extents(
2662 if ((error = xfs_btree_check_lptr(cur, cbno, 1))) 2662 if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
2663 return error; 2663 return error;
2664#endif 2664#endif
2665 if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, 2665 error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
2666 XFS_BMAP_BTREE_REF))) 2666 &xfs_bmbt_buf_ops);
2667 if (error)
2667 return error; 2668 return error;
2668 cblock = XFS_BUF_TO_BLOCK(cbp); 2669 cblock = XFS_BUF_TO_BLOCK(cbp);
2669 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) 2670 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
@@ -3123,6 +3124,7 @@ xfs_bmap_extents_to_btree(
3123 /* 3124 /*
3124 * Fill in the child block. 3125 * Fill in the child block.
3125 */ 3126 */
3127 abp->b_ops = &xfs_bmbt_buf_ops;
3126 ablock = XFS_BUF_TO_BLOCK(abp); 3128 ablock = XFS_BUF_TO_BLOCK(abp);
3127 ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); 3129 ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
3128 ablock->bb_level = 0; 3130 ablock->bb_level = 0;
@@ -3269,6 +3271,7 @@ xfs_bmap_local_to_extents(
3269 ASSERT(args.len == 1); 3271 ASSERT(args.len == 1);
3270 *firstblock = args.fsbno; 3272 *firstblock = args.fsbno;
3271 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); 3273 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
3274 bp->b_ops = &xfs_bmbt_buf_ops;
3272 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); 3275 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
3273 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); 3276 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
3274 xfs_bmap_forkoff_reset(args.mp, ip, whichfork); 3277 xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
@@ -4078,8 +4081,9 @@ xfs_bmap_read_extents(
4078 * pointer (leftmost) at each level. 4081 * pointer (leftmost) at each level.
4079 */ 4082 */
4080 while (level-- > 0) { 4083 while (level-- > 0) {
4081 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 4084 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
4082 XFS_BMAP_BTREE_REF))) 4085 XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
4086 if (error)
4083 return error; 4087 return error;
4084 block = XFS_BUF_TO_BLOCK(bp); 4088 block = XFS_BUF_TO_BLOCK(bp);
4085 XFS_WANT_CORRUPTED_GOTO( 4089 XFS_WANT_CORRUPTED_GOTO(
@@ -4124,7 +4128,8 @@ xfs_bmap_read_extents(
4124 */ 4128 */
4125 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); 4129 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
4126 if (nextbno != NULLFSBLOCK) 4130 if (nextbno != NULLFSBLOCK)
4127 xfs_btree_reada_bufl(mp, nextbno, 1); 4131 xfs_btree_reada_bufl(mp, nextbno, 1,
4132 &xfs_bmbt_buf_ops);
4128 /* 4133 /*
4129 * Copy records into the extent records. 4134 * Copy records into the extent records.
4130 */ 4135 */
@@ -4156,8 +4161,9 @@ xfs_bmap_read_extents(
4156 */ 4161 */
4157 if (bno == NULLFSBLOCK) 4162 if (bno == NULLFSBLOCK)
4158 break; 4163 break;
4159 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 4164 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
4160 XFS_BMAP_BTREE_REF))) 4165 XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
4166 if (error)
4161 return error; 4167 return error;
4162 block = XFS_BUF_TO_BLOCK(bp); 4168 block = XFS_BUF_TO_BLOCK(bp);
4163 } 4169 }
@@ -5599,7 +5605,7 @@ xfs_getbmap(
5599 xfs_ilock(ip, XFS_IOLOCK_SHARED); 5605 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5600 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { 5606 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
5601 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { 5607 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
5602 error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); 5608 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
5603 if (error) 5609 if (error)
5604 goto out_unlock_iolock; 5610 goto out_unlock_iolock;
5605 } 5611 }
@@ -5868,15 +5874,16 @@ xfs_bmap_check_leaf_extents(
5868 */ 5874 */
5869 while (level-- > 0) { 5875 while (level-- > 0) {
5870 /* See if buf is in cur first */ 5876 /* See if buf is in cur first */
5877 bp_release = 0;
5871 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); 5878 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
5872 if (bp) { 5879 if (!bp) {
5873 bp_release = 0;
5874 } else {
5875 bp_release = 1; 5880 bp_release = 1;
5881 error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5882 XFS_BMAP_BTREE_REF,
5883 &xfs_bmbt_buf_ops);
5884 if (error)
5885 goto error_norelse;
5876 } 5886 }
5877 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5878 XFS_BMAP_BTREE_REF)))
5879 goto error_norelse;
5880 block = XFS_BUF_TO_BLOCK(bp); 5887 block = XFS_BUF_TO_BLOCK(bp);
5881 XFS_WANT_CORRUPTED_GOTO( 5888 XFS_WANT_CORRUPTED_GOTO(
5882 xfs_bmap_sanity_check(mp, bp, level), 5889 xfs_bmap_sanity_check(mp, bp, level),
@@ -5953,15 +5960,16 @@ xfs_bmap_check_leaf_extents(
5953 if (bno == NULLFSBLOCK) 5960 if (bno == NULLFSBLOCK)
5954 break; 5961 break;
5955 5962
5963 bp_release = 0;
5956 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); 5964 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
5957 if (bp) { 5965 if (!bp) {
5958 bp_release = 0;
5959 } else {
5960 bp_release = 1; 5966 bp_release = 1;
5967 error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5968 XFS_BMAP_BTREE_REF,
5969 &xfs_bmbt_buf_ops);
5970 if (error)
5971 goto error_norelse;
5961 } 5972 }
5962 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5963 XFS_BMAP_BTREE_REF)))
5964 goto error_norelse;
5965 block = XFS_BUF_TO_BLOCK(bp); 5973 block = XFS_BUF_TO_BLOCK(bp);
5966 } 5974 }
5967 if (bp_release) { 5975 if (bp_release) {
@@ -6052,7 +6060,9 @@ xfs_bmap_count_tree(
6052 struct xfs_btree_block *block, *nextblock; 6060 struct xfs_btree_block *block, *nextblock;
6053 int numrecs; 6061 int numrecs;
6054 6062
6055 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) 6063 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
6064 &xfs_bmbt_buf_ops);
6065 if (error)
6056 return error; 6066 return error;
6057 *count += 1; 6067 *count += 1;
6058 block = XFS_BUF_TO_BLOCK(bp); 6068 block = XFS_BUF_TO_BLOCK(bp);
@@ -6061,8 +6071,10 @@ xfs_bmap_count_tree(
6061 /* Not at node above leaves, count this level of nodes */ 6071 /* Not at node above leaves, count this level of nodes */
6062 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); 6072 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
6063 while (nextbno != NULLFSBLOCK) { 6073 while (nextbno != NULLFSBLOCK) {
6064 if ((error = xfs_btree_read_bufl(mp, tp, nextbno, 6074 error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
6065 0, &nbp, XFS_BMAP_BTREE_REF))) 6075 XFS_BMAP_BTREE_REF,
6076 &xfs_bmbt_buf_ops);
6077 if (error)
6066 return error; 6078 return error;
6067 *count += 1; 6079 *count += 1;
6068 nextblock = XFS_BUF_TO_BLOCK(nbp); 6080 nextblock = XFS_BUF_TO_BLOCK(nbp);
@@ -6091,8 +6103,10 @@ xfs_bmap_count_tree(
6091 if (nextbno == NULLFSBLOCK) 6103 if (nextbno == NULLFSBLOCK)
6092 break; 6104 break;
6093 bno = nextbno; 6105 bno = nextbno;
6094 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 6106 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
6095 XFS_BMAP_BTREE_REF))) 6107 XFS_BMAP_BTREE_REF,
6108 &xfs_bmbt_buf_ops);
6109 if (error)
6096 return error; 6110 return error;
6097 *count += 1; 6111 *count += 1;
6098 block = XFS_BUF_TO_BLOCK(bp); 6112 block = XFS_BUF_TO_BLOCK(bp);
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 862084a47a7e..061b45cbe614 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -36,6 +36,7 @@
36#include "xfs_bmap.h" 36#include "xfs_bmap.h"
37#include "xfs_error.h" 37#include "xfs_error.h"
38#include "xfs_quota.h" 38#include "xfs_quota.h"
39#include "xfs_trace.h"
39 40
40/* 41/*
41 * Determine the extent state. 42 * Determine the extent state.
@@ -707,6 +708,67 @@ xfs_bmbt_key_diff(
707 cur->bc_rec.b.br_startoff; 708 cur->bc_rec.b.br_startoff;
708} 709}
709 710
711static void
712xfs_bmbt_verify(
713 struct xfs_buf *bp)
714{
715 struct xfs_mount *mp = bp->b_target->bt_mount;
716 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
717 unsigned int level;
718 int lblock_ok; /* block passes checks */
719
720 /* magic number and level verification.
721 *
722 * We don't know waht fork we belong to, so just verify that the level
723 * is less than the maximum of the two. Later checks will be more
724 * precise.
725 */
726 level = be16_to_cpu(block->bb_level);
727 lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
728 level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);
729
730 /* numrecs verification */
731 lblock_ok = lblock_ok &&
732 be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
733
734 /* sibling pointer verification */
735 lblock_ok = lblock_ok &&
736 block->bb_u.l.bb_leftsib &&
737 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
738 XFS_FSB_SANITY_CHECK(mp,
739 be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
740 block->bb_u.l.bb_rightsib &&
741 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
742 XFS_FSB_SANITY_CHECK(mp,
743 be64_to_cpu(block->bb_u.l.bb_rightsib)));
744
745 if (!lblock_ok) {
746 trace_xfs_btree_corrupt(bp, _RET_IP_);
747 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
748 xfs_buf_ioerror(bp, EFSCORRUPTED);
749 }
750}
751
752static void
753xfs_bmbt_read_verify(
754 struct xfs_buf *bp)
755{
756 xfs_bmbt_verify(bp);
757}
758
759static void
760xfs_bmbt_write_verify(
761 struct xfs_buf *bp)
762{
763 xfs_bmbt_verify(bp);
764}
765
766const struct xfs_buf_ops xfs_bmbt_buf_ops = {
767 .verify_read = xfs_bmbt_read_verify,
768 .verify_write = xfs_bmbt_write_verify,
769};
770
771
710#ifdef DEBUG 772#ifdef DEBUG
711STATIC int 773STATIC int
712xfs_bmbt_keys_inorder( 774xfs_bmbt_keys_inorder(
@@ -746,6 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
746 .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, 808 .init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
747 .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, 809 .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
748 .key_diff = xfs_bmbt_key_diff, 810 .key_diff = xfs_bmbt_key_diff,
811 .buf_ops = &xfs_bmbt_buf_ops,
749#ifdef DEBUG 812#ifdef DEBUG
750 .keys_inorder = xfs_bmbt_keys_inorder, 813 .keys_inorder = xfs_bmbt_keys_inorder,
751 .recs_inorder = xfs_bmbt_recs_inorder, 814 .recs_inorder = xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e66c4ea0f85..88469ca08696 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,5 +236,6 @@ extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
236extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, 236extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
237 struct xfs_trans *, struct xfs_inode *, int); 237 struct xfs_trans *, struct xfs_inode *, int);
238 238
239extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
239 240
240#endif /* __XFS_BMAP_BTREE_H__ */ 241#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e53e317b1582..db010408d701 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -266,9 +266,13 @@ xfs_btree_dup_cursor(
266 for (i = 0; i < new->bc_nlevels; i++) { 266 for (i = 0; i < new->bc_nlevels; i++) {
267 new->bc_ptrs[i] = cur->bc_ptrs[i]; 267 new->bc_ptrs[i] = cur->bc_ptrs[i];
268 new->bc_ra[i] = cur->bc_ra[i]; 268 new->bc_ra[i] = cur->bc_ra[i];
269 if ((bp = cur->bc_bufs[i])) { 269 bp = cur->bc_bufs[i];
270 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 270 if (bp) {
271 XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { 271 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
272 XFS_BUF_ADDR(bp), mp->m_bsize,
273 0, &bp,
274 cur->bc_ops->buf_ops);
275 if (error) {
272 xfs_btree_del_cursor(new, error); 276 xfs_btree_del_cursor(new, error);
273 *ncur = NULL; 277 *ncur = NULL;
274 return error; 278 return error;
@@ -609,25 +613,26 @@ xfs_btree_offsets(
609 * Get a buffer for the block, return it read in. 613 * Get a buffer for the block, return it read in.
610 * Long-form addressing. 614 * Long-form addressing.
611 */ 615 */
612int /* error */ 616int
613xfs_btree_read_bufl( 617xfs_btree_read_bufl(
614 xfs_mount_t *mp, /* file system mount point */ 618 struct xfs_mount *mp, /* file system mount point */
615 xfs_trans_t *tp, /* transaction pointer */ 619 struct xfs_trans *tp, /* transaction pointer */
616 xfs_fsblock_t fsbno, /* file system block number */ 620 xfs_fsblock_t fsbno, /* file system block number */
617 uint lock, /* lock flags for read_buf */ 621 uint lock, /* lock flags for read_buf */
618 xfs_buf_t **bpp, /* buffer for fsbno */ 622 struct xfs_buf **bpp, /* buffer for fsbno */
619 int refval) /* ref count value for buffer */ 623 int refval, /* ref count value for buffer */
620{ 624 const struct xfs_buf_ops *ops)
621 xfs_buf_t *bp; /* return value */ 625{
626 struct xfs_buf *bp; /* return value */
622 xfs_daddr_t d; /* real disk block address */ 627 xfs_daddr_t d; /* real disk block address */
623 int error; 628 int error;
624 629
625 ASSERT(fsbno != NULLFSBLOCK); 630 ASSERT(fsbno != NULLFSBLOCK);
626 d = XFS_FSB_TO_DADDR(mp, fsbno); 631 d = XFS_FSB_TO_DADDR(mp, fsbno);
627 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 632 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
628 mp->m_bsize, lock, &bp))) { 633 mp->m_bsize, lock, &bp, ops);
634 if (error)
629 return error; 635 return error;
630 }
631 ASSERT(!xfs_buf_geterror(bp)); 636 ASSERT(!xfs_buf_geterror(bp));
632 if (bp) 637 if (bp)
633 xfs_buf_set_ref(bp, refval); 638 xfs_buf_set_ref(bp, refval);
@@ -642,15 +647,16 @@ xfs_btree_read_bufl(
642/* ARGSUSED */ 647/* ARGSUSED */
643void 648void
644xfs_btree_reada_bufl( 649xfs_btree_reada_bufl(
645 xfs_mount_t *mp, /* file system mount point */ 650 struct xfs_mount *mp, /* file system mount point */
646 xfs_fsblock_t fsbno, /* file system block number */ 651 xfs_fsblock_t fsbno, /* file system block number */
647 xfs_extlen_t count) /* count of filesystem blocks */ 652 xfs_extlen_t count, /* count of filesystem blocks */
653 const struct xfs_buf_ops *ops)
648{ 654{
649 xfs_daddr_t d; 655 xfs_daddr_t d;
650 656
651 ASSERT(fsbno != NULLFSBLOCK); 657 ASSERT(fsbno != NULLFSBLOCK);
652 d = XFS_FSB_TO_DADDR(mp, fsbno); 658 d = XFS_FSB_TO_DADDR(mp, fsbno);
653 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); 659 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
654} 660}
655 661
656/* 662/*
@@ -660,17 +666,18 @@ xfs_btree_reada_bufl(
660/* ARGSUSED */ 666/* ARGSUSED */
661void 667void
662xfs_btree_reada_bufs( 668xfs_btree_reada_bufs(
663 xfs_mount_t *mp, /* file system mount point */ 669 struct xfs_mount *mp, /* file system mount point */
664 xfs_agnumber_t agno, /* allocation group number */ 670 xfs_agnumber_t agno, /* allocation group number */
665 xfs_agblock_t agbno, /* allocation group block number */ 671 xfs_agblock_t agbno, /* allocation group block number */
666 xfs_extlen_t count) /* count of filesystem blocks */ 672 xfs_extlen_t count, /* count of filesystem blocks */
673 const struct xfs_buf_ops *ops)
667{ 674{
668 xfs_daddr_t d; 675 xfs_daddr_t d;
669 676
670 ASSERT(agno != NULLAGNUMBER); 677 ASSERT(agno != NULLAGNUMBER);
671 ASSERT(agbno != NULLAGBLOCK); 678 ASSERT(agbno != NULLAGBLOCK);
672 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 679 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
673 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); 680 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
674} 681}
675 682
676STATIC int 683STATIC int
@@ -684,12 +691,14 @@ xfs_btree_readahead_lblock(
684 xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 691 xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
685 692
686 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { 693 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
687 xfs_btree_reada_bufl(cur->bc_mp, left, 1); 694 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
695 cur->bc_ops->buf_ops);
688 rval++; 696 rval++;
689 } 697 }
690 698
691 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { 699 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
692 xfs_btree_reada_bufl(cur->bc_mp, right, 1); 700 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
701 cur->bc_ops->buf_ops);
693 rval++; 702 rval++;
694 } 703 }
695 704
@@ -709,13 +718,13 @@ xfs_btree_readahead_sblock(
709 718
710 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 719 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
711 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, 720 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
712 left, 1); 721 left, 1, cur->bc_ops->buf_ops);
713 rval++; 722 rval++;
714 } 723 }
715 724
716 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 725 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
717 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, 726 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
718 right, 1); 727 right, 1, cur->bc_ops->buf_ops);
719 rval++; 728 rval++;
720 } 729 }
721 730
@@ -853,18 +862,22 @@ xfs_btree_set_sibling(
853 } 862 }
854} 863}
855 864
856STATIC void 865void
857xfs_btree_init_block( 866xfs_btree_init_block(
858 struct xfs_btree_cur *cur, 867 struct xfs_mount *mp,
859 int level, 868 struct xfs_buf *bp,
860 int numrecs, 869 __u32 magic,
861 struct xfs_btree_block *new) /* new block */ 870 __u16 level,
871 __u16 numrecs,
872 unsigned int flags)
862{ 873{
863 new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); 874 struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp);
875
876 new->bb_magic = cpu_to_be32(magic);
864 new->bb_level = cpu_to_be16(level); 877 new->bb_level = cpu_to_be16(level);
865 new->bb_numrecs = cpu_to_be16(numrecs); 878 new->bb_numrecs = cpu_to_be16(numrecs);
866 879
867 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 880 if (flags & XFS_BTREE_LONG_PTRS) {
868 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); 881 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
869 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); 882 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
870 } else { 883 } else {
@@ -873,6 +886,17 @@ xfs_btree_init_block(
873 } 886 }
874} 887}
875 888
889STATIC void
890xfs_btree_init_block_cur(
891 struct xfs_btree_cur *cur,
892 int level,
893 int numrecs,
894 struct xfs_buf *bp)
895{
896 xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
897 level, numrecs, cur->bc_flags);
898}
899
876/* 900/*
877 * Return true if ptr is the last record in the btree and 901 * Return true if ptr is the last record in the btree and
878 * we need to track updateѕ to this record. The decision 902 * we need to track updateѕ to this record. The decision
@@ -972,6 +996,7 @@ xfs_btree_get_buf_block(
972 if (!*bpp) 996 if (!*bpp)
973 return ENOMEM; 997 return ENOMEM;
974 998
999 (*bpp)->b_ops = cur->bc_ops->buf_ops;
975 *block = XFS_BUF_TO_BLOCK(*bpp); 1000 *block = XFS_BUF_TO_BLOCK(*bpp);
976 return 0; 1001 return 0;
977} 1002}
@@ -998,19 +1023,15 @@ xfs_btree_read_buf_block(
998 1023
999 d = xfs_btree_ptr_to_daddr(cur, ptr); 1024 d = xfs_btree_ptr_to_daddr(cur, ptr);
1000 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, 1025 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1001 mp->m_bsize, flags, bpp); 1026 mp->m_bsize, flags, bpp,
1027 cur->bc_ops->buf_ops);
1002 if (error) 1028 if (error)
1003 return error; 1029 return error;
1004 1030
1005 ASSERT(!xfs_buf_geterror(*bpp)); 1031 ASSERT(!xfs_buf_geterror(*bpp));
1006
1007 xfs_btree_set_refs(cur, *bpp); 1032 xfs_btree_set_refs(cur, *bpp);
1008 *block = XFS_BUF_TO_BLOCK(*bpp); 1033 *block = XFS_BUF_TO_BLOCK(*bpp);
1009 1034 return 0;
1010 error = xfs_btree_check_block(cur, *block, level, *bpp);
1011 if (error)
1012 xfs_trans_brelse(cur->bc_tp, *bpp);
1013 return error;
1014} 1035}
1015 1036
1016/* 1037/*
@@ -2183,7 +2204,7 @@ xfs_btree_split(
2183 goto error0; 2204 goto error0;
2184 2205
2185 /* Fill in the btree header for the new right block. */ 2206 /* Fill in the btree header for the new right block. */
2186 xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right); 2207 xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
2187 2208
2188 /* 2209 /*
2189 * Split the entries between the old and the new block evenly. 2210 * Split the entries between the old and the new block evenly.
@@ -2492,7 +2513,7 @@ xfs_btree_new_root(
2492 nptr = 2; 2513 nptr = 2;
2493 } 2514 }
2494 /* Fill in the new block's btree header and log it. */ 2515 /* Fill in the new block's btree header and log it. */
2495 xfs_btree_init_block(cur, cur->bc_nlevels, 2, new); 2516 xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
2496 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 2517 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2497 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 2518 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2498 !xfs_btree_ptr_is_null(cur, &rptr)); 2519 !xfs_btree_ptr_is_null(cur, &rptr));
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 5b240de104c0..f932897194eb 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -188,6 +188,8 @@ struct xfs_btree_ops {
188 __int64_t (*key_diff)(struct xfs_btree_cur *cur, 188 __int64_t (*key_diff)(struct xfs_btree_cur *cur,
189 union xfs_btree_key *key); 189 union xfs_btree_key *key);
190 190
191 const struct xfs_buf_ops *buf_ops;
192
191#ifdef DEBUG 193#ifdef DEBUG
192 /* check that k1 is lower than k2 */ 194 /* check that k1 is lower than k2 */
193 int (*keys_inorder)(struct xfs_btree_cur *cur, 195 int (*keys_inorder)(struct xfs_btree_cur *cur,
@@ -355,7 +357,8 @@ xfs_btree_read_bufl(
355 xfs_fsblock_t fsbno, /* file system block number */ 357 xfs_fsblock_t fsbno, /* file system block number */
356 uint lock, /* lock flags for read_buf */ 358 uint lock, /* lock flags for read_buf */
357 struct xfs_buf **bpp, /* buffer for fsbno */ 359 struct xfs_buf **bpp, /* buffer for fsbno */
358 int refval);/* ref count value for buffer */ 360 int refval, /* ref count value for buffer */
361 const struct xfs_buf_ops *ops);
359 362
360/* 363/*
361 * Read-ahead the block, don't wait for it, don't return a buffer. 364 * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -365,7 +368,8 @@ void /* error */
365xfs_btree_reada_bufl( 368xfs_btree_reada_bufl(
366 struct xfs_mount *mp, /* file system mount point */ 369 struct xfs_mount *mp, /* file system mount point */
367 xfs_fsblock_t fsbno, /* file system block number */ 370 xfs_fsblock_t fsbno, /* file system block number */
368 xfs_extlen_t count); /* count of filesystem blocks */ 371 xfs_extlen_t count, /* count of filesystem blocks */
372 const struct xfs_buf_ops *ops);
369 373
370/* 374/*
371 * Read-ahead the block, don't wait for it, don't return a buffer. 375 * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -376,8 +380,20 @@ xfs_btree_reada_bufs(
376 struct xfs_mount *mp, /* file system mount point */ 380 struct xfs_mount *mp, /* file system mount point */
377 xfs_agnumber_t agno, /* allocation group number */ 381 xfs_agnumber_t agno, /* allocation group number */
378 xfs_agblock_t agbno, /* allocation group block number */ 382 xfs_agblock_t agbno, /* allocation group block number */
379 xfs_extlen_t count); /* count of filesystem blocks */ 383 xfs_extlen_t count, /* count of filesystem blocks */
384 const struct xfs_buf_ops *ops);
380 385
386/*
387 * Initialise a new btree block header
388 */
389void
390xfs_btree_init_block(
391 struct xfs_mount *mp,
392 struct xfs_buf *bp,
393 __u32 magic,
394 __u16 level,
395 __u16 numrecs,
396 unsigned int flags);
381 397
382/* 398/*
383 * Common btree core entry points. 399 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4b0b8dd1b7b0..26673a0b20e7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -569,7 +569,9 @@ found:
569 */ 569 */
570 if (bp->b_flags & XBF_STALE) { 570 if (bp->b_flags & XBF_STALE) {
571 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 571 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
572 ASSERT(bp->b_iodone == NULL);
572 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 573 bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
574 bp->b_ops = NULL;
573 } 575 }
574 576
575 trace_xfs_buf_find(bp, flags, _RET_IP_); 577 trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -654,7 +656,8 @@ xfs_buf_read_map(
654 struct xfs_buftarg *target, 656 struct xfs_buftarg *target,
655 struct xfs_buf_map *map, 657 struct xfs_buf_map *map,
656 int nmaps, 658 int nmaps,
657 xfs_buf_flags_t flags) 659 xfs_buf_flags_t flags,
660 const struct xfs_buf_ops *ops)
658{ 661{
659 struct xfs_buf *bp; 662 struct xfs_buf *bp;
660 663
@@ -666,6 +669,7 @@ xfs_buf_read_map(
666 669
667 if (!XFS_BUF_ISDONE(bp)) { 670 if (!XFS_BUF_ISDONE(bp)) {
668 XFS_STATS_INC(xb_get_read); 671 XFS_STATS_INC(xb_get_read);
672 bp->b_ops = ops;
669 _xfs_buf_read(bp, flags); 673 _xfs_buf_read(bp, flags);
670 } else if (flags & XBF_ASYNC) { 674 } else if (flags & XBF_ASYNC) {
671 /* 675 /*
@@ -691,13 +695,14 @@ void
691xfs_buf_readahead_map( 695xfs_buf_readahead_map(
692 struct xfs_buftarg *target, 696 struct xfs_buftarg *target,
693 struct xfs_buf_map *map, 697 struct xfs_buf_map *map,
694 int nmaps) 698 int nmaps,
699 const struct xfs_buf_ops *ops)
695{ 700{
696 if (bdi_read_congested(target->bt_bdi)) 701 if (bdi_read_congested(target->bt_bdi))
697 return; 702 return;
698 703
699 xfs_buf_read_map(target, map, nmaps, 704 xfs_buf_read_map(target, map, nmaps,
700 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 705 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
701} 706}
702 707
703/* 708/*
@@ -709,10 +714,10 @@ xfs_buf_read_uncached(
709 struct xfs_buftarg *target, 714 struct xfs_buftarg *target,
710 xfs_daddr_t daddr, 715 xfs_daddr_t daddr,
711 size_t numblks, 716 size_t numblks,
712 int flags) 717 int flags,
718 const struct xfs_buf_ops *ops)
713{ 719{
714 xfs_buf_t *bp; 720 struct xfs_buf *bp;
715 int error;
716 721
717 bp = xfs_buf_get_uncached(target, numblks, flags); 722 bp = xfs_buf_get_uncached(target, numblks, flags);
718 if (!bp) 723 if (!bp)
@@ -723,13 +728,10 @@ xfs_buf_read_uncached(
723 bp->b_bn = daddr; 728 bp->b_bn = daddr;
724 bp->b_maps[0].bm_bn = daddr; 729 bp->b_maps[0].bm_bn = daddr;
725 bp->b_flags |= XBF_READ; 730 bp->b_flags |= XBF_READ;
731 bp->b_ops = ops;
726 732
727 xfsbdstrat(target->bt_mount, bp); 733 xfsbdstrat(target->bt_mount, bp);
728 error = xfs_buf_iowait(bp); 734 xfs_buf_iowait(bp);
729 if (error) {
730 xfs_buf_relse(bp);
731 return NULL;
732 }
733 return bp; 735 return bp;
734} 736}
735 737
@@ -999,27 +1001,37 @@ STATIC void
999xfs_buf_iodone_work( 1001xfs_buf_iodone_work(
1000 struct work_struct *work) 1002 struct work_struct *work)
1001{ 1003{
1002 xfs_buf_t *bp = 1004 struct xfs_buf *bp =
1003 container_of(work, xfs_buf_t, b_iodone_work); 1005 container_of(work, xfs_buf_t, b_iodone_work);
1006 bool read = !!(bp->b_flags & XBF_READ);
1007
1008 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1009 if (read && bp->b_ops)
1010 bp->b_ops->verify_read(bp);
1004 1011
1005 if (bp->b_iodone) 1012 if (bp->b_iodone)
1006 (*(bp->b_iodone))(bp); 1013 (*(bp->b_iodone))(bp);
1007 else if (bp->b_flags & XBF_ASYNC) 1014 else if (bp->b_flags & XBF_ASYNC)
1008 xfs_buf_relse(bp); 1015 xfs_buf_relse(bp);
1016 else {
1017 ASSERT(read && bp->b_ops);
1018 complete(&bp->b_iowait);
1019 }
1009} 1020}
1010 1021
1011void 1022void
1012xfs_buf_ioend( 1023xfs_buf_ioend(
1013 xfs_buf_t *bp, 1024 struct xfs_buf *bp,
1014 int schedule) 1025 int schedule)
1015{ 1026{
1027 bool read = !!(bp->b_flags & XBF_READ);
1028
1016 trace_xfs_buf_iodone(bp, _RET_IP_); 1029 trace_xfs_buf_iodone(bp, _RET_IP_);
1017 1030
1018 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1019 if (bp->b_error == 0) 1031 if (bp->b_error == 0)
1020 bp->b_flags |= XBF_DONE; 1032 bp->b_flags |= XBF_DONE;
1021 1033
1022 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { 1034 if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
1023 if (schedule) { 1035 if (schedule) {
1024 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); 1036 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
1025 queue_work(xfslogd_workqueue, &bp->b_iodone_work); 1037 queue_work(xfslogd_workqueue, &bp->b_iodone_work);
@@ -1027,6 +1039,7 @@ xfs_buf_ioend(
1027 xfs_buf_iodone_work(&bp->b_iodone_work); 1039 xfs_buf_iodone_work(&bp->b_iodone_work);
1028 } 1040 }
1029 } else { 1041 } else {
1042 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1030 complete(&bp->b_iowait); 1043 complete(&bp->b_iowait);
1031 } 1044 }
1032} 1045}
@@ -1314,6 +1327,20 @@ _xfs_buf_ioapply(
1314 rw |= REQ_FUA; 1327 rw |= REQ_FUA;
1315 if (bp->b_flags & XBF_FLUSH) 1328 if (bp->b_flags & XBF_FLUSH)
1316 rw |= REQ_FLUSH; 1329 rw |= REQ_FLUSH;
1330
1331 /*
1332 * Run the write verifier callback function if it exists. If
1333 * this function fails it will mark the buffer with an error and
1334 * the IO should not be dispatched.
1335 */
1336 if (bp->b_ops) {
1337 bp->b_ops->verify_write(bp);
1338 if (bp->b_error) {
1339 xfs_force_shutdown(bp->b_target->bt_mount,
1340 SHUTDOWN_CORRUPT_INCORE);
1341 return;
1342 }
1343 }
1317 } else if (bp->b_flags & XBF_READ_AHEAD) { 1344 } else if (bp->b_flags & XBF_READ_AHEAD) {
1318 rw = READA; 1345 rw = READA;
1319 } else { 1346 } else {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 7c0b6a0a1557..23f5642480bb 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -100,6 +100,7 @@ typedef struct xfs_buftarg {
100struct xfs_buf; 100struct xfs_buf;
101typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 101typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
102 102
103
103#define XB_PAGES 2 104#define XB_PAGES 2
104 105
105struct xfs_buf_map { 106struct xfs_buf_map {
@@ -110,6 +111,11 @@ struct xfs_buf_map {
110#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ 111#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
111 struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; 112 struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
112 113
114struct xfs_buf_ops {
115 void (*verify_read)(struct xfs_buf *);
116 void (*verify_write)(struct xfs_buf *);
117};
118
113typedef struct xfs_buf { 119typedef struct xfs_buf {
114 /* 120 /*
115 * first cacheline holds all the fields needed for an uncontended cache 121 * first cacheline holds all the fields needed for an uncontended cache
@@ -153,13 +159,13 @@ typedef struct xfs_buf {
153 unsigned int b_page_count; /* size of page array */ 159 unsigned int b_page_count; /* size of page array */
154 unsigned int b_offset; /* page offset in first page */ 160 unsigned int b_offset; /* page offset in first page */
155 unsigned short b_error; /* error code on I/O */ 161 unsigned short b_error; /* error code on I/O */
162 const struct xfs_buf_ops *b_ops;
156 163
157#ifdef XFS_BUF_LOCK_TRACKING 164#ifdef XFS_BUF_LOCK_TRACKING
158 int b_last_holder; 165 int b_last_holder;
159#endif 166#endif
160} xfs_buf_t; 167} xfs_buf_t;
161 168
162
163/* Finding and Reading Buffers */ 169/* Finding and Reading Buffers */
164struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, 170struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
165 struct xfs_buf_map *map, int nmaps, 171 struct xfs_buf_map *map, int nmaps,
@@ -196,9 +202,11 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
196 xfs_buf_flags_t flags); 202 xfs_buf_flags_t flags);
197struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, 203struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
198 struct xfs_buf_map *map, int nmaps, 204 struct xfs_buf_map *map, int nmaps,
199 xfs_buf_flags_t flags); 205 xfs_buf_flags_t flags,
206 const struct xfs_buf_ops *ops);
200void xfs_buf_readahead_map(struct xfs_buftarg *target, 207void xfs_buf_readahead_map(struct xfs_buftarg *target,
201 struct xfs_buf_map *map, int nmaps); 208 struct xfs_buf_map *map, int nmaps,
209 const struct xfs_buf_ops *ops);
202 210
203static inline struct xfs_buf * 211static inline struct xfs_buf *
204xfs_buf_get( 212xfs_buf_get(
@@ -216,20 +224,22 @@ xfs_buf_read(
216 struct xfs_buftarg *target, 224 struct xfs_buftarg *target,
217 xfs_daddr_t blkno, 225 xfs_daddr_t blkno,
218 size_t numblks, 226 size_t numblks,
219 xfs_buf_flags_t flags) 227 xfs_buf_flags_t flags,
228 const struct xfs_buf_ops *ops)
220{ 229{
221 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); 230 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
222 return xfs_buf_read_map(target, &map, 1, flags); 231 return xfs_buf_read_map(target, &map, 1, flags, ops);
223} 232}
224 233
225static inline void 234static inline void
226xfs_buf_readahead( 235xfs_buf_readahead(
227 struct xfs_buftarg *target, 236 struct xfs_buftarg *target,
228 xfs_daddr_t blkno, 237 xfs_daddr_t blkno,
229 size_t numblks) 238 size_t numblks,
239 const struct xfs_buf_ops *ops)
230{ 240{
231 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); 241 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
232 return xfs_buf_readahead_map(target, &map, 1); 242 return xfs_buf_readahead_map(target, &map, 1, ops);
233} 243}
234 244
235struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); 245struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
@@ -239,7 +249,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
239struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, 249struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
240 int flags); 250 int flags);
241struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, 251struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
242 xfs_daddr_t daddr, size_t numblks, int flags); 252 xfs_daddr_t daddr, size_t numblks, int flags,
253 const struct xfs_buf_ops *ops);
243void xfs_buf_hold(struct xfs_buf *bp); 254void xfs_buf_hold(struct xfs_buf *bp);
244 255
245/* Releasing Buffers */ 256/* Releasing Buffers */
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h
new file mode 100644
index 000000000000..fad1676ad8cd
--- /dev/null
+++ b/fs/xfs/xfs_cksum.h
@@ -0,0 +1,63 @@
1#ifndef _XFS_CKSUM_H
2#define _XFS_CKSUM_H 1
3
4#define XFS_CRC_SEED (~(__uint32_t)0)
5
6/*
7 * Calculate the intermediate checksum for a buffer that has the CRC field
8 * inside it. The offset of the 32bit crc fields is passed as the
9 * cksum_offset parameter.
10 */
11static inline __uint32_t
12xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
13{
14 __uint32_t zero = 0;
15 __uint32_t crc;
16
17 /* Calculate CRC up to the checksum. */
18 crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
19
20 /* Skip checksum field */
21 crc = crc32c(crc, &zero, sizeof(__u32));
22
23 /* Calculate the rest of the CRC. */
24 return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
25 length - (cksum_offset + sizeof(__be32)));
26}
27
28/*
29 * Convert the intermediate checksum to the final ondisk format.
30 *
31 * The CRC32c calculation uses LE format even on BE machines, but returns the
32 * result in host endian format. Hence we need to byte swap it back to LE format
33 * so that it is consistent on disk.
34 */
35static inline __le32
36xfs_end_cksum(__uint32_t crc)
37{
38 return ~cpu_to_le32(crc);
39}
40
41/*
42 * Helper to generate the checksum for a buffer.
43 */
44static inline void
45xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
46{
47 __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
48
49 *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
50}
51
52/*
53 * Helper to verify the checksum for a buffer.
54 */
55static inline int
56xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
57{
58 __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
59
60 return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
61}
62
63#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 7bfb7dd334fc..4d7696a02418 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -91,6 +91,84 @@ STATIC int xfs_da_blk_unlink(xfs_da_state_t *state,
91 xfs_da_state_blk_t *save_blk); 91 xfs_da_state_blk_t *save_blk);
92STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); 92STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state);
93 93
94static void
95xfs_da_node_verify(
96 struct xfs_buf *bp)
97{
98 struct xfs_mount *mp = bp->b_target->bt_mount;
99 struct xfs_da_node_hdr *hdr = bp->b_addr;
100 int block_ok = 0;
101
102 block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC);
103 block_ok = block_ok &&
104 be16_to_cpu(hdr->level) > 0 &&
105 be16_to_cpu(hdr->count) > 0 ;
106 if (!block_ok) {
107 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
108 xfs_buf_ioerror(bp, EFSCORRUPTED);
109 }
110
111}
112
113static void
114xfs_da_node_write_verify(
115 struct xfs_buf *bp)
116{
117 xfs_da_node_verify(bp);
118}
119
120/*
121 * leaf/node format detection on trees is sketchy, so a node read can be done on
122 * leaf level blocks when detection identifies the tree as a node format tree
123 * incorrectly. In this case, we need to swap the verifier to match the correct
124 * format of the block being read.
125 */
126static void
127xfs_da_node_read_verify(
128 struct xfs_buf *bp)
129{
130 struct xfs_mount *mp = bp->b_target->bt_mount;
131 struct xfs_da_blkinfo *info = bp->b_addr;
132
133 switch (be16_to_cpu(info->magic)) {
134 case XFS_DA_NODE_MAGIC:
135 xfs_da_node_verify(bp);
136 break;
137 case XFS_ATTR_LEAF_MAGIC:
138 bp->b_ops = &xfs_attr_leaf_buf_ops;
139 bp->b_ops->verify_read(bp);
140 return;
141 case XFS_DIR2_LEAFN_MAGIC:
142 bp->b_ops = &xfs_dir2_leafn_buf_ops;
143 bp->b_ops->verify_read(bp);
144 return;
145 default:
146 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
147 mp, info);
148 xfs_buf_ioerror(bp, EFSCORRUPTED);
149 break;
150 }
151}
152
153const struct xfs_buf_ops xfs_da_node_buf_ops = {
154 .verify_read = xfs_da_node_read_verify,
155 .verify_write = xfs_da_node_write_verify,
156};
157
158
159int
160xfs_da_node_read(
161 struct xfs_trans *tp,
162 struct xfs_inode *dp,
163 xfs_dablk_t bno,
164 xfs_daddr_t mappedbno,
165 struct xfs_buf **bpp,
166 int which_fork)
167{
168 return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
169 which_fork, &xfs_da_node_buf_ops);
170}
171
94/*======================================================================== 172/*========================================================================
95 * Routines used for growing the Btree. 173 * Routines used for growing the Btree.
96 *========================================================================*/ 174 *========================================================================*/
@@ -125,6 +203,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
125 xfs_trans_log_buf(tp, bp, 203 xfs_trans_log_buf(tp, bp,
126 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); 204 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
127 205
206 bp->b_ops = &xfs_da_node_buf_ops;
128 *bpp = bp; 207 *bpp = bp;
129 return(0); 208 return(0);
130} 209}
@@ -324,6 +403,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
324 } 403 }
325 memcpy(node, oldroot, size); 404 memcpy(node, oldroot, size);
326 xfs_trans_log_buf(tp, bp, 0, size - 1); 405 xfs_trans_log_buf(tp, bp, 0, size - 1);
406
407 bp->b_ops = blk1->bp->b_ops;
327 blk1->bp = bp; 408 blk1->bp = bp;
328 blk1->blkno = blkno; 409 blk1->blkno = blkno;
329 410
@@ -746,7 +827,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
746 */ 827 */
747 child = be32_to_cpu(oldroot->btree[0].before); 828 child = be32_to_cpu(oldroot->btree[0].before);
748 ASSERT(child != 0); 829 ASSERT(child != 0);
749 error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, 830 error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp,
750 args->whichfork); 831 args->whichfork);
751 if (error) 832 if (error)
752 return(error); 833 return(error);
@@ -754,7 +835,14 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
754 xfs_da_blkinfo_onlychild_validate(bp->b_addr, 835 xfs_da_blkinfo_onlychild_validate(bp->b_addr,
755 be16_to_cpu(oldroot->hdr.level)); 836 be16_to_cpu(oldroot->hdr.level));
756 837
838 /*
839 * This could be copying a leaf back into the root block in the case of
840 * there only being a single leaf block left in the tree. Hence we have
841 * to update the b_ops pointer as well to match the buffer type change
842 * that could occur.
843 */
757 memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); 844 memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
845 root_blk->bp->b_ops = bp->b_ops;
758 xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); 846 xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
759 error = xfs_da_shrink_inode(args, child, bp); 847 error = xfs_da_shrink_inode(args, child, bp);
760 return(error); 848 return(error);
@@ -779,6 +867,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
779 xfs_dablk_t blkno; 867 xfs_dablk_t blkno;
780 struct xfs_buf *bp; 868 struct xfs_buf *bp;
781 869
870 trace_xfs_da_node_toosmall(state->args);
871
782 /* 872 /*
783 * Check for the degenerate case of the block being over 50% full. 873 * Check for the degenerate case of the block being over 50% full.
784 * If so, it's not worth even looking to see if we might be able 874 * If so, it's not worth even looking to see if we might be able
@@ -835,7 +925,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
835 blkno = be32_to_cpu(info->back); 925 blkno = be32_to_cpu(info->back);
836 if (blkno == 0) 926 if (blkno == 0)
837 continue; 927 continue;
838 error = xfs_da_read_buf(state->args->trans, state->args->dp, 928 error = xfs_da_node_read(state->args->trans, state->args->dp,
839 blkno, -1, &bp, state->args->whichfork); 929 blkno, -1, &bp, state->args->whichfork);
840 if (error) 930 if (error)
841 return(error); 931 return(error);
@@ -900,6 +990,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
900 xfs_dahash_t lasthash=0; 990 xfs_dahash_t lasthash=0;
901 int level, count; 991 int level, count;
902 992
993 trace_xfs_da_fixhashpath(state->args);
994
903 level = path->active-1; 995 level = path->active-1;
904 blk = &path->blk[ level ]; 996 blk = &path->blk[ level ];
905 switch (blk->magic) { 997 switch (blk->magic) {
@@ -1079,7 +1171,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
1079 * Read the next node down in the tree. 1171 * Read the next node down in the tree.
1080 */ 1172 */
1081 blk->blkno = blkno; 1173 blk->blkno = blkno;
1082 error = xfs_da_read_buf(args->trans, args->dp, blkno, 1174 error = xfs_da_node_read(args->trans, args->dp, blkno,
1083 -1, &blk->bp, args->whichfork); 1175 -1, &blk->bp, args->whichfork);
1084 if (error) { 1176 if (error) {
1085 blk->blkno = 0; 1177 blk->blkno = 0;
@@ -1241,7 +1333,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
1241 new_info->forw = cpu_to_be32(old_blk->blkno); 1333 new_info->forw = cpu_to_be32(old_blk->blkno);
1242 new_info->back = old_info->back; 1334 new_info->back = old_info->back;
1243 if (old_info->back) { 1335 if (old_info->back) {
1244 error = xfs_da_read_buf(args->trans, args->dp, 1336 error = xfs_da_node_read(args->trans, args->dp,
1245 be32_to_cpu(old_info->back), 1337 be32_to_cpu(old_info->back),
1246 -1, &bp, args->whichfork); 1338 -1, &bp, args->whichfork);
1247 if (error) 1339 if (error)
@@ -1262,7 +1354,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
1262 new_info->forw = old_info->forw; 1354 new_info->forw = old_info->forw;
1263 new_info->back = cpu_to_be32(old_blk->blkno); 1355 new_info->back = cpu_to_be32(old_blk->blkno);
1264 if (old_info->forw) { 1356 if (old_info->forw) {
1265 error = xfs_da_read_buf(args->trans, args->dp, 1357 error = xfs_da_node_read(args->trans, args->dp,
1266 be32_to_cpu(old_info->forw), 1358 be32_to_cpu(old_info->forw),
1267 -1, &bp, args->whichfork); 1359 -1, &bp, args->whichfork);
1268 if (error) 1360 if (error)
@@ -1362,7 +1454,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1362 trace_xfs_da_unlink_back(args); 1454 trace_xfs_da_unlink_back(args);
1363 save_info->back = drop_info->back; 1455 save_info->back = drop_info->back;
1364 if (drop_info->back) { 1456 if (drop_info->back) {
1365 error = xfs_da_read_buf(args->trans, args->dp, 1457 error = xfs_da_node_read(args->trans, args->dp,
1366 be32_to_cpu(drop_info->back), 1458 be32_to_cpu(drop_info->back),
1367 -1, &bp, args->whichfork); 1459 -1, &bp, args->whichfork);
1368 if (error) 1460 if (error)
@@ -1379,7 +1471,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1379 trace_xfs_da_unlink_forward(args); 1471 trace_xfs_da_unlink_forward(args);
1380 save_info->forw = drop_info->forw; 1472 save_info->forw = drop_info->forw;
1381 if (drop_info->forw) { 1473 if (drop_info->forw) {
1382 error = xfs_da_read_buf(args->trans, args->dp, 1474 error = xfs_da_node_read(args->trans, args->dp,
1383 be32_to_cpu(drop_info->forw), 1475 be32_to_cpu(drop_info->forw),
1384 -1, &bp, args->whichfork); 1476 -1, &bp, args->whichfork);
1385 if (error) 1477 if (error)
@@ -1417,6 +1509,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1417 xfs_dablk_t blkno=0; 1509 xfs_dablk_t blkno=0;
1418 int level, error; 1510 int level, error;
1419 1511
1512 trace_xfs_da_path_shift(state->args);
1513
1420 /* 1514 /*
1421 * Roll up the Btree looking for the first block where our 1515 * Roll up the Btree looking for the first block where our
1422 * current index is not at the edge of the block. Note that 1516 * current index is not at the edge of the block. Note that
@@ -1463,8 +1557,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1463 * Read the next child block. 1557 * Read the next child block.
1464 */ 1558 */
1465 blk->blkno = blkno; 1559 blk->blkno = blkno;
1466 error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, 1560 error = xfs_da_node_read(args->trans, args->dp, blkno, -1,
1467 &blk->bp, args->whichfork); 1561 &blk->bp, args->whichfork);
1468 if (error) 1562 if (error)
1469 return(error); 1563 return(error);
1470 ASSERT(blk->bp != NULL); 1564 ASSERT(blk->bp != NULL);
@@ -1727,7 +1821,8 @@ xfs_da_swap_lastblock(
1727 * Read the last block in the btree space. 1821 * Read the last block in the btree space.
1728 */ 1822 */
1729 last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; 1823 last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
1730 if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w))) 1824 error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w);
1825 if (error)
1731 return error; 1826 return error;
1732 /* 1827 /*
1733 * Copy the last block into the dead buffer and log it. 1828 * Copy the last block into the dead buffer and log it.
@@ -1753,7 +1848,8 @@ xfs_da_swap_lastblock(
1753 * If the moved block has a left sibling, fix up the pointers. 1848 * If the moved block has a left sibling, fix up the pointers.
1754 */ 1849 */
1755 if ((sib_blkno = be32_to_cpu(dead_info->back))) { 1850 if ((sib_blkno = be32_to_cpu(dead_info->back))) {
1756 if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) 1851 error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
1852 if (error)
1757 goto done; 1853 goto done;
1758 sib_info = sib_buf->b_addr; 1854 sib_info = sib_buf->b_addr;
1759 if (unlikely( 1855 if (unlikely(
@@ -1774,7 +1870,8 @@ xfs_da_swap_lastblock(
1774 * If the moved block has a right sibling, fix up the pointers. 1870 * If the moved block has a right sibling, fix up the pointers.
1775 */ 1871 */
1776 if ((sib_blkno = be32_to_cpu(dead_info->forw))) { 1872 if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
1777 if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) 1873 error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
1874 if (error)
1778 goto done; 1875 goto done;
1779 sib_info = sib_buf->b_addr; 1876 sib_info = sib_buf->b_addr;
1780 if (unlikely( 1877 if (unlikely(
@@ -1797,7 +1894,8 @@ xfs_da_swap_lastblock(
1797 * Walk down the tree looking for the parent of the moved block. 1894 * Walk down the tree looking for the parent of the moved block.
1798 */ 1895 */
1799 for (;;) { 1896 for (;;) {
1800 if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) 1897 error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
1898 if (error)
1801 goto done; 1899 goto done;
1802 par_node = par_buf->b_addr; 1900 par_node = par_buf->b_addr;
1803 if (unlikely(par_node->hdr.info.magic != 1901 if (unlikely(par_node->hdr.info.magic !=
@@ -1847,7 +1945,8 @@ xfs_da_swap_lastblock(
1847 error = XFS_ERROR(EFSCORRUPTED); 1945 error = XFS_ERROR(EFSCORRUPTED);
1848 goto done; 1946 goto done;
1849 } 1947 }
1850 if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) 1948 error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
1949 if (error)
1851 goto done; 1950 goto done;
1852 par_node = par_buf->b_addr; 1951 par_node = par_buf->b_addr;
1853 if (unlikely( 1952 if (unlikely(
@@ -2133,7 +2232,8 @@ xfs_da_read_buf(
2133 xfs_dablk_t bno, 2232 xfs_dablk_t bno,
2134 xfs_daddr_t mappedbno, 2233 xfs_daddr_t mappedbno,
2135 struct xfs_buf **bpp, 2234 struct xfs_buf **bpp,
2136 int whichfork) 2235 int whichfork,
2236 const struct xfs_buf_ops *ops)
2137{ 2237{
2138 struct xfs_buf *bp; 2238 struct xfs_buf *bp;
2139 struct xfs_buf_map map; 2239 struct xfs_buf_map map;
@@ -2155,7 +2255,7 @@ xfs_da_read_buf(
2155 2255
2156 error = xfs_trans_read_buf_map(dp->i_mount, trans, 2256 error = xfs_trans_read_buf_map(dp->i_mount, trans,
2157 dp->i_mount->m_ddev_targp, 2257 dp->i_mount->m_ddev_targp,
2158 mapp, nmap, 0, &bp); 2258 mapp, nmap, 0, &bp, ops);
2159 if (error) 2259 if (error)
2160 goto out_free; 2260 goto out_free;
2161 2261
@@ -2211,9 +2311,10 @@ xfs_da_reada_buf(
2211 struct xfs_trans *trans, 2311 struct xfs_trans *trans,
2212 struct xfs_inode *dp, 2312 struct xfs_inode *dp,
2213 xfs_dablk_t bno, 2313 xfs_dablk_t bno,
2214 int whichfork) 2314 xfs_daddr_t mappedbno,
2315 int whichfork,
2316 const struct xfs_buf_ops *ops)
2215{ 2317{
2216 xfs_daddr_t mappedbno = -1;
2217 struct xfs_buf_map map; 2318 struct xfs_buf_map map;
2218 struct xfs_buf_map *mapp; 2319 struct xfs_buf_map *mapp;
2219 int nmap; 2320 int nmap;
@@ -2221,7 +2322,7 @@ xfs_da_reada_buf(
2221 2322
2222 mapp = &map; 2323 mapp = &map;
2223 nmap = 1; 2324 nmap = 1;
2224 error = xfs_dabuf_map(trans, dp, bno, -1, whichfork, 2325 error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
2225 &mapp, &nmap); 2326 &mapp, &nmap);
2226 if (error) { 2327 if (error) {
2227 /* mapping a hole is not an error, but we don't continue */ 2328 /* mapping a hole is not an error, but we don't continue */
@@ -2231,7 +2332,7 @@ xfs_da_reada_buf(
2231 } 2332 }
2232 2333
2233 mappedbno = mapp[0].bm_bn; 2334 mappedbno = mapp[0].bm_bn;
2234 xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap); 2335 xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
2235 2336
2236out_free: 2337out_free:
2237 if (mapp != &map) 2338 if (mapp != &map)
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 132adafb041e..ee5170c46ae1 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -18,7 +18,6 @@
18#ifndef __XFS_DA_BTREE_H__ 18#ifndef __XFS_DA_BTREE_H__
19#define __XFS_DA_BTREE_H__ 19#define __XFS_DA_BTREE_H__
20 20
21struct xfs_buf;
22struct xfs_bmap_free; 21struct xfs_bmap_free;
23struct xfs_inode; 22struct xfs_inode;
24struct xfs_mount; 23struct xfs_mount;
@@ -214,6 +213,9 @@ int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
214 */ 213 */
215int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, 214int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
216 xfs_da_state_blk_t *new_blk); 215 xfs_da_state_blk_t *new_blk);
216int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
217 xfs_dablk_t bno, xfs_daddr_t mappedbno,
218 struct xfs_buf **bpp, int which_fork);
217 219
218/* 220/*
219 * Utility routines. 221 * Utility routines.
@@ -226,9 +228,11 @@ int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
226 struct xfs_buf **bp, int whichfork); 228 struct xfs_buf **bp, int whichfork);
227int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, 229int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
228 xfs_dablk_t bno, xfs_daddr_t mappedbno, 230 xfs_dablk_t bno, xfs_daddr_t mappedbno,
229 struct xfs_buf **bpp, int whichfork); 231 struct xfs_buf **bpp, int whichfork,
232 const struct xfs_buf_ops *ops);
230xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, 233xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
231 xfs_dablk_t bno, int whichfork); 234 xfs_dablk_t bno, xfs_daddr_t mapped_bno,
235 int whichfork, const struct xfs_buf_ops *ops);
232int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, 236int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
233 struct xfs_buf *dead_buf); 237 struct xfs_buf *dead_buf);
234 238
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b9b8646e62db..d0e9c74d3d96 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -246,12 +246,10 @@ xfs_swap_extents(
246 goto out_unlock; 246 goto out_unlock;
247 } 247 }
248 248
249 if (VN_CACHED(VFS_I(tip)) != 0) { 249 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
250 error = xfs_flushinval_pages(tip, 0, -1, 250 if (error)
251 FI_REMAPF_LOCKED); 251 goto out_unlock;
252 if (error) 252 truncate_pagecache_range(VFS_I(ip), 0, -1);
253 goto out_unlock;
254 }
255 253
256 /* Verify O_DIRECT for ftmp */ 254 /* Verify O_DIRECT for ftmp */
257 if (VN_CACHED(VFS_I(tip)) != 0) { 255 if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -315,8 +313,7 @@ xfs_swap_extents(
315 * are safe. We don't really care if non-io related 313 * are safe. We don't really care if non-io related
316 * fields change. 314 * fields change.
317 */ 315 */
318 316 truncate_pagecache_range(VFS_I(ip), 0, -1);
319 xfs_tosspages(ip, 0, -1, FI_REMAPF);
320 317
321 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); 318 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
322 if ((error = xfs_trans_reserve(tp, 0, 319 if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e93ca8f054f4..7536faaa61e7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -56,6 +56,214 @@ xfs_dir_startup(void)
56 xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); 56 xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
57} 57}
58 58
59static void
60xfs_dir2_block_verify(
61 struct xfs_buf *bp)
62{
63 struct xfs_mount *mp = bp->b_target->bt_mount;
64 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
65 int block_ok = 0;
66
67 block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
68 block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
69
70 if (!block_ok) {
71 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
72 xfs_buf_ioerror(bp, EFSCORRUPTED);
73 }
74}
75
76static void
77xfs_dir2_block_read_verify(
78 struct xfs_buf *bp)
79{
80 xfs_dir2_block_verify(bp);
81}
82
83static void
84xfs_dir2_block_write_verify(
85 struct xfs_buf *bp)
86{
87 xfs_dir2_block_verify(bp);
88}
89
90const struct xfs_buf_ops xfs_dir2_block_buf_ops = {
91 .verify_read = xfs_dir2_block_read_verify,
92 .verify_write = xfs_dir2_block_write_verify,
93};
94
95static int
96xfs_dir2_block_read(
97 struct xfs_trans *tp,
98 struct xfs_inode *dp,
99 struct xfs_buf **bpp)
100{
101 struct xfs_mount *mp = dp->i_mount;
102
103 return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
104 XFS_DATA_FORK, &xfs_dir2_block_buf_ops);
105}
106
107static void
108xfs_dir2_block_need_space(
109 struct xfs_dir2_data_hdr *hdr,
110 struct xfs_dir2_block_tail *btp,
111 struct xfs_dir2_leaf_entry *blp,
112 __be16 **tagpp,
113 struct xfs_dir2_data_unused **dupp,
114 struct xfs_dir2_data_unused **enddupp,
115 int *compact,
116 int len)
117{
118 struct xfs_dir2_data_free *bf;
119 __be16 *tagp = NULL;
120 struct xfs_dir2_data_unused *dup = NULL;
121 struct xfs_dir2_data_unused *enddup = NULL;
122
123 *compact = 0;
124 bf = hdr->bestfree;
125
126 /*
127 * If there are stale entries we'll use one for the leaf.
128 */
129 if (btp->stale) {
130 if (be16_to_cpu(bf[0].length) >= len) {
131 /*
132 * The biggest entry enough to avoid compaction.
133 */
134 dup = (xfs_dir2_data_unused_t *)
135 ((char *)hdr + be16_to_cpu(bf[0].offset));
136 goto out;
137 }
138
139 /*
140 * Will need to compact to make this work.
141 * Tag just before the first leaf entry.
142 */
143 *compact = 1;
144 tagp = (__be16 *)blp - 1;
145
146 /* Data object just before the first leaf entry. */
147 dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
148
149 /*
150 * If it's not free then the data will go where the
151 * leaf data starts now, if it works at all.
152 */
153 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
154 if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
155 (uint)sizeof(*blp) < len)
156 dup = NULL;
157 } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
158 dup = NULL;
159 else
160 dup = (xfs_dir2_data_unused_t *)blp;
161 goto out;
162 }
163
164 /*
165 * no stale entries, so just use free space.
166 * Tag just before the first leaf entry.
167 */
168 tagp = (__be16 *)blp - 1;
169
170 /* Data object just before the first leaf entry. */
171 enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
172
173 /*
174 * If it's not free then can't do this add without cleaning up:
175 * the space before the first leaf entry needs to be free so it
176 * can be expanded to hold the pointer to the new entry.
177 */
178 if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
179 /*
180 * Check out the biggest freespace and see if it's the same one.
181 */
182 dup = (xfs_dir2_data_unused_t *)
183 ((char *)hdr + be16_to_cpu(bf[0].offset));
184 if (dup != enddup) {
185 /*
186 * Not the same free entry, just check its length.
187 */
188 if (be16_to_cpu(dup->length) < len)
189 dup = NULL;
190 goto out;
191 }
192
193 /*
194 * It is the biggest freespace, can it hold the leaf too?
195 */
196 if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
197 /*
198 * Yes, use the second-largest entry instead if it works.
199 */
200 if (be16_to_cpu(bf[1].length) >= len)
201 dup = (xfs_dir2_data_unused_t *)
202 ((char *)hdr + be16_to_cpu(bf[1].offset));
203 else
204 dup = NULL;
205 }
206 }
207out:
208 *tagpp = tagp;
209 *dupp = dup;
210 *enddupp = enddup;
211}
212
213/*
214 * compact the leaf entries.
215 * Leave the highest-numbered stale entry stale.
216 * XXX should be the one closest to mid but mid is not yet computed.
217 */
218static void
219xfs_dir2_block_compact(
220 struct xfs_trans *tp,
221 struct xfs_buf *bp,
222 struct xfs_dir2_data_hdr *hdr,
223 struct xfs_dir2_block_tail *btp,
224 struct xfs_dir2_leaf_entry *blp,
225 int *needlog,
226 int *lfloghigh,
227 int *lfloglow)
228{
229 int fromidx; /* source leaf index */
230 int toidx; /* target leaf index */
231 int needscan = 0;
232 int highstale; /* high stale index */
233
234 fromidx = toidx = be32_to_cpu(btp->count) - 1;
235 highstale = *lfloghigh = -1;
236 for (; fromidx >= 0; fromidx--) {
237 if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
238 if (highstale == -1)
239 highstale = toidx;
240 else {
241 if (*lfloghigh == -1)
242 *lfloghigh = toidx;
243 continue;
244 }
245 }
246 if (fromidx < toidx)
247 blp[toidx] = blp[fromidx];
248 toidx--;
249 }
250 *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
251 *lfloghigh -= be32_to_cpu(btp->stale) - 1;
252 be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
253 xfs_dir2_data_make_free(tp, bp,
254 (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
255 (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
256 needlog, &needscan);
257 blp += be32_to_cpu(btp->stale) - 1;
258 btp->stale = cpu_to_be32(1);
259 /*
260 * If we now need to rebuild the bestfree map, do so.
261 * This needs to happen before the next call to use_free.
262 */
263 if (needscan)
264 xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog);
265}
266
59/* 267/*
60 * Add an entry to a block directory. 268 * Add an entry to a block directory.
61 */ 269 */
@@ -63,7 +271,6 @@ int /* error */
63xfs_dir2_block_addname( 271xfs_dir2_block_addname(
64 xfs_da_args_t *args) /* directory op arguments */ 272 xfs_da_args_t *args) /* directory op arguments */
65{ 273{
66 xfs_dir2_data_free_t *bf; /* bestfree table in block */
67 xfs_dir2_data_hdr_t *hdr; /* block header */ 274 xfs_dir2_data_hdr_t *hdr; /* block header */
68 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ 275 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
69 struct xfs_buf *bp; /* buffer for block */ 276 struct xfs_buf *bp; /* buffer for block */
@@ -94,134 +301,44 @@ xfs_dir2_block_addname(
94 dp = args->dp; 301 dp = args->dp;
95 tp = args->trans; 302 tp = args->trans;
96 mp = dp->i_mount; 303 mp = dp->i_mount;
97 /* 304
98 * Read the (one and only) directory block into dabuf bp. 305 /* Read the (one and only) directory block into bp. */
99 */ 306 error = xfs_dir2_block_read(tp, dp, &bp);
100 if ((error = 307 if (error)
101 xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
102 return error; 308 return error;
103 } 309
104 ASSERT(bp != NULL);
105 hdr = bp->b_addr;
106 /*
107 * Check the magic number, corrupted if wrong.
108 */
109 if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) {
110 XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
111 XFS_ERRLEVEL_LOW, mp, hdr);
112 xfs_trans_brelse(tp, bp);
113 return XFS_ERROR(EFSCORRUPTED);
114 }
115 len = xfs_dir2_data_entsize(args->namelen); 310 len = xfs_dir2_data_entsize(args->namelen);
311
116 /* 312 /*
117 * Set up pointers to parts of the block. 313 * Set up pointers to parts of the block.
118 */ 314 */
119 bf = hdr->bestfree; 315 hdr = bp->b_addr;
120 btp = xfs_dir2_block_tail_p(mp, hdr); 316 btp = xfs_dir2_block_tail_p(mp, hdr);
121 blp = xfs_dir2_block_leaf_p(btp); 317 blp = xfs_dir2_block_leaf_p(btp);
318
122 /* 319 /*
123 * No stale entries? Need space for entry and new leaf. 320 * Find out if we can reuse stale entries or whether we need extra
124 */ 321 * space for entry and new leaf.
125 if (!btp->stale) {
126 /*
127 * Tag just before the first leaf entry.
128 */
129 tagp = (__be16 *)blp - 1;
130 /*
131 * Data object just before the first leaf entry.
132 */
133 enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
134 /*
135 * If it's not free then can't do this add without cleaning up:
136 * the space before the first leaf entry needs to be free so it
137 * can be expanded to hold the pointer to the new entry.
138 */
139 if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG)
140 dup = enddup = NULL;
141 /*
142 * Check out the biggest freespace and see if it's the same one.
143 */
144 else {
145 dup = (xfs_dir2_data_unused_t *)
146 ((char *)hdr + be16_to_cpu(bf[0].offset));
147 if (dup == enddup) {
148 /*
149 * It is the biggest freespace, is it too small
150 * to hold the new leaf too?
151 */
152 if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
153 /*
154 * Yes, we use the second-largest
155 * entry instead if it works.
156 */
157 if (be16_to_cpu(bf[1].length) >= len)
158 dup = (xfs_dir2_data_unused_t *)
159 ((char *)hdr +
160 be16_to_cpu(bf[1].offset));
161 else
162 dup = NULL;
163 }
164 } else {
165 /*
166 * Not the same free entry,
167 * just check its length.
168 */
169 if (be16_to_cpu(dup->length) < len) {
170 dup = NULL;
171 }
172 }
173 }
174 compact = 0;
175 }
176 /*
177 * If there are stale entries we'll use one for the leaf.
178 * Is the biggest entry enough to avoid compaction?
179 */
180 else if (be16_to_cpu(bf[0].length) >= len) {
181 dup = (xfs_dir2_data_unused_t *)
182 ((char *)hdr + be16_to_cpu(bf[0].offset));
183 compact = 0;
184 }
185 /*
186 * Will need to compact to make this work.
187 */ 322 */
188 else { 323 xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup,
189 /* 324 &enddup, &compact, len);
190 * Tag just before the first leaf entry. 325
191 */
192 tagp = (__be16 *)blp - 1;
193 /*
194 * Data object just before the first leaf entry.
195 */
196 dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
197 /*
198 * If it's not free then the data will go where the
199 * leaf data starts now, if it works at all.
200 */
201 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
202 if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
203 (uint)sizeof(*blp) < len)
204 dup = NULL;
205 } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
206 dup = NULL;
207 else
208 dup = (xfs_dir2_data_unused_t *)blp;
209 compact = 1;
210 }
211 /* 326 /*
212 * If this isn't a real add, we're done with the buffer. 327 * Done everything we need for a space check now.
213 */ 328 */
214 if (args->op_flags & XFS_DA_OP_JUSTCHECK) 329 if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
215 xfs_trans_brelse(tp, bp); 330 xfs_trans_brelse(tp, bp);
331 if (!dup)
332 return XFS_ERROR(ENOSPC);
333 return 0;
334 }
335
216 /* 336 /*
217 * If we don't have space for the new entry & leaf ... 337 * If we don't have space for the new entry & leaf ...
218 */ 338 */
219 if (!dup) { 339 if (!dup) {
220 /* 340 /* Don't have a space reservation: return no-space. */
221 * Not trying to actually do anything, or don't have 341 if (args->total == 0)
222 * a space reservation: return no-space.
223 */
224 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
225 return XFS_ERROR(ENOSPC); 342 return XFS_ERROR(ENOSPC);
226 /* 343 /*
227 * Convert to the next larger format. 344 * Convert to the next larger format.
@@ -232,65 +349,24 @@ xfs_dir2_block_addname(
232 return error; 349 return error;
233 return xfs_dir2_leaf_addname(args); 350 return xfs_dir2_leaf_addname(args);
234 } 351 }
235 /* 352
236 * Just checking, and it would work, so say so.
237 */
238 if (args->op_flags & XFS_DA_OP_JUSTCHECK)
239 return 0;
240 needlog = needscan = 0; 353 needlog = needscan = 0;
354
241 /* 355 /*
242 * If need to compact the leaf entries, do it now. 356 * If need to compact the leaf entries, do it now.
243 * Leave the highest-numbered stale entry stale.
244 * XXX should be the one closest to mid but mid is not yet computed.
245 */
246 if (compact) {
247 int fromidx; /* source leaf index */
248 int toidx; /* target leaf index */
249
250 for (fromidx = toidx = be32_to_cpu(btp->count) - 1,
251 highstale = lfloghigh = -1;
252 fromidx >= 0;
253 fromidx--) {
254 if (blp[fromidx].address ==
255 cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
256 if (highstale == -1)
257 highstale = toidx;
258 else {
259 if (lfloghigh == -1)
260 lfloghigh = toidx;
261 continue;
262 }
263 }
264 if (fromidx < toidx)
265 blp[toidx] = blp[fromidx];
266 toidx--;
267 }
268 lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
269 lfloghigh -= be32_to_cpu(btp->stale) - 1;
270 be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
271 xfs_dir2_data_make_free(tp, bp,
272 (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
273 (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
274 &needlog, &needscan);
275 blp += be32_to_cpu(btp->stale) - 1;
276 btp->stale = cpu_to_be32(1);
277 /*
278 * If we now need to rebuild the bestfree map, do so.
279 * This needs to happen before the next call to use_free.
280 */
281 if (needscan) {
282 xfs_dir2_data_freescan(mp, hdr, &needlog);
283 needscan = 0;
284 }
285 }
286 /*
287 * Set leaf logging boundaries to impossible state.
288 * For the no-stale case they're set explicitly.
289 */ 357 */
358 if (compact)
359 xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
360 &lfloghigh, &lfloglow);
290 else if (btp->stale) { 361 else if (btp->stale) {
362 /*
363 * Set leaf logging boundaries to impossible state.
364 * For the no-stale case they're set explicitly.
365 */
291 lfloglow = be32_to_cpu(btp->count); 366 lfloglow = be32_to_cpu(btp->count);
292 lfloghigh = -1; 367 lfloghigh = -1;
293 } 368 }
369
294 /* 370 /*
295 * Find the slot that's first lower than our hash value, -1 if none. 371 * Find the slot that's first lower than our hash value, -1 if none.
296 */ 372 */
@@ -450,18 +526,13 @@ xfs_dir2_block_getdents(
450 /* 526 /*
451 * If the block number in the offset is out of range, we're done. 527 * If the block number in the offset is out of range, we're done.
452 */ 528 */
453 if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) { 529 if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
454 return 0; 530 return 0;
455 } 531
456 /* 532 error = xfs_dir2_block_read(NULL, dp, &bp);
457 * Can't read the block, give up, else get dabuf in bp.
458 */
459 error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1,
460 &bp, XFS_DATA_FORK);
461 if (error) 533 if (error)
462 return error; 534 return error;
463 535
464 ASSERT(bp != NULL);
465 /* 536 /*
466 * Extract the byte offset we start at from the seek pointer. 537 * Extract the byte offset we start at from the seek pointer.
467 * We'll skip entries before this. 538 * We'll skip entries before this.
@@ -637,14 +708,11 @@ xfs_dir2_block_lookup_int(
637 dp = args->dp; 708 dp = args->dp;
638 tp = args->trans; 709 tp = args->trans;
639 mp = dp->i_mount; 710 mp = dp->i_mount;
640 /* 711
641 * Read the buffer, return error if we can't get it. 712 error = xfs_dir2_block_read(tp, dp, &bp);
642 */ 713 if (error)
643 if ((error =
644 xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
645 return error; 714 return error;
646 } 715
647 ASSERT(bp != NULL);
648 hdr = bp->b_addr; 716 hdr = bp->b_addr;
649 xfs_dir2_data_check(dp, bp); 717 xfs_dir2_data_check(dp, bp);
650 btp = xfs_dir2_block_tail_p(mp, hdr); 718 btp = xfs_dir2_block_tail_p(mp, hdr);
@@ -917,10 +985,10 @@ xfs_dir2_leaf_to_block(
917 /* 985 /*
918 * Read the data block if we don't already have it, give up if it fails. 986 * Read the data block if we don't already have it, give up if it fails.
919 */ 987 */
920 if (dbp == NULL && 988 if (!dbp) {
921 (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, 989 error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
922 XFS_DATA_FORK))) { 990 if (error)
923 return error; 991 return error;
924 } 992 }
925 hdr = dbp->b_addr; 993 hdr = dbp->b_addr;
926 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); 994 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
@@ -944,6 +1012,7 @@ xfs_dir2_leaf_to_block(
944 /* 1012 /*
945 * Start converting it to block form. 1013 * Start converting it to block form.
946 */ 1014 */
1015 dbp->b_ops = &xfs_dir2_block_buf_ops;
947 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); 1016 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
948 needlog = 1; 1017 needlog = 1;
949 needscan = 0; 1018 needscan = 0;
@@ -1073,6 +1142,7 @@ xfs_dir2_sf_to_block(
1073 kmem_free(sfp); 1142 kmem_free(sfp);
1074 return error; 1143 return error;
1075 } 1144 }
1145 bp->b_ops = &xfs_dir2_block_buf_ops;
1076 hdr = bp->b_addr; 1146 hdr = bp->b_addr;
1077 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); 1147 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
1078 /* 1148 /*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 44ffd4d6bc91..ffcf1774152e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -34,14 +34,13 @@
34STATIC xfs_dir2_data_free_t * 34STATIC xfs_dir2_data_free_t *
35xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); 35xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
36 36
37#ifdef DEBUG
38/* 37/*
39 * Check the consistency of the data block. 38 * Check the consistency of the data block.
40 * The input can also be a block-format directory. 39 * The input can also be a block-format directory.
41 * Pop an assert if we find anything bad. 40 * Return 0 is the buffer is good, otherwise an error.
42 */ 41 */
43void 42int
44xfs_dir2_data_check( 43__xfs_dir2_data_check(
45 struct xfs_inode *dp, /* incore inode pointer */ 44 struct xfs_inode *dp, /* incore inode pointer */
46 struct xfs_buf *bp) /* data block's buffer */ 45 struct xfs_buf *bp) /* data block's buffer */
47{ 46{
@@ -64,18 +63,23 @@ xfs_dir2_data_check(
64 int stale; /* count of stale leaves */ 63 int stale; /* count of stale leaves */
65 struct xfs_name name; 64 struct xfs_name name;
66 65
67 mp = dp->i_mount; 66 mp = bp->b_target->bt_mount;
68 hdr = bp->b_addr; 67 hdr = bp->b_addr;
69 bf = hdr->bestfree; 68 bf = hdr->bestfree;
70 p = (char *)(hdr + 1); 69 p = (char *)(hdr + 1);
71 70
72 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { 71 switch (hdr->magic) {
72 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
73 btp = xfs_dir2_block_tail_p(mp, hdr); 73 btp = xfs_dir2_block_tail_p(mp, hdr);
74 lep = xfs_dir2_block_leaf_p(btp); 74 lep = xfs_dir2_block_leaf_p(btp);
75 endp = (char *)lep; 75 endp = (char *)lep;
76 } else { 76 break;
77 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); 77 case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
78 endp = (char *)hdr + mp->m_dirblksize; 78 endp = (char *)hdr + mp->m_dirblksize;
79 break;
80 default:
81 XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
82 return EFSCORRUPTED;
79 } 83 }
80 84
81 count = lastfree = freeseen = 0; 85 count = lastfree = freeseen = 0;
@@ -83,19 +87,22 @@ xfs_dir2_data_check(
83 * Account for zero bestfree entries. 87 * Account for zero bestfree entries.
84 */ 88 */
85 if (!bf[0].length) { 89 if (!bf[0].length) {
86 ASSERT(!bf[0].offset); 90 XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
87 freeseen |= 1 << 0; 91 freeseen |= 1 << 0;
88 } 92 }
89 if (!bf[1].length) { 93 if (!bf[1].length) {
90 ASSERT(!bf[1].offset); 94 XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
91 freeseen |= 1 << 1; 95 freeseen |= 1 << 1;
92 } 96 }
93 if (!bf[2].length) { 97 if (!bf[2].length) {
94 ASSERT(!bf[2].offset); 98 XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
95 freeseen |= 1 << 2; 99 freeseen |= 1 << 2;
96 } 100 }
97 ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); 101
98 ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); 102 XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
103 be16_to_cpu(bf[1].length));
104 XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
105 be16_to_cpu(bf[2].length));
99 /* 106 /*
100 * Loop over the data/unused entries. 107 * Loop over the data/unused entries.
101 */ 108 */
@@ -107,17 +114,20 @@ xfs_dir2_data_check(
107 * doesn't need to be there. 114 * doesn't need to be there.
108 */ 115 */
109 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { 116 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
110 ASSERT(lastfree == 0); 117 XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
111 ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == 118 XFS_WANT_CORRUPTED_RETURN(
112 (char *)dup - (char *)hdr); 119 be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
120 (char *)dup - (char *)hdr);
113 dfp = xfs_dir2_data_freefind(hdr, dup); 121 dfp = xfs_dir2_data_freefind(hdr, dup);
114 if (dfp) { 122 if (dfp) {
115 i = (int)(dfp - bf); 123 i = (int)(dfp - bf);
116 ASSERT((freeseen & (1 << i)) == 0); 124 XFS_WANT_CORRUPTED_RETURN(
125 (freeseen & (1 << i)) == 0);
117 freeseen |= 1 << i; 126 freeseen |= 1 << i;
118 } else { 127 } else {
119 ASSERT(be16_to_cpu(dup->length) <= 128 XFS_WANT_CORRUPTED_RETURN(
120 be16_to_cpu(bf[2].length)); 129 be16_to_cpu(dup->length) <=
130 be16_to_cpu(bf[2].length));
121 } 131 }
122 p += be16_to_cpu(dup->length); 132 p += be16_to_cpu(dup->length);
123 lastfree = 1; 133 lastfree = 1;
@@ -130,10 +140,12 @@ xfs_dir2_data_check(
130 * The linear search is crude but this is DEBUG code. 140 * The linear search is crude but this is DEBUG code.
131 */ 141 */
132 dep = (xfs_dir2_data_entry_t *)p; 142 dep = (xfs_dir2_data_entry_t *)p;
133 ASSERT(dep->namelen != 0); 143 XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
134 ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); 144 XFS_WANT_CORRUPTED_RETURN(
135 ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == 145 !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
136 (char *)dep - (char *)hdr); 146 XFS_WANT_CORRUPTED_RETURN(
147 be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
148 (char *)dep - (char *)hdr);
137 count++; 149 count++;
138 lastfree = 0; 150 lastfree = 0;
139 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { 151 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
@@ -148,27 +160,122 @@ xfs_dir2_data_check(
148 be32_to_cpu(lep[i].hashval) == hash) 160 be32_to_cpu(lep[i].hashval) == hash)
149 break; 161 break;
150 } 162 }
151 ASSERT(i < be32_to_cpu(btp->count)); 163 XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
152 } 164 }
153 p += xfs_dir2_data_entsize(dep->namelen); 165 p += xfs_dir2_data_entsize(dep->namelen);
154 } 166 }
155 /* 167 /*
156 * Need to have seen all the entries and all the bestfree slots. 168 * Need to have seen all the entries and all the bestfree slots.
157 */ 169 */
158 ASSERT(freeseen == 7); 170 XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
159 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { 171 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
160 for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { 172 for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
161 if (lep[i].address == 173 if (lep[i].address ==
162 cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) 174 cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
163 stale++; 175 stale++;
164 if (i > 0) 176 if (i > 0)
165 ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); 177 XFS_WANT_CORRUPTED_RETURN(
178 be32_to_cpu(lep[i].hashval) >=
179 be32_to_cpu(lep[i - 1].hashval));
166 } 180 }
167 ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); 181 XFS_WANT_CORRUPTED_RETURN(count ==
168 ASSERT(stale == be32_to_cpu(btp->stale)); 182 be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
183 XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
169 } 184 }
185 return 0;
186}
187
188static void
189xfs_dir2_data_verify(
190 struct xfs_buf *bp)
191{
192 struct xfs_mount *mp = bp->b_target->bt_mount;
193 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
194 int block_ok = 0;
195
196 block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC);
197 block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
198
199 if (!block_ok) {
200 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
201 xfs_buf_ioerror(bp, EFSCORRUPTED);
202 }
203}
204
205/*
206 * Readahead of the first block of the directory when it is opened is completely
207 * oblivious to the format of the directory. Hence we can either get a block
208 * format buffer or a data format buffer on readahead.
209 */
210static void
211xfs_dir2_data_reada_verify(
212 struct xfs_buf *bp)
213{
214 struct xfs_mount *mp = bp->b_target->bt_mount;
215 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
216
217 switch (hdr->magic) {
218 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
219 bp->b_ops = &xfs_dir2_block_buf_ops;
220 bp->b_ops->verify_read(bp);
221 return;
222 case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
223 xfs_dir2_data_verify(bp);
224 return;
225 default:
226 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
227 xfs_buf_ioerror(bp, EFSCORRUPTED);
228 break;
229 }
230}
231
232static void
233xfs_dir2_data_read_verify(
234 struct xfs_buf *bp)
235{
236 xfs_dir2_data_verify(bp);
237}
238
239static void
240xfs_dir2_data_write_verify(
241 struct xfs_buf *bp)
242{
243 xfs_dir2_data_verify(bp);
244}
245
246const struct xfs_buf_ops xfs_dir2_data_buf_ops = {
247 .verify_read = xfs_dir2_data_read_verify,
248 .verify_write = xfs_dir2_data_write_verify,
249};
250
251static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = {
252 .verify_read = xfs_dir2_data_reada_verify,
253 .verify_write = xfs_dir2_data_write_verify,
254};
255
256
257int
258xfs_dir2_data_read(
259 struct xfs_trans *tp,
260 struct xfs_inode *dp,
261 xfs_dablk_t bno,
262 xfs_daddr_t mapped_bno,
263 struct xfs_buf **bpp)
264{
265 return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
266 XFS_DATA_FORK, &xfs_dir2_data_buf_ops);
267}
268
269int
270xfs_dir2_data_readahead(
271 struct xfs_trans *tp,
272 struct xfs_inode *dp,
273 xfs_dablk_t bno,
274 xfs_daddr_t mapped_bno)
275{
276 return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
277 XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops);
170} 278}
171#endif
172 279
173/* 280/*
174 * Given a data block and an unused entry from that block, 281 * Given a data block and an unused entry from that block,
@@ -409,10 +516,9 @@ xfs_dir2_data_init(
409 */ 516 */
410 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, 517 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
411 XFS_DATA_FORK); 518 XFS_DATA_FORK);
412 if (error) { 519 if (error)
413 return error; 520 return error;
414 } 521 bp->b_ops = &xfs_dir2_data_buf_ops;
415 ASSERT(bp != NULL);
416 522
417 /* 523 /*
418 * Initialize the header. 524 * Initialize the header.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 0b296253bd01..60cd2fa4e047 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -48,6 +48,83 @@ static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
48 int first, int last); 48 int first, int last);
49static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); 49static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
50 50
51static void
52xfs_dir2_leaf_verify(
53 struct xfs_buf *bp,
54 __be16 magic)
55{
56 struct xfs_mount *mp = bp->b_target->bt_mount;
57 struct xfs_dir2_leaf_hdr *hdr = bp->b_addr;
58 int block_ok = 0;
59
60 block_ok = hdr->info.magic == magic;
61 if (!block_ok) {
62 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
63 xfs_buf_ioerror(bp, EFSCORRUPTED);
64 }
65}
66
67static void
68xfs_dir2_leaf1_read_verify(
69 struct xfs_buf *bp)
70{
71 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
72}
73
74static void
75xfs_dir2_leaf1_write_verify(
76 struct xfs_buf *bp)
77{
78 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
79}
80
81void
82xfs_dir2_leafn_read_verify(
83 struct xfs_buf *bp)
84{
85 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
86}
87
88void
89xfs_dir2_leafn_write_verify(
90 struct xfs_buf *bp)
91{
92 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
93}
94
95static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = {
96 .verify_read = xfs_dir2_leaf1_read_verify,
97 .verify_write = xfs_dir2_leaf1_write_verify,
98};
99
100const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = {
101 .verify_read = xfs_dir2_leafn_read_verify,
102 .verify_write = xfs_dir2_leafn_write_verify,
103};
104
105static int
106xfs_dir2_leaf_read(
107 struct xfs_trans *tp,
108 struct xfs_inode *dp,
109 xfs_dablk_t fbno,
110 xfs_daddr_t mappedbno,
111 struct xfs_buf **bpp)
112{
113 return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
114 XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops);
115}
116
117int
118xfs_dir2_leafn_read(
119 struct xfs_trans *tp,
120 struct xfs_inode *dp,
121 xfs_dablk_t fbno,
122 xfs_daddr_t mappedbno,
123 struct xfs_buf **bpp)
124{
125 return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
126 XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops);
127}
51 128
52/* 129/*
53 * Convert a block form directory to a leaf form directory. 130 * Convert a block form directory to a leaf form directory.
@@ -125,6 +202,7 @@ xfs_dir2_block_to_leaf(
125 /* 202 /*
126 * Fix up the block header, make it a data block. 203 * Fix up the block header, make it a data block.
127 */ 204 */
205 dbp->b_ops = &xfs_dir2_data_buf_ops;
128 hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); 206 hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
129 if (needscan) 207 if (needscan)
130 xfs_dir2_data_freescan(mp, hdr, &needlog); 208 xfs_dir2_data_freescan(mp, hdr, &needlog);
@@ -311,15 +389,11 @@ xfs_dir2_leaf_addname(
311 dp = args->dp; 389 dp = args->dp;
312 tp = args->trans; 390 tp = args->trans;
313 mp = dp->i_mount; 391 mp = dp->i_mount;
314 /* 392
315 * Read the leaf block. 393 error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
316 */ 394 if (error)
317 error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
318 XFS_DATA_FORK);
319 if (error) {
320 return error; 395 return error;
321 } 396
322 ASSERT(lbp != NULL);
323 /* 397 /*
324 * Look up the entry by hash value and name. 398 * Look up the entry by hash value and name.
325 * We know it's not there, our caller has already done a lookup. 399 * We know it's not there, our caller has already done a lookup.
@@ -494,22 +568,21 @@ xfs_dir2_leaf_addname(
494 hdr = dbp->b_addr; 568 hdr = dbp->b_addr;
495 bestsp[use_block] = hdr->bestfree[0].length; 569 bestsp[use_block] = hdr->bestfree[0].length;
496 grown = 1; 570 grown = 1;
497 } 571 } else {
498 /* 572 /*
499 * Already had space in some data block. 573 * Already had space in some data block.
500 * Just read that one in. 574 * Just read that one in.
501 */ 575 */
502 else { 576 error = xfs_dir2_data_read(tp, dp,
503 if ((error = 577 xfs_dir2_db_to_da(mp, use_block),
504 xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), 578 -1, &dbp);
505 -1, &dbp, XFS_DATA_FORK))) { 579 if (error) {
506 xfs_trans_brelse(tp, lbp); 580 xfs_trans_brelse(tp, lbp);
507 return error; 581 return error;
508 } 582 }
509 hdr = dbp->b_addr; 583 hdr = dbp->b_addr;
510 grown = 0; 584 grown = 0;
511 } 585 }
512 xfs_dir2_data_check(dp, dbp);
513 /* 586 /*
514 * Point to the biggest freespace in our data block. 587 * Point to the biggest freespace in our data block.
515 */ 588 */
@@ -892,10 +965,9 @@ xfs_dir2_leaf_readbuf(
892 * Read the directory block starting at the first mapping. 965 * Read the directory block starting at the first mapping.
893 */ 966 */
894 mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); 967 mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
895 error = xfs_da_read_buf(NULL, dp, map->br_startoff, 968 error = xfs_dir2_data_read(NULL, dp, map->br_startoff,
896 map->br_blockcount >= mp->m_dirblkfsbs ? 969 map->br_blockcount >= mp->m_dirblkfsbs ?
897 XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, 970 XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
898 &bp, XFS_DATA_FORK);
899 971
900 /* 972 /*
901 * Should just skip over the data block instead of giving up. 973 * Should just skip over the data block instead of giving up.
@@ -922,11 +994,11 @@ xfs_dir2_leaf_readbuf(
922 */ 994 */
923 if (i > mip->ra_current && 995 if (i > mip->ra_current &&
924 map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { 996 map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
925 xfs_buf_readahead(mp->m_ddev_targp, 997 xfs_dir2_data_readahead(NULL, dp,
998 map[mip->ra_index].br_startoff + mip->ra_offset,
926 XFS_FSB_TO_DADDR(mp, 999 XFS_FSB_TO_DADDR(mp,
927 map[mip->ra_index].br_startblock + 1000 map[mip->ra_index].br_startblock +
928 mip->ra_offset), 1001 mip->ra_offset));
929 (int)BTOBB(mp->m_dirblksize));
930 mip->ra_current = i; 1002 mip->ra_current = i;
931 } 1003 }
932 1004
@@ -935,10 +1007,9 @@ xfs_dir2_leaf_readbuf(
935 * use our mapping, but this is a very rare case. 1007 * use our mapping, but this is a very rare case.
936 */ 1008 */
937 else if (i > mip->ra_current) { 1009 else if (i > mip->ra_current) {
938 xfs_da_reada_buf(NULL, dp, 1010 xfs_dir2_data_readahead(NULL, dp,
939 map[mip->ra_index].br_startoff + 1011 map[mip->ra_index].br_startoff +
940 mip->ra_offset, 1012 mip->ra_offset, -1);
941 XFS_DATA_FORK);
942 mip->ra_current = i; 1013 mip->ra_current = i;
943 } 1014 }
944 1015
@@ -1177,15 +1248,14 @@ xfs_dir2_leaf_init(
1177 * Get the buffer for the block. 1248 * Get the buffer for the block.
1178 */ 1249 */
1179 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, 1250 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
1180 XFS_DATA_FORK); 1251 XFS_DATA_FORK);
1181 if (error) { 1252 if (error)
1182 return error; 1253 return error;
1183 } 1254
1184 ASSERT(bp != NULL);
1185 leaf = bp->b_addr;
1186 /* 1255 /*
1187 * Initialize the header. 1256 * Initialize the header.
1188 */ 1257 */
1258 leaf = bp->b_addr;
1189 leaf->hdr.info.magic = cpu_to_be16(magic); 1259 leaf->hdr.info.magic = cpu_to_be16(magic);
1190 leaf->hdr.info.forw = 0; 1260 leaf->hdr.info.forw = 0;
1191 leaf->hdr.info.back = 0; 1261 leaf->hdr.info.back = 0;
@@ -1198,10 +1268,12 @@ xfs_dir2_leaf_init(
1198 * the block. 1268 * the block.
1199 */ 1269 */
1200 if (magic == XFS_DIR2_LEAF1_MAGIC) { 1270 if (magic == XFS_DIR2_LEAF1_MAGIC) {
1271 bp->b_ops = &xfs_dir2_leaf1_buf_ops;
1201 ltp = xfs_dir2_leaf_tail_p(mp, leaf); 1272 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
1202 ltp->bestcount = 0; 1273 ltp->bestcount = 0;
1203 xfs_dir2_leaf_log_tail(tp, bp); 1274 xfs_dir2_leaf_log_tail(tp, bp);
1204 } 1275 } else
1276 bp->b_ops = &xfs_dir2_leafn_buf_ops;
1205 *bpp = bp; 1277 *bpp = bp;
1206 return 0; 1278 return 0;
1207} 1279}
@@ -1372,13 +1444,11 @@ xfs_dir2_leaf_lookup_int(
1372 dp = args->dp; 1444 dp = args->dp;
1373 tp = args->trans; 1445 tp = args->trans;
1374 mp = dp->i_mount; 1446 mp = dp->i_mount;
1375 /* 1447
1376 * Read the leaf block into the buffer. 1448 error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
1377 */
1378 error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
1379 XFS_DATA_FORK);
1380 if (error) 1449 if (error)
1381 return error; 1450 return error;
1451
1382 *lbpp = lbp; 1452 *lbpp = lbp;
1383 leaf = lbp->b_addr; 1453 leaf = lbp->b_addr;
1384 xfs_dir2_leaf_check(dp, lbp); 1454 xfs_dir2_leaf_check(dp, lbp);
@@ -1409,14 +1479,13 @@ xfs_dir2_leaf_lookup_int(
1409 if (newdb != curdb) { 1479 if (newdb != curdb) {
1410 if (dbp) 1480 if (dbp)
1411 xfs_trans_brelse(tp, dbp); 1481 xfs_trans_brelse(tp, dbp);
1412 error = xfs_da_read_buf(tp, dp, 1482 error = xfs_dir2_data_read(tp, dp,
1413 xfs_dir2_db_to_da(mp, newdb), 1483 xfs_dir2_db_to_da(mp, newdb),
1414 -1, &dbp, XFS_DATA_FORK); 1484 -1, &dbp);
1415 if (error) { 1485 if (error) {
1416 xfs_trans_brelse(tp, lbp); 1486 xfs_trans_brelse(tp, lbp);
1417 return error; 1487 return error;
1418 } 1488 }
1419 xfs_dir2_data_check(dp, dbp);
1420 curdb = newdb; 1489 curdb = newdb;
1421 } 1490 }
1422 /* 1491 /*
@@ -1451,9 +1520,9 @@ xfs_dir2_leaf_lookup_int(
1451 ASSERT(cidb != -1); 1520 ASSERT(cidb != -1);
1452 if (cidb != curdb) { 1521 if (cidb != curdb) {
1453 xfs_trans_brelse(tp, dbp); 1522 xfs_trans_brelse(tp, dbp);
1454 error = xfs_da_read_buf(tp, dp, 1523 error = xfs_dir2_data_read(tp, dp,
1455 xfs_dir2_db_to_da(mp, cidb), 1524 xfs_dir2_db_to_da(mp, cidb),
1456 -1, &dbp, XFS_DATA_FORK); 1525 -1, &dbp);
1457 if (error) { 1526 if (error) {
1458 xfs_trans_brelse(tp, lbp); 1527 xfs_trans_brelse(tp, lbp);
1459 return error; 1528 return error;
@@ -1738,10 +1807,9 @@ xfs_dir2_leaf_trim_data(
1738 /* 1807 /*
1739 * Read the offending data block. We need its buffer. 1808 * Read the offending data block. We need its buffer.
1740 */ 1809 */
1741 if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, 1810 error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
1742 XFS_DATA_FORK))) { 1811 if (error)
1743 return error; 1812 return error;
1744 }
1745 1813
1746 leaf = lbp->b_addr; 1814 leaf = lbp->b_addr;
1747 ltp = xfs_dir2_leaf_tail_p(mp, leaf); 1815 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -1864,10 +1932,9 @@ xfs_dir2_node_to_leaf(
1864 /* 1932 /*
1865 * Read the freespace block. 1933 * Read the freespace block.
1866 */ 1934 */
1867 if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, 1935 error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp);
1868 XFS_DATA_FORK))) { 1936 if (error)
1869 return error; 1937 return error;
1870 }
1871 free = fbp->b_addr; 1938 free = fbp->b_addr;
1872 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 1939 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
1873 ASSERT(!free->hdr.firstdb); 1940 ASSERT(!free->hdr.firstdb);
@@ -1890,7 +1957,10 @@ xfs_dir2_node_to_leaf(
1890 xfs_dir2_leaf_compact(args, lbp); 1957 xfs_dir2_leaf_compact(args, lbp);
1891 else 1958 else
1892 xfs_dir2_leaf_log_header(tp, lbp); 1959 xfs_dir2_leaf_log_header(tp, lbp);
1960
1961 lbp->b_ops = &xfs_dir2_leaf1_buf_ops;
1893 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); 1962 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
1963
1894 /* 1964 /*
1895 * Set up the leaf tail from the freespace block. 1965 * Set up the leaf tail from the freespace block.
1896 */ 1966 */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 6c7052406605..5980f9b7fa9b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -55,6 +55,74 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
55static int xfs_dir2_node_addname_int(xfs_da_args_t *args, 55static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
56 xfs_da_state_blk_t *fblk); 56 xfs_da_state_blk_t *fblk);
57 57
58static void
59xfs_dir2_free_verify(
60 struct xfs_buf *bp)
61{
62 struct xfs_mount *mp = bp->b_target->bt_mount;
63 struct xfs_dir2_free_hdr *hdr = bp->b_addr;
64 int block_ok = 0;
65
66 block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC);
67 if (!block_ok) {
68 XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic",
69 XFS_ERRLEVEL_LOW, mp, hdr);
70 xfs_buf_ioerror(bp, EFSCORRUPTED);
71 }
72}
73
74static void
75xfs_dir2_free_read_verify(
76 struct xfs_buf *bp)
77{
78 xfs_dir2_free_verify(bp);
79}
80
81static void
82xfs_dir2_free_write_verify(
83 struct xfs_buf *bp)
84{
85 xfs_dir2_free_verify(bp);
86}
87
88static const struct xfs_buf_ops xfs_dir2_free_buf_ops = {
89 .verify_read = xfs_dir2_free_read_verify,
90 .verify_write = xfs_dir2_free_write_verify,
91};
92
93
94static int
95__xfs_dir2_free_read(
96 struct xfs_trans *tp,
97 struct xfs_inode *dp,
98 xfs_dablk_t fbno,
99 xfs_daddr_t mappedbno,
100 struct xfs_buf **bpp)
101{
102 return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
103 XFS_DATA_FORK, &xfs_dir2_free_buf_ops);
104}
105
106int
107xfs_dir2_free_read(
108 struct xfs_trans *tp,
109 struct xfs_inode *dp,
110 xfs_dablk_t fbno,
111 struct xfs_buf **bpp)
112{
113 return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp);
114}
115
116static int
117xfs_dir2_free_try_read(
118 struct xfs_trans *tp,
119 struct xfs_inode *dp,
120 xfs_dablk_t fbno,
121 struct xfs_buf **bpp)
122{
123 return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp);
124}
125
58/* 126/*
59 * Log entries from a freespace block. 127 * Log entries from a freespace block.
60 */ 128 */
@@ -131,11 +199,12 @@ xfs_dir2_leaf_to_node(
131 /* 199 /*
132 * Get the buffer for the new freespace block. 200 * Get the buffer for the new freespace block.
133 */ 201 */
134 if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, 202 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
135 XFS_DATA_FORK))) { 203 XFS_DATA_FORK);
204 if (error)
136 return error; 205 return error;
137 } 206 fbp->b_ops = &xfs_dir2_free_buf_ops;
138 ASSERT(fbp != NULL); 207
139 free = fbp->b_addr; 208 free = fbp->b_addr;
140 leaf = lbp->b_addr; 209 leaf = lbp->b_addr;
141 ltp = xfs_dir2_leaf_tail_p(mp, leaf); 210 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -157,7 +226,10 @@ xfs_dir2_leaf_to_node(
157 *to = cpu_to_be16(off); 226 *to = cpu_to_be16(off);
158 } 227 }
159 free->hdr.nused = cpu_to_be32(n); 228 free->hdr.nused = cpu_to_be32(n);
229
230 lbp->b_ops = &xfs_dir2_leafn_buf_ops;
160 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); 231 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
232
161 /* 233 /*
162 * Log everything. 234 * Log everything.
163 */ 235 */
@@ -394,12 +466,10 @@ xfs_dir2_leafn_lookup_for_addname(
394 */ 466 */
395 if (curbp) 467 if (curbp)
396 xfs_trans_brelse(tp, curbp); 468 xfs_trans_brelse(tp, curbp);
397 /* 469
398 * Read the free block. 470 error = xfs_dir2_free_read(tp, dp,
399 */
400 error = xfs_da_read_buf(tp, dp,
401 xfs_dir2_db_to_da(mp, newfdb), 471 xfs_dir2_db_to_da(mp, newfdb),
402 -1, &curbp, XFS_DATA_FORK); 472 &curbp);
403 if (error) 473 if (error)
404 return error; 474 return error;
405 free = curbp->b_addr; 475 free = curbp->b_addr;
@@ -534,9 +604,9 @@ xfs_dir2_leafn_lookup_for_entry(
534 ASSERT(state->extravalid); 604 ASSERT(state->extravalid);
535 curbp = state->extrablk.bp; 605 curbp = state->extrablk.bp;
536 } else { 606 } else {
537 error = xfs_da_read_buf(tp, dp, 607 error = xfs_dir2_data_read(tp, dp,
538 xfs_dir2_db_to_da(mp, newdb), 608 xfs_dir2_db_to_da(mp, newdb),
539 -1, &curbp, XFS_DATA_FORK); 609 -1, &curbp);
540 if (error) 610 if (error)
541 return error; 611 return error;
542 } 612 }
@@ -568,6 +638,7 @@ xfs_dir2_leafn_lookup_for_entry(
568 state->extrablk.index = (int)((char *)dep - 638 state->extrablk.index = (int)((char *)dep -
569 (char *)curbp->b_addr); 639 (char *)curbp->b_addr);
570 state->extrablk.magic = XFS_DIR2_DATA_MAGIC; 640 state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
641 curbp->b_ops = &xfs_dir2_data_buf_ops;
571 if (cmp == XFS_CMP_EXACT) 642 if (cmp == XFS_CMP_EXACT)
572 return XFS_ERROR(EEXIST); 643 return XFS_ERROR(EEXIST);
573 } 644 }
@@ -582,6 +653,7 @@ xfs_dir2_leafn_lookup_for_entry(
582 state->extrablk.index = -1; 653 state->extrablk.index = -1;
583 state->extrablk.blkno = curdb; 654 state->extrablk.blkno = curdb;
584 state->extrablk.magic = XFS_DIR2_DATA_MAGIC; 655 state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
656 curbp->b_ops = &xfs_dir2_data_buf_ops;
585 } else { 657 } else {
586 /* If the curbp is not the CI match block, drop it */ 658 /* If the curbp is not the CI match block, drop it */
587 if (state->extrablk.bp != curbp) 659 if (state->extrablk.bp != curbp)
@@ -825,6 +897,77 @@ xfs_dir2_leafn_rebalance(
825 } 897 }
826} 898}
827 899
900static int
901xfs_dir2_data_block_free(
902 xfs_da_args_t *args,
903 struct xfs_dir2_data_hdr *hdr,
904 struct xfs_dir2_free *free,
905 xfs_dir2_db_t fdb,
906 int findex,
907 struct xfs_buf *fbp,
908 int longest)
909{
910 struct xfs_trans *tp = args->trans;
911 int logfree = 0;
912
913 if (!hdr) {
914 /* One less used entry in the free table. */
915 be32_add_cpu(&free->hdr.nused, -1);
916 xfs_dir2_free_log_header(tp, fbp);
917
918 /*
919 * If this was the last entry in the table, we can trim the
920 * table size back. There might be other entries at the end
921 * referring to non-existent data blocks, get those too.
922 */
923 if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
924 int i; /* free entry index */
925
926 for (i = findex - 1; i >= 0; i--) {
927 if (free->bests[i] != cpu_to_be16(NULLDATAOFF))
928 break;
929 }
930 free->hdr.nvalid = cpu_to_be32(i + 1);
931 logfree = 0;
932 } else {
933 /* Not the last entry, just punch it out. */
934 free->bests[findex] = cpu_to_be16(NULLDATAOFF);
935 logfree = 1;
936 }
937 /*
938 * If there are no useful entries left in the block,
939 * get rid of the block if we can.
940 */
941 if (!free->hdr.nused) {
942 int error;
943
944 error = xfs_dir2_shrink_inode(args, fdb, fbp);
945 if (error == 0) {
946 fbp = NULL;
947 logfree = 0;
948 } else if (error != ENOSPC || args->total != 0)
949 return error;
950 /*
951 * It's possible to get ENOSPC if there is no
952 * space reservation. In this case some one
953 * else will eventually get rid of this block.
954 */
955 }
956 } else {
957 /*
958 * Data block is not empty, just set the free entry to the new
959 * value.
960 */
961 free->bests[findex] = cpu_to_be16(longest);
962 logfree = 1;
963 }
964
965 /* Log the free entry that changed, unless we got rid of it. */
966 if (logfree)
967 xfs_dir2_free_log_bests(tp, fbp, findex, findex);
968 return 0;
969}
970
828/* 971/*
829 * Remove an entry from a node directory. 972 * Remove an entry from a node directory.
830 * This removes the leaf entry and the data entry, 973 * This removes the leaf entry and the data entry,
@@ -908,17 +1051,16 @@ xfs_dir2_leafn_remove(
908 xfs_dir2_db_t fdb; /* freeblock block number */ 1051 xfs_dir2_db_t fdb; /* freeblock block number */
909 int findex; /* index in freeblock entries */ 1052 int findex; /* index in freeblock entries */
910 xfs_dir2_free_t *free; /* freeblock structure */ 1053 xfs_dir2_free_t *free; /* freeblock structure */
911 int logfree; /* need to log free entry */
912 1054
913 /* 1055 /*
914 * Convert the data block number to a free block, 1056 * Convert the data block number to a free block,
915 * read in the free block. 1057 * read in the free block.
916 */ 1058 */
917 fdb = xfs_dir2_db_to_fdb(mp, db); 1059 fdb = xfs_dir2_db_to_fdb(mp, db);
918 if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), 1060 error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
919 -1, &fbp, XFS_DATA_FORK))) { 1061 &fbp);
1062 if (error)
920 return error; 1063 return error;
921 }
922 free = fbp->b_addr; 1064 free = fbp->b_addr;
923 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 1065 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
924 ASSERT(be32_to_cpu(free->hdr.firstdb) == 1066 ASSERT(be32_to_cpu(free->hdr.firstdb) ==
@@ -954,68 +1096,12 @@ xfs_dir2_leafn_remove(
954 * If we got rid of the data block, we can eliminate that entry 1096 * If we got rid of the data block, we can eliminate that entry
955 * in the free block. 1097 * in the free block.
956 */ 1098 */
957 if (hdr == NULL) { 1099 error = xfs_dir2_data_block_free(args, hdr, free,
958 /* 1100 fdb, findex, fbp, longest);
959 * One less used entry in the free table. 1101 if (error)
960 */ 1102 return error;
961 be32_add_cpu(&free->hdr.nused, -1);
962 xfs_dir2_free_log_header(tp, fbp);
963 /*
964 * If this was the last entry in the table, we can
965 * trim the table size back. There might be other
966 * entries at the end referring to non-existent
967 * data blocks, get those too.
968 */
969 if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
970 int i; /* free entry index */
971
972 for (i = findex - 1;
973 i >= 0 &&
974 free->bests[i] == cpu_to_be16(NULLDATAOFF);
975 i--)
976 continue;
977 free->hdr.nvalid = cpu_to_be32(i + 1);
978 logfree = 0;
979 }
980 /*
981 * Not the last entry, just punch it out.
982 */
983 else {
984 free->bests[findex] = cpu_to_be16(NULLDATAOFF);
985 logfree = 1;
986 }
987 /*
988 * If there are no useful entries left in the block,
989 * get rid of the block if we can.
990 */
991 if (!free->hdr.nused) {
992 error = xfs_dir2_shrink_inode(args, fdb, fbp);
993 if (error == 0) {
994 fbp = NULL;
995 logfree = 0;
996 } else if (error != ENOSPC || args->total != 0)
997 return error;
998 /*
999 * It's possible to get ENOSPC if there is no
1000 * space reservation. In this case some one
1001 * else will eventually get rid of this block.
1002 */
1003 }
1004 }
1005 /*
1006 * Data block is not empty, just set the free entry to
1007 * the new value.
1008 */
1009 else {
1010 free->bests[findex] = cpu_to_be16(longest);
1011 logfree = 1;
1012 }
1013 /*
1014 * Log the free entry that changed, unless we got rid of it.
1015 */
1016 if (logfree)
1017 xfs_dir2_free_log_bests(tp, fbp, findex, findex);
1018 } 1103 }
1104
1019 xfs_dir2_leafn_check(dp, bp); 1105 xfs_dir2_leafn_check(dp, bp);
1020 /* 1106 /*
1021 * Return indication of whether this leaf block is empty enough 1107 * Return indication of whether this leaf block is empty enough
@@ -1169,12 +1255,11 @@ xfs_dir2_leafn_toosmall(
1169 /* 1255 /*
1170 * Read the sibling leaf block. 1256 * Read the sibling leaf block.
1171 */ 1257 */
1172 if ((error = 1258 error = xfs_dir2_leafn_read(state->args->trans, state->args->dp,
1173 xfs_da_read_buf(state->args->trans, state->args->dp, blkno, 1259 blkno, -1, &bp);
1174 -1, &bp, XFS_DATA_FORK))) { 1260 if (error)
1175 return error; 1261 return error;
1176 } 1262
1177 ASSERT(bp != NULL);
1178 /* 1263 /*
1179 * Count bytes in the two blocks combined. 1264 * Count bytes in the two blocks combined.
1180 */ 1265 */
@@ -1454,14 +1539,13 @@ xfs_dir2_node_addname_int(
1454 * This should be really rare, so there's no reason 1539 * This should be really rare, so there's no reason
1455 * to avoid it. 1540 * to avoid it.
1456 */ 1541 */
1457 if ((error = xfs_da_read_buf(tp, dp, 1542 error = xfs_dir2_free_try_read(tp, dp,
1458 xfs_dir2_db_to_da(mp, fbno), -2, &fbp, 1543 xfs_dir2_db_to_da(mp, fbno),
1459 XFS_DATA_FORK))) { 1544 &fbp);
1545 if (error)
1460 return error; 1546 return error;
1461 } 1547 if (!fbp)
1462 if (unlikely(fbp == NULL)) {
1463 continue; 1548 continue;
1464 }
1465 free = fbp->b_addr; 1549 free = fbp->b_addr;
1466 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 1550 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
1467 findex = 0; 1551 findex = 0;
@@ -1520,9 +1604,10 @@ xfs_dir2_node_addname_int(
1520 * that was just allocated. 1604 * that was just allocated.
1521 */ 1605 */
1522 fbno = xfs_dir2_db_to_fdb(mp, dbno); 1606 fbno = xfs_dir2_db_to_fdb(mp, dbno);
1523 if (unlikely(error = xfs_da_read_buf(tp, dp, 1607 error = xfs_dir2_free_try_read(tp, dp,
1524 xfs_dir2_db_to_da(mp, fbno), -2, &fbp, 1608 xfs_dir2_db_to_da(mp, fbno),
1525 XFS_DATA_FORK))) 1609 &fbp);
1610 if (error)
1526 return error; 1611 return error;
1527 1612
1528 /* 1613 /*
@@ -1561,12 +1646,12 @@ xfs_dir2_node_addname_int(
1561 /* 1646 /*
1562 * Get a buffer for the new block. 1647 * Get a buffer for the new block.
1563 */ 1648 */
1564 if ((error = xfs_da_get_buf(tp, dp, 1649 error = xfs_da_get_buf(tp, dp,
1565 xfs_dir2_db_to_da(mp, fbno), 1650 xfs_dir2_db_to_da(mp, fbno),
1566 -1, &fbp, XFS_DATA_FORK))) { 1651 -1, &fbp, XFS_DATA_FORK);
1652 if (error)
1567 return error; 1653 return error;
1568 } 1654 fbp->b_ops = &xfs_dir2_free_buf_ops;
1569 ASSERT(fbp != NULL);
1570 1655
1571 /* 1656 /*
1572 * Initialize the new block to be empty, and remember 1657 * Initialize the new block to be empty, and remember
@@ -1630,8 +1715,8 @@ xfs_dir2_node_addname_int(
1630 /* 1715 /*
1631 * Read the data block in. 1716 * Read the data block in.
1632 */ 1717 */
1633 error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), 1718 error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
1634 -1, &dbp, XFS_DATA_FORK); 1719 -1, &dbp);
1635 if (error) 1720 if (error)
1636 return error; 1721 return error;
1637 hdr = dbp->b_addr; 1722 hdr = dbp->b_addr;
@@ -1917,18 +2002,15 @@ xfs_dir2_node_trim_free(
1917 /* 2002 /*
1918 * Read the freespace block. 2003 * Read the freespace block.
1919 */ 2004 */
1920 if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, 2005 error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
1921 XFS_DATA_FORK))) { 2006 if (error)
1922 return error; 2007 return error;
1923 }
1924
1925 /* 2008 /*
1926 * There can be holes in freespace. If fo is a hole, there's 2009 * There can be holes in freespace. If fo is a hole, there's
1927 * nothing to do. 2010 * nothing to do.
1928 */ 2011 */
1929 if (bp == NULL) { 2012 if (!bp)
1930 return 0; 2013 return 0;
1931 }
1932 free = bp->b_addr; 2014 free = bp->b_addr;
1933 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 2015 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
1934 /* 2016 /*
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 3523d3e15aa8..7da79f6515fd 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -30,6 +30,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
30 const unsigned char *name, int len); 30 const unsigned char *name, int len);
31 31
32/* xfs_dir2_block.c */ 32/* xfs_dir2_block.c */
33extern const struct xfs_buf_ops xfs_dir2_block_buf_ops;
34
33extern int xfs_dir2_block_addname(struct xfs_da_args *args); 35extern int xfs_dir2_block_addname(struct xfs_da_args *args);
34extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, 36extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
35 xfs_off_t *offset, filldir_t filldir); 37 xfs_off_t *offset, filldir_t filldir);
@@ -41,10 +43,19 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
41 43
42/* xfs_dir2_data.c */ 44/* xfs_dir2_data.c */
43#ifdef DEBUG 45#ifdef DEBUG
44extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); 46#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp);
45#else 47#else
46#define xfs_dir2_data_check(dp,bp) 48#define xfs_dir2_data_check(dp,bp)
47#endif 49#endif
50
51extern const struct xfs_buf_ops xfs_dir2_data_buf_ops;
52
53extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
54extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
55 xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
56extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
57 xfs_dablk_t bno, xfs_daddr_t mapped_bno);
58
48extern struct xfs_dir2_data_free * 59extern struct xfs_dir2_data_free *
49xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, 60xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
50 struct xfs_dir2_data_unused *dup, int *loghead); 61 struct xfs_dir2_data_unused *dup, int *loghead);
@@ -66,6 +77,10 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
66 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); 77 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
67 78
68/* xfs_dir2_leaf.c */ 79/* xfs_dir2_leaf.c */
80extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops;
81
82extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
83 xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
69extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, 84extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
70 struct xfs_buf *dbp); 85 struct xfs_buf *dbp);
71extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); 86extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
@@ -115,6 +130,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args);
115extern int xfs_dir2_node_replace(struct xfs_da_args *args); 130extern int xfs_dir2_node_replace(struct xfs_da_args *args);
116extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, 131extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
117 int *rvalp); 132 int *rvalp);
133extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
134 xfs_dablk_t fbno, struct xfs_buf **bpp);
118 135
119/* xfs_dir2_sf.c */ 136/* xfs_dir2_sf.c */
120extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); 137extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index bf27fcca4843..9e1bf5294c91 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -248,7 +248,59 @@ xfs_qm_init_dquot_blk(
248 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); 248 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
249} 249}
250 250
251static void
252xfs_dquot_buf_verify(
253 struct xfs_buf *bp)
254{
255 struct xfs_mount *mp = bp->b_target->bt_mount;
256 struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
257 struct xfs_disk_dquot *ddq;
258 xfs_dqid_t id = 0;
259 int i;
260
261 /*
262 * On the first read of the buffer, verify that each dquot is valid.
263 * We don't know what the id of the dquot is supposed to be, just that
264 * they should be increasing monotonically within the buffer. If the
265 * first id is corrupt, then it will fail on the second dquot in the
266 * buffer so corruptions could point to the wrong dquot in this case.
267 */
268 for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
269 int error;
270
271 ddq = &d[i].dd_diskdq;
272
273 if (i == 0)
274 id = be32_to_cpu(ddq->d_id);
275
276 error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
277 "xfs_dquot_read_verify");
278 if (error) {
279 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
280 xfs_buf_ioerror(bp, EFSCORRUPTED);
281 break;
282 }
283 }
284}
285
286static void
287xfs_dquot_buf_read_verify(
288 struct xfs_buf *bp)
289{
290 xfs_dquot_buf_verify(bp);
291}
292
293void
294xfs_dquot_buf_write_verify(
295 struct xfs_buf *bp)
296{
297 xfs_dquot_buf_verify(bp);
298}
251 299
300const struct xfs_buf_ops xfs_dquot_buf_ops = {
301 .verify_read = xfs_dquot_buf_read_verify,
302 .verify_write = xfs_dquot_buf_write_verify,
303};
252 304
253/* 305/*
254 * Allocate a block and fill it with dquots. 306 * Allocate a block and fill it with dquots.
@@ -315,6 +367,7 @@ xfs_qm_dqalloc(
315 error = xfs_buf_geterror(bp); 367 error = xfs_buf_geterror(bp);
316 if (error) 368 if (error)
317 goto error1; 369 goto error1;
370 bp->b_ops = &xfs_dquot_buf_ops;
318 371
319 /* 372 /*
320 * Make a chunk of dquots out of this buffer and log 373 * Make a chunk of dquots out of this buffer and log
@@ -359,6 +412,51 @@ xfs_qm_dqalloc(
359 412
360 return (error); 413 return (error);
361} 414}
415STATIC int
416xfs_qm_dqrepair(
417 struct xfs_mount *mp,
418 struct xfs_trans *tp,
419 struct xfs_dquot *dqp,
420 xfs_dqid_t firstid,
421 struct xfs_buf **bpp)
422{
423 int error;
424 struct xfs_disk_dquot *ddq;
425 struct xfs_dqblk *d;
426 int i;
427
428 /*
429 * Read the buffer without verification so we get the corrupted
430 * buffer returned to us. make sure we verify it on write, though.
431 */
432 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
433 mp->m_quotainfo->qi_dqchunklen,
434 0, bpp, NULL);
435
436 if (error) {
437 ASSERT(*bpp == NULL);
438 return XFS_ERROR(error);
439 }
440 (*bpp)->b_ops = &xfs_dquot_buf_ops;
441
442 ASSERT(xfs_buf_islocked(*bpp));
443 d = (struct xfs_dqblk *)(*bpp)->b_addr;
444
445 /* Do the actual repair of dquots in this buffer */
446 for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
447 ddq = &d[i].dd_diskdq;
448 error = xfs_qm_dqcheck(mp, ddq, firstid + i,
449 dqp->dq_flags & XFS_DQ_ALLTYPES,
450 XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
451 if (error) {
452 /* repair failed, we're screwed */
453 xfs_trans_brelse(tp, *bpp);
454 return XFS_ERROR(EIO);
455 }
456 }
457
458 return 0;
459}
362 460
363/* 461/*
364 * Maps a dquot to the buffer containing its on-disk version. 462 * Maps a dquot to the buffer containing its on-disk version.
@@ -378,7 +476,6 @@ xfs_qm_dqtobp(
378 xfs_buf_t *bp; 476 xfs_buf_t *bp;
379 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); 477 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
380 xfs_mount_t *mp = dqp->q_mount; 478 xfs_mount_t *mp = dqp->q_mount;
381 xfs_disk_dquot_t *ddq;
382 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); 479 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
383 xfs_trans_t *tp = (tpp ? *tpp : NULL); 480 xfs_trans_t *tp = (tpp ? *tpp : NULL);
384 481
@@ -439,33 +536,24 @@ xfs_qm_dqtobp(
439 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 536 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
440 dqp->q_blkno, 537 dqp->q_blkno,
441 mp->m_quotainfo->qi_dqchunklen, 538 mp->m_quotainfo->qi_dqchunklen,
442 0, &bp); 539 0, &bp, &xfs_dquot_buf_ops);
443 if (error || !bp)
444 return XFS_ERROR(error);
445 }
446
447 ASSERT(xfs_buf_islocked(bp));
448 540
449 /* 541 if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
450 * calculate the location of the dquot inside the buffer. 542 xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
451 */ 543 mp->m_quotainfo->qi_dqperchunk;
452 ddq = bp->b_addr + dqp->q_bufoffset; 544 ASSERT(bp == NULL);
545 error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp);
546 }
453 547
454 /* 548 if (error) {
455 * A simple sanity check in case we got a corrupted dquot... 549 ASSERT(bp == NULL);
456 */ 550 return XFS_ERROR(error);
457 error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
458 flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
459 "dqtobp");
460 if (error) {
461 if (!(flags & XFS_QMOPT_DQREPAIR)) {
462 xfs_trans_brelse(tp, bp);
463 return XFS_ERROR(EIO);
464 } 551 }
465 } 552 }
466 553
554 ASSERT(xfs_buf_islocked(bp));
467 *O_bpp = bp; 555 *O_bpp = bp;
468 *O_ddpp = ddq; 556 *O_ddpp = bp->b_addr + dqp->q_bufoffset;
469 557
470 return (0); 558 return (0);
471} 559}
@@ -920,7 +1008,7 @@ xfs_qm_dqflush(
920 * Get the buffer containing the on-disk dquot 1008 * Get the buffer containing the on-disk dquot
921 */ 1009 */
922 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, 1010 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
923 mp->m_quotainfo->qi_dqchunklen, 0, &bp); 1011 mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
924 if (error) 1012 if (error)
925 goto out_unlock; 1013 goto out_unlock;
926 1014
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 7d20af27346d..c694a8469c4a 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -161,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
161 return dqp; 161 return dqp;
162} 162}
163 163
164extern const struct xfs_buf_ops xfs_dquot_buf_ops;
165
164#endif /* __XFS_DQUOT_H__ */ 166#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 8c6d1d70278c..a83611849cee 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -29,6 +29,7 @@
29#include "xfs_inode.h" 29#include "xfs_inode.h"
30#include "xfs_inode_item.h" 30#include "xfs_inode_item.h"
31#include "xfs_trace.h" 31#include "xfs_trace.h"
32#include "xfs_icache.h"
32 33
33/* 34/*
34 * Note that we only accept fileids which are long enough rather than allow 35 * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index aa473fa640a2..67284edb84d7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -31,6 +31,8 @@
31#include "xfs_error.h" 31#include "xfs_error.h"
32#include "xfs_vnodeops.h" 32#include "xfs_vnodeops.h"
33#include "xfs_da_btree.h" 33#include "xfs_da_btree.h"
34#include "xfs_dir2_format.h"
35#include "xfs_dir2_priv.h"
34#include "xfs_ioctl.h" 36#include "xfs_ioctl.h"
35#include "xfs_trace.h" 37#include "xfs_trace.h"
36 38
@@ -84,7 +86,7 @@ xfs_rw_ilock_demote(
84 * valid before the operation, it will be read from disk before 86 * valid before the operation, it will be read from disk before
85 * being partially zeroed. 87 * being partially zeroed.
86 */ 88 */
87STATIC int 89int
88xfs_iozero( 90xfs_iozero(
89 struct xfs_inode *ip, /* inode */ 91 struct xfs_inode *ip, /* inode */
90 loff_t pos, /* offset in file */ 92 loff_t pos, /* offset in file */
@@ -255,15 +257,14 @@ xfs_file_aio_read(
255 xfs_buftarg_t *target = 257 xfs_buftarg_t *target =
256 XFS_IS_REALTIME_INODE(ip) ? 258 XFS_IS_REALTIME_INODE(ip) ?
257 mp->m_rtdev_targp : mp->m_ddev_targp; 259 mp->m_rtdev_targp : mp->m_ddev_targp;
258 if ((iocb->ki_pos & target->bt_smask) || 260 if ((pos & target->bt_smask) || (size & target->bt_smask)) {
259 (size & target->bt_smask)) { 261 if (pos == i_size_read(inode))
260 if (iocb->ki_pos == i_size_read(inode))
261 return 0; 262 return 0;
262 return -XFS_ERROR(EINVAL); 263 return -XFS_ERROR(EINVAL);
263 } 264 }
264 } 265 }
265 266
266 n = mp->m_super->s_maxbytes - iocb->ki_pos; 267 n = mp->m_super->s_maxbytes - pos;
267 if (n <= 0 || size == 0) 268 if (n <= 0 || size == 0)
268 return 0; 269 return 0;
269 270
@@ -289,20 +290,21 @@ xfs_file_aio_read(
289 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); 290 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
290 291
291 if (inode->i_mapping->nrpages) { 292 if (inode->i_mapping->nrpages) {
292 ret = -xfs_flushinval_pages(ip, 293 ret = -filemap_write_and_wait_range(
293 (iocb->ki_pos & PAGE_CACHE_MASK), 294 VFS_I(ip)->i_mapping,
294 -1, FI_REMAPF_LOCKED); 295 pos, -1);
295 if (ret) { 296 if (ret) {
296 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); 297 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
297 return ret; 298 return ret;
298 } 299 }
300 truncate_pagecache_range(VFS_I(ip), pos, -1);
299 } 301 }
300 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 302 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
301 } 303 }
302 304
303 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 305 trace_xfs_file_read(ip, size, pos, ioflags);
304 306
305 ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); 307 ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
306 if (ret > 0) 308 if (ret > 0)
307 XFS_STATS_ADD(xs_read_bytes, ret); 309 XFS_STATS_ADD(xs_read_bytes, ret);
308 310
@@ -670,10 +672,11 @@ xfs_file_dio_aio_write(
670 goto out; 672 goto out;
671 673
672 if (mapping->nrpages) { 674 if (mapping->nrpages) {
673 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, 675 ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
674 FI_REMAPF_LOCKED); 676 pos, -1);
675 if (ret) 677 if (ret)
676 goto out; 678 goto out;
679 truncate_pagecache_range(VFS_I(ip), pos, -1);
677 } 680 }
678 681
679 /* 682 /*
@@ -728,16 +731,17 @@ xfs_file_buffered_aio_write(
728write_retry: 731write_retry:
729 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); 732 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
730 ret = generic_file_buffered_write(iocb, iovp, nr_segs, 733 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
731 pos, &iocb->ki_pos, count, ret); 734 pos, &iocb->ki_pos, count, 0);
735
732 /* 736 /*
733 * if we just got an ENOSPC, flush the inode now we aren't holding any 737 * If we just got an ENOSPC, try to write back all dirty inodes to
734 * page locks and retry *once* 738 * convert delalloc space to free up some of the excess reserved
739 * metadata space.
735 */ 740 */
736 if (ret == -ENOSPC && !enospc) { 741 if (ret == -ENOSPC && !enospc) {
737 enospc = 1; 742 enospc = 1;
738 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); 743 xfs_flush_inodes(ip->i_mount);
739 if (!ret) 744 goto write_retry;
740 goto write_retry;
741 } 745 }
742 746
743 current->backing_dev_info = NULL; 747 current->backing_dev_info = NULL;
@@ -889,7 +893,7 @@ xfs_dir_open(
889 */ 893 */
890 mode = xfs_ilock_map_shared(ip); 894 mode = xfs_ilock_map_shared(ip);
891 if (ip->i_d.di_nextents > 0) 895 if (ip->i_d.di_nextents > 0)
892 xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); 896 xfs_dir2_data_readahead(NULL, ip, 0, -1);
893 xfs_iunlock(ip, mode); 897 xfs_iunlock(ip, mode);
894 return 0; 898 return 0;
895} 899}
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c13fed8c394a..6dda3f949b04 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks {
233#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ 233#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
234#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ 234#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
235#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ 235#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
236#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ 236#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
237#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
237#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ 238#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
238 239
239 240
@@ -339,6 +340,35 @@ typedef struct xfs_error_injection {
339 340
340 341
341/* 342/*
343 * Speculative preallocation trimming.
344 */
345#define XFS_EOFBLOCKS_VERSION 1
346struct xfs_eofblocks {
347 __u32 eof_version;
348 __u32 eof_flags;
349 uid_t eof_uid;
350 gid_t eof_gid;
351 prid_t eof_prid;
352 __u32 pad32;
353 __u64 eof_min_file_size;
354 __u64 pad64[12];
355};
356
357/* eof_flags values */
358#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */
359#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */
360#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */
361#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */
362#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */
363#define XFS_EOF_FLAGS_VALID \
364 (XFS_EOF_FLAGS_SYNC | \
365 XFS_EOF_FLAGS_UID | \
366 XFS_EOF_FLAGS_GID | \
367 XFS_EOF_FLAGS_PRID | \
368 XFS_EOF_FLAGS_MINFILESIZE)
369
370
371/*
342 * The user-level Handle Request interface structure. 372 * The user-level Handle Request interface structure.
343 */ 373 */
344typedef struct xfs_fsop_handlereq { 374typedef struct xfs_fsop_handlereq {
@@ -456,6 +486,7 @@ typedef struct xfs_handle {
456/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ 486/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
457#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) 487#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
458#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) 488#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
489#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks)
459 490
460/* 491/*
461 * ioctl commands that replace IRIX syssgi()'s 492 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
deleted file mode 100644
index 652b875a9d4c..000000000000
--- a/fs/xfs/xfs_fs_subr.c
+++ /dev/null
@@ -1,96 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_vnodeops.h"
20#include "xfs_bmap_btree.h"
21#include "xfs_inode.h"
22#include "xfs_trace.h"
23
24/*
25 * note: all filemap functions return negative error codes. These
26 * need to be inverted before returning to the xfs core functions.
27 */
28void
29xfs_tosspages(
30 xfs_inode_t *ip,
31 xfs_off_t first,
32 xfs_off_t last,
33 int fiopt)
34{
35 /* can't toss partial tail pages, so mask them out */
36 last &= ~(PAGE_SIZE - 1);
37 truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
38}
39
40int
41xfs_flushinval_pages(
42 xfs_inode_t *ip,
43 xfs_off_t first,
44 xfs_off_t last,
45 int fiopt)
46{
47 struct address_space *mapping = VFS_I(ip)->i_mapping;
48 int ret = 0;
49
50 trace_xfs_pagecache_inval(ip, first, last);
51
52 xfs_iflags_clear(ip, XFS_ITRUNCATED);
53 ret = filemap_write_and_wait_range(mapping, first,
54 last == -1 ? LLONG_MAX : last);
55 if (!ret)
56 truncate_inode_pages_range(mapping, first, last);
57 return -ret;
58}
59
60int
61xfs_flush_pages(
62 xfs_inode_t *ip,
63 xfs_off_t first,
64 xfs_off_t last,
65 uint64_t flags,
66 int fiopt)
67{
68 struct address_space *mapping = VFS_I(ip)->i_mapping;
69 int ret = 0;
70 int ret2;
71
72 xfs_iflags_clear(ip, XFS_ITRUNCATED);
73 ret = -filemap_fdatawrite_range(mapping, first,
74 last == -1 ? LLONG_MAX : last);
75 if (flags & XBF_ASYNC)
76 return ret;
77 ret2 = xfs_wait_on_pages(ip, first, last);
78 if (!ret)
79 ret = ret2;
80 return ret;
81}
82
83int
84xfs_wait_on_pages(
85 xfs_inode_t *ip,
86 xfs_off_t first,
87 xfs_off_t last)
88{
89 struct address_space *mapping = VFS_I(ip)->i_mapping;
90
91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? XFS_ISIZE(ip) - 1 : last);
94 }
95 return 0;
96}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 4beaede43277..94eaeedc5498 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -97,7 +97,9 @@ xfs_fs_geometry(
97 (xfs_sb_version_haslazysbcount(&mp->m_sb) ? 97 (xfs_sb_version_haslazysbcount(&mp->m_sb) ?
98 XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | 98 XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
99 (xfs_sb_version_hasattr2(&mp->m_sb) ? 99 (xfs_sb_version_hasattr2(&mp->m_sb) ?
100 XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); 100 XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
101 (xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
102 XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
101 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? 103 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
102 mp->m_sb.sb_logsectsize : BBSIZE; 104 mp->m_sb.sb_logsectsize : BBSIZE;
103 geo->rtsectsize = mp->m_sb.sb_blocksize; 105 geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -112,18 +114,40 @@ xfs_fs_geometry(
112 return 0; 114 return 0;
113} 115}
114 116
117static struct xfs_buf *
118xfs_growfs_get_hdr_buf(
119 struct xfs_mount *mp,
120 xfs_daddr_t blkno,
121 size_t numblks,
122 int flags,
123 const struct xfs_buf_ops *ops)
124{
125 struct xfs_buf *bp;
126
127 bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
128 if (!bp)
129 return NULL;
130
131 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
132 bp->b_bn = blkno;
133 bp->b_maps[0].bm_bn = blkno;
134 bp->b_ops = ops;
135
136 return bp;
137}
138
115static int 139static int
116xfs_growfs_data_private( 140xfs_growfs_data_private(
117 xfs_mount_t *mp, /* mount point for filesystem */ 141 xfs_mount_t *mp, /* mount point for filesystem */
118 xfs_growfs_data_t *in) /* growfs data input struct */ 142 xfs_growfs_data_t *in) /* growfs data input struct */
119{ 143{
120 xfs_agf_t *agf; 144 xfs_agf_t *agf;
145 struct xfs_agfl *agfl;
121 xfs_agi_t *agi; 146 xfs_agi_t *agi;
122 xfs_agnumber_t agno; 147 xfs_agnumber_t agno;
123 xfs_extlen_t agsize; 148 xfs_extlen_t agsize;
124 xfs_extlen_t tmpsize; 149 xfs_extlen_t tmpsize;
125 xfs_alloc_rec_t *arec; 150 xfs_alloc_rec_t *arec;
126 struct xfs_btree_block *block;
127 xfs_buf_t *bp; 151 xfs_buf_t *bp;
128 int bucket; 152 int bucket;
129 int dpct; 153 int dpct;
@@ -146,9 +170,14 @@ xfs_growfs_data_private(
146 dpct = pct - mp->m_sb.sb_imax_pct; 170 dpct = pct - mp->m_sb.sb_imax_pct;
147 bp = xfs_buf_read_uncached(mp->m_ddev_targp, 171 bp = xfs_buf_read_uncached(mp->m_ddev_targp,
148 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 172 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
149 XFS_FSS_TO_BB(mp, 1), 0); 173 XFS_FSS_TO_BB(mp, 1), 0, NULL);
150 if (!bp) 174 if (!bp)
151 return EIO; 175 return EIO;
176 if (bp->b_error) {
177 int error = bp->b_error;
178 xfs_buf_relse(bp);
179 return error;
180 }
152 xfs_buf_relse(bp); 181 xfs_buf_relse(bp);
153 182
154 new = nb; /* use new as a temporary here */ 183 new = nb; /* use new as a temporary here */
@@ -186,17 +215,18 @@ xfs_growfs_data_private(
186 nfree = 0; 215 nfree = 0;
187 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { 216 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
188 /* 217 /*
189 * AG freelist header block 218 * AG freespace header block
190 */ 219 */
191 bp = xfs_buf_get(mp->m_ddev_targp, 220 bp = xfs_growfs_get_hdr_buf(mp,
192 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 221 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
193 XFS_FSS_TO_BB(mp, 1), 0); 222 XFS_FSS_TO_BB(mp, 1), 0,
223 &xfs_agf_buf_ops);
194 if (!bp) { 224 if (!bp) {
195 error = ENOMEM; 225 error = ENOMEM;
196 goto error0; 226 goto error0;
197 } 227 }
228
198 agf = XFS_BUF_TO_AGF(bp); 229 agf = XFS_BUF_TO_AGF(bp);
199 memset(agf, 0, mp->m_sb.sb_sectsize);
200 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); 230 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
201 agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); 231 agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
202 agf->agf_seqno = cpu_to_be32(agno); 232 agf->agf_seqno = cpu_to_be32(agno);
@@ -223,17 +253,39 @@ xfs_growfs_data_private(
223 goto error0; 253 goto error0;
224 254
225 /* 255 /*
256 * AG freelist header block
257 */
258 bp = xfs_growfs_get_hdr_buf(mp,
259 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
260 XFS_FSS_TO_BB(mp, 1), 0,
261 &xfs_agfl_buf_ops);
262 if (!bp) {
263 error = ENOMEM;
264 goto error0;
265 }
266
267 agfl = XFS_BUF_TO_AGFL(bp);
268 for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
269 agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
270
271 error = xfs_bwrite(bp);
272 xfs_buf_relse(bp);
273 if (error)
274 goto error0;
275
276 /*
226 * AG inode header block 277 * AG inode header block
227 */ 278 */
228 bp = xfs_buf_get(mp->m_ddev_targp, 279 bp = xfs_growfs_get_hdr_buf(mp,
229 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 280 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
230 XFS_FSS_TO_BB(mp, 1), 0); 281 XFS_FSS_TO_BB(mp, 1), 0,
282 &xfs_agi_buf_ops);
231 if (!bp) { 283 if (!bp) {
232 error = ENOMEM; 284 error = ENOMEM;
233 goto error0; 285 goto error0;
234 } 286 }
287
235 agi = XFS_BUF_TO_AGI(bp); 288 agi = XFS_BUF_TO_AGI(bp);
236 memset(agi, 0, mp->m_sb.sb_sectsize);
237 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); 289 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
238 agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); 290 agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
239 agi->agi_seqno = cpu_to_be32(agno); 291 agi->agi_seqno = cpu_to_be32(agno);
@@ -254,24 +306,22 @@ xfs_growfs_data_private(
254 /* 306 /*
255 * BNO btree root block 307 * BNO btree root block
256 */ 308 */
257 bp = xfs_buf_get(mp->m_ddev_targp, 309 bp = xfs_growfs_get_hdr_buf(mp,
258 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), 310 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
259 BTOBB(mp->m_sb.sb_blocksize), 0); 311 BTOBB(mp->m_sb.sb_blocksize), 0,
312 &xfs_allocbt_buf_ops);
313
260 if (!bp) { 314 if (!bp) {
261 error = ENOMEM; 315 error = ENOMEM;
262 goto error0; 316 goto error0;
263 } 317 }
264 block = XFS_BUF_TO_BLOCK(bp); 318
265 memset(block, 0, mp->m_sb.sb_blocksize); 319 xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
266 block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); 320 arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
267 block->bb_level = 0;
268 block->bb_numrecs = cpu_to_be16(1);
269 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
270 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
271 arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
272 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); 321 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
273 arec->ar_blockcount = cpu_to_be32( 322 arec->ar_blockcount = cpu_to_be32(
274 agsize - be32_to_cpu(arec->ar_startblock)); 323 agsize - be32_to_cpu(arec->ar_startblock));
324
275 error = xfs_bwrite(bp); 325 error = xfs_bwrite(bp);
276 xfs_buf_relse(bp); 326 xfs_buf_relse(bp);
277 if (error) 327 if (error)
@@ -280,25 +330,22 @@ xfs_growfs_data_private(
280 /* 330 /*
281 * CNT btree root block 331 * CNT btree root block
282 */ 332 */
283 bp = xfs_buf_get(mp->m_ddev_targp, 333 bp = xfs_growfs_get_hdr_buf(mp,
284 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), 334 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
285 BTOBB(mp->m_sb.sb_blocksize), 0); 335 BTOBB(mp->m_sb.sb_blocksize), 0,
336 &xfs_allocbt_buf_ops);
286 if (!bp) { 337 if (!bp) {
287 error = ENOMEM; 338 error = ENOMEM;
288 goto error0; 339 goto error0;
289 } 340 }
290 block = XFS_BUF_TO_BLOCK(bp); 341
291 memset(block, 0, mp->m_sb.sb_blocksize); 342 xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
292 block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); 343 arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
293 block->bb_level = 0;
294 block->bb_numrecs = cpu_to_be16(1);
295 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
296 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
297 arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
298 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); 344 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
299 arec->ar_blockcount = cpu_to_be32( 345 arec->ar_blockcount = cpu_to_be32(
300 agsize - be32_to_cpu(arec->ar_startblock)); 346 agsize - be32_to_cpu(arec->ar_startblock));
301 nfree += be32_to_cpu(arec->ar_blockcount); 347 nfree += be32_to_cpu(arec->ar_blockcount);
348
302 error = xfs_bwrite(bp); 349 error = xfs_bwrite(bp);
303 xfs_buf_relse(bp); 350 xfs_buf_relse(bp);
304 if (error) 351 if (error)
@@ -307,20 +354,17 @@ xfs_growfs_data_private(
307 /* 354 /*
308 * INO btree root block 355 * INO btree root block
309 */ 356 */
310 bp = xfs_buf_get(mp->m_ddev_targp, 357 bp = xfs_growfs_get_hdr_buf(mp,
311 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), 358 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
312 BTOBB(mp->m_sb.sb_blocksize), 0); 359 BTOBB(mp->m_sb.sb_blocksize), 0,
360 &xfs_inobt_buf_ops);
313 if (!bp) { 361 if (!bp) {
314 error = ENOMEM; 362 error = ENOMEM;
315 goto error0; 363 goto error0;
316 } 364 }
317 block = XFS_BUF_TO_BLOCK(bp); 365
318 memset(block, 0, mp->m_sb.sb_blocksize); 366 xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
319 block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); 367
320 block->bb_level = 0;
321 block->bb_numrecs = 0;
322 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
323 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
324 error = xfs_bwrite(bp); 368 error = xfs_bwrite(bp);
325 xfs_buf_relse(bp); 369 xfs_buf_relse(bp);
326 if (error) 370 if (error)
@@ -408,14 +452,16 @@ xfs_growfs_data_private(
408 if (agno < oagcount) { 452 if (agno < oagcount) {
409 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 453 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
410 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 454 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
411 XFS_FSS_TO_BB(mp, 1), 0, &bp); 455 XFS_FSS_TO_BB(mp, 1), 0, &bp,
456 &xfs_sb_buf_ops);
412 } else { 457 } else {
413 bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, 458 bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
414 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 459 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
415 XFS_FSS_TO_BB(mp, 1), 0); 460 XFS_FSS_TO_BB(mp, 1), 0);
416 if (bp) 461 if (bp) {
462 bp->b_ops = &xfs_sb_buf_ops;
417 xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); 463 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
418 else 464 } else
419 error = ENOMEM; 465 error = ENOMEM;
420 } 466 }
421 467
@@ -426,6 +472,7 @@ xfs_growfs_data_private(
426 break; 472 break;
427 } 473 }
428 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); 474 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
475
429 /* 476 /*
430 * If we get an error writing out the alternate superblocks, 477 * If we get an error writing out the alternate superblocks,
431 * just issue a warning and continue. The real work is 478 * just issue a warning and continue. The real work is
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 76e81cff70b9..5399ef222dd7 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -21,7 +21,8 @@
21/* 21/*
22 * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, 22 * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
23 * other XFS code uses these values. Times are measured in centisecs (i.e. 23 * other XFS code uses these values. Times are measured in centisecs (i.e.
24 * 100ths of a second). 24 * 100ths of a second) with the exception of eofb_timer, which is measured in
25 * seconds.
25 */ 26 */
26xfs_param_t xfs_params = { 27xfs_param_t xfs_params = {
27 /* MIN DFLT MAX */ 28 /* MIN DFLT MAX */
@@ -40,4 +41,5 @@ xfs_param_t xfs_params = {
40 .rotorstep = { 1, 1, 255 }, 41 .rotorstep = { 1, 1, 255 },
41 .inherit_nodfrg = { 0, 1, 1 }, 42 .inherit_nodfrg = { 0, 1, 1 },
42 .fstrm_timer = { 1, 30*100, 3600*100}, 43 .fstrm_timer = { 1, 30*100, 3600*100},
44 .eofb_timer = { 1, 300, 3600*24},
43}; 45};
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c5c4ef4f2bdb..a815412eab80 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -200,7 +200,8 @@ xfs_ialloc_inode_init(
200 */ 200 */
201 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); 201 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
203 mp->m_bsize * blks_per_cluster, 0); 203 mp->m_bsize * blks_per_cluster,
204 XBF_UNMAPPED);
204 if (!fbuf) 205 if (!fbuf)
205 return ENOMEM; 206 return ENOMEM;
206 /* 207 /*
@@ -210,6 +211,7 @@ xfs_ialloc_inode_init(
210 * to log a whole cluster of inodes instead of all the 211 * to log a whole cluster of inodes instead of all the
211 * individual transactions causing a lot of log traffic. 212 * individual transactions causing a lot of log traffic.
212 */ 213 */
214 fbuf->b_ops = &xfs_inode_buf_ops;
213 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); 215 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
214 for (i = 0; i < ninodes; i++) { 216 for (i = 0; i < ninodes; i++) {
215 int ioffset = i << mp->m_sb.sb_inodelog; 217 int ioffset = i << mp->m_sb.sb_inodelog;
@@ -877,9 +879,9 @@ error0:
877 * This function is designed to be called twice if it has to do an allocation 879 * This function is designed to be called twice if it has to do an allocation
878 * to make more free inodes. On the first call, *IO_agbp should be set to NULL. 880 * to make more free inodes. On the first call, *IO_agbp should be set to NULL.
879 * If an inode is available without having to performn an allocation, an inode 881 * If an inode is available without having to performn an allocation, an inode
880 * number is returned. In this case, *IO_agbp would be NULL. If an allocation 882 * number is returned. In this case, *IO_agbp is set to NULL. If an allocation
881 * needes to be done, xfs_dialloc would return the current AGI buffer in 883 * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
882 * *IO_agbp. The caller should then commit the current transaction, allocate a 884 * The caller should then commit the current transaction, allocate a
883 * new transaction, and call xfs_dialloc() again, passing in the previous value 885 * new transaction, and call xfs_dialloc() again, passing in the previous value
884 * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI 886 * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI
885 * buffer is locked across the two calls, the second call is guaranteed to have 887 * buffer is locked across the two calls, the second call is guaranteed to have
@@ -1472,6 +1474,57 @@ xfs_check_agi_unlinked(
1472#define xfs_check_agi_unlinked(agi) 1474#define xfs_check_agi_unlinked(agi)
1473#endif 1475#endif
1474 1476
1477static void
1478xfs_agi_verify(
1479 struct xfs_buf *bp)
1480{
1481 struct xfs_mount *mp = bp->b_target->bt_mount;
1482 struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
1483 int agi_ok;
1484
1485 /*
1486 * Validate the magic number of the agi block.
1487 */
1488 agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
1489 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
1490
1491 /*
1492 * during growfs operations, the perag is not fully initialised,
1493 * so we can't use it for any useful checking. growfs ensures we can't
1494 * use it by using uncached buffers that don't have the perag attached
1495 * so we can detect and avoid this problem.
1496 */
1497 if (bp->b_pag)
1498 agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) ==
1499 bp->b_pag->pag_agno;
1500
1501 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
1502 XFS_RANDOM_IALLOC_READ_AGI))) {
1503 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi);
1504 xfs_buf_ioerror(bp, EFSCORRUPTED);
1505 }
1506 xfs_check_agi_unlinked(agi);
1507}
1508
1509static void
1510xfs_agi_read_verify(
1511 struct xfs_buf *bp)
1512{
1513 xfs_agi_verify(bp);
1514}
1515
1516static void
1517xfs_agi_write_verify(
1518 struct xfs_buf *bp)
1519{
1520 xfs_agi_verify(bp);
1521}
1522
1523const struct xfs_buf_ops xfs_agi_buf_ops = {
1524 .verify_read = xfs_agi_read_verify,
1525 .verify_write = xfs_agi_write_verify,
1526};
1527
1475/* 1528/*
1476 * Read in the allocation group header (inode allocation section) 1529 * Read in the allocation group header (inode allocation section)
1477 */ 1530 */
@@ -1482,38 +1535,18 @@ xfs_read_agi(
1482 xfs_agnumber_t agno, /* allocation group number */ 1535 xfs_agnumber_t agno, /* allocation group number */
1483 struct xfs_buf **bpp) /* allocation group hdr buf */ 1536 struct xfs_buf **bpp) /* allocation group hdr buf */
1484{ 1537{
1485 struct xfs_agi *agi; /* allocation group header */
1486 int agi_ok; /* agi is consistent */
1487 int error; 1538 int error;
1488 1539
1489 ASSERT(agno != NULLAGNUMBER); 1540 ASSERT(agno != NULLAGNUMBER);
1490 1541
1491 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 1542 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
1492 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 1543 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
1493 XFS_FSS_TO_BB(mp, 1), 0, bpp); 1544 XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
1494 if (error) 1545 if (error)
1495 return error; 1546 return error;
1496 1547
1497 ASSERT(!xfs_buf_geterror(*bpp)); 1548 ASSERT(!xfs_buf_geterror(*bpp));
1498 agi = XFS_BUF_TO_AGI(*bpp);
1499
1500 /*
1501 * Validate the magic number of the agi block.
1502 */
1503 agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
1504 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
1505 be32_to_cpu(agi->agi_seqno) == agno;
1506 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
1507 XFS_RANDOM_IALLOC_READ_AGI))) {
1508 XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
1509 mp, agi);
1510 xfs_trans_brelse(tp, *bpp);
1511 return XFS_ERROR(EFSCORRUPTED);
1512 }
1513
1514 xfs_buf_set_ref(*bpp, XFS_AGI_REF); 1549 xfs_buf_set_ref(*bpp, XFS_AGI_REF);
1515
1516 xfs_check_agi_unlinked(agi);
1517 return 0; 1550 return 0;
1518} 1551}
1519 1552
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 1fd6ea4e9c91..c8da3df271e6 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -147,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
147/* 147/*
148 * Get the data from the pointed-to record. 148 * Get the data from the pointed-to record.
149 */ 149 */
150extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, 150int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
151 xfs_inobt_rec_incore_t *rec, int *stat); 151 xfs_inobt_rec_incore_t *rec, int *stat);
152 152
153extern const struct xfs_buf_ops xfs_agi_buf_ops;
154
153#endif /* __XFS_IALLOC_H__ */ 155#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 2b8b7a37aa18..bec344b36507 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -33,6 +33,7 @@
33#include "xfs_ialloc.h" 33#include "xfs_ialloc.h"
34#include "xfs_alloc.h" 34#include "xfs_alloc.h"
35#include "xfs_error.h" 35#include "xfs_error.h"
36#include "xfs_trace.h"
36 37
37 38
38STATIC int 39STATIC int
@@ -181,6 +182,59 @@ xfs_inobt_key_diff(
181 cur->bc_rec.i.ir_startino; 182 cur->bc_rec.i.ir_startino;
182} 183}
183 184
185void
186xfs_inobt_verify(
187 struct xfs_buf *bp)
188{
189 struct xfs_mount *mp = bp->b_target->bt_mount;
190 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
191 unsigned int level;
192 int sblock_ok; /* block passes checks */
193
194 /* magic number and level verification */
195 level = be16_to_cpu(block->bb_level);
196 sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) &&
197 level < mp->m_in_maxlevels;
198
199 /* numrecs verification */
200 sblock_ok = sblock_ok &&
201 be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0];
202
203 /* sibling pointer verification */
204 sblock_ok = sblock_ok &&
205 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
206 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
207 block->bb_u.s.bb_leftsib &&
208 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
209 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
210 block->bb_u.s.bb_rightsib;
211
212 if (!sblock_ok) {
213 trace_xfs_btree_corrupt(bp, _RET_IP_);
214 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
215 xfs_buf_ioerror(bp, EFSCORRUPTED);
216 }
217}
218
219static void
220xfs_inobt_read_verify(
221 struct xfs_buf *bp)
222{
223 xfs_inobt_verify(bp);
224}
225
226static void
227xfs_inobt_write_verify(
228 struct xfs_buf *bp)
229{
230 xfs_inobt_verify(bp);
231}
232
233const struct xfs_buf_ops xfs_inobt_buf_ops = {
234 .verify_read = xfs_inobt_read_verify,
235 .verify_write = xfs_inobt_write_verify,
236};
237
184#ifdef DEBUG 238#ifdef DEBUG
185STATIC int 239STATIC int
186xfs_inobt_keys_inorder( 240xfs_inobt_keys_inorder(
@@ -218,6 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
218 .init_rec_from_cur = xfs_inobt_init_rec_from_cur, 272 .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
219 .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, 273 .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
220 .key_diff = xfs_inobt_key_diff, 274 .key_diff = xfs_inobt_key_diff,
275 .buf_ops = &xfs_inobt_buf_ops,
221#ifdef DEBUG 276#ifdef DEBUG
222 .keys_inorder = xfs_inobt_keys_inorder, 277 .keys_inorder = xfs_inobt_keys_inorder,
223 .recs_inorder = xfs_inobt_recs_inorder, 278 .recs_inorder = xfs_inobt_recs_inorder,
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index f782ad0c4769..25c0239a8eab 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
109 struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); 109 struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
110extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); 110extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
111 111
112extern const struct xfs_buf_ops xfs_inobt_buf_ops;
113
112#endif /* __XFS_IALLOC_BTREE_H__ */ 114#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_icache.c
index 9500caf15acf..96e344e3e927 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_icache.c
@@ -19,6 +19,7 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_log_priv.h"
22#include "xfs_inum.h" 23#include "xfs_inum.h"
23#include "xfs_trans.h" 24#include "xfs_trans.h"
24#include "xfs_trans_priv.h" 25#include "xfs_trans_priv.h"
@@ -35,11 +36,425 @@
35#include "xfs_quota.h" 36#include "xfs_quota.h"
36#include "xfs_trace.h" 37#include "xfs_trace.h"
37#include "xfs_fsops.h" 38#include "xfs_fsops.h"
39#include "xfs_icache.h"
38 40
39#include <linux/kthread.h> 41#include <linux/kthread.h>
40#include <linux/freezer.h> 42#include <linux/freezer.h>
41 43
42struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ 44STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
45 struct xfs_perag *pag, struct xfs_inode *ip);
46
47/*
48 * Allocate and initialise an xfs_inode.
49 */
50STATIC struct xfs_inode *
51xfs_inode_alloc(
52 struct xfs_mount *mp,
53 xfs_ino_t ino)
54{
55 struct xfs_inode *ip;
56
57 /*
58 * if this didn't occur in transactions, we could use
59 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
60 * code up to do this anyway.
61 */
62 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
63 if (!ip)
64 return NULL;
65 if (inode_init_always(mp->m_super, VFS_I(ip))) {
66 kmem_zone_free(xfs_inode_zone, ip);
67 return NULL;
68 }
69
70 ASSERT(atomic_read(&ip->i_pincount) == 0);
71 ASSERT(!spin_is_locked(&ip->i_flags_lock));
72 ASSERT(!xfs_isiflocked(ip));
73 ASSERT(ip->i_ino == 0);
74
75 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
76
77 /* initialise the xfs inode */
78 ip->i_ino = ino;
79 ip->i_mount = mp;
80 memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
81 ip->i_afp = NULL;
82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
83 ip->i_flags = 0;
84 ip->i_delayed_blks = 0;
85 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
86
87 return ip;
88}
89
90STATIC void
91xfs_inode_free_callback(
92 struct rcu_head *head)
93{
94 struct inode *inode = container_of(head, struct inode, i_rcu);
95 struct xfs_inode *ip = XFS_I(inode);
96
97 kmem_zone_free(xfs_inode_zone, ip);
98}
99
100STATIC void
101xfs_inode_free(
102 struct xfs_inode *ip)
103{
104 switch (ip->i_d.di_mode & S_IFMT) {
105 case S_IFREG:
106 case S_IFDIR:
107 case S_IFLNK:
108 xfs_idestroy_fork(ip, XFS_DATA_FORK);
109 break;
110 }
111
112 if (ip->i_afp)
113 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
114
115 if (ip->i_itemp) {
116 ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
117 xfs_inode_item_destroy(ip);
118 ip->i_itemp = NULL;
119 }
120
121 /* asserts to verify all state is correct here */
122 ASSERT(atomic_read(&ip->i_pincount) == 0);
123 ASSERT(!spin_is_locked(&ip->i_flags_lock));
124 ASSERT(!xfs_isiflocked(ip));
125
126 /*
127 * Because we use RCU freeing we need to ensure the inode always
128 * appears to be reclaimed with an invalid inode number when in the
129 * free state. The ip->i_flags_lock provides the barrier against lookup
130 * races.
131 */
132 spin_lock(&ip->i_flags_lock);
133 ip->i_flags = XFS_IRECLAIM;
134 ip->i_ino = 0;
135 spin_unlock(&ip->i_flags_lock);
136
137 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
138}
139
140/*
141 * Check the validity of the inode we just found it the cache
142 */
143static int
144xfs_iget_cache_hit(
145 struct xfs_perag *pag,
146 struct xfs_inode *ip,
147 xfs_ino_t ino,
148 int flags,
149 int lock_flags) __releases(RCU)
150{
151 struct inode *inode = VFS_I(ip);
152 struct xfs_mount *mp = ip->i_mount;
153 int error;
154
155 /*
156 * check for re-use of an inode within an RCU grace period due to the
157 * radix tree nodes not being updated yet. We monitor for this by
158 * setting the inode number to zero before freeing the inode structure.
159 * If the inode has been reallocated and set up, then the inode number
160 * will not match, so check for that, too.
161 */
162 spin_lock(&ip->i_flags_lock);
163 if (ip->i_ino != ino) {
164 trace_xfs_iget_skip(ip);
165 XFS_STATS_INC(xs_ig_frecycle);
166 error = EAGAIN;
167 goto out_error;
168 }
169
170
171 /*
172 * If we are racing with another cache hit that is currently
173 * instantiating this inode or currently recycling it out of
174 * reclaimabe state, wait for the initialisation to complete
175 * before continuing.
176 *
177 * XXX(hch): eventually we should do something equivalent to
178 * wait_on_inode to wait for these flags to be cleared
179 * instead of polling for it.
180 */
181 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
182 trace_xfs_iget_skip(ip);
183 XFS_STATS_INC(xs_ig_frecycle);
184 error = EAGAIN;
185 goto out_error;
186 }
187
188 /*
189 * If lookup is racing with unlink return an error immediately.
190 */
191 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
192 error = ENOENT;
193 goto out_error;
194 }
195
196 /*
197 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
198 * Need to carefully get it back into useable state.
199 */
200 if (ip->i_flags & XFS_IRECLAIMABLE) {
201 trace_xfs_iget_reclaim(ip);
202
203 /*
204 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
205 * from stomping over us while we recycle the inode. We can't
206 * clear the radix tree reclaimable tag yet as it requires
207 * pag_ici_lock to be held exclusive.
208 */
209 ip->i_flags |= XFS_IRECLAIM;
210
211 spin_unlock(&ip->i_flags_lock);
212 rcu_read_unlock();
213
214 error = -inode_init_always(mp->m_super, inode);
215 if (error) {
216 /*
217 * Re-initializing the inode failed, and we are in deep
218 * trouble. Try to re-add it to the reclaim list.
219 */
220 rcu_read_lock();
221 spin_lock(&ip->i_flags_lock);
222
223 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
224 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
225 trace_xfs_iget_reclaim_fail(ip);
226 goto out_error;
227 }
228
229 spin_lock(&pag->pag_ici_lock);
230 spin_lock(&ip->i_flags_lock);
231
232 /*
233 * Clear the per-lifetime state in the inode as we are now
234 * effectively a new inode and need to return to the initial
235 * state before reuse occurs.
236 */
237 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
238 ip->i_flags |= XFS_INEW;
239 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
240 inode->i_state = I_NEW;
241
242 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
243 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
244
245 spin_unlock(&ip->i_flags_lock);
246 spin_unlock(&pag->pag_ici_lock);
247 } else {
248 /* If the VFS inode is being torn down, pause and try again. */
249 if (!igrab(inode)) {
250 trace_xfs_iget_skip(ip);
251 error = EAGAIN;
252 goto out_error;
253 }
254
255 /* We've got a live one. */
256 spin_unlock(&ip->i_flags_lock);
257 rcu_read_unlock();
258 trace_xfs_iget_hit(ip);
259 }
260
261 if (lock_flags != 0)
262 xfs_ilock(ip, lock_flags);
263
264 xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
265 XFS_STATS_INC(xs_ig_found);
266
267 return 0;
268
269out_error:
270 spin_unlock(&ip->i_flags_lock);
271 rcu_read_unlock();
272 return error;
273}
274
275
276static int
277xfs_iget_cache_miss(
278 struct xfs_mount *mp,
279 struct xfs_perag *pag,
280 xfs_trans_t *tp,
281 xfs_ino_t ino,
282 struct xfs_inode **ipp,
283 int flags,
284 int lock_flags)
285{
286 struct xfs_inode *ip;
287 int error;
288 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
289 int iflags;
290
291 ip = xfs_inode_alloc(mp, ino);
292 if (!ip)
293 return ENOMEM;
294
295 error = xfs_iread(mp, tp, ip, flags);
296 if (error)
297 goto out_destroy;
298
299 trace_xfs_iget_miss(ip);
300
301 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
302 error = ENOENT;
303 goto out_destroy;
304 }
305
306 /*
307 * Preload the radix tree so we can insert safely under the
308 * write spinlock. Note that we cannot sleep inside the preload
309 * region. Since we can be called from transaction context, don't
310 * recurse into the file system.
311 */
312 if (radix_tree_preload(GFP_NOFS)) {
313 error = EAGAIN;
314 goto out_destroy;
315 }
316
317 /*
318 * Because the inode hasn't been added to the radix-tree yet it can't
319 * be found by another thread, so we can do the non-sleeping lock here.
320 */
321 if (lock_flags) {
322 if (!xfs_ilock_nowait(ip, lock_flags))
323 BUG();
324 }
325
326 /*
327 * These values must be set before inserting the inode into the radix
328 * tree as the moment it is inserted a concurrent lookup (allowed by the
329 * RCU locking mechanism) can find it and that lookup must see that this
330 * is an inode currently under construction (i.e. that XFS_INEW is set).
331 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
332 * memory barrier that ensures this detection works correctly at lookup
333 * time.
334 */
335 iflags = XFS_INEW;
336 if (flags & XFS_IGET_DONTCACHE)
337 iflags |= XFS_IDONTCACHE;
338 ip->i_udquot = ip->i_gdquot = NULL;
339 xfs_iflags_set(ip, iflags);
340
341 /* insert the new inode */
342 spin_lock(&pag->pag_ici_lock);
343 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
344 if (unlikely(error)) {
345 WARN_ON(error != -EEXIST);
346 XFS_STATS_INC(xs_ig_dup);
347 error = EAGAIN;
348 goto out_preload_end;
349 }
350 spin_unlock(&pag->pag_ici_lock);
351 radix_tree_preload_end();
352
353 *ipp = ip;
354 return 0;
355
356out_preload_end:
357 spin_unlock(&pag->pag_ici_lock);
358 radix_tree_preload_end();
359 if (lock_flags)
360 xfs_iunlock(ip, lock_flags);
361out_destroy:
362 __destroy_inode(VFS_I(ip));
363 xfs_inode_free(ip);
364 return error;
365}
366
367/*
368 * Look up an inode by number in the given file system.
369 * The inode is looked up in the cache held in each AG.
370 * If the inode is found in the cache, initialise the vfs inode
371 * if necessary.
372 *
373 * If it is not in core, read it in from the file system's device,
374 * add it to the cache and initialise the vfs inode.
375 *
376 * The inode is locked according to the value of the lock_flags parameter.
377 * This flag parameter indicates how and if the inode's IO lock and inode lock
378 * should be taken.
379 *
380 * mp -- the mount point structure for the current file system. It points
381 * to the inode hash table.
382 * tp -- a pointer to the current transaction if there is one. This is
383 * simply passed through to the xfs_iread() call.
384 * ino -- the number of the inode desired. This is the unique identifier
385 * within the file system for the inode being requested.
386 * lock_flags -- flags indicating how to lock the inode. See the comment
387 * for xfs_ilock() for a list of valid values.
388 */
389int
390xfs_iget(
391 xfs_mount_t *mp,
392 xfs_trans_t *tp,
393 xfs_ino_t ino,
394 uint flags,
395 uint lock_flags,
396 xfs_inode_t **ipp)
397{
398 xfs_inode_t *ip;
399 int error;
400 xfs_perag_t *pag;
401 xfs_agino_t agino;
402
403 /*
404 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
405 * doesn't get freed while it's being referenced during a
406 * radix tree traversal here. It assumes this function
407 * aqcuires only the ILOCK (and therefore it has no need to
408 * involve the IOLOCK in this synchronization).
409 */
410 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
411
412 /* reject inode numbers outside existing AGs */
413 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
414 return EINVAL;
415
416 /* get the perag structure and ensure that it's inode capable */
417 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
418 agino = XFS_INO_TO_AGINO(mp, ino);
419
420again:
421 error = 0;
422 rcu_read_lock();
423 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
424
425 if (ip) {
426 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
427 if (error)
428 goto out_error_or_again;
429 } else {
430 rcu_read_unlock();
431 XFS_STATS_INC(xs_ig_missed);
432
433 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
434 flags, lock_flags);
435 if (error)
436 goto out_error_or_again;
437 }
438 xfs_perag_put(pag);
439
440 *ipp = ip;
441
442 /*
443 * If we have a real type for an on-disk inode, we can set ops(&unlock)
444 * now. If it's a new inode being created, xfs_ialloc will handle it.
445 */
446 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
447 xfs_setup_inode(ip);
448 return 0;
449
450out_error_or_again:
451 if (error == EAGAIN) {
452 delay(1);
453 goto again;
454 }
455 xfs_perag_put(pag);
456 return error;
457}
43 458
44/* 459/*
45 * The inode lookup is done in batches to keep the amount of lock traffic and 460 * The inode lookup is done in batches to keep the amount of lock traffic and
@@ -101,8 +516,11 @@ xfs_inode_ag_walk(
101 struct xfs_mount *mp, 516 struct xfs_mount *mp,
102 struct xfs_perag *pag, 517 struct xfs_perag *pag,
103 int (*execute)(struct xfs_inode *ip, 518 int (*execute)(struct xfs_inode *ip,
104 struct xfs_perag *pag, int flags), 519 struct xfs_perag *pag, int flags,
105 int flags) 520 void *args),
521 int flags,
522 void *args,
523 int tag)
106{ 524{
107 uint32_t first_index; 525 uint32_t first_index;
108 int last_error = 0; 526 int last_error = 0;
@@ -121,9 +539,17 @@ restart:
121 int i; 539 int i;
122 540
123 rcu_read_lock(); 541 rcu_read_lock();
124 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 542
543 if (tag == -1)
544 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
125 (void **)batch, first_index, 545 (void **)batch, first_index,
126 XFS_LOOKUP_BATCH); 546 XFS_LOOKUP_BATCH);
547 else
548 nr_found = radix_tree_gang_lookup_tag(
549 &pag->pag_ici_root,
550 (void **) batch, first_index,
551 XFS_LOOKUP_BATCH, tag);
552
127 if (!nr_found) { 553 if (!nr_found) {
128 rcu_read_unlock(); 554 rcu_read_unlock();
129 break; 555 break;
@@ -164,7 +590,7 @@ restart:
164 for (i = 0; i < nr_found; i++) { 590 for (i = 0; i < nr_found; i++) {
165 if (!batch[i]) 591 if (!batch[i])
166 continue; 592 continue;
167 error = execute(batch[i], pag, flags); 593 error = execute(batch[i], pag, flags, args);
168 IRELE(batch[i]); 594 IRELE(batch[i]);
169 if (error == EAGAIN) { 595 if (error == EAGAIN) {
170 skipped++; 596 skipped++;
@@ -189,12 +615,40 @@ restart:
189 return last_error; 615 return last_error;
190} 616}
191 617
618/*
619 * Background scanning to trim post-EOF preallocated space. This is queued
620 * based on the 'background_prealloc_discard_period' tunable (5m by default).
621 */
622STATIC void
623xfs_queue_eofblocks(
624 struct xfs_mount *mp)
625{
626 rcu_read_lock();
627 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
628 queue_delayed_work(mp->m_eofblocks_workqueue,
629 &mp->m_eofblocks_work,
630 msecs_to_jiffies(xfs_eofb_secs * 1000));
631 rcu_read_unlock();
632}
633
634void
635xfs_eofblocks_worker(
636 struct work_struct *work)
637{
638 struct xfs_mount *mp = container_of(to_delayed_work(work),
639 struct xfs_mount, m_eofblocks_work);
640 xfs_icache_free_eofblocks(mp, NULL);
641 xfs_queue_eofblocks(mp);
642}
643
192int 644int
193xfs_inode_ag_iterator( 645xfs_inode_ag_iterator(
194 struct xfs_mount *mp, 646 struct xfs_mount *mp,
195 int (*execute)(struct xfs_inode *ip, 647 int (*execute)(struct xfs_inode *ip,
196 struct xfs_perag *pag, int flags), 648 struct xfs_perag *pag, int flags,
197 int flags) 649 void *args),
650 int flags,
651 void *args)
198{ 652{
199 struct xfs_perag *pag; 653 struct xfs_perag *pag;
200 int error = 0; 654 int error = 0;
@@ -204,7 +658,7 @@ xfs_inode_ag_iterator(
204 ag = 0; 658 ag = 0;
205 while ((pag = xfs_perag_get(mp, ag))) { 659 while ((pag = xfs_perag_get(mp, ag))) {
206 ag = pag->pag_agno + 1; 660 ag = pag->pag_agno + 1;
207 error = xfs_inode_ag_walk(mp, pag, execute, flags); 661 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
208 xfs_perag_put(pag); 662 xfs_perag_put(pag);
209 if (error) { 663 if (error) {
210 last_error = error; 664 last_error = error;
@@ -215,224 +669,50 @@ xfs_inode_ag_iterator(
215 return XFS_ERROR(last_error); 669 return XFS_ERROR(last_error);
216} 670}
217 671
218STATIC int
219xfs_sync_inode_data(
220 struct xfs_inode *ip,
221 struct xfs_perag *pag,
222 int flags)
223{
224 struct inode *inode = VFS_I(ip);
225 struct address_space *mapping = inode->i_mapping;
226 int error = 0;
227
228 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
229 return 0;
230
231 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
232 if (flags & SYNC_TRYLOCK)
233 return 0;
234 xfs_ilock(ip, XFS_IOLOCK_SHARED);
235 }
236
237 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
238 0 : XBF_ASYNC, FI_NONE);
239 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
240 return error;
241}
242
243/*
244 * Write out pagecache data for the whole filesystem.
245 */
246STATIC int
247xfs_sync_data(
248 struct xfs_mount *mp,
249 int flags)
250{
251 int error;
252
253 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
254
255 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
256 if (error)
257 return XFS_ERROR(error);
258
259 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
260 return 0;
261}
262
263STATIC int
264xfs_sync_fsdata(
265 struct xfs_mount *mp)
266{
267 struct xfs_buf *bp;
268 int error;
269
270 /*
271 * If the buffer is pinned then push on the log so we won't get stuck
272 * waiting in the write for someone, maybe ourselves, to flush the log.
273 *
274 * Even though we just pushed the log above, we did not have the
275 * superblock buffer locked at that point so it can become pinned in
276 * between there and here.
277 */
278 bp = xfs_getsb(mp, 0);
279 if (xfs_buf_ispinned(bp))
280 xfs_log_force(mp, 0);
281 error = xfs_bwrite(bp);
282 xfs_buf_relse(bp);
283 return error;
284}
285
286/*
287 * When remounting a filesystem read-only or freezing the filesystem, we have
288 * two phases to execute. This first phase is syncing the data before we
289 * quiesce the filesystem, and the second is flushing all the inodes out after
290 * we've waited for all the transactions created by the first phase to
291 * complete. The second phase ensures that the inodes are written to their
292 * location on disk rather than just existing in transactions in the log. This
293 * means after a quiesce there is no log replay required to write the inodes to
294 * disk (this is the main difference between a sync and a quiesce).
295 */
296/*
297 * First stage of freeze - no writers will make progress now we are here,
298 * so we flush delwri and delalloc buffers here, then wait for all I/O to
299 * complete. Data is frozen at that point. Metadata is not frozen,
300 * transactions can still occur here so don't bother emptying the AIL
301 * because it'll just get dirty again.
302 */
303int 672int
304xfs_quiesce_data( 673xfs_inode_ag_iterator_tag(
305 struct xfs_mount *mp) 674 struct xfs_mount *mp,
306{ 675 int (*execute)(struct xfs_inode *ip,
307 int error, error2 = 0; 676 struct xfs_perag *pag, int flags,
308 677 void *args),
309 /* force out the log */ 678 int flags,
310 xfs_log_force(mp, XFS_LOG_SYNC); 679 void *args,
311 680 int tag)
312 /* write superblock and hoover up shutdown errors */
313 error = xfs_sync_fsdata(mp);
314
315 /* mark the log as covered if needed */
316 if (xfs_log_need_covered(mp))
317 error2 = xfs_fs_log_dummy(mp);
318
319 return error ? error : error2;
320}
321
322/*
323 * Second stage of a quiesce. The data is already synced, now we have to take
324 * care of the metadata. New transactions are already blocked, so we need to
325 * wait for any remaining transactions to drain out before proceeding.
326 */
327void
328xfs_quiesce_attr(
329 struct xfs_mount *mp)
330{
331 int error = 0;
332
333 /* wait for all modifications to complete */
334 while (atomic_read(&mp->m_active_trans) > 0)
335 delay(100);
336
337 /* reclaim inodes to do any IO before the freeze completes */
338 xfs_reclaim_inodes(mp, 0);
339 xfs_reclaim_inodes(mp, SYNC_WAIT);
340
341 /* flush all pending changes from the AIL */
342 xfs_ail_push_all_sync(mp->m_ail);
343
344 /*
345 * Just warn here till VFS can correctly support
346 * read-only remount without racing.
347 */
348 WARN_ON(atomic_read(&mp->m_active_trans) != 0);
349
350 /* Push the superblock and write an unmount record */
351 error = xfs_log_sbcount(mp);
352 if (error)
353 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
354 "Frozen image may not be consistent.");
355 xfs_log_unmount_write(mp);
356
357 /*
358 * At this point we might have modified the superblock again and thus
359 * added an item to the AIL, thus flush it again.
360 */
361 xfs_ail_push_all_sync(mp->m_ail);
362
363 /*
364 * The superblock buffer is uncached and xfsaild_push() will lock and
365 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
366 * here but a lock on the superblock buffer will block until iodone()
367 * has completed.
368 */
369 xfs_buf_lock(mp->m_sb_bp);
370 xfs_buf_unlock(mp->m_sb_bp);
371}
372
373static void
374xfs_syncd_queue_sync(
375 struct xfs_mount *mp)
376{
377 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
378 msecs_to_jiffies(xfs_syncd_centisecs * 10));
379}
380
381/*
382 * Every sync period we need to unpin all items, reclaim inodes and sync
383 * disk quotas. We might need to cover the log to indicate that the
384 * filesystem is idle and not frozen.
385 */
386STATIC void
387xfs_sync_worker(
388 struct work_struct *work)
389{ 681{
390 struct xfs_mount *mp = container_of(to_delayed_work(work), 682 struct xfs_perag *pag;
391 struct xfs_mount, m_sync_work); 683 int error = 0;
392 int error; 684 int last_error = 0;
393 685 xfs_agnumber_t ag;
394 /*
395 * We shouldn't write/force the log if we are in the mount/unmount
396 * process or on a read only filesystem. The workqueue still needs to be
397 * active in both cases, however, because it is used for inode reclaim
398 * during these times. Use the MS_ACTIVE flag to avoid doing anything
399 * during mount. Doing work during unmount is avoided by calling
400 * cancel_delayed_work_sync on this work queue before tearing down
401 * the ail and the log in xfs_log_unmount.
402 */
403 if (!(mp->m_super->s_flags & MS_ACTIVE) &&
404 !(mp->m_flags & XFS_MOUNT_RDONLY)) {
405 /* dgc: errors ignored here */
406 if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
407 xfs_log_need_covered(mp))
408 error = xfs_fs_log_dummy(mp);
409 else
410 xfs_log_force(mp, 0);
411 686
412 /* start pushing all the metadata that is currently 687 ag = 0;
413 * dirty */ 688 while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
414 xfs_ail_push_all(mp->m_ail); 689 ag = pag->pag_agno + 1;
690 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
691 xfs_perag_put(pag);
692 if (error) {
693 last_error = error;
694 if (error == EFSCORRUPTED)
695 break;
696 }
415 } 697 }
416 698 return XFS_ERROR(last_error);
417 /* queue us up again */
418 xfs_syncd_queue_sync(mp);
419} 699}
420 700
421/* 701/*
422 * Queue a new inode reclaim pass if there are reclaimable inodes and there 702 * Queue a new inode reclaim pass if there are reclaimable inodes and there
423 * isn't a reclaim pass already in progress. By default it runs every 5s based 703 * isn't a reclaim pass already in progress. By default it runs every 5s based
424 * on the xfs syncd work default of 30s. Perhaps this should have it's own 704 * on the xfs periodic sync default of 30s. Perhaps this should have it's own
425 * tunable, but that can be done if this method proves to be ineffective or too 705 * tunable, but that can be done if this method proves to be ineffective or too
426 * aggressive. 706 * aggressive.
427 */ 707 */
428static void 708static void
429xfs_syncd_queue_reclaim( 709xfs_reclaim_work_queue(
430 struct xfs_mount *mp) 710 struct xfs_mount *mp)
431{ 711{
432 712
433 rcu_read_lock(); 713 rcu_read_lock();
434 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 714 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
435 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, 715 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
436 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 716 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
437 } 717 }
438 rcu_read_unlock(); 718 rcu_read_unlock();
@@ -445,7 +725,7 @@ xfs_syncd_queue_reclaim(
445 * goes low. It scans as quickly as possible avoiding locked inodes or those 725 * goes low. It scans as quickly as possible avoiding locked inodes or those
446 * already being flushed, and once done schedules a future pass. 726 * already being flushed, and once done schedules a future pass.
447 */ 727 */
448STATIC void 728void
449xfs_reclaim_worker( 729xfs_reclaim_worker(
450 struct work_struct *work) 730 struct work_struct *work)
451{ 731{
@@ -453,65 +733,10 @@ xfs_reclaim_worker(
453 struct xfs_mount, m_reclaim_work); 733 struct xfs_mount, m_reclaim_work);
454 734
455 xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 735 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
456 xfs_syncd_queue_reclaim(mp); 736 xfs_reclaim_work_queue(mp);
457} 737}
458 738
459/* 739static void
460 * Flush delayed allocate data, attempting to free up reserved space
461 * from existing allocations. At this point a new allocation attempt
462 * has failed with ENOSPC and we are in the process of scratching our
463 * heads, looking about for more room.
464 *
465 * Queue a new data flush if there isn't one already in progress and
466 * wait for completion of the flush. This means that we only ever have one
467 * inode flush in progress no matter how many ENOSPC events are occurring and
468 * so will prevent the system from bogging down due to every concurrent
469 * ENOSPC event scanning all the active inodes in the system for writeback.
470 */
471void
472xfs_flush_inodes(
473 struct xfs_inode *ip)
474{
475 struct xfs_mount *mp = ip->i_mount;
476
477 queue_work(xfs_syncd_wq, &mp->m_flush_work);
478 flush_work(&mp->m_flush_work);
479}
480
481STATIC void
482xfs_flush_worker(
483 struct work_struct *work)
484{
485 struct xfs_mount *mp = container_of(work,
486 struct xfs_mount, m_flush_work);
487
488 xfs_sync_data(mp, SYNC_TRYLOCK);
489 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
490}
491
492int
493xfs_syncd_init(
494 struct xfs_mount *mp)
495{
496 INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
497 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
498 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
499
500 xfs_syncd_queue_sync(mp);
501
502 return 0;
503}
504
505void
506xfs_syncd_stop(
507 struct xfs_mount *mp)
508{
509 cancel_delayed_work_sync(&mp->m_sync_work);
510 cancel_delayed_work_sync(&mp->m_reclaim_work);
511 cancel_work_sync(&mp->m_flush_work);
512}
513
514void
515__xfs_inode_set_reclaim_tag( 740__xfs_inode_set_reclaim_tag(
516 struct xfs_perag *pag, 741 struct xfs_perag *pag,
517 struct xfs_inode *ip) 742 struct xfs_inode *ip)
@@ -529,7 +754,7 @@ __xfs_inode_set_reclaim_tag(
529 spin_unlock(&ip->i_mount->m_perag_lock); 754 spin_unlock(&ip->i_mount->m_perag_lock);
530 755
531 /* schedule periodic background inode reclaim */ 756 /* schedule periodic background inode reclaim */
532 xfs_syncd_queue_reclaim(ip->i_mount); 757 xfs_reclaim_work_queue(ip->i_mount);
533 758
534 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 759 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
535 -1, _RET_IP_); 760 -1, _RET_IP_);
@@ -577,7 +802,7 @@ __xfs_inode_clear_reclaim(
577 } 802 }
578} 803}
579 804
580void 805STATIC void
581__xfs_inode_clear_reclaim_tag( 806__xfs_inode_clear_reclaim_tag(
582 xfs_mount_t *mp, 807 xfs_mount_t *mp,
583 xfs_perag_t *pag, 808 xfs_perag_t *pag,
@@ -787,9 +1012,9 @@ out:
787 /* 1012 /*
788 * We could return EAGAIN here to make reclaim rescan the inode tree in 1013 * We could return EAGAIN here to make reclaim rescan the inode tree in
789 * a short while. However, this just burns CPU time scanning the tree 1014 * a short while. However, this just burns CPU time scanning the tree
790 * waiting for IO to complete and xfssyncd never goes back to the idle 1015 * waiting for IO to complete and the reclaim work never goes back to
791 * state. Instead, return 0 to let the next scheduled background reclaim 1016 * the idle state. Instead, return 0 to let the next scheduled
792 * attempt to reclaim the inode again. 1017 * background reclaim attempt to reclaim the inode again.
793 */ 1018 */
794 return 0; 1019 return 0;
795} 1020}
@@ -800,7 +1025,7 @@ out:
800 * then a shut down during filesystem unmount reclaim walk leak all the 1025 * then a shut down during filesystem unmount reclaim walk leak all the
801 * unreclaimed inodes. 1026 * unreclaimed inodes.
802 */ 1027 */
803int 1028STATIC int
804xfs_reclaim_inodes_ag( 1029xfs_reclaim_inodes_ag(
805 struct xfs_mount *mp, 1030 struct xfs_mount *mp,
806 int flags, 1031 int flags,
@@ -945,7 +1170,7 @@ xfs_reclaim_inodes_nr(
945 int nr_to_scan) 1170 int nr_to_scan)
946{ 1171{
947 /* kick background reclaimer and push the AIL */ 1172 /* kick background reclaimer and push the AIL */
948 xfs_syncd_queue_reclaim(mp); 1173 xfs_reclaim_work_queue(mp);
949 xfs_ail_push_all(mp->m_ail); 1174 xfs_ail_push_all(mp->m_ail);
950 1175
951 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 1176 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
@@ -971,3 +1196,146 @@ xfs_reclaim_inodes_count(
971 return reclaimable; 1196 return reclaimable;
972} 1197}
973 1198
1199STATIC int
1200xfs_inode_match_id(
1201 struct xfs_inode *ip,
1202 struct xfs_eofblocks *eofb)
1203{
1204 if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
1205 ip->i_d.di_uid != eofb->eof_uid)
1206 return 0;
1207
1208 if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
1209 ip->i_d.di_gid != eofb->eof_gid)
1210 return 0;
1211
1212 if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
1213 xfs_get_projid(ip) != eofb->eof_prid)
1214 return 0;
1215
1216 return 1;
1217}
1218
1219STATIC int
1220xfs_inode_free_eofblocks(
1221 struct xfs_inode *ip,
1222 struct xfs_perag *pag,
1223 int flags,
1224 void *args)
1225{
1226 int ret;
1227 struct xfs_eofblocks *eofb = args;
1228
1229 if (!xfs_can_free_eofblocks(ip, false)) {
1230 /* inode could be preallocated or append-only */
1231 trace_xfs_inode_free_eofblocks_invalid(ip);
1232 xfs_inode_clear_eofblocks_tag(ip);
1233 return 0;
1234 }
1235
1236 /*
1237 * If the mapping is dirty the operation can block and wait for some
1238 * time. Unless we are waiting, skip it.
1239 */
1240 if (!(flags & SYNC_WAIT) &&
1241 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1242 return 0;
1243
1244 if (eofb) {
1245 if (!xfs_inode_match_id(ip, eofb))
1246 return 0;
1247
1248 /* skip the inode if the file size is too small */
1249 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
1250 XFS_ISIZE(ip) < eofb->eof_min_file_size)
1251 return 0;
1252 }
1253
1254 ret = xfs_free_eofblocks(ip->i_mount, ip, true);
1255
1256 /* don't revisit the inode if we're not waiting */
1257 if (ret == EAGAIN && !(flags & SYNC_WAIT))
1258 ret = 0;
1259
1260 return ret;
1261}
1262
1263int
1264xfs_icache_free_eofblocks(
1265 struct xfs_mount *mp,
1266 struct xfs_eofblocks *eofb)
1267{
1268 int flags = SYNC_TRYLOCK;
1269
1270 if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
1271 flags = SYNC_WAIT;
1272
1273 return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
1274 eofb, XFS_ICI_EOFBLOCKS_TAG);
1275}
1276
1277void
1278xfs_inode_set_eofblocks_tag(
1279 xfs_inode_t *ip)
1280{
1281 struct xfs_mount *mp = ip->i_mount;
1282 struct xfs_perag *pag;
1283 int tagged;
1284
1285 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1286 spin_lock(&pag->pag_ici_lock);
1287 trace_xfs_inode_set_eofblocks_tag(ip);
1288
1289 tagged = radix_tree_tagged(&pag->pag_ici_root,
1290 XFS_ICI_EOFBLOCKS_TAG);
1291 radix_tree_tag_set(&pag->pag_ici_root,
1292 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1293 XFS_ICI_EOFBLOCKS_TAG);
1294 if (!tagged) {
1295 /* propagate the eofblocks tag up into the perag radix tree */
1296 spin_lock(&ip->i_mount->m_perag_lock);
1297 radix_tree_tag_set(&ip->i_mount->m_perag_tree,
1298 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1299 XFS_ICI_EOFBLOCKS_TAG);
1300 spin_unlock(&ip->i_mount->m_perag_lock);
1301
1302 /* kick off background trimming */
1303 xfs_queue_eofblocks(ip->i_mount);
1304
1305 trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
1306 -1, _RET_IP_);
1307 }
1308
1309 spin_unlock(&pag->pag_ici_lock);
1310 xfs_perag_put(pag);
1311}
1312
1313void
1314xfs_inode_clear_eofblocks_tag(
1315 xfs_inode_t *ip)
1316{
1317 struct xfs_mount *mp = ip->i_mount;
1318 struct xfs_perag *pag;
1319
1320 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1321 spin_lock(&pag->pag_ici_lock);
1322 trace_xfs_inode_clear_eofblocks_tag(ip);
1323
1324 radix_tree_tag_clear(&pag->pag_ici_root,
1325 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1326 XFS_ICI_EOFBLOCKS_TAG);
1327 if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
1328 /* clear the eofblocks tag from the perag radix tree */
1329 spin_lock(&ip->i_mount->m_perag_lock);
1330 radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
1331 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1332 XFS_ICI_EOFBLOCKS_TAG);
1333 spin_unlock(&ip->i_mount->m_perag_lock);
1334 trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
1335 -1, _RET_IP_);
1336 }
1337
1338 spin_unlock(&pag->pag_ici_lock);
1339 xfs_perag_put(pag);
1340}
1341
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_icache.h
index 941202e7ac6e..e0f138c70a2f 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_icache.h
@@ -24,28 +24,30 @@ struct xfs_perag;
24#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 24#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
25#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ 25#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
26 26
27extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ 27int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
28 uint flags, uint lock_flags, xfs_inode_t **ipp);
28 29
29int xfs_syncd_init(struct xfs_mount *mp); 30void xfs_reclaim_worker(struct work_struct *work);
30void xfs_syncd_stop(struct xfs_mount *mp);
31
32int xfs_quiesce_data(struct xfs_mount *mp);
33void xfs_quiesce_attr(struct xfs_mount *mp);
34
35void xfs_flush_inodes(struct xfs_inode *ip);
36 31
37int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 32int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
38int xfs_reclaim_inodes_count(struct xfs_mount *mp); 33int xfs_reclaim_inodes_count(struct xfs_mount *mp);
39void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); 34void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
40 35
41void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); 36void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
42void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); 37
43void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 38void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
44 struct xfs_inode *ip); 39void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
40int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
41void xfs_eofblocks_worker(struct work_struct *);
45 42
46int xfs_sync_inode_grab(struct xfs_inode *ip); 43int xfs_sync_inode_grab(struct xfs_inode *ip);
47int xfs_inode_ag_iterator(struct xfs_mount *mp, 44int xfs_inode_ag_iterator(struct xfs_mount *mp,
48 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 45 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
49 int flags); 46 int flags, void *args),
47 int flags, void *args);
48int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
49 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
50 int flags, void *args),
51 int flags, void *args, int tag);
50 52
51#endif 53#endif
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
deleted file mode 100644
index 784a803383ec..000000000000
--- a/fs/xfs/xfs_iget.c
+++ /dev/null
@@ -1,705 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_acl.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_bmap_btree.h"
29#include "xfs_alloc_btree.h"
30#include "xfs_ialloc_btree.h"
31#include "xfs_dinode.h"
32#include "xfs_inode.h"
33#include "xfs_btree.h"
34#include "xfs_ialloc.h"
35#include "xfs_quota.h"
36#include "xfs_utils.h"
37#include "xfs_trans_priv.h"
38#include "xfs_inode_item.h"
39#include "xfs_bmap.h"
40#include "xfs_trace.h"
41
42
43/*
44 * Allocate and initialise an xfs_inode.
45 */
46STATIC struct xfs_inode *
47xfs_inode_alloc(
48 struct xfs_mount *mp,
49 xfs_ino_t ino)
50{
51 struct xfs_inode *ip;
52
53 /*
54 * if this didn't occur in transactions, we could use
55 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
56 * code up to do this anyway.
57 */
58 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
59 if (!ip)
60 return NULL;
61 if (inode_init_always(mp->m_super, VFS_I(ip))) {
62 kmem_zone_free(xfs_inode_zone, ip);
63 return NULL;
64 }
65
66 ASSERT(atomic_read(&ip->i_pincount) == 0);
67 ASSERT(!spin_is_locked(&ip->i_flags_lock));
68 ASSERT(!xfs_isiflocked(ip));
69 ASSERT(ip->i_ino == 0);
70
71 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
72
73 /* initialise the xfs inode */
74 ip->i_ino = ino;
75 ip->i_mount = mp;
76 memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
77 ip->i_afp = NULL;
78 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
79 ip->i_flags = 0;
80 ip->i_delayed_blks = 0;
81 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
82
83 return ip;
84}
85
86STATIC void
87xfs_inode_free_callback(
88 struct rcu_head *head)
89{
90 struct inode *inode = container_of(head, struct inode, i_rcu);
91 struct xfs_inode *ip = XFS_I(inode);
92
93 kmem_zone_free(xfs_inode_zone, ip);
94}
95
96void
97xfs_inode_free(
98 struct xfs_inode *ip)
99{
100 switch (ip->i_d.di_mode & S_IFMT) {
101 case S_IFREG:
102 case S_IFDIR:
103 case S_IFLNK:
104 xfs_idestroy_fork(ip, XFS_DATA_FORK);
105 break;
106 }
107
108 if (ip->i_afp)
109 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
110
111 if (ip->i_itemp) {
112 ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
113 xfs_inode_item_destroy(ip);
114 ip->i_itemp = NULL;
115 }
116
117 /* asserts to verify all state is correct here */
118 ASSERT(atomic_read(&ip->i_pincount) == 0);
119 ASSERT(!spin_is_locked(&ip->i_flags_lock));
120 ASSERT(!xfs_isiflocked(ip));
121
122 /*
123 * Because we use RCU freeing we need to ensure the inode always
124 * appears to be reclaimed with an invalid inode number when in the
125 * free state. The ip->i_flags_lock provides the barrier against lookup
126 * races.
127 */
128 spin_lock(&ip->i_flags_lock);
129 ip->i_flags = XFS_IRECLAIM;
130 ip->i_ino = 0;
131 spin_unlock(&ip->i_flags_lock);
132
133 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
134}
135
136/*
137 * Check the validity of the inode we just found it the cache
138 */
139static int
140xfs_iget_cache_hit(
141 struct xfs_perag *pag,
142 struct xfs_inode *ip,
143 xfs_ino_t ino,
144 int flags,
145 int lock_flags) __releases(RCU)
146{
147 struct inode *inode = VFS_I(ip);
148 struct xfs_mount *mp = ip->i_mount;
149 int error;
150
151 /*
152 * check for re-use of an inode within an RCU grace period due to the
153 * radix tree nodes not being updated yet. We monitor for this by
154 * setting the inode number to zero before freeing the inode structure.
155 * If the inode has been reallocated and set up, then the inode number
156 * will not match, so check for that, too.
157 */
158 spin_lock(&ip->i_flags_lock);
159 if (ip->i_ino != ino) {
160 trace_xfs_iget_skip(ip);
161 XFS_STATS_INC(xs_ig_frecycle);
162 error = EAGAIN;
163 goto out_error;
164 }
165
166
167 /*
168 * If we are racing with another cache hit that is currently
169 * instantiating this inode or currently recycling it out of
170 * reclaimabe state, wait for the initialisation to complete
171 * before continuing.
172 *
173 * XXX(hch): eventually we should do something equivalent to
174 * wait_on_inode to wait for these flags to be cleared
175 * instead of polling for it.
176 */
177 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
178 trace_xfs_iget_skip(ip);
179 XFS_STATS_INC(xs_ig_frecycle);
180 error = EAGAIN;
181 goto out_error;
182 }
183
184 /*
185 * If lookup is racing with unlink return an error immediately.
186 */
187 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
188 error = ENOENT;
189 goto out_error;
190 }
191
192 /*
193 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
194 * Need to carefully get it back into useable state.
195 */
196 if (ip->i_flags & XFS_IRECLAIMABLE) {
197 trace_xfs_iget_reclaim(ip);
198
199 /*
200 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
201 * from stomping over us while we recycle the inode. We can't
202 * clear the radix tree reclaimable tag yet as it requires
203 * pag_ici_lock to be held exclusive.
204 */
205 ip->i_flags |= XFS_IRECLAIM;
206
207 spin_unlock(&ip->i_flags_lock);
208 rcu_read_unlock();
209
210 error = -inode_init_always(mp->m_super, inode);
211 if (error) {
212 /*
213 * Re-initializing the inode failed, and we are in deep
214 * trouble. Try to re-add it to the reclaim list.
215 */
216 rcu_read_lock();
217 spin_lock(&ip->i_flags_lock);
218
219 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
220 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
221 trace_xfs_iget_reclaim_fail(ip);
222 goto out_error;
223 }
224
225 spin_lock(&pag->pag_ici_lock);
226 spin_lock(&ip->i_flags_lock);
227
228 /*
229 * Clear the per-lifetime state in the inode as we are now
230 * effectively a new inode and need to return to the initial
231 * state before reuse occurs.
232 */
233 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
234 ip->i_flags |= XFS_INEW;
235 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
236 inode->i_state = I_NEW;
237
238 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
239 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
240
241 spin_unlock(&ip->i_flags_lock);
242 spin_unlock(&pag->pag_ici_lock);
243 } else {
244 /* If the VFS inode is being torn down, pause and try again. */
245 if (!igrab(inode)) {
246 trace_xfs_iget_skip(ip);
247 error = EAGAIN;
248 goto out_error;
249 }
250
251 /* We've got a live one. */
252 spin_unlock(&ip->i_flags_lock);
253 rcu_read_unlock();
254 trace_xfs_iget_hit(ip);
255 }
256
257 if (lock_flags != 0)
258 xfs_ilock(ip, lock_flags);
259
260 xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
261 XFS_STATS_INC(xs_ig_found);
262
263 return 0;
264
265out_error:
266 spin_unlock(&ip->i_flags_lock);
267 rcu_read_unlock();
268 return error;
269}
270
271
272static int
273xfs_iget_cache_miss(
274 struct xfs_mount *mp,
275 struct xfs_perag *pag,
276 xfs_trans_t *tp,
277 xfs_ino_t ino,
278 struct xfs_inode **ipp,
279 int flags,
280 int lock_flags)
281{
282 struct xfs_inode *ip;
283 int error;
284 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
285 int iflags;
286
287 ip = xfs_inode_alloc(mp, ino);
288 if (!ip)
289 return ENOMEM;
290
291 error = xfs_iread(mp, tp, ip, flags);
292 if (error)
293 goto out_destroy;
294
295 trace_xfs_iget_miss(ip);
296
297 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
298 error = ENOENT;
299 goto out_destroy;
300 }
301
302 /*
303 * Preload the radix tree so we can insert safely under the
304 * write spinlock. Note that we cannot sleep inside the preload
305 * region. Since we can be called from transaction context, don't
306 * recurse into the file system.
307 */
308 if (radix_tree_preload(GFP_NOFS)) {
309 error = EAGAIN;
310 goto out_destroy;
311 }
312
313 /*
314 * Because the inode hasn't been added to the radix-tree yet it can't
315 * be found by another thread, so we can do the non-sleeping lock here.
316 */
317 if (lock_flags) {
318 if (!xfs_ilock_nowait(ip, lock_flags))
319 BUG();
320 }
321
322 /*
323 * These values must be set before inserting the inode into the radix
324 * tree as the moment it is inserted a concurrent lookup (allowed by the
325 * RCU locking mechanism) can find it and that lookup must see that this
326 * is an inode currently under construction (i.e. that XFS_INEW is set).
327 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
328 * memory barrier that ensures this detection works correctly at lookup
329 * time.
330 */
331 iflags = XFS_INEW;
332 if (flags & XFS_IGET_DONTCACHE)
333 iflags |= XFS_IDONTCACHE;
334 ip->i_udquot = ip->i_gdquot = NULL;
335 xfs_iflags_set(ip, iflags);
336
337 /* insert the new inode */
338 spin_lock(&pag->pag_ici_lock);
339 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
340 if (unlikely(error)) {
341 WARN_ON(error != -EEXIST);
342 XFS_STATS_INC(xs_ig_dup);
343 error = EAGAIN;
344 goto out_preload_end;
345 }
346 spin_unlock(&pag->pag_ici_lock);
347 radix_tree_preload_end();
348
349 *ipp = ip;
350 return 0;
351
352out_preload_end:
353 spin_unlock(&pag->pag_ici_lock);
354 radix_tree_preload_end();
355 if (lock_flags)
356 xfs_iunlock(ip, lock_flags);
357out_destroy:
358 __destroy_inode(VFS_I(ip));
359 xfs_inode_free(ip);
360 return error;
361}
362
363/*
364 * Look up an inode by number in the given file system.
365 * The inode is looked up in the cache held in each AG.
366 * If the inode is found in the cache, initialise the vfs inode
367 * if necessary.
368 *
369 * If it is not in core, read it in from the file system's device,
370 * add it to the cache and initialise the vfs inode.
371 *
372 * The inode is locked according to the value of the lock_flags parameter.
373 * This flag parameter indicates how and if the inode's IO lock and inode lock
374 * should be taken.
375 *
376 * mp -- the mount point structure for the current file system. It points
377 * to the inode hash table.
378 * tp -- a pointer to the current transaction if there is one. This is
379 * simply passed through to the xfs_iread() call.
380 * ino -- the number of the inode desired. This is the unique identifier
381 * within the file system for the inode being requested.
382 * lock_flags -- flags indicating how to lock the inode. See the comment
383 * for xfs_ilock() for a list of valid values.
384 */
385int
386xfs_iget(
387 xfs_mount_t *mp,
388 xfs_trans_t *tp,
389 xfs_ino_t ino,
390 uint flags,
391 uint lock_flags,
392 xfs_inode_t **ipp)
393{
394 xfs_inode_t *ip;
395 int error;
396 xfs_perag_t *pag;
397 xfs_agino_t agino;
398
399 /*
400 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
401 * doesn't get freed while it's being referenced during a
402 * radix tree traversal here. It assumes this function
403 * aqcuires only the ILOCK (and therefore it has no need to
404 * involve the IOLOCK in this synchronization).
405 */
406 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
407
408 /* reject inode numbers outside existing AGs */
409 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
410 return EINVAL;
411
412 /* get the perag structure and ensure that it's inode capable */
413 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
414 agino = XFS_INO_TO_AGINO(mp, ino);
415
416again:
417 error = 0;
418 rcu_read_lock();
419 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
420
421 if (ip) {
422 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
423 if (error)
424 goto out_error_or_again;
425 } else {
426 rcu_read_unlock();
427 XFS_STATS_INC(xs_ig_missed);
428
429 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
430 flags, lock_flags);
431 if (error)
432 goto out_error_or_again;
433 }
434 xfs_perag_put(pag);
435
436 *ipp = ip;
437
438 /*
439 * If we have a real type for an on-disk inode, we can set ops(&unlock)
440 * now. If it's a new inode being created, xfs_ialloc will handle it.
441 */
442 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
443 xfs_setup_inode(ip);
444 return 0;
445
446out_error_or_again:
447 if (error == EAGAIN) {
448 delay(1);
449 goto again;
450 }
451 xfs_perag_put(pag);
452 return error;
453}
454
455/*
456 * This is a wrapper routine around the xfs_ilock() routine
457 * used to centralize some grungy code. It is used in places
458 * that wish to lock the inode solely for reading the extents.
459 * The reason these places can't just call xfs_ilock(SHARED)
460 * is that the inode lock also guards to bringing in of the
461 * extents from disk for a file in b-tree format. If the inode
462 * is in b-tree format, then we need to lock the inode exclusively
463 * until the extents are read in. Locking it exclusively all
464 * the time would limit our parallelism unnecessarily, though.
465 * What we do instead is check to see if the extents have been
466 * read in yet, and only lock the inode exclusively if they
467 * have not.
468 *
469 * The function returns a value which should be given to the
470 * corresponding xfs_iunlock_map_shared(). This value is
471 * the mode in which the lock was actually taken.
472 */
473uint
474xfs_ilock_map_shared(
475 xfs_inode_t *ip)
476{
477 uint lock_mode;
478
479 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
480 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
481 lock_mode = XFS_ILOCK_EXCL;
482 } else {
483 lock_mode = XFS_ILOCK_SHARED;
484 }
485
486 xfs_ilock(ip, lock_mode);
487
488 return lock_mode;
489}
490
491/*
492 * This is simply the unlock routine to go with xfs_ilock_map_shared().
493 * All it does is call xfs_iunlock() with the given lock_mode.
494 */
495void
496xfs_iunlock_map_shared(
497 xfs_inode_t *ip,
498 unsigned int lock_mode)
499{
500 xfs_iunlock(ip, lock_mode);
501}
502
503/*
504 * The xfs inode contains 2 locks: a multi-reader lock called the
505 * i_iolock and a multi-reader lock called the i_lock. This routine
506 * allows either or both of the locks to be obtained.
507 *
508 * The 2 locks should always be ordered so that the IO lock is
509 * obtained first in order to prevent deadlock.
510 *
511 * ip -- the inode being locked
512 * lock_flags -- this parameter indicates the inode's locks
513 * to be locked. It can be:
514 * XFS_IOLOCK_SHARED,
515 * XFS_IOLOCK_EXCL,
516 * XFS_ILOCK_SHARED,
517 * XFS_ILOCK_EXCL,
518 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
519 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
520 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
521 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
522 */
523void
524xfs_ilock(
525 xfs_inode_t *ip,
526 uint lock_flags)
527{
528 /*
529 * You can't set both SHARED and EXCL for the same lock,
530 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
531 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
532 */
533 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
534 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
535 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
536 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
537 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
538
539 if (lock_flags & XFS_IOLOCK_EXCL)
540 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
541 else if (lock_flags & XFS_IOLOCK_SHARED)
542 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
543
544 if (lock_flags & XFS_ILOCK_EXCL)
545 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
546 else if (lock_flags & XFS_ILOCK_SHARED)
547 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
548
549 trace_xfs_ilock(ip, lock_flags, _RET_IP_);
550}
551
552/*
553 * This is just like xfs_ilock(), except that the caller
554 * is guaranteed not to sleep. It returns 1 if it gets
555 * the requested locks and 0 otherwise. If the IO lock is
556 * obtained but the inode lock cannot be, then the IO lock
557 * is dropped before returning.
558 *
559 * ip -- the inode being locked
560 * lock_flags -- this parameter indicates the inode's locks to be
561 * to be locked. See the comment for xfs_ilock() for a list
562 * of valid values.
563 */
564int
565xfs_ilock_nowait(
566 xfs_inode_t *ip,
567 uint lock_flags)
568{
569 /*
570 * You can't set both SHARED and EXCL for the same lock,
571 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
572 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
573 */
574 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
575 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
576 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
577 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
578 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
579
580 if (lock_flags & XFS_IOLOCK_EXCL) {
581 if (!mrtryupdate(&ip->i_iolock))
582 goto out;
583 } else if (lock_flags & XFS_IOLOCK_SHARED) {
584 if (!mrtryaccess(&ip->i_iolock))
585 goto out;
586 }
587 if (lock_flags & XFS_ILOCK_EXCL) {
588 if (!mrtryupdate(&ip->i_lock))
589 goto out_undo_iolock;
590 } else if (lock_flags & XFS_ILOCK_SHARED) {
591 if (!mrtryaccess(&ip->i_lock))
592 goto out_undo_iolock;
593 }
594 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
595 return 1;
596
597 out_undo_iolock:
598 if (lock_flags & XFS_IOLOCK_EXCL)
599 mrunlock_excl(&ip->i_iolock);
600 else if (lock_flags & XFS_IOLOCK_SHARED)
601 mrunlock_shared(&ip->i_iolock);
602 out:
603 return 0;
604}
605
606/*
607 * xfs_iunlock() is used to drop the inode locks acquired with
608 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
609 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
610 * that we know which locks to drop.
611 *
612 * ip -- the inode being unlocked
613 * lock_flags -- this parameter indicates the inode's locks to be
614 * to be unlocked. See the comment for xfs_ilock() for a list
615 * of valid values for this parameter.
616 *
617 */
618void
619xfs_iunlock(
620 xfs_inode_t *ip,
621 uint lock_flags)
622{
623 /*
624 * You can't set both SHARED and EXCL for the same lock,
625 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
626 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
627 */
628 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
629 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
630 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
631 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
632 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
633 ASSERT(lock_flags != 0);
634
635 if (lock_flags & XFS_IOLOCK_EXCL)
636 mrunlock_excl(&ip->i_iolock);
637 else if (lock_flags & XFS_IOLOCK_SHARED)
638 mrunlock_shared(&ip->i_iolock);
639
640 if (lock_flags & XFS_ILOCK_EXCL)
641 mrunlock_excl(&ip->i_lock);
642 else if (lock_flags & XFS_ILOCK_SHARED)
643 mrunlock_shared(&ip->i_lock);
644
645 trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
646}
647
648/*
649 * give up write locks. the i/o lock cannot be held nested
650 * if it is being demoted.
651 */
652void
653xfs_ilock_demote(
654 xfs_inode_t *ip,
655 uint lock_flags)
656{
657 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
658 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
659
660 if (lock_flags & XFS_ILOCK_EXCL)
661 mrdemote(&ip->i_lock);
662 if (lock_flags & XFS_IOLOCK_EXCL)
663 mrdemote(&ip->i_iolock);
664
665 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
666}
667
668#ifdef DEBUG
669int
670xfs_isilocked(
671 xfs_inode_t *ip,
672 uint lock_flags)
673{
674 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
675 if (!(lock_flags & XFS_ILOCK_SHARED))
676 return !!ip->i_lock.mr_writer;
677 return rwsem_is_locked(&ip->i_lock.mr_lock);
678 }
679
680 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
681 if (!(lock_flags & XFS_IOLOCK_SHARED))
682 return !!ip->i_iolock.mr_writer;
683 return rwsem_is_locked(&ip->i_iolock.mr_lock);
684 }
685
686 ASSERT(0);
687 return 0;
688}
689#endif
690
691void
692__xfs_iflock(
693 struct xfs_inode *ip)
694{
695 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
696 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
697
698 do {
699 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
700 if (xfs_isiflocked(ip))
701 io_schedule();
702 } while (!xfs_iflock_nowait(ip));
703
704 finish_wait(wq, &wait.wait);
705}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1938b41ee9f5..66282dcb821b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,6 +45,7 @@
45#include "xfs_filestream.h" 45#include "xfs_filestream.h"
46#include "xfs_vnodeops.h" 46#include "xfs_vnodeops.h"
47#include "xfs_trace.h" 47#include "xfs_trace.h"
48#include "xfs_icache.h"
48 49
49kmem_zone_t *xfs_ifork_zone; 50kmem_zone_t *xfs_ifork_zone;
50kmem_zone_t *xfs_inode_zone; 51kmem_zone_t *xfs_inode_zone;
@@ -74,6 +75,256 @@ xfs_get_extsz_hint(
74 return 0; 75 return 0;
75} 76}
76 77
78/*
79 * This is a wrapper routine around the xfs_ilock() routine used to centralize
80 * some grungy code. It is used in places that wish to lock the inode solely
81 * for reading the extents. The reason these places can't just call
82 * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
83 * extents from disk for a file in b-tree format. If the inode is in b-tree
84 * format, then we need to lock the inode exclusively until the extents are read
85 * in. Locking it exclusively all the time would limit our parallelism
86 * unnecessarily, though. What we do instead is check to see if the extents
87 * have been read in yet, and only lock the inode exclusively if they have not.
88 *
89 * The function returns a value which should be given to the corresponding
90 * xfs_iunlock_map_shared(). This value is the mode in which the lock was
91 * actually taken.
92 */
93uint
94xfs_ilock_map_shared(
95 xfs_inode_t *ip)
96{
97 uint lock_mode;
98
99 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
100 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
101 lock_mode = XFS_ILOCK_EXCL;
102 } else {
103 lock_mode = XFS_ILOCK_SHARED;
104 }
105
106 xfs_ilock(ip, lock_mode);
107
108 return lock_mode;
109}
110
111/*
112 * This is simply the unlock routine to go with xfs_ilock_map_shared().
113 * All it does is call xfs_iunlock() with the given lock_mode.
114 */
115void
116xfs_iunlock_map_shared(
117 xfs_inode_t *ip,
118 unsigned int lock_mode)
119{
120 xfs_iunlock(ip, lock_mode);
121}
122
123/*
124 * The xfs inode contains 2 locks: a multi-reader lock called the
125 * i_iolock and a multi-reader lock called the i_lock. This routine
126 * allows either or both of the locks to be obtained.
127 *
128 * The 2 locks should always be ordered so that the IO lock is
129 * obtained first in order to prevent deadlock.
130 *
131 * ip -- the inode being locked
132 * lock_flags -- this parameter indicates the inode's locks
133 * to be locked. It can be:
134 * XFS_IOLOCK_SHARED,
135 * XFS_IOLOCK_EXCL,
136 * XFS_ILOCK_SHARED,
137 * XFS_ILOCK_EXCL,
138 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
139 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
140 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
141 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
142 */
143void
144xfs_ilock(
145 xfs_inode_t *ip,
146 uint lock_flags)
147{
148 trace_xfs_ilock(ip, lock_flags, _RET_IP_);
149
150 /*
151 * You can't set both SHARED and EXCL for the same lock,
152 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
153 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
154 */
155 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
156 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
157 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
158 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
159 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
160
161 if (lock_flags & XFS_IOLOCK_EXCL)
162 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
163 else if (lock_flags & XFS_IOLOCK_SHARED)
164 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
165
166 if (lock_flags & XFS_ILOCK_EXCL)
167 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
168 else if (lock_flags & XFS_ILOCK_SHARED)
169 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
170}
171
172/*
173 * This is just like xfs_ilock(), except that the caller
174 * is guaranteed not to sleep. It returns 1 if it gets
175 * the requested locks and 0 otherwise. If the IO lock is
176 * obtained but the inode lock cannot be, then the IO lock
177 * is dropped before returning.
178 *
179 * ip -- the inode being locked
180 * lock_flags -- this parameter indicates the inode's locks to be
181 * to be locked. See the comment for xfs_ilock() for a list
182 * of valid values.
183 */
184int
185xfs_ilock_nowait(
186 xfs_inode_t *ip,
187 uint lock_flags)
188{
189 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
190
191 /*
192 * You can't set both SHARED and EXCL for the same lock,
193 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
194 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
195 */
196 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
197 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
198 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
199 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
200 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
201
202 if (lock_flags & XFS_IOLOCK_EXCL) {
203 if (!mrtryupdate(&ip->i_iolock))
204 goto out;
205 } else if (lock_flags & XFS_IOLOCK_SHARED) {
206 if (!mrtryaccess(&ip->i_iolock))
207 goto out;
208 }
209 if (lock_flags & XFS_ILOCK_EXCL) {
210 if (!mrtryupdate(&ip->i_lock))
211 goto out_undo_iolock;
212 } else if (lock_flags & XFS_ILOCK_SHARED) {
213 if (!mrtryaccess(&ip->i_lock))
214 goto out_undo_iolock;
215 }
216 return 1;
217
218 out_undo_iolock:
219 if (lock_flags & XFS_IOLOCK_EXCL)
220 mrunlock_excl(&ip->i_iolock);
221 else if (lock_flags & XFS_IOLOCK_SHARED)
222 mrunlock_shared(&ip->i_iolock);
223 out:
224 return 0;
225}
226
227/*
228 * xfs_iunlock() is used to drop the inode locks acquired with
229 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
230 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
231 * that we know which locks to drop.
232 *
233 * ip -- the inode being unlocked
234 * lock_flags -- this parameter indicates the inode's locks to be
235 * to be unlocked. See the comment for xfs_ilock() for a list
236 * of valid values for this parameter.
237 *
238 */
239void
240xfs_iunlock(
241 xfs_inode_t *ip,
242 uint lock_flags)
243{
244 /*
245 * You can't set both SHARED and EXCL for the same lock,
246 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
247 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
248 */
249 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
250 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
251 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
252 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
253 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
254 ASSERT(lock_flags != 0);
255
256 if (lock_flags & XFS_IOLOCK_EXCL)
257 mrunlock_excl(&ip->i_iolock);
258 else if (lock_flags & XFS_IOLOCK_SHARED)
259 mrunlock_shared(&ip->i_iolock);
260
261 if (lock_flags & XFS_ILOCK_EXCL)
262 mrunlock_excl(&ip->i_lock);
263 else if (lock_flags & XFS_ILOCK_SHARED)
264 mrunlock_shared(&ip->i_lock);
265
266 trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
267}
268
269/*
270 * give up write locks. the i/o lock cannot be held nested
271 * if it is being demoted.
272 */
273void
274xfs_ilock_demote(
275 xfs_inode_t *ip,
276 uint lock_flags)
277{
278 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
279 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
280
281 if (lock_flags & XFS_ILOCK_EXCL)
282 mrdemote(&ip->i_lock);
283 if (lock_flags & XFS_IOLOCK_EXCL)
284 mrdemote(&ip->i_iolock);
285
286 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
287}
288
289#ifdef DEBUG
290int
291xfs_isilocked(
292 xfs_inode_t *ip,
293 uint lock_flags)
294{
295 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
296 if (!(lock_flags & XFS_ILOCK_SHARED))
297 return !!ip->i_lock.mr_writer;
298 return rwsem_is_locked(&ip->i_lock.mr_lock);
299 }
300
301 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
302 if (!(lock_flags & XFS_IOLOCK_SHARED))
303 return !!ip->i_iolock.mr_writer;
304 return rwsem_is_locked(&ip->i_iolock.mr_lock);
305 }
306
307 ASSERT(0);
308 return 0;
309}
310#endif
311
312void
313__xfs_iflock(
314 struct xfs_inode *ip)
315{
316 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
317 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
318
319 do {
320 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
321 if (xfs_isiflocked(ip))
322 io_schedule();
323 } while (!xfs_iflock_nowait(ip));
324
325 finish_wait(wq, &wait.wait);
326}
327
77#ifdef DEBUG 328#ifdef DEBUG
78/* 329/*
79 * Make sure that the extents in the given memory buffer 330 * Make sure that the extents in the given memory buffer
@@ -131,6 +382,65 @@ xfs_inobp_check(
131} 382}
132#endif 383#endif
133 384
385static void
386xfs_inode_buf_verify(
387 struct xfs_buf *bp)
388{
389 struct xfs_mount *mp = bp->b_target->bt_mount;
390 int i;
391 int ni;
392
393 /*
394 * Validate the magic number and version of every inode in the buffer
395 */
396 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
397 for (i = 0; i < ni; i++) {
398 int di_ok;
399 xfs_dinode_t *dip;
400
401 dip = (struct xfs_dinode *)xfs_buf_offset(bp,
402 (i << mp->m_sb.sb_inodelog));
403 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
404 XFS_DINODE_GOOD_VERSION(dip->di_version);
405 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
406 XFS_ERRTAG_ITOBP_INOTOBP,
407 XFS_RANDOM_ITOBP_INOTOBP))) {
408 xfs_buf_ioerror(bp, EFSCORRUPTED);
409 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
410 mp, dip);
411#ifdef DEBUG
412 xfs_emerg(mp,
413 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
414 (unsigned long long)bp->b_bn, i,
415 be16_to_cpu(dip->di_magic));
416 ASSERT(0);
417#endif
418 }
419 }
420 xfs_inobp_check(mp, bp);
421}
422
423
424static void
425xfs_inode_buf_read_verify(
426 struct xfs_buf *bp)
427{
428 xfs_inode_buf_verify(bp);
429}
430
431static void
432xfs_inode_buf_write_verify(
433 struct xfs_buf *bp)
434{
435 xfs_inode_buf_verify(bp);
436}
437
438const struct xfs_buf_ops xfs_inode_buf_ops = {
439 .verify_read = xfs_inode_buf_read_verify,
440 .verify_write = xfs_inode_buf_write_verify,
441};
442
443
134/* 444/*
135 * This routine is called to map an inode to the buffer containing the on-disk 445 * This routine is called to map an inode to the buffer containing the on-disk
136 * version of the inode. It returns a pointer to the buffer containing the 446 * version of the inode. It returns a pointer to the buffer containing the
@@ -145,71 +455,33 @@ xfs_imap_to_bp(
145 struct xfs_mount *mp, 455 struct xfs_mount *mp,
146 struct xfs_trans *tp, 456 struct xfs_trans *tp,
147 struct xfs_imap *imap, 457 struct xfs_imap *imap,
148 struct xfs_dinode **dipp, 458 struct xfs_dinode **dipp,
149 struct xfs_buf **bpp, 459 struct xfs_buf **bpp,
150 uint buf_flags, 460 uint buf_flags,
151 uint iget_flags) 461 uint iget_flags)
152{ 462{
153 struct xfs_buf *bp; 463 struct xfs_buf *bp;
154 int error; 464 int error;
155 int i;
156 int ni;
157 465
158 buf_flags |= XBF_UNMAPPED; 466 buf_flags |= XBF_UNMAPPED;
159 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 467 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
160 (int)imap->im_len, buf_flags, &bp); 468 (int)imap->im_len, buf_flags, &bp,
469 &xfs_inode_buf_ops);
161 if (error) { 470 if (error) {
162 if (error != EAGAIN) { 471 if (error == EAGAIN) {
163 xfs_warn(mp,
164 "%s: xfs_trans_read_buf() returned error %d.",
165 __func__, error);
166 } else {
167 ASSERT(buf_flags & XBF_TRYLOCK); 472 ASSERT(buf_flags & XBF_TRYLOCK);
473 return error;
168 } 474 }
169 return error;
170 }
171
172 /*
173 * Validate the magic number and version of every inode in the buffer
174 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
175 */
176#ifdef DEBUG
177 ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
178#else /* usual case */
179 ni = 1;
180#endif
181 475
182 for (i = 0; i < ni; i++) { 476 if (error == EFSCORRUPTED &&
183 int di_ok; 477 (iget_flags & XFS_IGET_UNTRUSTED))
184 xfs_dinode_t *dip; 478 return XFS_ERROR(EINVAL);
185 479
186 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 480 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
187 (i << mp->m_sb.sb_inodelog)); 481 __func__, error);
188 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 482 return error;
189 XFS_DINODE_GOOD_VERSION(dip->di_version);
190 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
191 XFS_ERRTAG_ITOBP_INOTOBP,
192 XFS_RANDOM_ITOBP_INOTOBP))) {
193 if (iget_flags & XFS_IGET_UNTRUSTED) {
194 xfs_trans_brelse(tp, bp);
195 return XFS_ERROR(EINVAL);
196 }
197 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
198 mp, dip);
199#ifdef DEBUG
200 xfs_emerg(mp,
201 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
202 (unsigned long long)imap->im_blkno, i,
203 be16_to_cpu(dip->di_magic));
204 ASSERT(0);
205#endif
206 xfs_trans_brelse(tp, bp);
207 return XFS_ERROR(EFSCORRUPTED);
208 }
209 } 483 }
210 484
211 xfs_inobp_check(mp, bp);
212
213 *bpp = bp; 485 *bpp = bp;
214 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); 486 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
215 return 0; 487 return 0;
@@ -853,16 +1125,16 @@ xfs_iread_extents(
853 * set according to the contents of the given cred structure. 1125 * set according to the contents of the given cred structure.
854 * 1126 *
855 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 1127 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
856 * has a free inode available, call xfs_iget() 1128 * has a free inode available, call xfs_iget() to obtain the in-core
857 * to obtain the in-core version of the allocated inode. Finally, 1129 * version of the allocated inode. Finally, fill in the inode and
858 * fill in the inode and log its initial contents. In this case, 1130 * log its initial contents. In this case, ialloc_context would be
859 * ialloc_context would be set to NULL and call_again set to false. 1131 * set to NULL.
860 * 1132 *
861 * If xfs_dialloc() does not have an available inode, 1133 * If xfs_dialloc() does not have an available inode, it will replenish
862 * it will replenish its supply by doing an allocation. Since we can 1134 * its supply by doing an allocation. Since we can only do one
863 * only do one allocation within a transaction without deadlocks, we 1135 * allocation within a transaction without deadlocks, we must commit
864 * must commit the current transaction before returning the inode itself. 1136 * the current transaction before returning the inode itself.
865 * In this case, therefore, we will set call_again to true and return. 1137 * In this case, therefore, we will set ialloc_context and return.
866 * The caller should then commit the current transaction, start a new 1138 * The caller should then commit the current transaction, start a new
867 * transaction, and call xfs_ialloc() again to actually get the inode. 1139 * transaction, and call xfs_ialloc() again to actually get the inode.
868 * 1140 *
@@ -1514,6 +1786,18 @@ xfs_ifree_cluster(
1514 1786
1515 if (!bp) 1787 if (!bp)
1516 return ENOMEM; 1788 return ENOMEM;
1789
1790 /*
1791 * This buffer may not have been correctly initialised as we
1792 * didn't read it from disk. That's not important because we are
1793 * only using to mark the buffer as stale in the log, and to
1794 * attach stale cached inodes on it. That means it will never be
1795 * dispatched for IO. If it is, we want to know about it, and we
1796 * want it to fail. We can acheive this by adding a write
1797 * verifier to the buffer.
1798 */
1799 bp->b_ops = &xfs_inode_buf_ops;
1800
1517 /* 1801 /*
1518 * Walk the inodes already attached to the buffer and mark them 1802 * Walk the inodes already attached to the buffer and mark them
1519 * stale. These will all have the flush locks held, so an 1803 * stale. These will all have the flush locks held, so an
@@ -3661,3 +3945,40 @@ xfs_iext_irec_update_extoffs(
3661 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; 3945 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
3662 } 3946 }
3663} 3947}
3948
3949/*
3950 * Test whether it is appropriate to check an inode for and free post EOF
3951 * blocks. The 'force' parameter determines whether we should also consider
3952 * regular files that are marked preallocated or append-only.
3953 */
3954bool
3955xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
3956{
3957 /* prealloc/delalloc exists only on regular files */
3958 if (!S_ISREG(ip->i_d.di_mode))
3959 return false;
3960
3961 /*
3962 * Zero sized files with no cached pages and delalloc blocks will not
3963 * have speculative prealloc/delalloc blocks to remove.
3964 */
3965 if (VFS_I(ip)->i_size == 0 &&
3966 VN_CACHED(VFS_I(ip)) == 0 &&
3967 ip->i_delayed_blks == 0)
3968 return false;
3969
3970 /* If we haven't read in the extent list, then don't do it now. */
3971 if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
3972 return false;
3973
3974 /*
3975 * Do not free real preallocated or append-only files unless the file
3976 * has delalloc blocks and we are forced to remove them.
3977 */
3978 if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
3979 if (!force || ip->i_delayed_blks == 0)
3980 return false;
3981
3982 return true;
3983}
3984
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 94b32f906e79..22baf6ea4fac 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
496 (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ 496 (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
497 ((pip)->i_d.di_mode & S_ISGID)) 497 ((pip)->i_d.di_mode & S_ISGID))
498 498
499
499/* 500/*
500 * xfs_iget.c prototypes. 501 * xfs_inode.c prototypes.
501 */ 502 */
502int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
503 uint, uint, xfs_inode_t **);
504void xfs_ilock(xfs_inode_t *, uint); 503void xfs_ilock(xfs_inode_t *, uint);
505int xfs_ilock_nowait(xfs_inode_t *, uint); 504int xfs_ilock_nowait(xfs_inode_t *, uint);
506void xfs_iunlock(xfs_inode_t *, uint); 505void xfs_iunlock(xfs_inode_t *, uint);
@@ -508,11 +507,6 @@ void xfs_ilock_demote(xfs_inode_t *, uint);
508int xfs_isilocked(xfs_inode_t *, uint); 507int xfs_isilocked(xfs_inode_t *, uint);
509uint xfs_ilock_map_shared(xfs_inode_t *); 508uint xfs_ilock_map_shared(xfs_inode_t *);
510void xfs_iunlock_map_shared(xfs_inode_t *, uint); 509void xfs_iunlock_map_shared(xfs_inode_t *, uint);
511void xfs_inode_free(struct xfs_inode *ip);
512
513/*
514 * xfs_inode.c prototypes.
515 */
516int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, 510int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
517 xfs_nlink_t, xfs_dev_t, prid_t, int, 511 xfs_nlink_t, xfs_dev_t, prid_t, int,
518 struct xfs_buf **, xfs_inode_t **); 512 struct xfs_buf **, xfs_inode_t **);
@@ -591,6 +585,7 @@ void xfs_iext_irec_compact(xfs_ifork_t *);
591void xfs_iext_irec_compact_pages(xfs_ifork_t *); 585void xfs_iext_irec_compact_pages(xfs_ifork_t *);
592void xfs_iext_irec_compact_full(xfs_ifork_t *); 586void xfs_iext_irec_compact_full(xfs_ifork_t *);
593void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); 587void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
588bool xfs_can_free_eofblocks(struct xfs_inode *, bool);
594 589
595#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) 590#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
596 591
@@ -603,5 +598,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
603extern struct kmem_zone *xfs_ifork_zone; 598extern struct kmem_zone *xfs_ifork_zone;
604extern struct kmem_zone *xfs_inode_zone; 599extern struct kmem_zone *xfs_inode_zone;
605extern struct kmem_zone *xfs_ili_zone; 600extern struct kmem_zone *xfs_ili_zone;
601extern const struct xfs_buf_ops xfs_inode_buf_ops;
606 602
607#endif /* __XFS_INODE_H__ */ 603#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c1df3c623de2..c1c3ef88a260 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
42#include "xfs_inode_item.h" 42#include "xfs_inode_item.h"
43#include "xfs_export.h" 43#include "xfs_export.h"
44#include "xfs_trace.h" 44#include "xfs_trace.h"
45#include "xfs_icache.h"
45 46
46#include <linux/capability.h> 47#include <linux/capability.h>
47#include <linux/dcache.h> 48#include <linux/dcache.h>
@@ -1602,6 +1603,26 @@ xfs_file_ioctl(
1602 error = xfs_errortag_clearall(mp, 1); 1603 error = xfs_errortag_clearall(mp, 1);
1603 return -error; 1604 return -error;
1604 1605
1606 case XFS_IOC_FREE_EOFBLOCKS: {
1607 struct xfs_eofblocks eofb;
1608
1609 if (copy_from_user(&eofb, arg, sizeof(eofb)))
1610 return -XFS_ERROR(EFAULT);
1611
1612 if (eofb.eof_version != XFS_EOFBLOCKS_VERSION)
1613 return -XFS_ERROR(EINVAL);
1614
1615 if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
1616 return -XFS_ERROR(EINVAL);
1617
1618 if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) ||
1619 memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
1620 return -XFS_ERROR(EINVAL);
1621
1622 error = xfs_icache_free_eofblocks(mp, &eofb);
1623 return -error;
1624 }
1625
1605 default: 1626 default:
1606 return -ENOTTY; 1627 return -ENOTTY;
1607 } 1628 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7f537663365b..add06b4e9a63 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -41,6 +41,7 @@
41#include "xfs_utils.h" 41#include "xfs_utils.h"
42#include "xfs_iomap.h" 42#include "xfs_iomap.h"
43#include "xfs_trace.h" 43#include "xfs_trace.h"
44#include "xfs_icache.h"
44 45
45 46
46#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 47#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
@@ -373,7 +374,7 @@ xfs_iomap_write_delay(
373 xfs_extlen_t extsz; 374 xfs_extlen_t extsz;
374 int nimaps; 375 int nimaps;
375 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; 376 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
376 int prealloc, flushed = 0; 377 int prealloc;
377 int error; 378 int error;
378 379
379 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 380 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -434,31 +435,29 @@ retry:
434 } 435 }
435 436
436 /* 437 /*
437 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For 438 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
438 * ENOSPC, * flush all other inodes with delalloc blocks to free up
439 * some of the excess reserved metadata space. For both cases, retry
440 * without EOF preallocation. 439 * without EOF preallocation.
441 */ 440 */
442 if (nimaps == 0) { 441 if (nimaps == 0) {
443 trace_xfs_delalloc_enospc(ip, offset, count); 442 trace_xfs_delalloc_enospc(ip, offset, count);
444 if (flushed) 443 if (prealloc) {
445 return XFS_ERROR(error ? error : ENOSPC); 444 prealloc = 0;
446 445 error = 0;
447 if (error == ENOSPC) { 446 goto retry;
448 xfs_iunlock(ip, XFS_ILOCK_EXCL);
449 xfs_flush_inodes(ip);
450 xfs_ilock(ip, XFS_ILOCK_EXCL);
451 } 447 }
452 448 return XFS_ERROR(error ? error : ENOSPC);
453 flushed = 1;
454 error = 0;
455 prealloc = 0;
456 goto retry;
457 } 449 }
458 450
459 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) 451 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
460 return xfs_alert_fsblock_zero(ip, &imap[0]); 452 return xfs_alert_fsblock_zero(ip, &imap[0]);
461 453
454 /*
455 * Tag the inode as speculatively preallocated so we can reclaim this
456 * space on demand, if necessary.
457 */
458 if (prealloc)
459 xfs_inode_set_eofblocks_tag(ip);
460
462 *ret_imap = imap[0]; 461 *ret_imap = imap[0];
463 return 0; 462 return 0;
464} 463}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 4e00cf091d2c..d82efaa2ac73 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
38#include "xfs_vnodeops.h" 38#include "xfs_vnodeops.h"
39#include "xfs_inode_item.h" 39#include "xfs_inode_item.h"
40#include "xfs_trace.h" 40#include "xfs_trace.h"
41#include "xfs_icache.h"
41 42
42#include <linux/capability.h> 43#include <linux/capability.h>
43#include <linux/xattr.h> 44#include <linux/xattr.h>
@@ -779,8 +780,8 @@ xfs_setattr_size(
779 * care about here. 780 * care about here.
780 */ 781 */
781 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { 782 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
782 error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, 783 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
783 FI_NONE); 784 ip->i_d.di_size, newsize);
784 if (error) 785 if (error)
785 goto out_unlock; 786 goto out_unlock;
786 } 787 }
@@ -854,6 +855,9 @@ xfs_setattr_size(
854 * and do not wait the usual (long) time for writeout. 855 * and do not wait the usual (long) time for writeout.
855 */ 856 */
856 xfs_iflags_set(ip, XFS_ITRUNCATED); 857 xfs_iflags_set(ip, XFS_ITRUNCATED);
858
859 /* A truncate down always removes post-EOF blocks. */
860 xfs_inode_clear_eofblocks_tag(ip);
857 } 861 }
858 862
859 if (mask & ATTR_CTIME) { 863 if (mask & ATTR_CTIME) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 01d10a66e302..2ea7d402188d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -34,6 +34,7 @@
34#include "xfs_error.h" 34#include "xfs_error.h"
35#include "xfs_btree.h" 35#include "xfs_btree.h"
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_icache.h"
37 38
38STATIC int 39STATIC int
39xfs_internal_inum( 40xfs_internal_inum(
@@ -395,7 +396,8 @@ xfs_bulkstat(
395 if (xfs_inobt_maskn(chunkidx, nicluster) 396 if (xfs_inobt_maskn(chunkidx, nicluster)
396 & ~r.ir_free) 397 & ~r.ir_free)
397 xfs_btree_reada_bufs(mp, agno, 398 xfs_btree_reada_bufs(mp, agno,
398 agbno, nbcluster); 399 agbno, nbcluster,
400 &xfs_inode_buf_ops);
399 } 401 }
400 irbp->ir_startino = r.ir_startino; 402 irbp->ir_startino = r.ir_startino;
401 irbp->ir_freecount = r.ir_freecount; 403 irbp->ir_freecount = r.ir_freecount;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 828662f70d64..fe7e4df85a7b 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -44,6 +44,7 @@
44#include <linux/kernel.h> 44#include <linux/kernel.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/slab.h> 46#include <linux/slab.h>
47#include <linux/crc32c.h>
47#include <linux/module.h> 48#include <linux/module.h>
48#include <linux/mutex.h> 49#include <linux/mutex.h>
49#include <linux/file.h> 50#include <linux/file.h>
@@ -118,6 +119,7 @@
118#define xfs_rotorstep xfs_params.rotorstep.val 119#define xfs_rotorstep xfs_params.rotorstep.val
119#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val 120#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
120#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val 121#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val
122#define xfs_eofb_secs xfs_params.eofb_timer.val
121 123
122#define current_cpu() (raw_smp_processor_id()) 124#define current_cpu() (raw_smp_processor_id())
123#define current_pid() (current->pid) 125#define current_pid() (current->pid)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4dad756962d0..46bd9d52ab51 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,8 @@
34#include "xfs_dinode.h" 34#include "xfs_dinode.h"
35#include "xfs_inode.h" 35#include "xfs_inode.h"
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_fsops.h"
38#include "xfs_cksum.h"
37 39
38kmem_zone_t *xfs_log_ticket_zone; 40kmem_zone_t *xfs_log_ticket_zone;
39 41
@@ -458,7 +460,8 @@ xfs_log_reserve(
458 tic->t_trans_type = t_type; 460 tic->t_trans_type = t_type;
459 *ticp = tic; 461 *ticp = tic;
460 462
461 xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); 463 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
464 : tic->t_unit_res);
462 465
463 trace_xfs_log_reserve(log, tic); 466 trace_xfs_log_reserve(log, tic);
464 467
@@ -679,25 +682,29 @@ out:
679} 682}
680 683
681/* 684/*
682 * Finish the recovery of the file system. This is separate from 685 * Finish the recovery of the file system. This is separate from the
683 * the xfs_log_mount() call, because it depends on the code in 686 * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
684 * xfs_mountfs() to read in the root and real-time bitmap inodes 687 * in the root and real-time bitmap inodes between calling xfs_log_mount() and
685 * between calling xfs_log_mount() and here. 688 * here.
686 * 689 *
687 * mp - ubiquitous xfs mount point structure 690 * If we finish recovery successfully, start the background log work. If we are
691 * not doing recovery, then we have a RO filesystem and we don't need to start
692 * it.
688 */ 693 */
689int 694int
690xfs_log_mount_finish(xfs_mount_t *mp) 695xfs_log_mount_finish(xfs_mount_t *mp)
691{ 696{
692 int error; 697 int error = 0;
693 698
694 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 699 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
695 error = xlog_recover_finish(mp->m_log); 700 error = xlog_recover_finish(mp->m_log);
696 else { 701 if (!error)
697 error = 0; 702 xfs_log_work_queue(mp);
703 } else {
698 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 704 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
699 } 705 }
700 706
707
701 return error; 708 return error;
702} 709}
703 710
@@ -850,15 +857,49 @@ xfs_log_unmount_write(xfs_mount_t *mp)
850} /* xfs_log_unmount_write */ 857} /* xfs_log_unmount_write */
851 858
852/* 859/*
853 * Deallocate log structures for unmount/relocation. 860 * Empty the log for unmount/freeze.
861 *
862 * To do this, we first need to shut down the background log work so it is not
863 * trying to cover the log as we clean up. We then need to unpin all objects in
864 * the log so we can then flush them out. Once they have completed their IO and
865 * run the callbacks removing themselves from the AIL, we can write the unmount
866 * record.
867 */
868void
869xfs_log_quiesce(
870 struct xfs_mount *mp)
871{
872 cancel_delayed_work_sync(&mp->m_log->l_work);
873 xfs_log_force(mp, XFS_LOG_SYNC);
874
875 /*
876 * The superblock buffer is uncached and while xfs_ail_push_all_sync()
877 * will push it, xfs_wait_buftarg() will not wait for it. Further,
878 * xfs_buf_iowait() cannot be used because it was pushed with the
879 * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
880 * the IO to complete.
881 */
882 xfs_ail_push_all_sync(mp->m_ail);
883 xfs_wait_buftarg(mp->m_ddev_targp);
884 xfs_buf_lock(mp->m_sb_bp);
885 xfs_buf_unlock(mp->m_sb_bp);
886
887 xfs_log_unmount_write(mp);
888}
889
890/*
891 * Shut down and release the AIL and Log.
854 * 892 *
855 * We need to stop the aild from running before we destroy 893 * During unmount, we need to ensure we flush all the dirty metadata objects
856 * and deallocate the log as the aild references the log. 894 * from the AIL so that the log is empty before we write the unmount record to
895 * the log. Once this is done, we can tear down the AIL and the log.
857 */ 896 */
858void 897void
859xfs_log_unmount(xfs_mount_t *mp) 898xfs_log_unmount(
899 struct xfs_mount *mp)
860{ 900{
861 cancel_delayed_work_sync(&mp->m_sync_work); 901 xfs_log_quiesce(mp);
902
862 xfs_trans_ail_destroy(mp); 903 xfs_trans_ail_destroy(mp);
863 xlog_dealloc_log(mp->m_log); 904 xlog_dealloc_log(mp->m_log);
864} 905}
@@ -1090,8 +1131,7 @@ xlog_iodone(xfs_buf_t *bp)
1090 * with it being freed after writing the unmount record to the 1131 * with it being freed after writing the unmount record to the
1091 * log. 1132 * log.
1092 */ 1133 */
1093 1134}
1094} /* xlog_iodone */
1095 1135
1096/* 1136/*
1097 * Return size of each in-core log record buffer. 1137 * Return size of each in-core log record buffer.
@@ -1161,6 +1201,40 @@ done:
1161} /* xlog_get_iclog_buffer_size */ 1201} /* xlog_get_iclog_buffer_size */
1162 1202
1163 1203
1204void
1205xfs_log_work_queue(
1206 struct xfs_mount *mp)
1207{
1208 queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
1209 msecs_to_jiffies(xfs_syncd_centisecs * 10));
1210}
1211
1212/*
1213 * Every sync period we need to unpin all items in the AIL and push them to
1214 * disk. If there is nothing dirty, then we might need to cover the log to
1215 * indicate that the filesystem is idle.
1216 */
1217void
1218xfs_log_worker(
1219 struct work_struct *work)
1220{
1221 struct xlog *log = container_of(to_delayed_work(work),
1222 struct xlog, l_work);
1223 struct xfs_mount *mp = log->l_mp;
1224
1225 /* dgc: errors ignored - not fatal and nowhere to report them */
1226 if (xfs_log_need_covered(mp))
1227 xfs_fs_log_dummy(mp);
1228 else
1229 xfs_log_force(mp, 0);
1230
1231 /* start pushing all the metadata that is currently dirty */
1232 xfs_ail_push_all(mp->m_ail);
1233
1234 /* queue us up again */
1235 xfs_log_work_queue(mp);
1236}
1237
1164/* 1238/*
1165 * This routine initializes some of the log structure for a given mount point. 1239 * This routine initializes some of the log structure for a given mount point.
1166 * Its primary purpose is to fill in enough, so recovery can occur. However, 1240 * Its primary purpose is to fill in enough, so recovery can occur. However,
@@ -1195,6 +1269,7 @@ xlog_alloc_log(
1195 log->l_logBBsize = num_bblks; 1269 log->l_logBBsize = num_bblks;
1196 log->l_covered_state = XLOG_STATE_COVER_IDLE; 1270 log->l_covered_state = XLOG_STATE_COVER_IDLE;
1197 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1271 log->l_flags |= XLOG_ACTIVE_RECOVERY;
1272 INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
1198 1273
1199 log->l_prev_block = -1; 1274 log->l_prev_block = -1;
1200 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1275 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -1417,6 +1492,84 @@ xlog_grant_push_ail(
1417} 1492}
1418 1493
1419/* 1494/*
1495 * Stamp cycle number in every block
1496 */
1497STATIC void
1498xlog_pack_data(
1499 struct xlog *log,
1500 struct xlog_in_core *iclog,
1501 int roundoff)
1502{
1503 int i, j, k;
1504 int size = iclog->ic_offset + roundoff;
1505 __be32 cycle_lsn;
1506 xfs_caddr_t dp;
1507
1508 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
1509
1510 dp = iclog->ic_datap;
1511 for (i = 0; i < BTOBB(size); i++) {
1512 if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
1513 break;
1514 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
1515 *(__be32 *)dp = cycle_lsn;
1516 dp += BBSIZE;
1517 }
1518
1519 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1520 xlog_in_core_2_t *xhdr = iclog->ic_data;
1521
1522 for ( ; i < BTOBB(size); i++) {
1523 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1524 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1525 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
1526 *(__be32 *)dp = cycle_lsn;
1527 dp += BBSIZE;
1528 }
1529
1530 for (i = 1; i < log->l_iclog_heads; i++)
1531 xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
1532 }
1533}
1534
1535/*
1536 * Calculate the checksum for a log buffer.
1537 *
1538 * This is a little more complicated than it should be because the various
1539 * headers and the actual data are non-contiguous.
1540 */
1541__le32
1542xlog_cksum(
1543 struct xlog *log,
1544 struct xlog_rec_header *rhead,
1545 char *dp,
1546 int size)
1547{
1548 __uint32_t crc;
1549
1550 /* first generate the crc for the record header ... */
1551 crc = xfs_start_cksum((char *)rhead,
1552 sizeof(struct xlog_rec_header),
1553 offsetof(struct xlog_rec_header, h_crc));
1554
1555 /* ... then for additional cycle data for v2 logs ... */
1556 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1557 union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
1558 int i;
1559
1560 for (i = 1; i < log->l_iclog_heads; i++) {
1561 crc = crc32c(crc, &xhdr[i].hic_xheader,
1562 sizeof(struct xlog_rec_ext_header));
1563 }
1564 }
1565
1566 /* ... and finally for the payload */
1567 crc = crc32c(crc, dp, size);
1568
1569 return xfs_end_cksum(crc);
1570}
1571
1572/*
1420 * The bdstrat callback function for log bufs. This gives us a central 1573 * The bdstrat callback function for log bufs. This gives us a central
1421 * place to trap bufs in case we get hit by a log I/O error and need to 1574 * place to trap bufs in case we get hit by a log I/O error and need to
1422 * shutdown. Actually, in practice, even when we didn't get a log error, 1575 * shutdown. Actually, in practice, even when we didn't get a log error,
@@ -1476,7 +1629,6 @@ xlog_sync(
1476 struct xlog *log, 1629 struct xlog *log,
1477 struct xlog_in_core *iclog) 1630 struct xlog_in_core *iclog)
1478{ 1631{
1479 xfs_caddr_t dptr; /* pointer to byte sized element */
1480 xfs_buf_t *bp; 1632 xfs_buf_t *bp;
1481 int i; 1633 int i;
1482 uint count; /* byte count of bwrite */ 1634 uint count; /* byte count of bwrite */
@@ -1485,6 +1637,7 @@ xlog_sync(
1485 int split = 0; /* split write into two regions */ 1637 int split = 0; /* split write into two regions */
1486 int error; 1638 int error;
1487 int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); 1639 int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
1640 int size;
1488 1641
1489 XFS_STATS_INC(xs_log_writes); 1642 XFS_STATS_INC(xs_log_writes);
1490 ASSERT(atomic_read(&iclog->ic_refcnt) == 0); 1643 ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
@@ -1515,13 +1668,10 @@ xlog_sync(
1515 xlog_pack_data(log, iclog, roundoff); 1668 xlog_pack_data(log, iclog, roundoff);
1516 1669
1517 /* real byte length */ 1670 /* real byte length */
1518 if (v2) { 1671 size = iclog->ic_offset;
1519 iclog->ic_header.h_len = 1672 if (v2)
1520 cpu_to_be32(iclog->ic_offset + roundoff); 1673 size += roundoff;
1521 } else { 1674 iclog->ic_header.h_len = cpu_to_be32(size);
1522 iclog->ic_header.h_len =
1523 cpu_to_be32(iclog->ic_offset);
1524 }
1525 1675
1526 bp = iclog->ic_bp; 1676 bp = iclog->ic_bp;
1527 XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); 1677 XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
@@ -1530,12 +1680,36 @@ xlog_sync(
1530 1680
1531 /* Do we need to split this write into 2 parts? */ 1681 /* Do we need to split this write into 2 parts? */
1532 if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { 1682 if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
1683 char *dptr;
1684
1533 split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); 1685 split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
1534 count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); 1686 count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
1535 iclog->ic_bwritecnt = 2; /* split into 2 writes */ 1687 iclog->ic_bwritecnt = 2;
1688
1689 /*
1690 * Bump the cycle numbers at the start of each block in the
1691 * part of the iclog that ends up in the buffer that gets
1692 * written to the start of the log.
1693 *
1694 * Watch out for the header magic number case, though.
1695 */
1696 dptr = (char *)&iclog->ic_header + count;
1697 for (i = 0; i < split; i += BBSIZE) {
1698 __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
1699 if (++cycle == XLOG_HEADER_MAGIC_NUM)
1700 cycle++;
1701 *(__be32 *)dptr = cpu_to_be32(cycle);
1702
1703 dptr += BBSIZE;
1704 }
1536 } else { 1705 } else {
1537 iclog->ic_bwritecnt = 1; 1706 iclog->ic_bwritecnt = 1;
1538 } 1707 }
1708
1709 /* calculcate the checksum */
1710 iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
1711 iclog->ic_datap, size);
1712
1539 bp->b_io_length = BTOBB(count); 1713 bp->b_io_length = BTOBB(count);
1540 bp->b_fspriv = iclog; 1714 bp->b_fspriv = iclog;
1541 XFS_BUF_ZEROFLAGS(bp); 1715 XFS_BUF_ZEROFLAGS(bp);
@@ -1589,19 +1763,6 @@ xlog_sync(
1589 bp->b_flags |= XBF_SYNCIO; 1763 bp->b_flags |= XBF_SYNCIO;
1590 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1764 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1591 bp->b_flags |= XBF_FUA; 1765 bp->b_flags |= XBF_FUA;
1592 dptr = bp->b_addr;
1593 /*
1594 * Bump the cycle numbers at the start of each block
1595 * since this part of the buffer is at the start of
1596 * a new cycle. Watch out for the header magic number
1597 * case, though.
1598 */
1599 for (i = 0; i < split; i += BBSIZE) {
1600 be32_add_cpu((__be32 *)dptr, 1);
1601 if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
1602 be32_add_cpu((__be32 *)dptr, 1);
1603 dptr += BBSIZE;
1604 }
1605 1766
1606 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); 1767 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1607 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); 1768 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1618,7 +1779,6 @@ xlog_sync(
1618 return 0; 1779 return 0;
1619} /* xlog_sync */ 1780} /* xlog_sync */
1620 1781
1621
1622/* 1782/*
1623 * Deallocate a log structure 1783 * Deallocate a log structure
1624 */ 1784 */
@@ -3713,3 +3873,4 @@ xlog_iclogs_empty(
3713 } while (iclog != log->l_iclog); 3873 } while (iclog != log->l_iclog);
3714 return 1; 3874 return 1;
3715} 3875}
3876
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 748d312850e2..5caee96059df 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,9 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
181 xfs_lsn_t *commit_lsn, int flags); 181 xfs_lsn_t *commit_lsn, int flags);
182bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 182bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
183 183
184void xfs_log_work_queue(struct xfs_mount *mp);
185void xfs_log_worker(struct work_struct *work);
186void xfs_log_quiesce(struct xfs_mount *mp);
187
184#endif 188#endif
185#endif /* __XFS_LOG_H__ */ 189#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 18a801d76a42..16d8d12ea3b4 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i)
139/* 139/*
140 * Flags for log structure 140 * Flags for log structure
141 */ 141 */
142#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */
143#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ 142#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
144#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 143#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
145#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 144#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
@@ -291,7 +290,7 @@ typedef struct xlog_rec_header {
291 __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ 290 __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
292 __be64 h_lsn; /* lsn of this LR : 8 */ 291 __be64 h_lsn; /* lsn of this LR : 8 */
293 __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ 292 __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
294 __be32 h_chksum; /* may not be used; non-zero if used : 4 */ 293 __le32 h_crc; /* crc of log record : 4 */
295 __be32 h_prev_block; /* block number to previous LR : 4 */ 294 __be32 h_prev_block; /* block number to previous LR : 4 */
296 __be32 h_num_logops; /* number of log operations in this LR : 4 */ 295 __be32 h_num_logops; /* number of log operations in this LR : 4 */
297 __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; 296 __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
@@ -495,6 +494,7 @@ struct xlog {
495 struct xfs_buf *l_xbuf; /* extra buffer for log 494 struct xfs_buf *l_xbuf; /* extra buffer for log
496 * wrapping */ 495 * wrapping */
497 struct xfs_buftarg *l_targ; /* buftarg of log */ 496 struct xfs_buftarg *l_targ; /* buftarg of log */
497 struct delayed_work l_work; /* background flush work */
498 uint l_flags; 498 uint l_flags;
499 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ 499 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
500 struct list_head *l_buf_cancel_table; 500 struct list_head *l_buf_cancel_table;
@@ -554,11 +554,9 @@ xlog_recover(
554extern int 554extern int
555xlog_recover_finish( 555xlog_recover_finish(
556 struct xlog *log); 556 struct xlog *log);
557extern void 557
558xlog_pack_data( 558extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
559 struct xlog *log, 559 char *dp, int size);
560 struct xlog_in_core *iclog,
561 int);
562 560
563extern kmem_zone_t *xfs_log_ticket_zone; 561extern kmem_zone_t *xfs_log_ticket_zone;
564struct xlog_ticket * 562struct xlog_ticket *
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index d308749fabf1..96fcbb85ff83 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -41,7 +41,9 @@
41#include "xfs_trans_priv.h" 41#include "xfs_trans_priv.h"
42#include "xfs_quota.h" 42#include "xfs_quota.h"
43#include "xfs_utils.h" 43#include "xfs_utils.h"
44#include "xfs_cksum.h"
44#include "xfs_trace.h" 45#include "xfs_trace.h"
46#include "xfs_icache.h"
45 47
46STATIC int 48STATIC int
47xlog_find_zeroed( 49xlog_find_zeroed(
@@ -2143,7 +2145,7 @@ xlog_recover_buffer_pass2(
2143 buf_flags |= XBF_UNMAPPED; 2145 buf_flags |= XBF_UNMAPPED;
2144 2146
2145 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 2147 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2146 buf_flags); 2148 buf_flags, NULL);
2147 if (!bp) 2149 if (!bp)
2148 return XFS_ERROR(ENOMEM); 2150 return XFS_ERROR(ENOMEM);
2149 error = bp->b_error; 2151 error = bp->b_error;
@@ -2236,7 +2238,8 @@ xlog_recover_inode_pass2(
2236 } 2238 }
2237 trace_xfs_log_recover_inode_recover(log, in_f); 2239 trace_xfs_log_recover_inode_recover(log, in_f);
2238 2240
2239 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); 2241 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
2242 NULL);
2240 if (!bp) { 2243 if (!bp) {
2241 error = ENOMEM; 2244 error = ENOMEM;
2242 goto error; 2245 goto error;
@@ -2547,7 +2550,8 @@ xlog_recover_dquot_pass2(
2547 ASSERT(dq_f->qlf_len == 1); 2550 ASSERT(dq_f->qlf_len == 1);
2548 2551
2549 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, 2552 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
2550 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); 2553 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
2554 NULL);
2551 if (error) 2555 if (error)
2552 return error; 2556 return error;
2553 2557
@@ -3213,80 +3217,58 @@ xlog_recover_process_iunlinks(
3213 mp->m_dmevmask = mp_dmevmask; 3217 mp->m_dmevmask = mp_dmevmask;
3214} 3218}
3215 3219
3216
3217#ifdef DEBUG
3218STATIC void
3219xlog_pack_data_checksum(
3220 struct xlog *log,
3221 struct xlog_in_core *iclog,
3222 int size)
3223{
3224 int i;
3225 __be32 *up;
3226 uint chksum = 0;
3227
3228 up = (__be32 *)iclog->ic_datap;
3229 /* divide length by 4 to get # words */
3230 for (i = 0; i < (size >> 2); i++) {
3231 chksum ^= be32_to_cpu(*up);
3232 up++;
3233 }
3234 iclog->ic_header.h_chksum = cpu_to_be32(chksum);
3235}
3236#else
3237#define xlog_pack_data_checksum(log, iclog, size)
3238#endif
3239
3240/* 3220/*
3241 * Stamp cycle number in every block 3221 * Upack the log buffer data and crc check it. If the check fails, issue a
3222 * warning if and only if the CRC in the header is non-zero. This makes the
3223 * check an advisory warning, and the zero CRC check will prevent failure
3224 * warnings from being emitted when upgrading the kernel from one that does not
3225 * add CRCs by default.
3226 *
3227 * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
3228 * corruption failure
3242 */ 3229 */
3243void 3230STATIC int
3244xlog_pack_data( 3231xlog_unpack_data_crc(
3245 struct xlog *log, 3232 struct xlog_rec_header *rhead,
3246 struct xlog_in_core *iclog, 3233 xfs_caddr_t dp,
3247 int roundoff) 3234 struct xlog *log)
3248{ 3235{
3249 int i, j, k; 3236 __le32 crc;
3250 int size = iclog->ic_offset + roundoff; 3237
3251 __be32 cycle_lsn; 3238 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
3252 xfs_caddr_t dp; 3239 if (crc != rhead->h_crc) {
3253 3240 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
3254 xlog_pack_data_checksum(log, iclog, size); 3241 xfs_alert(log->l_mp,
3255 3242 "log record CRC mismatch: found 0x%x, expected 0x%x.\n",
3256 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); 3243 le32_to_cpu(rhead->h_crc),
3257 3244 le32_to_cpu(crc));
3258 dp = iclog->ic_datap; 3245 xfs_hex_dump(dp, 32);
3259 for (i = 0; i < BTOBB(size) &&
3260 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3261 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
3262 *(__be32 *)dp = cycle_lsn;
3263 dp += BBSIZE;
3264 }
3265
3266 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3267 xlog_in_core_2_t *xhdr = iclog->ic_data;
3268
3269 for ( ; i < BTOBB(size); i++) {
3270 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3271 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3272 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
3273 *(__be32 *)dp = cycle_lsn;
3274 dp += BBSIZE;
3275 } 3246 }
3276 3247
3277 for (i = 1; i < log->l_iclog_heads; i++) { 3248 /*
3278 xhdr[i].hic_xheader.xh_cycle = cycle_lsn; 3249 * If we've detected a log record corruption, then we can't
3279 } 3250 * recover past this point. Abort recovery if we are enforcing
3251 * CRC protection by punting an error back up the stack.
3252 */
3253 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
3254 return EFSCORRUPTED;
3280 } 3255 }
3256
3257 return 0;
3281} 3258}
3282 3259
3283STATIC void 3260STATIC int
3284xlog_unpack_data( 3261xlog_unpack_data(
3285 struct xlog_rec_header *rhead, 3262 struct xlog_rec_header *rhead,
3286 xfs_caddr_t dp, 3263 xfs_caddr_t dp,
3287 struct xlog *log) 3264 struct xlog *log)
3288{ 3265{
3289 int i, j, k; 3266 int i, j, k;
3267 int error;
3268
3269 error = xlog_unpack_data_crc(rhead, dp, log);
3270 if (error)
3271 return error;
3290 3272
3291 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && 3273 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
3292 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { 3274 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3303,6 +3285,8 @@ xlog_unpack_data(
3303 dp += BBSIZE; 3285 dp += BBSIZE;
3304 } 3286 }
3305 } 3287 }
3288
3289 return 0;
3306} 3290}
3307 3291
3308STATIC int 3292STATIC int
@@ -3434,9 +3418,13 @@ xlog_do_recovery_pass(
3434 if (error) 3418 if (error)
3435 goto bread_err2; 3419 goto bread_err2;
3436 3420
3437 xlog_unpack_data(rhead, offset, log); 3421 error = xlog_unpack_data(rhead, offset, log);
3438 if ((error = xlog_recover_process_data(log, 3422 if (error)
3439 rhash, rhead, offset, pass))) 3423 goto bread_err2;
3424
3425 error = xlog_recover_process_data(log,
3426 rhash, rhead, offset, pass);
3427 if (error)
3440 goto bread_err2; 3428 goto bread_err2;
3441 blk_no += bblks + hblks; 3429 blk_no += bblks + hblks;
3442 } 3430 }
@@ -3546,9 +3534,14 @@ xlog_do_recovery_pass(
3546 if (error) 3534 if (error)
3547 goto bread_err2; 3535 goto bread_err2;
3548 } 3536 }
3549 xlog_unpack_data(rhead, offset, log); 3537
3550 if ((error = xlog_recover_process_data(log, rhash, 3538 error = xlog_unpack_data(rhead, offset, log);
3551 rhead, offset, pass))) 3539 if (error)
3540 goto bread_err2;
3541
3542 error = xlog_recover_process_data(log, rhash,
3543 rhead, offset, pass);
3544 if (error)
3552 goto bread_err2; 3545 goto bread_err2;
3553 blk_no += bblks; 3546 blk_no += bblks;
3554 } 3547 }
@@ -3573,9 +3566,13 @@ xlog_do_recovery_pass(
3573 if (error) 3566 if (error)
3574 goto bread_err2; 3567 goto bread_err2;
3575 3568
3576 xlog_unpack_data(rhead, offset, log); 3569 error = xlog_unpack_data(rhead, offset, log);
3577 if ((error = xlog_recover_process_data(log, rhash, 3570 if (error)
3578 rhead, offset, pass))) 3571 goto bread_err2;
3572
3573 error = xlog_recover_process_data(log, rhash,
3574 rhead, offset, pass);
3575 if (error)
3579 goto bread_err2; 3576 goto bread_err2;
3580 blk_no += bblks + hblks; 3577 blk_no += bblks + hblks;
3581 } 3578 }
@@ -3689,13 +3686,14 @@ xlog_do_recover(
3689 3686
3690 /* 3687 /*
3691 * Now that we've finished replaying all buffer and inode 3688 * Now that we've finished replaying all buffer and inode
3692 * updates, re-read in the superblock. 3689 * updates, re-read in the superblock and reverify it.
3693 */ 3690 */
3694 bp = xfs_getsb(log->l_mp, 0); 3691 bp = xfs_getsb(log->l_mp, 0);
3695 XFS_BUF_UNDONE(bp); 3692 XFS_BUF_UNDONE(bp);
3696 ASSERT(!(XFS_BUF_ISWRITE(bp))); 3693 ASSERT(!(XFS_BUF_ISWRITE(bp)));
3697 XFS_BUF_READ(bp); 3694 XFS_BUF_READ(bp);
3698 XFS_BUF_UNASYNC(bp); 3695 XFS_BUF_UNASYNC(bp);
3696 bp->b_ops = &xfs_sb_buf_ops;
3699 xfsbdstrat(log->l_mp, bp); 3697 xfsbdstrat(log->l_mp, bp);
3700 error = xfs_buf_iowait(bp); 3698 error = xfs_buf_iowait(bp);
3701 if (error) { 3699 if (error) {
@@ -3707,7 +3705,7 @@ xlog_do_recover(
3707 3705
3708 /* Convert superblock from on-disk format */ 3706 /* Convert superblock from on-disk format */
3709 sbp = &log->l_mp->m_sb; 3707 sbp = &log->l_mp->m_sb;
3710 xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp)); 3708 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
3711 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); 3709 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3712 ASSERT(xfs_sb_good_version(sbp)); 3710 ASSERT(xfs_sb_good_version(sbp));
3713 xfs_buf_relse(bp); 3711 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b2bd3a0e6376..da508463ff10 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
42#include "xfs_fsops.h" 42#include "xfs_fsops.h"
43#include "xfs_utils.h" 43#include "xfs_utils.h"
44#include "xfs_trace.h" 44#include "xfs_trace.h"
45#include "xfs_icache.h"
45 46
46 47
47#ifdef HAVE_PERCPU_SB 48#ifdef HAVE_PERCPU_SB
@@ -303,9 +304,8 @@ STATIC int
303xfs_mount_validate_sb( 304xfs_mount_validate_sb(
304 xfs_mount_t *mp, 305 xfs_mount_t *mp,
305 xfs_sb_t *sbp, 306 xfs_sb_t *sbp,
306 int flags) 307 bool check_inprogress)
307{ 308{
308 int loud = !(flags & XFS_MFSI_QUIET);
309 309
310 /* 310 /*
311 * If the log device and data device have the 311 * If the log device and data device have the
@@ -315,21 +315,18 @@ xfs_mount_validate_sb(
315 * a volume filesystem in a non-volume manner. 315 * a volume filesystem in a non-volume manner.
316 */ 316 */
317 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 317 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
318 if (loud) 318 xfs_warn(mp, "bad magic number");
319 xfs_warn(mp, "bad magic number");
320 return XFS_ERROR(EWRONGFS); 319 return XFS_ERROR(EWRONGFS);
321 } 320 }
322 321
323 if (!xfs_sb_good_version(sbp)) { 322 if (!xfs_sb_good_version(sbp)) {
324 if (loud) 323 xfs_warn(mp, "bad version");
325 xfs_warn(mp, "bad version");
326 return XFS_ERROR(EWRONGFS); 324 return XFS_ERROR(EWRONGFS);
327 } 325 }
328 326
329 if (unlikely( 327 if (unlikely(
330 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { 328 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
331 if (loud) 329 xfs_warn(mp,
332 xfs_warn(mp,
333 "filesystem is marked as having an external log; " 330 "filesystem is marked as having an external log; "
334 "specify logdev on the mount command line."); 331 "specify logdev on the mount command line.");
335 return XFS_ERROR(EINVAL); 332 return XFS_ERROR(EINVAL);
@@ -337,8 +334,7 @@ xfs_mount_validate_sb(
337 334
338 if (unlikely( 335 if (unlikely(
339 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { 336 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
340 if (loud) 337 xfs_warn(mp,
341 xfs_warn(mp,
342 "filesystem is marked as having an internal log; " 338 "filesystem is marked as having an internal log; "
343 "do not specify logdev on the mount command line."); 339 "do not specify logdev on the mount command line.");
344 return XFS_ERROR(EINVAL); 340 return XFS_ERROR(EINVAL);
@@ -372,8 +368,7 @@ xfs_mount_validate_sb(
372 sbp->sb_dblocks == 0 || 368 sbp->sb_dblocks == 0 ||
373 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || 369 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
374 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { 370 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
375 if (loud) 371 XFS_CORRUPTION_ERROR("SB sanity check failed",
376 XFS_CORRUPTION_ERROR("SB sanity check failed",
377 XFS_ERRLEVEL_LOW, mp, sbp); 372 XFS_ERRLEVEL_LOW, mp, sbp);
378 return XFS_ERROR(EFSCORRUPTED); 373 return XFS_ERROR(EFSCORRUPTED);
379 } 374 }
@@ -382,12 +377,10 @@ xfs_mount_validate_sb(
382 * Until this is fixed only page-sized or smaller data blocks work. 377 * Until this is fixed only page-sized or smaller data blocks work.
383 */ 378 */
384 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { 379 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
385 if (loud) { 380 xfs_warn(mp,
386 xfs_warn(mp,
387 "File system with blocksize %d bytes. " 381 "File system with blocksize %d bytes. "
388 "Only pagesize (%ld) or less will currently work.", 382 "Only pagesize (%ld) or less will currently work.",
389 sbp->sb_blocksize, PAGE_SIZE); 383 sbp->sb_blocksize, PAGE_SIZE);
390 }
391 return XFS_ERROR(ENOSYS); 384 return XFS_ERROR(ENOSYS);
392 } 385 }
393 386
@@ -401,23 +394,20 @@ xfs_mount_validate_sb(
401 case 2048: 394 case 2048:
402 break; 395 break;
403 default: 396 default:
404 if (loud) 397 xfs_warn(mp, "inode size of %d bytes not supported",
405 xfs_warn(mp, "inode size of %d bytes not supported",
406 sbp->sb_inodesize); 398 sbp->sb_inodesize);
407 return XFS_ERROR(ENOSYS); 399 return XFS_ERROR(ENOSYS);
408 } 400 }
409 401
410 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || 402 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
411 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 403 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
412 if (loud) 404 xfs_warn(mp,
413 xfs_warn(mp,
414 "file system too large to be mounted on this system."); 405 "file system too large to be mounted on this system.");
415 return XFS_ERROR(EFBIG); 406 return XFS_ERROR(EFBIG);
416 } 407 }
417 408
418 if (unlikely(sbp->sb_inprogress)) { 409 if (check_inprogress && sbp->sb_inprogress) {
419 if (loud) 410 xfs_warn(mp, "Offline file system operation in progress!");
420 xfs_warn(mp, "file system busy");
421 return XFS_ERROR(EFSCORRUPTED); 411 return XFS_ERROR(EFSCORRUPTED);
422 } 412 }
423 413
@@ -425,9 +415,7 @@ xfs_mount_validate_sb(
425 * Version 1 directory format has never worked on Linux. 415 * Version 1 directory format has never worked on Linux.
426 */ 416 */
427 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { 417 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
428 if (loud) 418 xfs_warn(mp, "file system using version 1 directory format");
429 xfs_warn(mp,
430 "file system using version 1 directory format");
431 return XFS_ERROR(ENOSYS); 419 return XFS_ERROR(ENOSYS);
432 } 420 }
433 421
@@ -520,11 +508,9 @@ out_unwind:
520 508
521void 509void
522xfs_sb_from_disk( 510xfs_sb_from_disk(
523 struct xfs_mount *mp, 511 struct xfs_sb *to,
524 xfs_dsb_t *from) 512 xfs_dsb_t *from)
525{ 513{
526 struct xfs_sb *to = &mp->m_sb;
527
528 to->sb_magicnum = be32_to_cpu(from->sb_magicnum); 514 to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
529 to->sb_blocksize = be32_to_cpu(from->sb_blocksize); 515 to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
530 to->sb_dblocks = be64_to_cpu(from->sb_dblocks); 516 to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -626,6 +612,72 @@ xfs_sb_to_disk(
626 } 612 }
627} 613}
628 614
615static void
616xfs_sb_verify(
617 struct xfs_buf *bp)
618{
619 struct xfs_mount *mp = bp->b_target->bt_mount;
620 struct xfs_sb sb;
621 int error;
622
623 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
624
625 /*
626 * Only check the in progress field for the primary superblock as
627 * mkfs.xfs doesn't clear it from secondary superblocks.
628 */
629 error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
630 if (error)
631 xfs_buf_ioerror(bp, error);
632}
633
634static void
635xfs_sb_read_verify(
636 struct xfs_buf *bp)
637{
638 xfs_sb_verify(bp);
639}
640
641/*
642 * We may be probed for a filesystem match, so we may not want to emit
643 * messages when the superblock buffer is not actually an XFS superblock.
644 * If we find an XFS superblock, the run a normal, noisy mount because we are
645 * really going to mount it and want to know about errors.
646 */
647static void
648xfs_sb_quiet_read_verify(
649 struct xfs_buf *bp)
650{
651 struct xfs_sb sb;
652
653 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
654
655 if (sb.sb_magicnum == XFS_SB_MAGIC) {
656 /* XFS filesystem, verify noisily! */
657 xfs_sb_read_verify(bp);
658 return;
659 }
660 /* quietly fail */
661 xfs_buf_ioerror(bp, EFSCORRUPTED);
662}
663
664static void
665xfs_sb_write_verify(
666 struct xfs_buf *bp)
667{
668 xfs_sb_verify(bp);
669}
670
671const struct xfs_buf_ops xfs_sb_buf_ops = {
672 .verify_read = xfs_sb_read_verify,
673 .verify_write = xfs_sb_write_verify,
674};
675
676static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
677 .verify_read = xfs_sb_quiet_read_verify,
678 .verify_write = xfs_sb_write_verify,
679};
680
629/* 681/*
630 * xfs_readsb 682 * xfs_readsb
631 * 683 *
@@ -651,26 +703,27 @@ xfs_readsb(xfs_mount_t *mp, int flags)
651 703
652reread: 704reread:
653 bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, 705 bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
654 BTOBB(sector_size), 0); 706 BTOBB(sector_size), 0,
707 loud ? &xfs_sb_buf_ops
708 : &xfs_sb_quiet_buf_ops);
655 if (!bp) { 709 if (!bp) {
656 if (loud) 710 if (loud)
657 xfs_warn(mp, "SB buffer read failed"); 711 xfs_warn(mp, "SB buffer read failed");
658 return EIO; 712 return EIO;
659 } 713 }
660 714 if (bp->b_error) {
661 /* 715 error = bp->b_error;
662 * Initialize the mount structure from the superblock.
663 * But first do some basic consistency checking.
664 */
665 xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
666 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
667 if (error) {
668 if (loud) 716 if (loud)
669 xfs_warn(mp, "SB validate failed"); 717 xfs_warn(mp, "SB validate failed");
670 goto release_buf; 718 goto release_buf;
671 } 719 }
672 720
673 /* 721 /*
722 * Initialize the mount structure from the superblock.
723 */
724 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
725
726 /*
674 * We must be able to do sector-sized and sector-aligned IO. 727 * We must be able to do sector-sized and sector-aligned IO.
675 */ 728 */
676 if (sector_size > mp->m_sb.sb_sectsize) { 729 if (sector_size > mp->m_sb.sb_sectsize) {
@@ -1001,7 +1054,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1001 } 1054 }
1002 bp = xfs_buf_read_uncached(mp->m_ddev_targp, 1055 bp = xfs_buf_read_uncached(mp->m_ddev_targp,
1003 d - XFS_FSS_TO_BB(mp, 1), 1056 d - XFS_FSS_TO_BB(mp, 1),
1004 XFS_FSS_TO_BB(mp, 1), 0); 1057 XFS_FSS_TO_BB(mp, 1), 0, NULL);
1005 if (!bp) { 1058 if (!bp) {
1006 xfs_warn(mp, "last sector read failed"); 1059 xfs_warn(mp, "last sector read failed");
1007 return EIO; 1060 return EIO;
@@ -1016,7 +1069,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1016 } 1069 }
1017 bp = xfs_buf_read_uncached(mp->m_logdev_targp, 1070 bp = xfs_buf_read_uncached(mp->m_logdev_targp,
1018 d - XFS_FSB_TO_BB(mp, 1), 1071 d - XFS_FSB_TO_BB(mp, 1),
1019 XFS_FSB_TO_BB(mp, 1), 0); 1072 XFS_FSB_TO_BB(mp, 1), 0, NULL);
1020 if (!bp) { 1073 if (!bp) {
1021 xfs_warn(mp, "log device read failed"); 1074 xfs_warn(mp, "log device read failed");
1022 return EIO; 1075 return EIO;
@@ -1427,6 +1480,8 @@ xfs_unmountfs(
1427 __uint64_t resblks; 1480 __uint64_t resblks;
1428 int error; 1481 int error;
1429 1482
1483 cancel_delayed_work_sync(&mp->m_eofblocks_work);
1484
1430 xfs_qm_unmount_quotas(mp); 1485 xfs_qm_unmount_quotas(mp);
1431 xfs_rtunmount_inodes(mp); 1486 xfs_rtunmount_inodes(mp);
1432 IRELE(mp->m_rootip); 1487 IRELE(mp->m_rootip);
@@ -1450,21 +1505,16 @@ xfs_unmountfs(
1450 1505
1451 /* 1506 /*
1452 * And reclaim all inodes. At this point there should be no dirty 1507 * And reclaim all inodes. At this point there should be no dirty
1453 * inode, and none should be pinned or locked, but use synchronous 1508 * inodes and none should be pinned or locked, but use synchronous
1454 * reclaim just to be sure. 1509 * reclaim just to be sure. We can stop background inode reclaim
1510 * here as well if it is still running.
1455 */ 1511 */
1512 cancel_delayed_work_sync(&mp->m_reclaim_work);
1456 xfs_reclaim_inodes(mp, SYNC_WAIT); 1513 xfs_reclaim_inodes(mp, SYNC_WAIT);
1457 1514
1458 xfs_qm_unmount(mp); 1515 xfs_qm_unmount(mp);
1459 1516
1460 /* 1517 /*
1461 * Flush out the log synchronously so that we know for sure
1462 * that nothing is pinned. This is important because bflush()
1463 * will skip pinned buffers.
1464 */
1465 xfs_log_force(mp, XFS_LOG_SYNC);
1466
1467 /*
1468 * Unreserve any blocks we have so that when we unmount we don't account 1518 * Unreserve any blocks we have so that when we unmount we don't account
1469 * the reserved free space as used. This is really only necessary for 1519 * the reserved free space as used. This is really only necessary for
1470 * lazy superblock counting because it trusts the incore superblock 1520 * lazy superblock counting because it trusts the incore superblock
@@ -1489,23 +1539,6 @@ xfs_unmountfs(
1489 xfs_warn(mp, "Unable to update superblock counters. " 1539 xfs_warn(mp, "Unable to update superblock counters. "
1490 "Freespace may not be correct on next mount."); 1540 "Freespace may not be correct on next mount.");
1491 1541
1492 /*
1493 * At this point we might have modified the superblock again and thus
1494 * added an item to the AIL, thus flush it again.
1495 */
1496 xfs_ail_push_all_sync(mp->m_ail);
1497 xfs_wait_buftarg(mp->m_ddev_targp);
1498
1499 /*
1500 * The superblock buffer is uncached and xfsaild_push() will lock and
1501 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
1502 * here but a lock on the superblock buffer will block until iodone()
1503 * has completed.
1504 */
1505 xfs_buf_lock(mp->m_sb_bp);
1506 xfs_buf_unlock(mp->m_sb_bp);
1507
1508 xfs_log_unmount_write(mp);
1509 xfs_log_unmount(mp); 1542 xfs_log_unmount(mp);
1510 xfs_uuid_unmount(mp); 1543 xfs_uuid_unmount(mp);
1511 1544
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index deee09e534dc..bab8314507e4 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations {
51 51
52#else /* __KERNEL__ */ 52#else /* __KERNEL__ */
53 53
54#include "xfs_sync.h"
55
56struct xlog; 54struct xlog;
57struct xfs_inode; 55struct xfs_inode;
58struct xfs_mru_cache; 56struct xfs_mru_cache;
@@ -197,9 +195,9 @@ typedef struct xfs_mount {
197 struct mutex m_icsb_mutex; /* balancer sync lock */ 195 struct mutex m_icsb_mutex; /* balancer sync lock */
198#endif 196#endif
199 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ 197 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
200 struct delayed_work m_sync_work; /* background sync work */
201 struct delayed_work m_reclaim_work; /* background inode reclaim */ 198 struct delayed_work m_reclaim_work; /* background inode reclaim */
202 struct work_struct m_flush_work; /* background inode flush */ 199 struct delayed_work m_eofblocks_work; /* background eof blocks
200 trimming */
203 __int64_t m_update_flags; /* sb flags we need to update 201 __int64_t m_update_flags; /* sb flags we need to update
204 on the next remount,rw */ 202 on the next remount,rw */
205 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 203 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
@@ -209,6 +207,9 @@ typedef struct xfs_mount {
209 struct workqueue_struct *m_data_workqueue; 207 struct workqueue_struct *m_data_workqueue;
210 struct workqueue_struct *m_unwritten_workqueue; 208 struct workqueue_struct *m_unwritten_workqueue;
211 struct workqueue_struct *m_cil_workqueue; 209 struct workqueue_struct *m_cil_workqueue;
210 struct workqueue_struct *m_reclaim_workqueue;
211 struct workqueue_struct *m_log_workqueue;
212 struct workqueue_struct *m_eofblocks_workqueue;
212} xfs_mount_t; 213} xfs_mount_t;
213 214
214/* 215/*
@@ -387,7 +388,9 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
387extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 388extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
388extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, 389extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
389 xfs_agnumber_t *); 390 xfs_agnumber_t *);
390extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *); 391extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
391extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); 392extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
392 393
394extern const struct xfs_buf_ops xfs_sb_buf_ops;
395
393#endif /* __XFS_MOUNT_H__ */ 396#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2e86fa0cfc0d..60eff4763156 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -40,6 +40,7 @@
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h"
43 44
44/* 45/*
45 * The global quota manager. There is only one of these for the entire 46 * The global quota manager. There is only one of these for the entire
@@ -891,7 +892,8 @@ xfs_qm_dqiter_bufs(
891 while (blkcnt--) { 892 while (blkcnt--) {
892 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 893 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
893 XFS_FSB_TO_DADDR(mp, bno), 894 XFS_FSB_TO_DADDR(mp, bno),
894 mp->m_quotainfo->qi_dqchunklen, 0, &bp); 895 mp->m_quotainfo->qi_dqchunklen, 0, &bp,
896 &xfs_dquot_buf_ops);
895 if (error) 897 if (error)
896 break; 898 break;
897 899
@@ -978,7 +980,8 @@ xfs_qm_dqiterate(
978 while (rablkcnt--) { 980 while (rablkcnt--) {
979 xfs_buf_readahead(mp->m_ddev_targp, 981 xfs_buf_readahead(mp->m_ddev_targp,
980 XFS_FSB_TO_DADDR(mp, rablkno), 982 XFS_FSB_TO_DADDR(mp, rablkno),
981 mp->m_quotainfo->qi_dqchunklen); 983 mp->m_quotainfo->qi_dqchunklen,
984 NULL);
982 rablkno++; 985 rablkno++;
983 } 986 }
984 } 987 }
@@ -1453,7 +1456,7 @@ xfs_qm_dqreclaim_one(
1453 int error; 1456 int error;
1454 1457
1455 if (!xfs_dqlock_nowait(dqp)) 1458 if (!xfs_dqlock_nowait(dqp))
1456 goto out_busy; 1459 goto out_move_tail;
1457 1460
1458 /* 1461 /*
1459 * This dquot has acquired a reference in the meantime remove it from 1462 * This dquot has acquired a reference in the meantime remove it from
@@ -1476,7 +1479,7 @@ xfs_qm_dqreclaim_one(
1476 * getting flushed to disk, we don't want to reclaim it. 1479 * getting flushed to disk, we don't want to reclaim it.
1477 */ 1480 */
1478 if (!xfs_dqflock_nowait(dqp)) 1481 if (!xfs_dqflock_nowait(dqp))
1479 goto out_busy; 1482 goto out_unlock_move_tail;
1480 1483
1481 if (XFS_DQ_IS_DIRTY(dqp)) { 1484 if (XFS_DQ_IS_DIRTY(dqp)) {
1482 struct xfs_buf *bp = NULL; 1485 struct xfs_buf *bp = NULL;
@@ -1487,7 +1490,7 @@ xfs_qm_dqreclaim_one(
1487 if (error) { 1490 if (error) {
1488 xfs_warn(mp, "%s: dquot %p flush failed", 1491 xfs_warn(mp, "%s: dquot %p flush failed",
1489 __func__, dqp); 1492 __func__, dqp);
1490 goto out_busy; 1493 goto out_unlock_move_tail;
1491 } 1494 }
1492 1495
1493 xfs_buf_delwri_queue(bp, buffer_list); 1496 xfs_buf_delwri_queue(bp, buffer_list);
@@ -1496,7 +1499,7 @@ xfs_qm_dqreclaim_one(
1496 * Give the dquot another try on the freelist, as the 1499 * Give the dquot another try on the freelist, as the
1497 * flushing will take some time. 1500 * flushing will take some time.
1498 */ 1501 */
1499 goto out_busy; 1502 goto out_unlock_move_tail;
1500 } 1503 }
1501 xfs_dqfunlock(dqp); 1504 xfs_dqfunlock(dqp);
1502 1505
@@ -1515,14 +1518,13 @@ xfs_qm_dqreclaim_one(
1515 XFS_STATS_INC(xs_qm_dqreclaims); 1518 XFS_STATS_INC(xs_qm_dqreclaims);
1516 return; 1519 return;
1517 1520
1518out_busy:
1519 xfs_dqunlock(dqp);
1520
1521 /* 1521 /*
1522 * Move the dquot to the tail of the list so that we don't spin on it. 1522 * Move the dquot to the tail of the list so that we don't spin on it.
1523 */ 1523 */
1524out_unlock_move_tail:
1525 xfs_dqunlock(dqp);
1526out_move_tail:
1524 list_move_tail(&dqp->q_lru, &qi->qi_lru_list); 1527 list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
1525
1526 trace_xfs_dqreclaim_busy(dqp); 1528 trace_xfs_dqreclaim_busy(dqp);
1527 XFS_STATS_INC(xs_qm_dqreclaim_misses); 1529 XFS_STATS_INC(xs_qm_dqreclaim_misses);
1528} 1530}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 858a3b186110..5f53e75409b8 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -40,6 +40,7 @@
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h"
43 44
44STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); 45STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
45STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, 46STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
@@ -845,7 +846,8 @@ STATIC int
845xfs_dqrele_inode( 846xfs_dqrele_inode(
846 struct xfs_inode *ip, 847 struct xfs_inode *ip,
847 struct xfs_perag *pag, 848 struct xfs_perag *pag,
848 int flags) 849 int flags,
850 void *args)
849{ 851{
850 /* skip quota inodes */ 852 /* skip quota inodes */
851 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || 853 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
@@ -881,5 +883,5 @@ xfs_qm_dqrele_all_inodes(
881 uint flags) 883 uint flags)
882{ 884{
883 ASSERT(mp->m_quotainfo); 885 ASSERT(mp->m_quotainfo);
884 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags); 886 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
885} 887}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ca28a4ba4b54..98dc670d3ee0 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -38,6 +38,7 @@
38#include "xfs_utils.h" 38#include "xfs_utils.h"
39#include "xfs_trace.h" 39#include "xfs_trace.h"
40#include "xfs_buf.h" 40#include "xfs_buf.h"
41#include "xfs_icache.h"
41 42
42 43
43/* 44/*
@@ -869,7 +870,7 @@ xfs_rtbuf_get(
869 ASSERT(map.br_startblock != NULLFSBLOCK); 870 ASSERT(map.br_startblock != NULLFSBLOCK);
870 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 871 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
871 XFS_FSB_TO_DADDR(mp, map.br_startblock), 872 XFS_FSB_TO_DADDR(mp, map.br_startblock),
872 mp->m_bsize, 0, &bp); 873 mp->m_bsize, 0, &bp, NULL);
873 if (error) 874 if (error)
874 return error; 875 return error;
875 ASSERT(!xfs_buf_geterror(bp)); 876 ASSERT(!xfs_buf_geterror(bp));
@@ -1872,9 +1873,14 @@ xfs_growfs_rt(
1872 */ 1873 */
1873 bp = xfs_buf_read_uncached(mp->m_rtdev_targp, 1874 bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
1874 XFS_FSB_TO_BB(mp, nrblocks - 1), 1875 XFS_FSB_TO_BB(mp, nrblocks - 1),
1875 XFS_FSB_TO_BB(mp, 1), 0); 1876 XFS_FSB_TO_BB(mp, 1), 0, NULL);
1876 if (!bp) 1877 if (!bp)
1877 return EIO; 1878 return EIO;
1879 if (bp->b_error) {
1880 error = bp->b_error;
1881 xfs_buf_relse(bp);
1882 return error;
1883 }
1878 xfs_buf_relse(bp); 1884 xfs_buf_relse(bp);
1879 1885
1880 /* 1886 /*
@@ -2219,9 +2225,11 @@ xfs_rtmount_init(
2219 } 2225 }
2220 bp = xfs_buf_read_uncached(mp->m_rtdev_targp, 2226 bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
2221 d - XFS_FSB_TO_BB(mp, 1), 2227 d - XFS_FSB_TO_BB(mp, 1),
2222 XFS_FSB_TO_BB(mp, 1), 0); 2228 XFS_FSB_TO_BB(mp, 1), 0, NULL);
2223 if (!bp) { 2229 if (!bp || bp->b_error) {
2224 xfs_warn(mp, "realtime device size check failed"); 2230 xfs_warn(mp, "realtime device size check failed");
2231 if (bp)
2232 xfs_buf_relse(bp);
2225 return EIO; 2233 return EIO;
2226 } 2234 }
2227 xfs_buf_relse(bp); 2235 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index f429d9d5d325..a05b45175fb0 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -81,6 +81,7 @@ struct xfs_mount;
81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ 81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ 82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ 83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
84#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
84 85
85#define XFS_SB_VERSION2_OKREALFBITS \ 86#define XFS_SB_VERSION2_OKREALFBITS \
86 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ 87 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
503 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); 504 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
504} 505}
505 506
507static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
508{
509 return (xfs_sb_version_hasmorebits(sbp) &&
510 (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT));
511}
512
506/* 513/*
507 * end of superblock version macros 514 * end of superblock version macros
508 */ 515 */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 26a09bd7f975..ab8839b26272 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,7 +49,7 @@
49#include "xfs_extfree_item.h" 49#include "xfs_extfree_item.h"
50#include "xfs_mru_cache.h" 50#include "xfs_mru_cache.h"
51#include "xfs_inode_item.h" 51#include "xfs_inode_item.h"
52#include "xfs_sync.h" 52#include "xfs_icache.h"
53#include "xfs_trace.h" 53#include "xfs_trace.h"
54 54
55#include <linux/namei.h> 55#include <linux/namei.h>
@@ -863,8 +863,30 @@ xfs_init_mount_workqueues(
863 WQ_MEM_RECLAIM, 0, mp->m_fsname); 863 WQ_MEM_RECLAIM, 0, mp->m_fsname);
864 if (!mp->m_cil_workqueue) 864 if (!mp->m_cil_workqueue)
865 goto out_destroy_unwritten; 865 goto out_destroy_unwritten;
866
867 mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
868 WQ_NON_REENTRANT, 0, mp->m_fsname);
869 if (!mp->m_reclaim_workqueue)
870 goto out_destroy_cil;
871
872 mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
873 WQ_NON_REENTRANT, 0, mp->m_fsname);
874 if (!mp->m_log_workqueue)
875 goto out_destroy_reclaim;
876
877 mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
878 WQ_NON_REENTRANT, 0, mp->m_fsname);
879 if (!mp->m_eofblocks_workqueue)
880 goto out_destroy_log;
881
866 return 0; 882 return 0;
867 883
884out_destroy_log:
885 destroy_workqueue(mp->m_log_workqueue);
886out_destroy_reclaim:
887 destroy_workqueue(mp->m_reclaim_workqueue);
888out_destroy_cil:
889 destroy_workqueue(mp->m_cil_workqueue);
868out_destroy_unwritten: 890out_destroy_unwritten:
869 destroy_workqueue(mp->m_unwritten_workqueue); 891 destroy_workqueue(mp->m_unwritten_workqueue);
870out_destroy_data_iodone_queue: 892out_destroy_data_iodone_queue:
@@ -877,11 +899,32 @@ STATIC void
877xfs_destroy_mount_workqueues( 899xfs_destroy_mount_workqueues(
878 struct xfs_mount *mp) 900 struct xfs_mount *mp)
879{ 901{
902 destroy_workqueue(mp->m_eofblocks_workqueue);
903 destroy_workqueue(mp->m_log_workqueue);
904 destroy_workqueue(mp->m_reclaim_workqueue);
880 destroy_workqueue(mp->m_cil_workqueue); 905 destroy_workqueue(mp->m_cil_workqueue);
881 destroy_workqueue(mp->m_data_workqueue); 906 destroy_workqueue(mp->m_data_workqueue);
882 destroy_workqueue(mp->m_unwritten_workqueue); 907 destroy_workqueue(mp->m_unwritten_workqueue);
883} 908}
884 909
910/*
911 * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
912 * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
913 * for IO to complete so that we effectively throttle multiple callers to the
914 * rate at which IO is completing.
915 */
916void
917xfs_flush_inodes(
918 struct xfs_mount *mp)
919{
920 struct super_block *sb = mp->m_super;
921
922 if (down_read_trylock(&sb->s_umount)) {
923 sync_inodes_sb(sb);
924 up_read(&sb->s_umount);
925 }
926}
927
885/* Catch misguided souls that try to use this interface on XFS */ 928/* Catch misguided souls that try to use this interface on XFS */
886STATIC struct inode * 929STATIC struct inode *
887xfs_fs_alloc_inode( 930xfs_fs_alloc_inode(
@@ -1006,9 +1049,8 @@ xfs_fs_put_super(
1006 struct xfs_mount *mp = XFS_M(sb); 1049 struct xfs_mount *mp = XFS_M(sb);
1007 1050
1008 xfs_filestream_unmount(mp); 1051 xfs_filestream_unmount(mp);
1009 cancel_delayed_work_sync(&mp->m_sync_work);
1010 xfs_unmountfs(mp); 1052 xfs_unmountfs(mp);
1011 xfs_syncd_stop(mp); 1053
1012 xfs_freesb(mp); 1054 xfs_freesb(mp);
1013 xfs_icsb_destroy_counters(mp); 1055 xfs_icsb_destroy_counters(mp);
1014 xfs_destroy_mount_workqueues(mp); 1056 xfs_destroy_mount_workqueues(mp);
@@ -1023,7 +1065,6 @@ xfs_fs_sync_fs(
1023 int wait) 1065 int wait)
1024{ 1066{
1025 struct xfs_mount *mp = XFS_M(sb); 1067 struct xfs_mount *mp = XFS_M(sb);
1026 int error;
1027 1068
1028 /* 1069 /*
1029 * Doing anything during the async pass would be counterproductive. 1070 * Doing anything during the async pass would be counterproductive.
@@ -1031,17 +1072,14 @@ xfs_fs_sync_fs(
1031 if (!wait) 1072 if (!wait)
1032 return 0; 1073 return 0;
1033 1074
1034 error = xfs_quiesce_data(mp); 1075 xfs_log_force(mp, XFS_LOG_SYNC);
1035 if (error)
1036 return -error;
1037
1038 if (laptop_mode) { 1076 if (laptop_mode) {
1039 /* 1077 /*
1040 * The disk must be active because we're syncing. 1078 * The disk must be active because we're syncing.
1041 * We schedule xfssyncd now (now that the disk is 1079 * We schedule log work now (now that the disk is
1042 * active) instead of later (when it might not be). 1080 * active) instead of later (when it might not be).
1043 */ 1081 */
1044 flush_delayed_work(&mp->m_sync_work); 1082 flush_delayed_work(&mp->m_log->l_work);
1045 } 1083 }
1046 1084
1047 return 0; 1085 return 0;
@@ -1118,6 +1156,48 @@ xfs_restore_resvblks(struct xfs_mount *mp)
1118 xfs_reserve_blocks(mp, &resblks, NULL); 1156 xfs_reserve_blocks(mp, &resblks, NULL);
1119} 1157}
1120 1158
1159/*
1160 * Trigger writeback of all the dirty metadata in the file system.
1161 *
1162 * This ensures that the metadata is written to their location on disk rather
1163 * than just existing in transactions in the log. This means after a quiesce
1164 * there is no log replay required to write the inodes to disk - this is the
1165 * primary difference between a sync and a quiesce.
1166 *
1167 * Note: xfs_log_quiesce() stops background log work - the callers must ensure
1168 * it is started again when appropriate.
1169 */
1170void
1171xfs_quiesce_attr(
1172 struct xfs_mount *mp)
1173{
1174 int error = 0;
1175
1176 /* wait for all modifications to complete */
1177 while (atomic_read(&mp->m_active_trans) > 0)
1178 delay(100);
1179
1180 /* force the log to unpin objects from the now complete transactions */
1181 xfs_log_force(mp, XFS_LOG_SYNC);
1182
1183 /* reclaim inodes to do any IO before the freeze completes */
1184 xfs_reclaim_inodes(mp, 0);
1185 xfs_reclaim_inodes(mp, SYNC_WAIT);
1186
1187 /* Push the superblock and write an unmount record */
1188 error = xfs_log_sbcount(mp);
1189 if (error)
1190 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
1191 "Frozen image may not be consistent.");
1192 /*
1193 * Just warn here till VFS can correctly support
1194 * read-only remount without racing.
1195 */
1196 WARN_ON(atomic_read(&mp->m_active_trans) != 0);
1197
1198 xfs_log_quiesce(mp);
1199}
1200
1121STATIC int 1201STATIC int
1122xfs_fs_remount( 1202xfs_fs_remount(
1123 struct super_block *sb, 1203 struct super_block *sb,
@@ -1198,20 +1278,18 @@ xfs_fs_remount(
1198 * value if it is non-zero, otherwise go with the default. 1278 * value if it is non-zero, otherwise go with the default.
1199 */ 1279 */
1200 xfs_restore_resvblks(mp); 1280 xfs_restore_resvblks(mp);
1281 xfs_log_work_queue(mp);
1201 } 1282 }
1202 1283
1203 /* rw -> ro */ 1284 /* rw -> ro */
1204 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { 1285 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
1205 /* 1286 /*
1206 * After we have synced the data but before we sync the 1287 * Before we sync the metadata, we need to free up the reserve
1207 * metadata, we need to free up the reserve block pool so that 1288 * block pool so that the used block count in the superblock on
1208 * the used block count in the superblock on disk is correct at 1289 * disk is correct at the end of the remount. Stash the current
1209 * the end of the remount. Stash the current reserve pool size 1290 * reserve pool size so that if we get remounted rw, we can
1210 * so that if we get remounted rw, we can return it to the same 1291 * return it to the same size.
1211 * size.
1212 */ 1292 */
1213
1214 xfs_quiesce_data(mp);
1215 xfs_save_resvblks(mp); 1293 xfs_save_resvblks(mp);
1216 xfs_quiesce_attr(mp); 1294 xfs_quiesce_attr(mp);
1217 mp->m_flags |= XFS_MOUNT_RDONLY; 1295 mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -1243,6 +1321,7 @@ xfs_fs_unfreeze(
1243 struct xfs_mount *mp = XFS_M(sb); 1321 struct xfs_mount *mp = XFS_M(sb);
1244 1322
1245 xfs_restore_resvblks(mp); 1323 xfs_restore_resvblks(mp);
1324 xfs_log_work_queue(mp);
1246 return 0; 1325 return 0;
1247} 1326}
1248 1327
@@ -1321,6 +1400,8 @@ xfs_fs_fill_super(
1321 spin_lock_init(&mp->m_sb_lock); 1400 spin_lock_init(&mp->m_sb_lock);
1322 mutex_init(&mp->m_growlock); 1401 mutex_init(&mp->m_growlock);
1323 atomic_set(&mp->m_active_trans, 0); 1402 atomic_set(&mp->m_active_trans, 0);
1403 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
1404 INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
1324 1405
1325 mp->m_super = sb; 1406 mp->m_super = sb;
1326 sb->s_fs_info = mp; 1407 sb->s_fs_info = mp;
@@ -1371,10 +1452,6 @@ xfs_fs_fill_super(
1371 /* 1452 /*
1372 * we must configure the block size in the superblock before we run the 1453 * we must configure the block size in the superblock before we run the
1373 * full mount process as the mount process can lookup and cache inodes. 1454 * full mount process as the mount process can lookup and cache inodes.
1374 * For the same reason we must also initialise the syncd and register
1375 * the inode cache shrinker so that inodes can be reclaimed during
1376 * operations like a quotacheck that iterate all inodes in the
1377 * filesystem.
1378 */ 1455 */
1379 sb->s_magic = XFS_SB_MAGIC; 1456 sb->s_magic = XFS_SB_MAGIC;
1380 sb->s_blocksize = mp->m_sb.sb_blocksize; 1457 sb->s_blocksize = mp->m_sb.sb_blocksize;
@@ -1384,13 +1461,9 @@ xfs_fs_fill_super(
1384 sb->s_time_gran = 1; 1461 sb->s_time_gran = 1;
1385 set_posix_acl_flag(sb); 1462 set_posix_acl_flag(sb);
1386 1463
1387 error = xfs_syncd_init(mp);
1388 if (error)
1389 goto out_filestream_unmount;
1390
1391 error = xfs_mountfs(mp); 1464 error = xfs_mountfs(mp);
1392 if (error) 1465 if (error)
1393 goto out_syncd_stop; 1466 goto out_filestream_unmount;
1394 1467
1395 root = igrab(VFS_I(mp->m_rootip)); 1468 root = igrab(VFS_I(mp->m_rootip));
1396 if (!root) { 1469 if (!root) {
@@ -1408,8 +1481,7 @@ xfs_fs_fill_super(
1408 } 1481 }
1409 1482
1410 return 0; 1483 return 0;
1411 out_syncd_stop: 1484
1412 xfs_syncd_stop(mp);
1413 out_filestream_unmount: 1485 out_filestream_unmount:
1414 xfs_filestream_unmount(mp); 1486 xfs_filestream_unmount(mp);
1415 out_free_sb: 1487 out_free_sb:
@@ -1429,7 +1501,6 @@ out_destroy_workqueues:
1429 out_unmount: 1501 out_unmount:
1430 xfs_filestream_unmount(mp); 1502 xfs_filestream_unmount(mp);
1431 xfs_unmountfs(mp); 1503 xfs_unmountfs(mp);
1432 xfs_syncd_stop(mp);
1433 goto out_free_sb; 1504 goto out_free_sb;
1434} 1505}
1435 1506
@@ -1625,16 +1696,6 @@ STATIC int __init
1625xfs_init_workqueues(void) 1696xfs_init_workqueues(void)
1626{ 1697{
1627 /* 1698 /*
1628 * We never want to the same work item to run twice, reclaiming inodes
1629 * or idling the log is not going to get any faster by multiple CPUs
1630 * competing for ressources. Use the default large max_active value
1631 * so that even lots of filesystems can perform these task in parallel.
1632 */
1633 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
1634 if (!xfs_syncd_wq)
1635 return -ENOMEM;
1636
1637 /*
1638 * The allocation workqueue can be used in memory reclaim situations 1699 * The allocation workqueue can be used in memory reclaim situations
1639 * (writepage path), and parallelism is only limited by the number of 1700 * (writepage path), and parallelism is only limited by the number of
1640 * AGs in all the filesystems mounted. Hence use the default large 1701 * AGs in all the filesystems mounted. Hence use the default large
@@ -1642,20 +1703,15 @@ xfs_init_workqueues(void)
1642 */ 1703 */
1643 xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); 1704 xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
1644 if (!xfs_alloc_wq) 1705 if (!xfs_alloc_wq)
1645 goto out_destroy_syncd; 1706 return -ENOMEM;
1646 1707
1647 return 0; 1708 return 0;
1648
1649out_destroy_syncd:
1650 destroy_workqueue(xfs_syncd_wq);
1651 return -ENOMEM;
1652} 1709}
1653 1710
1654STATIC void 1711STATIC void
1655xfs_destroy_workqueues(void) 1712xfs_destroy_workqueues(void)
1656{ 1713{
1657 destroy_workqueue(xfs_alloc_wq); 1714 destroy_workqueue(xfs_alloc_wq);
1658 destroy_workqueue(xfs_syncd_wq);
1659} 1715}
1660 1716
1661STATIC int __init 1717STATIC int __init
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 9de4a920ba05..bbe3d15a7904 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -74,6 +74,7 @@ struct block_device;
74 74
75extern __uint64_t xfs_max_file_offset(unsigned int); 75extern __uint64_t xfs_max_file_offset(unsigned int);
76 76
77extern void xfs_flush_inodes(struct xfs_mount *mp);
77extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 78extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
78extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); 79extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
79extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); 80extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index ee2d2adaa438..2801b5ce6cdb 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -202,6 +202,15 @@ static ctl_table xfs_table[] = {
202 .extra1 = &xfs_params.fstrm_timer.min, 202 .extra1 = &xfs_params.fstrm_timer.min,
203 .extra2 = &xfs_params.fstrm_timer.max, 203 .extra2 = &xfs_params.fstrm_timer.max,
204 }, 204 },
205 {
206 .procname = "speculative_prealloc_lifetime",
207 .data = &xfs_params.eofb_timer.val,
208 .maxlen = sizeof(int),
209 .mode = 0644,
210 .proc_handler = proc_dointvec_minmax,
211 .extra1 = &xfs_params.eofb_timer.min,
212 .extra2 = &xfs_params.eofb_timer.max,
213 },
205 /* please keep this the last entry */ 214 /* please keep this the last entry */
206#ifdef CONFIG_PROC_FS 215#ifdef CONFIG_PROC_FS
207 { 216 {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index b9937d450f8e..bd8e157c20ef 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -47,6 +47,7 @@ typedef struct xfs_param {
47 xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ 47 xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
48 xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ 48 xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
49 xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ 49 xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */
50 xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */
50} xfs_param_t; 51} xfs_param_t;
51 52
52/* 53/*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7d36ccf57f93..2e137d4a85ae 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -96,6 +96,8 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
96DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); 96DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
97DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); 97DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
98DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); 98DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
99DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
100DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
99 101
100DECLARE_EVENT_CLASS(xfs_perag_class, 102DECLARE_EVENT_CLASS(xfs_perag_class,
101 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, 103 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
@@ -130,6 +132,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
130DEFINE_PERAG_REF_EVENT(xfs_perag_put); 132DEFINE_PERAG_REF_EVENT(xfs_perag_put);
131DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); 133DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
132DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); 134DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
135DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
136DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
133 137
134TRACE_EVENT(xfs_attr_list_node_descend, 138TRACE_EVENT(xfs_attr_list_node_descend,
135 TP_PROTO(struct xfs_attr_list_context *ctx, 139 TP_PROTO(struct xfs_attr_list_context *ctx,
@@ -585,6 +589,10 @@ DEFINE_INODE_EVENT(xfs_update_time);
585DEFINE_INODE_EVENT(xfs_dquot_dqalloc); 589DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
586DEFINE_INODE_EVENT(xfs_dquot_dqdetach); 590DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
587 591
592DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
593DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
594DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
595
588DECLARE_EVENT_CLASS(xfs_iref_class, 596DECLARE_EVENT_CLASS(xfs_iref_class,
589 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), 597 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
590 TP_ARGS(ip, caller_ip), 598 TP_ARGS(ip, caller_ip),
@@ -1496,8 +1504,42 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
1496DEFINE_DIR2_EVENT(xfs_dir2_node_removename); 1504DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
1497DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); 1505DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
1498 1506
1507DECLARE_EVENT_CLASS(xfs_attr_class,
1508 TP_PROTO(struct xfs_da_args *args),
1509 TP_ARGS(args),
1510 TP_STRUCT__entry(
1511 __field(dev_t, dev)
1512 __field(xfs_ino_t, ino)
1513 __dynamic_array(char, name, args->namelen)
1514 __field(int, namelen)
1515 __field(int, valuelen)
1516 __field(xfs_dahash_t, hashval)
1517 __field(int, op_flags)
1518 ),
1519 TP_fast_assign(
1520 __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
1521 __entry->ino = args->dp->i_ino;
1522 if (args->namelen)
1523 memcpy(__get_str(name), args->name, args->namelen);
1524 __entry->namelen = args->namelen;
1525 __entry->valuelen = args->valuelen;
1526 __entry->hashval = args->hashval;
1527 __entry->op_flags = args->op_flags;
1528 ),
1529 TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
1530 "hashval 0x%x op_flags %s",
1531 MAJOR(__entry->dev), MINOR(__entry->dev),
1532 __entry->ino,
1533 __entry->namelen,
1534 __entry->namelen ? __get_str(name) : NULL,
1535 __entry->namelen,
1536 __entry->valuelen,
1537 __entry->hashval,
1538 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
1539)
1540
1499#define DEFINE_ATTR_EVENT(name) \ 1541#define DEFINE_ATTR_EVENT(name) \
1500DEFINE_EVENT(xfs_da_class, name, \ 1542DEFINE_EVENT(xfs_attr_class, name, \
1501 TP_PROTO(struct xfs_da_args *args), \ 1543 TP_PROTO(struct xfs_da_args *args), \
1502 TP_ARGS(args)) 1544 TP_ARGS(args))
1503DEFINE_ATTR_EVENT(xfs_attr_sf_add); 1545DEFINE_ATTR_EVENT(xfs_attr_sf_add);
@@ -1511,10 +1553,14 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
1511DEFINE_ATTR_EVENT(xfs_attr_leaf_add); 1553DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
1512DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); 1554DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
1513DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); 1555DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
1556DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
1514DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); 1557DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
1515DEFINE_ATTR_EVENT(xfs_attr_leaf_create); 1558DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
1559DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
1560DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
1516DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup); 1561DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
1517DEFINE_ATTR_EVENT(xfs_attr_leaf_replace); 1562DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
1563DEFINE_ATTR_EVENT(xfs_attr_leaf_remove);
1518DEFINE_ATTR_EVENT(xfs_attr_leaf_removename); 1564DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
1519DEFINE_ATTR_EVENT(xfs_attr_leaf_split); 1565DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
1520DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before); 1566DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
@@ -1526,12 +1572,21 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
1526DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node); 1572DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
1527DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance); 1573DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
1528DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance); 1574DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
1575DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
1529 1576
1530DEFINE_ATTR_EVENT(xfs_attr_node_addname); 1577DEFINE_ATTR_EVENT(xfs_attr_node_addname);
1578DEFINE_ATTR_EVENT(xfs_attr_node_get);
1531DEFINE_ATTR_EVENT(xfs_attr_node_lookup); 1579DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
1532DEFINE_ATTR_EVENT(xfs_attr_node_replace); 1580DEFINE_ATTR_EVENT(xfs_attr_node_replace);
1533DEFINE_ATTR_EVENT(xfs_attr_node_removename); 1581DEFINE_ATTR_EVENT(xfs_attr_node_removename);
1534 1582
1583DEFINE_ATTR_EVENT(xfs_attr_fillstate);
1584DEFINE_ATTR_EVENT(xfs_attr_refillstate);
1585
1586DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
1587DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
1588DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove);
1589
1535#define DEFINE_DA_EVENT(name) \ 1590#define DEFINE_DA_EVENT(name) \
1536DEFINE_EVENT(xfs_da_class, name, \ 1591DEFINE_EVENT(xfs_da_class, name, \
1537 TP_PROTO(struct xfs_da_args *args), \ 1592 TP_PROTO(struct xfs_da_args *args), \
@@ -1550,9 +1605,12 @@ DEFINE_DA_EVENT(xfs_da_node_split);
1550DEFINE_DA_EVENT(xfs_da_node_remove); 1605DEFINE_DA_EVENT(xfs_da_node_remove);
1551DEFINE_DA_EVENT(xfs_da_node_rebalance); 1606DEFINE_DA_EVENT(xfs_da_node_rebalance);
1552DEFINE_DA_EVENT(xfs_da_node_unbalance); 1607DEFINE_DA_EVENT(xfs_da_node_unbalance);
1608DEFINE_DA_EVENT(xfs_da_node_toosmall);
1553DEFINE_DA_EVENT(xfs_da_swap_lastblock); 1609DEFINE_DA_EVENT(xfs_da_swap_lastblock);
1554DEFINE_DA_EVENT(xfs_da_grow_inode); 1610DEFINE_DA_EVENT(xfs_da_grow_inode);
1555DEFINE_DA_EVENT(xfs_da_shrink_inode); 1611DEFINE_DA_EVENT(xfs_da_shrink_inode);
1612DEFINE_DA_EVENT(xfs_da_fixhashpath);
1613DEFINE_DA_EVENT(xfs_da_path_shift);
1556 1614
1557DECLARE_EVENT_CLASS(xfs_dir2_space_class, 1615DECLARE_EVENT_CLASS(xfs_dir2_space_class,
1558 TP_PROTO(struct xfs_da_args *args, int idx), 1616 TP_PROTO(struct xfs_da_args *args, int idx),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index db056544cbb5..c6c0601abd7a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -464,10 +464,7 @@ xfs_trans_get_buf(
464 int numblks, 464 int numblks,
465 uint flags) 465 uint flags)
466{ 466{
467 struct xfs_buf_map map = { 467 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
468 .bm_bn = blkno,
469 .bm_len = numblks,
470 };
471 return xfs_trans_get_buf_map(tp, target, &map, 1, flags); 468 return xfs_trans_get_buf_map(tp, target, &map, 1, flags);
472} 469}
473 470
@@ -476,7 +473,8 @@ int xfs_trans_read_buf_map(struct xfs_mount *mp,
476 struct xfs_buftarg *target, 473 struct xfs_buftarg *target,
477 struct xfs_buf_map *map, int nmaps, 474 struct xfs_buf_map *map, int nmaps,
478 xfs_buf_flags_t flags, 475 xfs_buf_flags_t flags,
479 struct xfs_buf **bpp); 476 struct xfs_buf **bpp,
477 const struct xfs_buf_ops *ops);
480 478
481static inline int 479static inline int
482xfs_trans_read_buf( 480xfs_trans_read_buf(
@@ -486,13 +484,12 @@ xfs_trans_read_buf(
486 xfs_daddr_t blkno, 484 xfs_daddr_t blkno,
487 int numblks, 485 int numblks,
488 xfs_buf_flags_t flags, 486 xfs_buf_flags_t flags,
489 struct xfs_buf **bpp) 487 struct xfs_buf **bpp,
488 const struct xfs_buf_ops *ops)
490{ 489{
491 struct xfs_buf_map map = { 490 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
492 .bm_bn = blkno, 491 return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
493 .bm_len = numblks, 492 flags, bpp, ops);
494 };
495 return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp);
496} 493}
497 494
498struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); 495struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6311b99c267f..4fc17d479d42 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -257,7 +257,8 @@ xfs_trans_read_buf_map(
257 struct xfs_buf_map *map, 257 struct xfs_buf_map *map,
258 int nmaps, 258 int nmaps,
259 xfs_buf_flags_t flags, 259 xfs_buf_flags_t flags,
260 struct xfs_buf **bpp) 260 struct xfs_buf **bpp,
261 const struct xfs_buf_ops *ops)
261{ 262{
262 xfs_buf_t *bp; 263 xfs_buf_t *bp;
263 xfs_buf_log_item_t *bip; 264 xfs_buf_log_item_t *bip;
@@ -265,7 +266,7 @@ xfs_trans_read_buf_map(
265 266
266 *bpp = NULL; 267 *bpp = NULL;
267 if (!tp) { 268 if (!tp) {
268 bp = xfs_buf_read_map(target, map, nmaps, flags); 269 bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
269 if (!bp) 270 if (!bp)
270 return (flags & XBF_TRYLOCK) ? 271 return (flags & XBF_TRYLOCK) ?
271 EAGAIN : XFS_ERROR(ENOMEM); 272 EAGAIN : XFS_ERROR(ENOMEM);
@@ -312,7 +313,9 @@ xfs_trans_read_buf_map(
312 if (!(XFS_BUF_ISDONE(bp))) { 313 if (!(XFS_BUF_ISDONE(bp))) {
313 trace_xfs_trans_read_buf_io(bp, _RET_IP_); 314 trace_xfs_trans_read_buf_io(bp, _RET_IP_);
314 ASSERT(!XFS_BUF_ISASYNC(bp)); 315 ASSERT(!XFS_BUF_ISASYNC(bp));
316 ASSERT(bp->b_iodone == NULL);
315 XFS_BUF_READ(bp); 317 XFS_BUF_READ(bp);
318 bp->b_ops = ops;
316 xfsbdstrat(tp->t_mountp, bp); 319 xfsbdstrat(tp->t_mountp, bp);
317 error = xfs_buf_iowait(bp); 320 error = xfs_buf_iowait(bp);
318 if (error) { 321 if (error) {
@@ -349,7 +352,7 @@ xfs_trans_read_buf_map(
349 return 0; 352 return 0;
350 } 353 }
351 354
352 bp = xfs_buf_read_map(target, map, nmaps, flags); 355 bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
353 if (bp == NULL) { 356 if (bp == NULL) {
354 *bpp = NULL; 357 *bpp = NULL;
355 return (flags & XBF_TRYLOCK) ? 358 return (flags & XBF_TRYLOCK) ?
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2a5c637344b4..d95f565a390e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -47,6 +47,7 @@
47#include "xfs_filestream.h" 47#include "xfs_filestream.h"
48#include "xfs_vnodeops.h" 48#include "xfs_vnodeops.h"
49#include "xfs_trace.h" 49#include "xfs_trace.h"
50#include "xfs_icache.h"
50 51
51/* 52/*
52 * The maximum pathlen is 1024 bytes. Since the minimum file system 53 * The maximum pathlen is 1024 bytes. Since the minimum file system
@@ -79,7 +80,7 @@ xfs_readlink_bmap(
79 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); 80 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
80 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); 81 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
81 82
82 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); 83 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL);
83 if (!bp) 84 if (!bp)
84 return XFS_ERROR(ENOMEM); 85 return XFS_ERROR(ENOMEM);
85 error = bp->b_error; 86 error = bp->b_error;
@@ -150,7 +151,7 @@ xfs_readlink(
150 * when the link count isn't zero and by xfs_dm_punch_hole() when 151 * when the link count isn't zero and by xfs_dm_punch_hole() when
151 * punching a hole to EOF. 152 * punching a hole to EOF.
152 */ 153 */
153STATIC int 154int
154xfs_free_eofblocks( 155xfs_free_eofblocks(
155 xfs_mount_t *mp, 156 xfs_mount_t *mp,
156 xfs_inode_t *ip, 157 xfs_inode_t *ip,
@@ -199,7 +200,7 @@ xfs_free_eofblocks(
199 if (need_iolock) { 200 if (need_iolock) {
200 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 201 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
201 xfs_trans_cancel(tp, 0); 202 xfs_trans_cancel(tp, 0);
202 return 0; 203 return EAGAIN;
203 } 204 }
204 } 205 }
205 206
@@ -237,6 +238,8 @@ xfs_free_eofblocks(
237 } else { 238 } else {
238 error = xfs_trans_commit(tp, 239 error = xfs_trans_commit(tp,
239 XFS_TRANS_RELEASE_LOG_RES); 240 XFS_TRANS_RELEASE_LOG_RES);
241 if (!error)
242 xfs_inode_clear_eofblocks_tag(ip);
240 } 243 }
241 244
242 xfs_iunlock(ip, XFS_ILOCK_EXCL); 245 xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -425,19 +428,18 @@ xfs_release(
425 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 428 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
426 if (truncated) { 429 if (truncated) {
427 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); 430 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
428 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) 431 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
429 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); 432 error = -filemap_flush(VFS_I(ip)->i_mapping);
433 if (error)
434 return error;
435 }
430 } 436 }
431 } 437 }
432 438
433 if (ip->i_d.di_nlink == 0) 439 if (ip->i_d.di_nlink == 0)
434 return 0; 440 return 0;
435 441
436 if ((S_ISREG(ip->i_d.di_mode) && 442 if (xfs_can_free_eofblocks(ip, false)) {
437 (VFS_I(ip)->i_size > 0 ||
438 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
439 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
440 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
441 443
442 /* 444 /*
443 * If we can't get the iolock just skip truncating the blocks 445 * If we can't get the iolock just skip truncating the blocks
@@ -464,7 +466,7 @@ xfs_release(
464 return 0; 466 return 0;
465 467
466 error = xfs_free_eofblocks(mp, ip, true); 468 error = xfs_free_eofblocks(mp, ip, true);
467 if (error) 469 if (error && error != EAGAIN)
468 return error; 470 return error;
469 471
470 /* delalloc blocks after truncation means it really is dirty */ 472 /* delalloc blocks after truncation means it really is dirty */
@@ -513,13 +515,12 @@ xfs_inactive(
513 goto out; 515 goto out;
514 516
515 if (ip->i_d.di_nlink != 0) { 517 if (ip->i_d.di_nlink != 0) {
516 if ((S_ISREG(ip->i_d.di_mode) && 518 /*
517 (VFS_I(ip)->i_size > 0 || 519 * force is true because we are evicting an inode from the
518 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && 520 * cache. Post-eof blocks must be freed, lest we end up with
519 (ip->i_df.if_flags & XFS_IFEXTENTS) && 521 * broken free space accounting.
520 (!(ip->i_d.di_flags & 522 */
521 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || 523 if (xfs_can_free_eofblocks(ip, true)) {
522 ip->i_delayed_blks != 0))) {
523 error = xfs_free_eofblocks(mp, ip, false); 524 error = xfs_free_eofblocks(mp, ip, false);
524 if (error) 525 if (error)
525 return VN_INACTIVE_CACHE; 526 return VN_INACTIVE_CACHE;
@@ -777,7 +778,7 @@ xfs_create(
777 XFS_TRANS_PERM_LOG_RES, log_count); 778 XFS_TRANS_PERM_LOG_RES, log_count);
778 if (error == ENOSPC) { 779 if (error == ENOSPC) {
779 /* flush outstanding delalloc blocks and retry */ 780 /* flush outstanding delalloc blocks and retry */
780 xfs_flush_inodes(dp); 781 xfs_flush_inodes(mp);
781 error = xfs_trans_reserve(tp, resblks, log_res, 0, 782 error = xfs_trans_reserve(tp, resblks, log_res, 0,
782 XFS_TRANS_PERM_LOG_RES, log_count); 783 XFS_TRANS_PERM_LOG_RES, log_count);
783 } 784 }
@@ -1957,12 +1958,11 @@ xfs_free_file_space(
1957 1958
1958 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); 1959 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1959 ioffset = offset & ~(rounding - 1); 1960 ioffset = offset & ~(rounding - 1);
1960 1961 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1961 if (VN_CACHED(VFS_I(ip)) != 0) { 1962 ioffset, -1);
1962 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); 1963 if (error)
1963 if (error) 1964 goto out_unlock_iolock;
1964 goto out_unlock_iolock; 1965 truncate_pagecache_range(VFS_I(ip), ioffset, -1);
1965 }
1966 1966
1967 /* 1967 /*
1968 * Need to zero the stuff we're not freeing, on disk. 1968 * Need to zero the stuff we're not freeing, on disk.
@@ -2095,6 +2095,73 @@ xfs_free_file_space(
2095 return error; 2095 return error;
2096} 2096}
2097 2097
2098
2099STATIC int
2100xfs_zero_file_space(
2101 struct xfs_inode *ip,
2102 xfs_off_t offset,
2103 xfs_off_t len,
2104 int attr_flags)
2105{
2106 struct xfs_mount *mp = ip->i_mount;
2107 uint granularity;
2108 xfs_off_t start_boundary;
2109 xfs_off_t end_boundary;
2110 int error;
2111
2112 granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
2113
2114 /*
2115 * Round the range of extents we are going to convert inwards. If the
2116 * offset is aligned, then it doesn't get changed so we zero from the
2117 * start of the block offset points to.
2118 */
2119 start_boundary = round_up(offset, granularity);
2120 end_boundary = round_down(offset + len, granularity);
2121
2122 ASSERT(start_boundary >= offset);
2123 ASSERT(end_boundary <= offset + len);
2124
2125 if (!(attr_flags & XFS_ATTR_NOLOCK))
2126 xfs_ilock(ip, XFS_IOLOCK_EXCL);
2127
2128 if (start_boundary < end_boundary - 1) {
2129 /* punch out the page cache over the conversion range */
2130 truncate_pagecache_range(VFS_I(ip), start_boundary,
2131 end_boundary - 1);
2132 /* convert the blocks */
2133 error = xfs_alloc_file_space(ip, start_boundary,
2134 end_boundary - start_boundary - 1,
2135 XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
2136 attr_flags);
2137 if (error)
2138 goto out_unlock;
2139
2140 /* We've handled the interior of the range, now for the edges */
2141 if (start_boundary != offset)
2142 error = xfs_iozero(ip, offset, start_boundary - offset);
2143 if (error)
2144 goto out_unlock;
2145
2146 if (end_boundary != offset + len)
2147 error = xfs_iozero(ip, end_boundary,
2148 offset + len - end_boundary);
2149
2150 } else {
2151 /*
2152 * It's either a sub-granularity range or the range spanned lies
2153 * partially across two adjacent blocks.
2154 */
2155 error = xfs_iozero(ip, offset, len);
2156 }
2157
2158out_unlock:
2159 if (!(attr_flags & XFS_ATTR_NOLOCK))
2160 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
2161 return error;
2162
2163}
2164
2098/* 2165/*
2099 * xfs_change_file_space() 2166 * xfs_change_file_space()
2100 * This routine allocates or frees disk space for the given file. 2167 * This routine allocates or frees disk space for the given file.
@@ -2120,10 +2187,8 @@ xfs_change_file_space(
2120 xfs_fsize_t fsize; 2187 xfs_fsize_t fsize;
2121 int setprealloc; 2188 int setprealloc;
2122 xfs_off_t startoffset; 2189 xfs_off_t startoffset;
2123 xfs_off_t llen;
2124 xfs_trans_t *tp; 2190 xfs_trans_t *tp;
2125 struct iattr iattr; 2191 struct iattr iattr;
2126 int prealloc_type;
2127 2192
2128 if (!S_ISREG(ip->i_d.di_mode)) 2193 if (!S_ISREG(ip->i_d.di_mode))
2129 return XFS_ERROR(EINVAL); 2194 return XFS_ERROR(EINVAL);
@@ -2141,12 +2206,30 @@ xfs_change_file_space(
2141 return XFS_ERROR(EINVAL); 2206 return XFS_ERROR(EINVAL);
2142 } 2207 }
2143 2208
2144 llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len; 2209 /*
2210 * length of <= 0 for resv/unresv/zero is invalid. length for
2211 * alloc/free is ignored completely and we have no idea what userspace
2212 * might have set it to, so set it to zero to allow range
2213 * checks to pass.
2214 */
2215 switch (cmd) {
2216 case XFS_IOC_ZERO_RANGE:
2217 case XFS_IOC_RESVSP:
2218 case XFS_IOC_RESVSP64:
2219 case XFS_IOC_UNRESVSP:
2220 case XFS_IOC_UNRESVSP64:
2221 if (bf->l_len <= 0)
2222 return XFS_ERROR(EINVAL);
2223 break;
2224 default:
2225 bf->l_len = 0;
2226 break;
2227 }
2145 2228
2146 if (bf->l_start < 0 || 2229 if (bf->l_start < 0 ||
2147 bf->l_start > mp->m_super->s_maxbytes || 2230 bf->l_start > mp->m_super->s_maxbytes ||
2148 bf->l_start + llen < 0 || 2231 bf->l_start + bf->l_len < 0 ||
2149 bf->l_start + llen > mp->m_super->s_maxbytes) 2232 bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
2150 return XFS_ERROR(EINVAL); 2233 return XFS_ERROR(EINVAL);
2151 2234
2152 bf->l_whence = 0; 2235 bf->l_whence = 0;
@@ -2154,29 +2237,20 @@ xfs_change_file_space(
2154 startoffset = bf->l_start; 2237 startoffset = bf->l_start;
2155 fsize = XFS_ISIZE(ip); 2238 fsize = XFS_ISIZE(ip);
2156 2239
2157 /*
2158 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
2159 * file space.
2160 * These calls do NOT zero the data space allocated to the file,
2161 * nor do they change the file size.
2162 *
2163 * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
2164 * space.
2165 * These calls cause the new file data to be zeroed and the file
2166 * size to be changed.
2167 */
2168 setprealloc = clrprealloc = 0; 2240 setprealloc = clrprealloc = 0;
2169 prealloc_type = XFS_BMAPI_PREALLOC;
2170
2171 switch (cmd) { 2241 switch (cmd) {
2172 case XFS_IOC_ZERO_RANGE: 2242 case XFS_IOC_ZERO_RANGE:
2173 prealloc_type |= XFS_BMAPI_CONVERT; 2243 error = xfs_zero_file_space(ip, startoffset, bf->l_len,
2174 xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0); 2244 attr_flags);
2175 /* FALLTHRU */ 2245 if (error)
2246 return error;
2247 setprealloc = 1;
2248 break;
2249
2176 case XFS_IOC_RESVSP: 2250 case XFS_IOC_RESVSP:
2177 case XFS_IOC_RESVSP64: 2251 case XFS_IOC_RESVSP64:
2178 error = xfs_alloc_file_space(ip, startoffset, bf->l_len, 2252 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
2179 prealloc_type, attr_flags); 2253 XFS_BMAPI_PREALLOC, attr_flags);
2180 if (error) 2254 if (error)
2181 return error; 2255 return error;
2182 setprealloc = 1; 2256 setprealloc = 1;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 447e146b2ba6..5163022d9808 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -48,14 +48,9 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
48int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); 48int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
49int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, 49int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
50 int flags, struct attrlist_cursor_kern *cursor); 50 int flags, struct attrlist_cursor_kern *cursor);
51void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
52 xfs_off_t last, int fiopt);
53int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
54 xfs_off_t last, int fiopt);
55int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
56 xfs_off_t last, uint64_t flags, int fiopt);
57int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
58 51
52int xfs_iozero(struct xfs_inode *, loff_t, size_t);
59int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); 53int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
54int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
60 55
61#endif /* _XFS_VNODEOPS_H */ 56#endif /* _XFS_VNODEOPS_H */