30 files changed, 681 insertions, 332 deletions
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 1d32f1d52763..306d883d89bc 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -21,6 +21,8 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_vnodeops.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
 #include "xfs_trace.h"
 #include <linux/slab.h>
 #include <linux/xattr.h>
@@ -34,7 +36,9 @@
 */
 STATIC struct posix_acl *
-xfs_acl_from_disk(struct xfs_acl *aclp)
+xfs_acl_from_disk(
+        struct xfs_acl  *aclp,
+        int             max_entries)
 {
        struct posix_acl_entry *acl_e;
        struct posix_acl *acl;
@@ -42,7 +46,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
        unsigned int count, i;
        count = be32_to_cpu(aclp->acl_cnt);
-        if (count > XFS_ACL_MAX_ENTRIES)
+        if (count > max_entries)
                return ERR_PTR(-EFSCORRUPTED);
        acl = posix_acl_alloc(count, GFP_KERNEL);
@@ -108,9 +112,9 @@ xfs_get_acl(struct inode *inode, int type)
        struct xfs_inode *ip = XFS_I(inode);
        struct posix_acl *acl;
        struct xfs_acl *xfs_acl;
-        int len = sizeof(struct xfs_acl);
        unsigned char *ea_name;
        int error;
+        int len;
        acl = get_cached_acl(inode, type);
        if (acl != ACL_NOT_CACHED)
@@ -133,8 +137,8 @@ xfs_get_acl(struct inode *inode, int type)
         * If we have a cached ACLs value just return it, not need to
         * go out to the disk.
         */
+        len = XFS_ACL_MAX_SIZE(ip->i_mount);
-        xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+        xfs_acl = kzalloc(len, GFP_KERNEL);
        if (!xfs_acl)
                return ERR_PTR(-ENOMEM);
@@ -153,7 +157,7 @@ xfs_get_acl(struct inode *inode, int type)
                goto out;
        }
-        acl = xfs_acl_from_disk(xfs_acl);
+        acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
        if (IS_ERR(acl))
                goto out;
@@ -189,16 +193,17 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
        if (acl) {
                struct xfs_acl *xfs_acl;
-                int len;
+                int len = XFS_ACL_MAX_SIZE(ip->i_mount);
-                xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+                xfs_acl = kzalloc(len, GFP_KERNEL);
                if (!xfs_acl)
                        return -ENOMEM;
                xfs_acl_to_disk(xfs_acl, acl);
-                len = sizeof(struct xfs_acl) -
-                        (sizeof(struct xfs_acl_entry) *
+                /* subtract away the unused acl entries */
-                         (XFS_ACL_MAX_ENTRIES - acl->a_count));
+                len -= sizeof(struct xfs_acl_entry) *
+                         (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
                error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
                                len, ATTR_ROOT);
@@ -243,7 +248,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)
 static int
 xfs_acl_exists(struct inode *inode, unsigned char *name)
 {
-        int len = sizeof(struct xfs_acl);
+        int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
        return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
                            ATTR_ROOT|ATTR_KERNOVAL) == 0);
@@ -379,7 +384,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
                goto out_release;
        error = -EINVAL;
-        if (acl->a_count > XFS_ACL_MAX_ENTRIES)
+        if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
                goto out_release;
        if (type == ACL_TYPE_ACCESS) {
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 39632d941354..4016a567b83c 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -22,19 +22,36 @@ struct inode;
 struct posix_acl;
 struct xfs_inode;
-#define XFS_ACL_MAX_ENTRIES 25
 #define XFS_ACL_NOT_PRESENT (-1)
 /* On-disk XFS access control list structure */
+struct xfs_acl_entry {
+        __be32  ae_tag;
+        __be32  ae_id;
+        __be16  ae_perm;
+        __be16  ae_pad;         /* fill the implicit hole in the structure */
+};
 struct xfs_acl {
-        __be32          acl_cnt;
+        __be32                  acl_cnt;
-        struct xfs_acl_entry {
+        struct xfs_acl_entry    acl_entry[0];
-                __be32  ae_tag;
-                __be32  ae_id;
-                __be16  ae_perm;
-        } acl_entry[XFS_ACL_MAX_ENTRIES];
 };
+/*
+ * The number of ACL entries allowed is defined by the on-disk format.
+ * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
+ * limited only by the maximum size of the xattr that stores the information.
+ */
+#define XFS_ACL_MAX_ENTRIES(mp) \
+        (xfs_sb_version_hascrc(&mp->m_sb) \
+                ?  (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+                                                sizeof(struct xfs_acl_entry) \
+                : 25)
+#define XFS_ACL_MAX_SIZE(mp) \
+        (sizeof(struct xfs_acl) + \
+                sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
 /* On-disk XFS extended attribute names */
 #define SGI_ACL_FILE            (unsigned char *)"SGI_ACL_FILE"
 #define SGI_ACL_DEFAULT         (unsigned char *)"SGI_ACL_DEFAULT"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 2b2691b73428..41a695048be7 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -725,6 +725,25 @@ xfs_convert_page(
                        (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
                        i_size_read(inode));
+        /*
+         * If the current map does not span the entire page we are about to try
+         * to write, then give up. The only way we can write a page that spans
+         * multiple mappings in a single writeback iteration is via the
+         * xfs_vm_writepage() function. Data integrity writeback requires the
+         * entire page to be written in a single attempt, otherwise the part of
+         * the page we don't write here doesn't get written as part of the data
+         * integrity sync.
+         *
+         * For normal writeback, we also don't attempt to write partial pages
+         * here as it simply means that write_cache_pages() will see it under
+         * writeback and ignore the page until some point in the future, at
+         * which time this will be the only page in the file that needs
+         * writeback.  Hence for more optimal IO patterns, we should always
+         * avoid partial page writeback due to multiple mappings on a page here.
+         */
+        if (!xfs_imap_valid(inode, imap, end_offset))
+                goto fail_unlock_page;
        len = 1 << inode->i_blkbits;
        p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
                                        PAGE_CACHE_SIZE);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 08d5457c948e..31d3cd129269 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -931,20 +931,22 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 */
 int
 xfs_attr_shortform_allfit(
-        struct xfs_buf  *bp,
+        struct xfs_buf          *bp,
-        struct xfs_inode *dp)
+        struct xfs_inode        *dp)
 {
-        xfs_attr_leafblock_t *leaf;
+        struct xfs_attr_leafblock *leaf;
-        xfs_attr_leaf_entry_t *entry;
+        struct xfs_attr_leaf_entry *entry;
        xfs_attr_leaf_name_local_t *name_loc;
-        int bytes, i;
+        struct xfs_attr3_icleaf_hdr leafhdr;
+        int                     bytes;
+        int                     i;
        leaf = bp->b_addr;
-        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+        xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+        entry = xfs_attr3_leaf_entryp(leaf);
-        entry = &leaf->entries[0];
        bytes = sizeof(struct xfs_attr_sf_hdr);
-        for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+        for (i = 0; i < leafhdr.count; entry++, i++) {
                if (entry->flags & XFS_ATTR_INCOMPLETE)
                        continue;               /* don't copy partial entries */
                if (!(entry->flags & XFS_ATTR_LOCAL))
@@ -954,15 +956,15 @@ xfs_attr_shortform_allfit(
                        return(0);
                if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
                        return(0);
-                bytes += sizeof(struct xfs_attr_sf_entry)-1
+                bytes += sizeof(struct xfs_attr_sf_entry) - 1
                                + name_loc->namelen
                                + be16_to_cpu(name_loc->valuelen);
        }
        if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
            (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
            (bytes == sizeof(struct xfs_attr_sf_hdr)))
-                return(-1);
+                return -1;
-        return(xfs_attr_shortform_bytesfit(dp, bytes));
+        return xfs_attr_shortform_bytesfit(dp, bytes);
 }
 /*
@@ -1410,7 +1412,7 @@ xfs_attr3_leaf_add_work(
                name_rmt->valuelen = 0;
                name_rmt->valueblk = 0;
                args->rmtblkno = 1;
-                args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+                args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
        }
        xfs_trans_log_buf(args->trans, bp,
             XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
@@ -1443,11 +1445,12 @@ xfs_attr3_leaf_add_work(
 STATIC void
 xfs_attr3_leaf_compact(
        struct xfs_da_args      *args,
-        struct xfs_attr3_icleaf_hdr *ichdr_d,
+        struct xfs_attr3_icleaf_hdr *ichdr_dst,
        struct xfs_buf          *bp)
 {
-        xfs_attr_leafblock_t    *leaf_s, *leaf_d;
+        struct xfs_attr_leafblock *leaf_src;
-        struct xfs_attr3_icleaf_hdr ichdr_s;
+        struct xfs_attr_leafblock *leaf_dst;
+        struct xfs_attr3_icleaf_hdr ichdr_src;
        struct xfs_trans        *trans = args->trans;
        struct xfs_mount        *mp = trans->t_mountp;
        char                    *tmpbuffer;
@@ -1455,29 +1458,38 @@ xfs_attr3_leaf_compact(
        trace_xfs_attr_leaf_compact(args);
        tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
-        ASSERT(tmpbuffer != NULL);
        memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
        memset(bp->b_addr, 0, XFS_LBSIZE(mp));
+        leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
+        leaf_dst = bp->b_addr;
        /*
-         * Copy basic information
+         * Copy the on-disk header back into the destination buffer to ensure
+         * all the information in the header that is not part of the incore
+         * header structure is preserved.
         */
-        leaf_s = (xfs_attr_leafblock_t *)tmpbuffer;
+        memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
-        leaf_d = bp->b_addr;
-        ichdr_s = *ichdr_d;     /* struct copy */
+        /* Initialise the incore headers */
-        ichdr_d->firstused = XFS_LBSIZE(mp);
+        ichdr_src = *ichdr_dst; /* struct copy */
-        ichdr_d->usedbytes = 0;
+        ichdr_dst->firstused = XFS_LBSIZE(mp);
-        ichdr_d->count = 0;
+        ichdr_dst->usedbytes = 0;
-        ichdr_d->holes = 0;
+        ichdr_dst->count = 0;
-        ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_s);
+        ichdr_dst->holes = 0;
-        ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
+        ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
+        ichdr_dst->freemap[0].size = ichdr_dst->firstused -
+                                                ichdr_dst->freemap[0].base;
+        /* write the header back to initialise the underlying buffer */
+        xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
        /*
         * Copy all entry's in the same (sorted) order,
         * but allocate name/value pairs packed and in sequence.
         */
-        xfs_attr3_leaf_moveents(leaf_s, &ichdr_s, 0, leaf_d, ichdr_d, 0,
+        xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0,
-                                ichdr_s.count, mp);
+                                ichdr_src.count, mp);
        /*
         * this logs the entire buffer, but the caller must write the header
         * back to the buffer when it is finished modifying it.
@@ -2179,14 +2191,24 @@ xfs_attr3_leaf_unbalance(
                struct xfs_attr_leafblock *tmp_leaf;
                struct xfs_attr3_icleaf_hdr tmphdr;
-                tmp_leaf = kmem_alloc(state->blocksize, KM_SLEEP);
+                tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP);
-                memset(tmp_leaf, 0, state->blocksize);
-                memset(&tmphdr, 0, sizeof(tmphdr));
+                /*
+                 * Copy the header into the temp leaf so that all the stuff
+                 * not in the incore header is present and gets copied back in
+                 * once we've moved all the entries.
+                 */
+                memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
+                memset(&tmphdr, 0, sizeof(tmphdr));
                tmphdr.magic = savehdr.magic;
                tmphdr.forw = savehdr.forw;
                tmphdr.back = savehdr.back;
                tmphdr.firstused = state->blocksize;
+                /* write the header to the temp buffer to initialise it */
+                xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
                if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
                                         drop_blk->bp, &drophdr)) {
                        xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
@@ -2330,9 +2352,11 @@ xfs_attr3_leaf_lookup_int(
                        if (!xfs_attr_namesp_match(args->flags, entry->flags))
                                continue;
                        args->index = probe;
+                        args->valuelen = be32_to_cpu(name_rmt->valuelen);
                        args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-                        args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount,
+                        args->rmtblkcnt = xfs_attr3_rmt_blocks(
-                                                   be32_to_cpu(name_rmt->valuelen));
+                                                        args->dp->i_mount,
+                                                        args->valuelen);
                        return XFS_ERROR(EEXIST);
                }
        }
@@ -2383,7 +2407,8 @@ xfs_attr3_leaf_getvalue(
                ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
                valuelen = be32_to_cpu(name_rmt->valuelen);
                args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-                args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen);
+                args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+                                                       valuelen);
                if (args->flags & ATTR_KERNOVAL) {
                        args->valuelen = valuelen;
                        return 0;
@@ -2709,7 +2734,8 @@ xfs_attr3_leaf_list_int(
                                args.valuelen = valuelen;
                                args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
                                args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
-                                args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
+                                args.rmtblkcnt = xfs_attr3_rmt_blocks(
+                                                        args.dp->i_mount, valuelen);
                                retval = xfs_attr_rmtval_get(&args);
                                if (retval)
                                        return retval;
@@ -3232,7 +3258,7 @@ xfs_attr3_leaf_inactive(
                        name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
                        if (name_rmt->valueblk) {
                                lp->valueblk = be32_to_cpu(name_rmt->valueblk);
-                                lp->valuelen = XFS_B_TO_FSB(dp->i_mount,
+                                lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
                                                    be32_to_cpu(name_rmt->valuelen));
                                lp++;
                        }
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index f9d7846097e2..444a7704596c 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -128,6 +128,7 @@ struct xfs_attr3_leaf_hdr {
        __u8                    holes;
        __u8                    pad1;
        struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
+        __be32                  pad2;           /* 64 bit alignment */
 };
 #define XFS_ATTR3_LEAF_CRC_OFF  (offsetof(struct xfs_attr3_leaf_hdr, info.crc))
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index dee84466dcc9..ef6b0c124528 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -47,22 +47,55 @@
 * Each contiguous block has a header, so it is not just a simple attribute
 * length to FSB conversion.
 */
-static int
+int
 xfs_attr3_rmt_blocks(
        struct xfs_mount *mp,
        int             attrlen)
 {
-        int             buflen = XFS_ATTR3_RMT_BUF_SPACE(mp,
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
-                                                         mp->m_sb.sb_blocksize);
+                int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
-        return (attrlen + buflen - 1) / buflen;
+                return (attrlen + buflen - 1) / buflen;
+        }
+        return XFS_B_TO_FSB(mp, attrlen);
+}
+/*
+ * Checking of the remote attribute header is split into two parts. The verifier
+ * does CRC, location and bounds checking, the unpacking function checks the
+ * attribute parameters and owner.
+ */
+static bool
+xfs_attr3_rmt_hdr_ok(
+        struct xfs_mount        *mp,
+        void                    *ptr,
+        xfs_ino_t               ino,
+        uint32_t                offset,
+        uint32_t                size,
+        xfs_daddr_t             bno)
+{
+        struct xfs_attr3_rmt_hdr *rmt = ptr;
+        if (bno != be64_to_cpu(rmt->rm_blkno))
+                return false;
+        if (offset != be32_to_cpu(rmt->rm_offset))
+                return false;
+        if (size != be32_to_cpu(rmt->rm_bytes))
+                return false;
+        if (ino != be64_to_cpu(rmt->rm_owner))
+                return false;
+        /* ok */
+        return true;
 }
 static bool
 xfs_attr3_rmt_verify(
-        struct xfs_buf          *bp)
+        struct xfs_mount        *mp,
+        void                    *ptr,
+        int                     fsbsize,
+        xfs_daddr_t             bno)
 {
-        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_attr3_rmt_hdr *rmt = ptr;
-        struct xfs_attr3_rmt_hdr *rmt = bp->b_addr;
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return false;
@@ -70,7 +103,9 @@ xfs_attr3_rmt_verify(
                return false;
        if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
                return false;
-        if (bp->b_bn != be64_to_cpu(rmt->rm_blkno))
+        if (be64_to_cpu(rmt->rm_blkno) != bno)
+                return false;
+        if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
                return false;
        if (be32_to_cpu(rmt->rm_offset) +
                                be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX)
@@ -86,17 +121,40 @@ xfs_attr3_rmt_read_verify(
        struct xfs_buf  *bp)
 {
        struct xfs_mount *mp = bp->b_target->bt_mount;
+        char            *ptr;
+        int             len;
+        bool            corrupt = false;
+        xfs_daddr_t     bno;
        /* no verification of non-crc buffers */
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return;
-        if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+        ptr = bp->b_addr;
-                              XFS_ATTR3_RMT_CRC_OFF) ||
+        bno = bp->b_bn;
-            !xfs_attr3_rmt_verify(bp)) {
+        len = BBTOB(bp->b_length);
+        ASSERT(len >= XFS_LBSIZE(mp));
+        while (len > 0) {
+                if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp),
+                                      XFS_ATTR3_RMT_CRC_OFF)) {
+                        corrupt = true;
+                        break;
+                }
+                if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
+                        corrupt = true;
+                        break;
+                }
+                len -= XFS_LBSIZE(mp);
+                ptr += XFS_LBSIZE(mp);
+                bno += mp->m_bsize;
+        }
+        if (corrupt) {
                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
-        }
+        } else
+                ASSERT(len == 0);
 }
 static void
@@ -105,23 +163,39 @@ xfs_attr3_rmt_write_verify(
 {
        struct xfs_mount *mp = bp->b_target->bt_mount;
        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        char            *ptr;
+        int             len;
+        xfs_daddr_t     bno;
        /* no verification of non-crc buffers */
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return;
-        if (!xfs_attr3_rmt_verify(bp)) {
+        ptr = bp->b_addr;
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+        bno = bp->b_bn;
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        len = BBTOB(bp->b_length);
-                return;
+        ASSERT(len >= XFS_LBSIZE(mp));
-        }
+        while (len > 0) {
+                if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
+                        XFS_CORRUPTION_ERROR(__func__,
+                                            XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+                        xfs_buf_ioerror(bp, EFSCORRUPTED);
+                        return;
+                }
+                if (bip) {
+                        struct xfs_attr3_rmt_hdr *rmt;
+                        rmt = (struct xfs_attr3_rmt_hdr *)ptr;
+                        rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+                }
+                xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF);
-        if (bip) {
+                len -= XFS_LBSIZE(mp);
-                struct xfs_attr3_rmt_hdr *rmt = bp->b_addr;
+                ptr += XFS_LBSIZE(mp);
-                rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+                bno += mp->m_bsize;
        }
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+        ASSERT(len == 0);
-                         XFS_ATTR3_RMT_CRC_OFF);
 }
 const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
@@ -129,15 +203,16 @@ const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
        .verify_write = xfs_attr3_rmt_write_verify,
 };
-static int
+STATIC int
 xfs_attr3_rmt_hdr_set(
        struct xfs_mount        *mp,
+        void                    *ptr,
        xfs_ino_t               ino,
        uint32_t                offset,
        uint32_t                size,
-        struct xfs_buf          *bp)
+        xfs_daddr_t             bno)
 {
-        struct xfs_attr3_rmt_hdr *rmt = bp->b_addr;
+        struct xfs_attr3_rmt_hdr *rmt = ptr;
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return 0;
@@ -147,36 +222,107 @@ xfs_attr3_rmt_hdr_set(
        rmt->rm_bytes = cpu_to_be32(size);
        uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
        rmt->rm_owner = cpu_to_be64(ino);
-        rmt->rm_blkno = cpu_to_be64(bp->b_bn);
+        rmt->rm_blkno = cpu_to_be64(bno);
-        bp->b_ops = &xfs_attr3_rmt_buf_ops;
        return sizeof(struct xfs_attr3_rmt_hdr);
 }
 /*
- * Checking of the remote attribute header is split into two parts. the verifier
+ * Helper functions to copy attribute data in and out of the one disk extents
- * does CRC, location and bounds checking, the unpacking function checks the
- * attribute parameters and owner.
 */
-static bool
+STATIC int
-xfs_attr3_rmt_hdr_ok(
+xfs_attr_rmtval_copyout(
-        struct xfs_mount        *mp,
+        struct xfs_mount *mp,
-        xfs_ino_t               ino,
+        struct xfs_buf  *bp,
-        uint32_t                offset,
+        xfs_ino_t       ino,
-        uint32_t                size,
+        int             *offset,
-        struct xfs_buf          *bp)
+        int             *valuelen,
+        char            **dst)
 {
-        struct xfs_attr3_rmt_hdr *rmt = bp->b_addr;
+        char            *src = bp->b_addr;
+        xfs_daddr_t     bno = bp->b_bn;
+        int             len = BBTOB(bp->b_length);
-        if (offset != be32_to_cpu(rmt->rm_offset))
+        ASSERT(len >= XFS_LBSIZE(mp));
-                return false;
-        if (size != be32_to_cpu(rmt->rm_bytes))
-                return false;
-        if (ino != be64_to_cpu(rmt->rm_owner))
-                return false;
-        /* ok */
+        while (len > 0 && *valuelen > 0) {
-        return true;
+                int hdr_size = 0;
+                int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
+                byte_cnt = min_t(int, *valuelen, byte_cnt);
+                if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                        if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset,
+                                                  byte_cnt, bno)) {
+                                xfs_alert(mp,
+"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
+                                        bno, *offset, byte_cnt, ino);
+                                return EFSCORRUPTED;
+                        }
+                        hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
+                }
+                memcpy(*dst, src + hdr_size, byte_cnt);
+                /* roll buffer forwards */
+                len -= XFS_LBSIZE(mp);
+                src += XFS_LBSIZE(mp);
+                bno += mp->m_bsize;
+                /* roll attribute data forwards */
+                *valuelen -= byte_cnt;
+                *dst += byte_cnt;
+                *offset += byte_cnt;
+        }
+        return 0;
+}
+STATIC void
+xfs_attr_rmtval_copyin(
+        struct xfs_mount *mp,
+        struct xfs_buf  *bp,
+        xfs_ino_t       ino,
+        int             *offset,
+        int             *valuelen,
+        char            **src)
+{
+        char            *dst = bp->b_addr;
+        xfs_daddr_t     bno = bp->b_bn;
+        int             len = BBTOB(bp->b_length);
+        ASSERT(len >= XFS_LBSIZE(mp));
+        while (len > 0 && *valuelen > 0) {
+                int hdr_size;
+                int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
+                byte_cnt = min(*valuelen, byte_cnt);
+                hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
+                                                 byte_cnt, bno);
+                memcpy(dst + hdr_size, *src, byte_cnt);
+                /*
+                 * If this is the last block, zero the remainder of it.
+                 * Check that we are actually the last block, too.
+                 */
+                if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) {
+                        ASSERT(*valuelen - byte_cnt == 0);
+                        ASSERT(len == XFS_LBSIZE(mp));
+                        memset(dst + hdr_size + byte_cnt, 0,
+                                        XFS_LBSIZE(mp) - hdr_size - byte_cnt);
+                }
+                /* roll buffer forwards */
+                len -= XFS_LBSIZE(mp);
+                dst += XFS_LBSIZE(mp);
+                bno += mp->m_bsize;
+                /* roll attribute data forwards */
+                *valuelen -= byte_cnt;
+                *src += byte_cnt;
+                *offset += byte_cnt;
+        }
 }
 /*
@@ -190,13 +336,12 @@ xfs_attr_rmtval_get(
        struct xfs_bmbt_irec    map[ATTR_RMTVALUE_MAPSIZE];
        struct xfs_mount        *mp = args->dp->i_mount;
        struct xfs_buf          *bp;
-        xfs_daddr_t             dblkno;
        xfs_dablk_t             lblkno = args->rmtblkno;
-        void                    *dst = args->value;
+        char                    *dst = args->value;
        int                     valuelen = args->valuelen;
        int                     nmap;
        int                     error;
-        int                     blkcnt;
+        int                     blkcnt = args->rmtblkcnt;
        int                     i;
        int                     offset = 0;
@@ -207,52 +352,36 @@ xfs_attr_rmtval_get(
        while (valuelen > 0) {
                nmap = ATTR_RMTVALUE_MAPSIZE;
                error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
-                                       args->rmtblkcnt, map, &nmap,
+                                       blkcnt, map, &nmap,
                                       XFS_BMAPI_ATTRFORK);
                if (error)
                        return error;
                ASSERT(nmap >= 1);
                for (i = 0; (i < nmap) && (valuelen > 0); i++) {
-                        int     byte_cnt;
+                        xfs_daddr_t     dblkno;
-                        char    *src;
+                        int             dblkcnt;
                        ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
                               (map[i].br_startblock != HOLESTARTBLOCK));
                        dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
-                        blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+                        dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
                        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
-                                                   dblkno, blkcnt, 0, &bp,
+                                                   dblkno, dblkcnt, 0, &bp,
                                                   &xfs_attr3_rmt_buf_ops);
                        if (error)
                                return error;
-                        byte_cnt = min_t(int, valuelen, BBTOB(bp->b_length));
+                        error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
-                        byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt);
+                                                        &offset, &valuelen,
+                                                        &dst);
-                        src = bp->b_addr;
-                        if (xfs_sb_version_hascrc(&mp->m_sb)) {
-                                if (!xfs_attr3_rmt_hdr_ok(mp, args->dp->i_ino,
-                                                        offset, byte_cnt, bp)) {
-                                        xfs_alert(mp,
-"remote attribute header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
-                                                offset, byte_cnt, args->dp->i_ino);
-                                        xfs_buf_relse(bp);
-                                        return EFSCORRUPTED;
-                                }
-                                src += sizeof(struct xfs_attr3_rmt_hdr);
-                        }
-                        memcpy(dst, src, byte_cnt);
                        xfs_buf_relse(bp);
+                        if (error)
+                                return error;
-                        offset += byte_cnt;
+                        /* roll attribute extent map forwards */
-                        dst += byte_cnt;
-                        valuelen -= byte_cnt;
                        lblkno += map[i].br_blockcount;
+                        blkcnt -= map[i].br_blockcount;
                }
        }
        ASSERT(valuelen == 0);
@@ -270,17 +399,13 @@ xfs_attr_rmtval_set(
        struct xfs_inode        *dp = args->dp;
        struct xfs_mount        *mp = dp->i_mount;
        struct xfs_bmbt_irec    map;
-        struct xfs_buf          *bp;
-        xfs_daddr_t             dblkno;
        xfs_dablk_t             lblkno;
        xfs_fileoff_t           lfileoff = 0;
-        void                    *src = args->value;
+        char                    *src = args->value;
        int                     blkcnt;
        int                     valuelen;
        int                     nmap;
        int                     error;
-        int                     hdrcnt = 0;
-        bool                    crcs = xfs_sb_version_hascrc(&mp->m_sb);
        int                     offset = 0;
        trace_xfs_attr_rmtval_set(args);
@@ -289,24 +414,14 @@ xfs_attr_rmtval_set(
         * Find a "hole" in the attribute address space large enough for
         * us to drop the new attribute's value into. Because CRC enable
         * attributes have headers, we can't just do a straight byte to FSB
-         * conversion. We calculate the worst case block count in this case
+         * conversion and have to take the header space into account.
-         * and we may not need that many, so we have to handle this when
-         * allocating the blocks below. 
         */
-        if (!crcs)
+        blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
-                blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
-        else
-                blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
        error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
                                                   XFS_ATTR_FORK);
        if (error)
                return error;
-        /* Start with the attribute data. We'll allocate the rest afterwards. */
-        if (crcs)
-                blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
        args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
        args->rmtblkcnt = blkcnt;
@@ -349,26 +464,6 @@ xfs_attr_rmtval_set(
                       (map.br_startblock != HOLESTARTBLOCK));
                lblkno += map.br_blockcount;
                blkcnt -= map.br_blockcount;
-                hdrcnt++;
-                /*
-                 * If we have enough blocks for the attribute data, calculate
-                 * how many extra blocks we need for headers. We might run
-                 * through this multiple times in the case that the additional
-                 * headers in the blocks needed for the data fragments spills
-                 * into requiring more blocks. e.g. for 512 byte blocks, we'll
-                 * spill for another block every 9 headers we require in this
-                 * loop.
-                 */
-                if (crcs && blkcnt == 0) {
-                        int total_len;
-                        total_len = args->valuelen +
-                                    hdrcnt * sizeof(struct xfs_attr3_rmt_hdr);
-                        blkcnt = XFS_B_TO_FSB(mp, total_len);
-                        blkcnt -= args->rmtblkcnt;
-                        args->rmtblkcnt += blkcnt;
-                }
                /*
                 * Start the next trans in the chain.
@@ -385,18 +480,19 @@ xfs_attr_rmtval_set(
         * the INCOMPLETE flag.
         */
        lblkno = args->rmtblkno;
+        blkcnt = args->rmtblkcnt;
        valuelen = args->valuelen;
        while (valuelen > 0) {
-                int     byte_cnt;
+                struct xfs_buf  *bp;
-                char    *buf;
+                xfs_daddr_t     dblkno;
+                int             dblkcnt;
+                ASSERT(blkcnt > 0);
-                /*
-                 * Try to remember where we decided to put the value.
-                 */
                xfs_bmap_init(args->flist, args->firstblock);
                nmap = 1;
                error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
-                                       args->rmtblkcnt, &map, &nmap,
+                                       blkcnt, &map, &nmap,
                                       XFS_BMAPI_ATTRFORK);
                if (error)
                        return(error);
@@ -405,41 +501,27 @@ xfs_attr_rmtval_set(
                       (map.br_startblock != HOLESTARTBLOCK));
                dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
-                blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+                dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-                bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0);
+                bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
                if (!bp)
                        return ENOMEM;
                bp->b_ops = &xfs_attr3_rmt_buf_ops;
-                byte_cnt = BBTOB(bp->b_length);
+                xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
-                byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt);
+                                       &valuelen, &src);
-                if (valuelen < byte_cnt)
-                        byte_cnt = valuelen;
-                buf = bp->b_addr;
-                buf += xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset,
-                                             byte_cnt, bp);
-                memcpy(buf, src, byte_cnt);
-                if (byte_cnt < BBTOB(bp->b_length))
-                        xfs_buf_zero(bp, byte_cnt,
-                                     BBTOB(bp->b_length) - byte_cnt);
                error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
                xfs_buf_relse(bp);
                if (error)
                        return error;
-                src += byte_cnt;
-                valuelen -= byte_cnt;
-                offset += byte_cnt;
-                hdrcnt--;
+                /* roll attribute extent map forwards */
                lblkno += map.br_blockcount;
+                blkcnt -= map.br_blockcount;
        }
        ASSERT(valuelen == 0);
-        ASSERT(hdrcnt == 0);
        return 0;
 }
@@ -448,33 +530,40 @@ xfs_attr_rmtval_set(
 * out-of-line buffer that it is stored on.
 */
 int
-xfs_attr_rmtval_remove(xfs_da_args_t *args)
+xfs_attr_rmtval_remove(
+        struct xfs_da_args      *args)
 {
-        xfs_mount_t *mp;
+        struct xfs_mount        *mp = args->dp->i_mount;
-        xfs_bmbt_irec_t map;
+        xfs_dablk_t             lblkno;
-        xfs_buf_t *bp;
+        int                     blkcnt;
-        xfs_daddr_t dblkno;
+        int                     error;
-        xfs_dablk_t lblkno;
+        int                     done;
-        int valuelen, blkcnt, nmap, error, done, committed;
        trace_xfs_attr_rmtval_remove(args);
-        mp = args->dp->i_mount;
        /*
-         * Roll through the "value", invalidating the attribute value's
+         * Roll through the "value", invalidating the attribute value's blocks.
-         * blocks.
+         * Note that args->rmtblkcnt is the minimum number of data blocks we'll
+         * see for a CRC enabled remote attribute. Each extent will have a
+         * header, and so we may have more blocks than we realise here.  If we
+         * fail to map the blocks correctly, we'll have problems with the buffer
+         * lookups.
         */
        lblkno = args->rmtblkno;
-        valuelen = args->rmtblkcnt;
+        blkcnt = args->rmtblkcnt;
-        while (valuelen > 0) {
+        while (blkcnt > 0) {
+                struct xfs_bmbt_irec    map;
+                struct xfs_buf          *bp;
+                xfs_daddr_t             dblkno;
+                int                     dblkcnt;
+                int                     nmap;
                /*
                 * Try to remember where we decided to put the value.
                 */
                nmap = 1;
                error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
-                                       args->rmtblkcnt, &map, &nmap,
+                                       blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
-                                       XFS_BMAPI_ATTRFORK);
                if (error)
                        return(error);
                ASSERT(nmap == 1);
@@ -482,21 +571,20 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                       (map.br_startblock != HOLESTARTBLOCK));
                dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
-                blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+                dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
                /*
                 * If the "remote" value is in the cache, remove it.
                 */
-                bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
+                bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
                if (bp) {
                        xfs_buf_stale(bp);
                        xfs_buf_relse(bp);
                        bp = NULL;
                }
-                valuelen -= map.br_blockcount;
                lblkno += map.br_blockcount;
+                blkcnt -= map.br_blockcount;
        }
        /*
@@ -506,6 +594,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
        blkcnt = args->rmtblkcnt;
        done = 0;
        while (!done) {
+                int committed;
                xfs_bmap_init(args->flist, args->firstblock);
                error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
                                    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h
index c7cca60a062a..92a8fd7977cc 100644
--- a/fs/xfs/xfs_attr_remote.h
+++ b/fs/xfs/xfs_attr_remote.h
@@ -20,6 +20,14 @@
 #define XFS_ATTR3_RMT_MAGIC     0x5841524d      /* XARM */
+/*
+ * There is one of these headers per filesystem block in a remote attribute.
+ * This is done to ensure there is a 1:1 mapping between the attribute value
+ * length and the number of blocks needed to store the attribute. This makes the
+ * verification of a buffer a little more complex, but greatly simplifies the
+ * allocation, reading and writing of these attributes as we don't have to guess
+ * the number of blocks needed to store the attribute data.
+ */
 struct xfs_attr3_rmt_hdr {
        __be32  rm_magic;
        __be32  rm_offset;
@@ -39,6 +47,8 @@ struct xfs_attr3_rmt_hdr {
 extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
+int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
 int xfs_attr_rmtval_get(struct xfs_da_args *args);
 int xfs_attr_rmtval_set(struct xfs_da_args *args);
 int xfs_attr_rmtval_remove(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 8804b8a3c310..0903960410a2 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -2544,7 +2544,17 @@ xfs_btree_new_iroot(
        if (error)
                goto error0;
+        /*
+         * we can't just memcpy() the root in for CRC enabled btree blocks.
+         * In that case have to also ensure the blkno remains correct
+         */
        memcpy(cblock, block, xfs_btree_block_len(cur));
+        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+                if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                        cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
+                else
+                        cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
+        }
        be16_add_cpu(&block->bb_level, 1);
        xfs_btree_set_numrecs(block, 1);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 82b70bda9f47..1b2472a46e46 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -513,6 +513,7 @@ _xfs_buf_find(
                xfs_alert(btp->bt_mount,
                          "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
                          __func__, blkno, eofs);
+                WARN_ON(1);
                return NULL;
        }
@@ -1649,7 +1650,7 @@ xfs_alloc_buftarg(
 {
        xfs_buftarg_t           *btp;
-        btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
+        btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index cf263476d6b4..4ec431777048 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -262,12 +262,7 @@ xfs_buf_item_format_segment(
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
                        vecp->i_len = nbits * XFS_BLF_CHUNK;
                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
-/*
+                        nvecs++;
- * You would think we need to bump the nvecs here too, but we do not
- * this number is used by recovery, and it gets confused by the boundary
- * split here
- *                      nvecs++;
- */
                        vecp++;
                        first_bit = next_bit;
                        last_bit = next_bit;
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9b26a99ebfe9..0b8b2a13cd24 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -270,6 +270,7 @@ xfs_da3_node_read_verify(
                                break;
                        return;
                case XFS_ATTR_LEAF_MAGIC:
+                case XFS_ATTR3_LEAF_MAGIC:
                        bp->b_ops = &xfs_attr3_leaf_buf_ops;
                        bp->b_ops->verify_read(bp);
                        return;
@@ -2464,7 +2465,8 @@ xfs_buf_map_from_irec(
        ASSERT(nirecs >= 1);
        if (nirecs > 1) {
-                map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP);
+                map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
+                                  KM_SLEEP | KM_NOFS);
                if (!map)
                        return ENOMEM;
                *mapp = map;
@@ -2520,7 +2522,8 @@ xfs_dabuf_map(
                 * Optimize the one-block case.
                 */
                if (nfsb != 1)
-                        irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP);
+                        irecs = kmem_zalloc(sizeof(irec) * nfsb,
+                                            KM_SLEEP | KM_NOFS);
                nirecs = nfsb;
                error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index f852b082a084..c407e1ccff43 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -219,6 +219,14 @@ xfs_swap_extents(
        int             taforkblks = 0;
        __uint64_t      tmp;
+        /*
+         * We have no way of updating owner information in the BMBT blocks for
+         * each inode on CRC enabled filesystems, so to avoid corrupting the
+         * this metadata we simply don't allow extent swaps to occur.
+         */
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                return XFS_ERROR(EINVAL);
        tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
        if (!tempifp) {
                error = XFS_ERROR(ENOMEM);
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h
index a3b1bd841a80..7826782b8d78 100644
--- a/fs/xfs/xfs_dir2_format.h
+++ b/fs/xfs/xfs_dir2_format.h
@@ -266,6 +266,7 @@ struct xfs_dir3_blk_hdr {
 struct xfs_dir3_data_hdr {
        struct xfs_dir3_blk_hdr hdr;
        xfs_dir2_data_free_t    best_free[XFS_DIR2_DATA_FD_COUNT];
+        __be32                  pad;    /* 64 bit alignment */
 };
 #define XFS_DIR3_DATA_CRC_OFF  offsetof(struct xfs_dir3_data_hdr, hdr.crc)
@@ -477,7 +478,7 @@ struct xfs_dir3_leaf_hdr {
        struct xfs_da3_blkinfo  info;           /* header for da routines */
        __be16                  count;          /* count of entries */
        __be16                  stale;          /* count of stale entries */
-        __be32                  pad;
+        __be32                  pad;            /* 64 bit alignment */
 };
 struct xfs_dir3_icleaf_hdr {
@@ -715,6 +716,7 @@ struct xfs_dir3_free_hdr {
        __be32                  firstdb;        /* db of first entry */
        __be32                  nvalid;         /* count of valid entries */
        __be32                  nused;          /* count of used entries */
+        __be32                  pad;            /* 64 bit alignment */
 };
 struct xfs_dir3_free {
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 721ba2fe8e54..da71a1819d78 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1336,7 +1336,7 @@ xfs_dir2_leaf_getdents(
                                     mp->m_sb.sb_blocksize);
        map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
                                (length * sizeof(struct xfs_bmbt_irec)),
-                               KM_SLEEP);
+                               KM_SLEEP | KM_NOFS);
        map_info->map_size = length;
        /*
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 5246de4912d4..2226a00acd15 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -263,18 +263,19 @@ xfs_dir3_free_get_buf(
         * Initialize the new block to be empty, and remember
         * its first slot as our empty slot.
         */
-        hdr.magic = XFS_DIR2_FREE_MAGIC;
+        memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
-        hdr.firstdb = 0;
+        memset(&hdr, 0, sizeof(hdr));
-        hdr.nused = 0;
-        hdr.nvalid = 0;
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
                hdr.magic = XFS_DIR3_FREE_MAGIC;
                hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
                hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
                uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
-        }
+        } else
+                hdr.magic = XFS_DIR2_FREE_MAGIC;
        xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr);
        *bpp = bp;
        return 0;
@@ -1921,8 +1922,6 @@ xfs_dir2_node_addname_int(
                         */
                        freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
                                        xfs_dir3_free_max_bests(mp);
-                        free->hdr.nvalid = 0;
-                        free->hdr.nused = 0;
                } else {
                        free = fbp->b_addr;
                        bests = xfs_dir3_free_bests_p(mp, free);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index a41f8bf1da37..044e97a33c8d 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -249,8 +249,11 @@ xfs_qm_init_dquot_blk(
                d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
                d->dd_diskdq.d_id = cpu_to_be32(curid);
                d->dd_diskdq.d_flags = type;
-                if (xfs_sb_version_hascrc(&mp->m_sb))
+                if (xfs_sb_version_hascrc(&mp->m_sb)) {
                        uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+                        xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+                                         XFS_DQUOT_CRC_OFF);
+                }
        }
        xfs_trans_dquot_buf(tp, bp,
@@ -286,23 +289,6 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
        dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
 }
-STATIC void
-xfs_dquot_buf_calc_crc(
-        struct xfs_mount        *mp,
-        struct xfs_buf          *bp)
-{
-        struct xfs_dqblk        *d = (struct xfs_dqblk *)bp->b_addr;
-        int                     i;
-        if (!xfs_sb_version_hascrc(&mp->m_sb))
-                return;
-        for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++, d++) {
-                xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
-                                 offsetof(struct xfs_dqblk, dd_crc));
-        }
-}
 STATIC bool
 xfs_dquot_buf_verify_crc(
        struct xfs_mount        *mp,
@@ -328,12 +314,11 @@ xfs_dquot_buf_verify_crc(
        for (i = 0; i < ndquots; i++, d++) {
                if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
-                                 offsetof(struct xfs_dqblk, dd_crc)))
+                                 XFS_DQUOT_CRC_OFF))
                        return false;
                if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
                        return false;
        }
        return true;
 }
@@ -393,6 +378,11 @@ xfs_dquot_buf_read_verify(
        }
 }
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
 void
 xfs_dquot_buf_write_verify(
        struct xfs_buf  *bp)
@@ -404,7 +394,6 @@ xfs_dquot_buf_write_verify(
                xfs_buf_ioerror(bp, EFSCORRUPTED);
                return;
        }
-        xfs_dquot_buf_calc_crc(mp, bp);
 }
 const struct xfs_buf_ops xfs_dquot_buf_ops = {
@@ -1151,11 +1140,17 @@ xfs_qm_dqflush(
         * copy the lsn into the on-disk dquot now while we have the in memory
         * dquot here. This can't be done later in the write verifier as we
         * can't get access to the log item at that point in time.
+         *
+         * We also calculate the CRC here so that the on-disk dquot in the
+         * buffer always has a valid CRC. This ensures there is no possibility
+         * of a dquot without an up-to-date CRC getting to disk.
         */
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp;
                dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
+                xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
+                                 XFS_DQUOT_CRC_OFF);
        }
        /*
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index c0f375087efc..452920a3f03f 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -305,11 +305,12 @@ xfs_efi_release(xfs_efi_log_item_t	*efip,
 {
        ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
        if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) {
-                __xfs_efi_release(efip);
                /* recovery needs us to drop the EFI reference, too */
                if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
                        __xfs_efi_release(efip);
+                __xfs_efi_release(efip);
+                /* efip may now have been freed, do not reference it again. */
        }
 }
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 6dda3f949b04..d04695545397 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_PROJID32    0x0800  /* 32-bit project IDs   */
 #define XFS_FSOP_GEOM_FLAGS_DIRV2CI     0x1000  /* ASCII only CI names  */
 #define XFS_FSOP_GEOM_FLAGS_LAZYSB      0x4000  /* lazy superblock counters */
+#define XFS_FSOP_GEOM_FLAGS_V5SB        0x8000  /* version 5 superblock */
 /*
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 87595b211da1..3c3644ea825b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -99,7 +99,9 @@ xfs_fs_geometry(
                        (xfs_sb_version_hasattr2(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
                        (xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
-                                XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
+                                XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) |
+                        (xfs_sb_version_hascrc(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_V5SB : 0);
                geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
                                mp->m_sb.sb_logsectsize : BBSIZE;
                geo->rtsectsize = mp->m_sb.sb_blocksize;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index efbe1accb6ca..7f7be5f98f52 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1638,6 +1638,10 @@ xfs_iunlink(
                dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
                offset = ip->i_imap.im_boffset +
                        offsetof(xfs_dinode_t, di_next_unlinked);
+                /* need to recalc the inode CRC if appropriate */
+                xfs_dinode_calc_crc(mp, dip);
                xfs_trans_inode_buf(tp, ibp);
                xfs_trans_log_buf(tp, ibp, offset,
                                  (offset + sizeof(xfs_agino_t) - 1));
@@ -1723,6 +1727,10 @@ xfs_iunlink_remove(
                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
                        offset = ip->i_imap.im_boffset +
                                offsetof(xfs_dinode_t, di_next_unlinked);
+                        /* need to recalc the inode CRC if appropriate */
+                        xfs_dinode_calc_crc(mp, dip);
                        xfs_trans_inode_buf(tp, ibp);
                        xfs_trans_log_buf(tp, ibp, offset,
                                          (offset + sizeof(xfs_agino_t) - 1));
@@ -1796,6 +1804,10 @@ xfs_iunlink_remove(
                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
                        offset = ip->i_imap.im_boffset +
                                offsetof(xfs_dinode_t, di_next_unlinked);
+                        /* need to recalc the inode CRC if appropriate */
+                        xfs_dinode_calc_crc(mp, dip);
                        xfs_trans_inode_buf(tp, ibp);
                        xfs_trans_log_buf(tp, ibp, offset,
                                          (offset + sizeof(xfs_agino_t) - 1));
@@ -1809,6 +1821,10 @@ xfs_iunlink_remove(
                last_dip->di_next_unlinked = cpu_to_be32(next_agino);
                ASSERT(next_agino != 0);
                offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
+                /* need to recalc the inode CRC if appropriate */
+                xfs_dinode_calc_crc(mp, last_dip);
                xfs_trans_inode_buf(tp, last_ibp);
                xfs_trans_log_buf(tp, last_ibp, offset,
                                  (offset + sizeof(xfs_agino_t) - 1));
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index d82efaa2ac73..ca9ecaa81112 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -455,6 +455,28 @@ xfs_vn_getattr(
        return 0;
 }
+static void
+xfs_setattr_mode(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *ip,
+        struct iattr            *iattr)
+{
+        struct inode    *inode = VFS_I(ip);
+        umode_t         mode = iattr->ia_mode;
+        ASSERT(tp);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+                mode &= ~S_ISGID;
+        ip->i_d.di_mode &= S_IFMT;
+        ip->i_d.di_mode |= mode & ~S_IFMT;
+        inode->i_mode &= S_IFMT;
+        inode->i_mode |= mode & ~S_IFMT;
+}
 int
 xfs_setattr_nonsize(
        struct xfs_inode        *ip,
@@ -606,18 +628,8 @@ xfs_setattr_nonsize(
        /*
         * Change file access modes.
         */
-        if (mask & ATTR_MODE) {
+        if (mask & ATTR_MODE)
-                umode_t mode = iattr->ia_mode;
+                xfs_setattr_mode(tp, ip, iattr);
-                if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
-                        mode &= ~S_ISGID;
-                ip->i_d.di_mode &= S_IFMT;
-                ip->i_d.di_mode |= mode & ~S_IFMT;
-                inode->i_mode &= S_IFMT;
-                inode->i_mode |= mode & ~S_IFMT;
-        }
        /*
         * Change file access or modified times.
@@ -714,9 +726,8 @@ xfs_setattr_size(
                return XFS_ERROR(error);
        ASSERT(S_ISREG(ip->i_d.di_mode));
-        ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
+        ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
-                        ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
+                        ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
-                        ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
        if (!(flags & XFS_ATTR_NOLOCK)) {
                lock_flags |= XFS_IOLOCK_EXCL;
@@ -860,6 +871,12 @@ xfs_setattr_size(
                xfs_inode_clear_eofblocks_tag(ip);
        }
+        /*
+         * Change file access modes.
+         */
+        if (mask & ATTR_MODE)
+                xfs_setattr_mode(tp, ip, iattr);
        if (mask & ATTR_CTIME) {
                inode->i_ctime = iattr->ia_ctime;
                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index e3d0b85d852b..d0833b54e55d 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -139,7 +139,7 @@ xlog_cil_prepare_log_vecs(
                new_lv = kmem_zalloc(sizeof(*new_lv) +
                                niovecs * sizeof(struct xfs_log_iovec),
-                                KM_SLEEP);
+                                KM_SLEEP|KM_NOFS);
                /* The allocated iovec region lies beyond the log vector. */
                new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 93f03ec17eec..7cf5e4eafe28 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1599,10 +1599,43 @@ xlog_recover_add_to_trans(
 }
 /*
- * Sort the log items in the transaction. Cancelled buffers need
+ * Sort the log items in the transaction.
- * to be put first so they are processed before any items that might
+ *
- * modify the buffers. If they are cancelled, then the modifications
+ * The ordering constraints are defined by the inode allocation and unlink
- * don't need to be replayed.
+ * behaviour. The rules are:
+ *
+ *      1. Every item is only logged once in a given transaction. Hence it
+ *         represents the last logged state of the item. Hence ordering is
+ *         dependent on the order in which operations need to be performed so
+ *         required initial conditions are always met.
+ *
+ *      2. Cancelled buffers are recorded in pass 1 in a separate table and
+ *         there's nothing to replay from them so we can simply cull them
+ *         from the transaction. However, we can't do that until after we've
+ *         replayed all the other items because they may be dependent on the
+ *         cancelled buffer and replaying the cancelled buffer can remove it
+ *         form the cancelled buffer table. Hence they have tobe done last.
+ *
+ *      3. Inode allocation buffers must be replayed before inode items that
+ *         read the buffer and replay changes into it.
+ *
+ *      4. Inode unlink buffers must be replayed after inode items are replayed.
+ *         This ensures that inodes are completely flushed to the inode buffer
+ *         in a "free" state before we remove the unlinked inode list pointer.
+ *
+ * Hence the ordering needs to be inode allocation buffers first, inode items
+ * second, inode unlink buffers third and cancelled buffers last.
+ *
+ * But there's a problem with that - we can't tell an inode allocation buffer
+ * apart from a regular buffer, so we can't separate them. We can, however,
+ * tell an inode unlink buffer from the others, and so we can separate them out
+ * from all the other buffers and move them to last.
+ *
+ * Hence, 4 lists, in order from head to tail:
+ *      - buffer_list for all buffers except cancelled/inode unlink buffers
+ *      - item_list for all non-buffer items
+ *      - inode_buffer_list for inode unlink buffers
+ *      - cancel_list for the cancelled buffers
 */
 STATIC int
 xlog_recover_reorder_trans(
@@ -1612,6 +1645,10 @@ xlog_recover_reorder_trans(
 {
        xlog_recover_item_t     *item, *n;
        LIST_HEAD(sort_list);
+        LIST_HEAD(cancel_list);
+        LIST_HEAD(buffer_list);
+        LIST_HEAD(inode_buffer_list);
+        LIST_HEAD(inode_list);
        list_splice_init(&trans->r_itemq, &sort_list);
        list_for_each_entry_safe(item, n, &sort_list, ri_list) {
@@ -1619,12 +1656,18 @@ xlog_recover_reorder_trans(
                switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
-                        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+                        if (buf_f->blf_flags & XFS_BLF_CANCEL) {
                                trace_xfs_log_recover_item_reorder_head(log,
                                                        trans, item, pass);
-                                list_move(&item->ri_list, &trans->r_itemq);
+                                list_move(&item->ri_list, &cancel_list);
+                                break;
+                        }
+                        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+                                list_move(&item->ri_list, &inode_buffer_list);
                                break;
                        }
+                        list_move_tail(&item->ri_list, &buffer_list);
+                        break;
                case XFS_LI_INODE:
                case XFS_LI_DQUOT:
                case XFS_LI_QUOTAOFF:
@@ -1632,7 +1675,7 @@ xlog_recover_reorder_trans(
                case XFS_LI_EFI:
                        trace_xfs_log_recover_item_reorder_tail(log,
                                                        trans, item, pass);
-                        list_move_tail(&item->ri_list, &trans->r_itemq);
+                        list_move_tail(&item->ri_list, &inode_list);
                        break;
                default:
                        xfs_warn(log->l_mp,
@@ -1643,6 +1686,14 @@ xlog_recover_reorder_trans(
                }
        }
        ASSERT(list_empty(&sort_list));
+        if (!list_empty(&buffer_list))
+                list_splice(&buffer_list, &trans->r_itemq);
+        if (!list_empty(&inode_list))
+                list_splice_tail(&inode_list, &trans->r_itemq);
+        if (!list_empty(&inode_buffer_list))
+                list_splice_tail(&inode_buffer_list, &trans->r_itemq);
+        if (!list_empty(&cancel_list))
+                list_splice_tail(&cancel_list, &trans->r_itemq);
        return 0;
 }
@@ -1794,7 +1845,13 @@ xlog_recover_do_inode_buffer(
        xfs_agino_t             *buffer_nextp;
        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
-        bp->b_ops = &xfs_inode_buf_ops;
+        /*
+         * Post recovery validation only works properly on CRC enabled
+         * filesystems.
+         */
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                bp->b_ops = &xfs_inode_buf_ops;
        inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
        for (i = 0; i < inodes_per_buf; i++) {
@@ -1861,6 +1918,15 @@ xlog_recover_do_inode_buffer(
                buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
                                              next_unlinked_offset);
                *buffer_nextp = *logged_nextp;
+                /*
+                 * If necessary, recalculate the CRC in the on-disk inode. We
+                 * have to leave the inode in a consistent state for whoever
+                 * reads it next....
+                 */
+                xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+                                xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
        }
        return 0;
@@ -2097,6 +2163,17 @@ xlog_recover_do_reg_buffer(
                       ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
                /*
+                 * The dirty regions logged in the buffer, even though
+                 * contiguous, may span multiple chunks. This is because the
+                 * dirty region may span a physical page boundary in a buffer
+                 * and hence be split into two separate vectors for writing into
+                 * the log. Hence we need to trim nbits back to the length of
+                 * the current region being copied out of the log.
+                 */
+                if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
+                        nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
+                /*
                 * Do a sanity check if this is a dquot buffer. Just checking
                 * the first dquot in the buffer should do. XXXThis is
                 * probably a good thing to do for other buf types also.
@@ -2134,7 +2211,16 @@ xlog_recover_do_reg_buffer(
        /* Shouldn't be any more regions */
        ASSERT(i == item->ri_total);
-        xlog_recovery_validate_buf_type(mp, bp, buf_f);
+        /*
+         * We can only do post recovery validation on items on CRC enabled
+         * fielsystems as we need to know when the buffer was written to be able
+         * to determine if we should have replayed the item. If we replay old
+         * metadata over a newer buffer, then it will enter a temporarily
+         * inconsistent state resulting in verification failures. Hence for now
+         * just avoid the verification stage for non-crc filesystems
+         */
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                xlog_recovery_validate_buf_type(mp, bp, buf_f);
 }
 /*
@@ -2255,6 +2341,12 @@ xfs_qm_dqcheck(
        d->dd_diskdq.d_flags = type;
        d->dd_diskdq.d_id = cpu_to_be32(id);
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+                xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+                                 XFS_DQUOT_CRC_OFF);
+        }
        return errs;
 }
@@ -2782,6 +2874,10 @@ xlog_recover_dquot_pass2(
        }
        memcpy(ddq, recddq, item->ri_buf[1].i_len);
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+                                 XFS_DQUOT_CRC_OFF);
+        }
        ASSERT(dq_f->qlf_size == 2);
        ASSERT(bp->b_target->bt_mount == mp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f6bfbd734669..e8e310c05097 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -314,7 +314,8 @@ STATIC int
 xfs_mount_validate_sb(
        xfs_mount_t     *mp,
        xfs_sb_t        *sbp,
-        bool            check_inprogress)
+        bool            check_inprogress,
+        bool            check_version)
 {
        /*
@@ -337,9 +338,10 @@ xfs_mount_validate_sb(
        /*
         * Version 5 superblock feature mask validation. Reject combinations the
-         * kernel cannot support up front before checking anything else.
+         * kernel cannot support up front before checking anything else. For
+         * write validation, we don't need to check feature masks.
         */
-        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
+        if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
                xfs_alert(mp,
 "Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
 "Use of these features in this kernel is at your own risk!");
@@ -675,7 +677,8 @@ xfs_sb_to_disk(
 static int
 xfs_sb_verify(
-        struct xfs_buf  *bp)
+        struct xfs_buf  *bp,
+        bool            check_version)
 {
        struct xfs_mount *mp = bp->b_target->bt_mount;
        struct xfs_sb   sb;
@@ -686,7 +689,8 @@ xfs_sb_verify(
         * Only check the in progress field for the primary superblock as
         * mkfs.xfs doesn't clear it from secondary superblocks.
         */
-        return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
+        return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
+                                     check_version);
 }
 /*
@@ -719,7 +723,7 @@ xfs_sb_read_verify(
                        goto out_error;
                }
        }
-        error = xfs_sb_verify(bp);
+        error = xfs_sb_verify(bp, true);
 out_error:
        if (error) {
@@ -758,7 +762,7 @@ xfs_sb_write_verify(
        struct xfs_buf_log_item *bip = bp->b_fspriv;
        int                     error;
-        error = xfs_sb_verify(bp);
+        error = xfs_sb_verify(bp, false);
        if (error) {
                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, error);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index f41702b43003..b75c9bb6e71e 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -41,6 +41,7 @@
 #include "xfs_qm.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_cksum.h"
 /*
 * The global quota manager. There is only one of these for the entire
@@ -839,7 +840,7 @@ xfs_qm_reset_dqcounts(
        xfs_dqid_t      id,
        uint            type)
 {
-        xfs_disk_dquot_t        *ddq;
+        struct xfs_dqblk        *dqb;
        int                     j;
        trace_xfs_reset_dqcounts(bp, _RET_IP_);
@@ -853,8 +854,12 @@ xfs_qm_reset_dqcounts(
        do_div(j, sizeof(xfs_dqblk_t));
        ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
 #endif
-        ddq = bp->b_addr;
+        dqb = bp->b_addr;
        for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
+                struct xfs_disk_dquot   *ddq;
+                ddq = (struct xfs_disk_dquot *)&dqb[j];
                /*
                 * Do a sanity check, and if needed, repair the dqblk. Don't
                 * output any warnings because it's perfectly possible to
@@ -871,7 +876,12 @@ xfs_qm_reset_dqcounts(
                ddq->d_bwarns = 0;
                ddq->d_iwarns = 0;
                ddq->d_rtbwarns = 0;
-                ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
+                if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                        xfs_update_cksum((char *)&dqb[j],
+                                         sizeof(struct xfs_dqblk),
+                                         XFS_DQUOT_CRC_OFF);
+                }
        }
 }
@@ -907,19 +917,29 @@ xfs_qm_dqiter_bufs(
                              XFS_FSB_TO_DADDR(mp, bno),
                              mp->m_quotainfo->qi_dqchunklen, 0, &bp,
                              &xfs_dquot_buf_ops);
-                if (error)
-                        break;
                /*
-                 * XXX(hch): need to figure out if it makes sense to validate
+                 * CRC and validation errors will return a EFSCORRUPTED here. If
-                 *           the CRC here.
+                 * this occurs, re-read without CRC validation so that we can
+                 * repair the damage via xfs_qm_reset_dqcounts(). This process
+                 * will leave a trace in the log indicating corruption has
+                 * been detected.
                 */
+                if (error == EFSCORRUPTED) {
+                        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+                                      XFS_FSB_TO_DADDR(mp, bno),
+                                      mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+                                      NULL);
+                }
+                if (error)
+                        break;
                xfs_qm_reset_dqcounts(mp, bp, firstid, type);
                xfs_buf_delwri_queue(bp, buffer_list);
                xfs_buf_relse(bp);
-                /*
-                 * goto the next block.
+                /* goto the next block. */
-                 */
                bno++;
                firstid += mp->m_quotainfo->qi_dqperchunk;
        }
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index c41190cad6e9..6cdf6ffc36a1 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -489,31 +489,36 @@ xfs_qm_scall_setqlim(
        if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
                return 0;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
-        error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp),
-                                  0, 0, XFS_DEFAULT_LOG_COUNT);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return (error);
-        }
        /*
         * We don't want to race with a quotaoff so take the quotaoff lock.
-         * (We don't hold an inode lock, so there's nothing else to stop
+         * We don't hold an inode lock, so there's nothing else to stop
-         * a quotaoff from happening). (XXXThis doesn't currently happen
+         * a quotaoff from happening.
-         * because we take the vfslock before calling xfs_qm_sysent).
         */
        mutex_lock(&q->qi_quotaofflock);
        /*
-         * Get the dquot (locked), and join it to the transaction.
+         * Get the dquot (locked) before we start, as we need to do a
-         * Allocate the dquot if this doesn't exist.
+         * transaction to allocate it if it doesn't exist. Once we have the
+         * dquot, unlock it so we can start the next transaction safely. We hold
+         * a reference to the dquot, so it's safe to do this unlock/lock without
+         * it being reclaimed in the mean time.
         */
-        if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
+        error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
-                xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+        if (error) {
                ASSERT(error != ENOENT);
                goto out_unlock;
        }
+        xfs_dqunlock(dqp);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
+        error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp),
+                                  0, 0, XFS_DEFAULT_LOG_COUNT);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                goto out_rele;
+        }
+        xfs_dqlock(dqp);
        xfs_trans_dqjoin(tp, dqp);
        ddq = &dqp->q_core;
@@ -621,9 +626,10 @@ xfs_qm_scall_setqlim(
        xfs_trans_log_dquot(tp, dqp);
        error = xfs_trans_commit(tp, 0);
-        xfs_qm_dqrele(dqp);
- out_unlock:
+out_rele:
+        xfs_qm_dqrele(dqp);
+out_unlock:
        mutex_unlock(&q->qi_quotaofflock);
        return error;
 }
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index c61e31c7d997..c38068f26c55 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -87,6 +87,8 @@ typedef struct xfs_dqblk {
        uuid_t            dd_uuid;      /* location information */
 } xfs_dqblk_t;
+#define XFS_DQUOT_CRC_OFF       offsetof(struct xfs_dqblk, dd_crc)
 /*
 * flags for q_flags field in the dquot.
 */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ea341cea68cb..3033ba5e9762 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1373,6 +1373,17 @@ xfs_finish_flags(
        }
        /*
+         * V5 filesystems always use attr2 format for attributes.
+         */
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
+            (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+                xfs_warn(mp,
+"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
+                        MNTOPT_NOATTR2, MNTOPT_ATTR2);
+                return XFS_ERROR(EINVAL);
+        }
+        /*
         * mkfs'ed attr2 will turn on attr2 mount unless explicitly
         * told by noattr2 to turn it off
         */
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 5f234389327c..195a403e1522 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -56,16 +56,9 @@ xfs_symlink_blocks(
        struct xfs_mount *mp,
        int             pathlen)
 {
-        int             fsblocks = 0;
+        int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
-        int             len = pathlen;
-        do {
+        return (pathlen + buflen - 1) / buflen;
-                fsblocks++;
-                len -= XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
-        } while (len > 0);
-        ASSERT(fsblocks <= XFS_SYMLINK_MAPS);
-        return fsblocks;
 }
 static int
@@ -405,7 +398,7 @@ xfs_symlink(
        if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version))
                fs_blocks = 0;
        else
-                fs_blocks = XFS_B_TO_FSB(mp, pathlen);
+                fs_blocks = xfs_symlink_blocks(mp, pathlen);
        resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
        error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
                        XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
@@ -512,7 +505,7 @@ xfs_symlink(
                cur_chunk = target_path;
                offset = 0;
                for (n = 0; n < nmaps; n++) {
-                        char *buf;
+                        char    *buf;
                        d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
                        byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
@@ -525,9 +518,7 @@ xfs_symlink(
                        bp->b_ops = &xfs_symlink_buf_ops;
                        byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
-                        if (pathlen < byte_cnt) {
+                        byte_cnt = min(byte_cnt, pathlen);
-                                byte_cnt = pathlen;
-                        }
                        buf = bp->b_addr;
                        buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset,
@@ -542,6 +533,7 @@ xfs_symlink(
                        xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
                                                        (char *)bp->b_addr);
                }
+                ASSERT(pathlen == 0);
        }
        /*
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 1501f4fa51a6..0176bb21f09a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1453,7 +1453,7 @@ xfs_free_file_space(
        xfs_mount_t             *mp;
        int                     nimap;
        uint                    resblks;
-        uint                    rounding;
+        xfs_off_t               rounding;
        int                     rt;
        xfs_fileoff_t           startoffset_fsb;
        xfs_trans_t             *tp;
@@ -1482,7 +1482,7 @@ xfs_free_file_space(
                inode_dio_wait(VFS_I(ip));
        }
-        rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+        rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
        ioffset = offset & ~(rounding - 1);
        error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
                                              ioffset, -1);