Merge tag 'xfs-for-linus-4.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs

Pull xfs updates from Dave Chinner: "A pretty average collection of fixes, cleanups and improvements in this request. Summary: - fixes for mount line parsing, sparse warnings, read-only compat feature remount behaviour - allow fast path symlink lookups for inline symlinks. - attribute listing cleanups - writeback goes direct to bios rather than indirecting through bufferheads - transaction allocation cleanup - optimised kmem_realloc - added configurable error handling for metadata write errors, changed default error handling behaviour from "retry forever" to "retry until unmount then fail" - fixed several inode cluster writeback lookup vs reclaim race conditions - fixed inode cluster writeback checking wrong inode after lookup - fixed bugs where struct xfs_inode freeing wasn't actually RCU safe - cleaned up inode reclaim tagging" * tag 'xfs-for-linus-4.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (39 commits) xfs: fix warning in xfs_finish_page_writeback for non-debug builds xfs: move reclaim tagging functions xfs: simplify inode reclaim tagging interfaces xfs: rename variables in xfs_iflush_cluster for clarity xfs: xfs_iflush_cluster has range issues xfs: mark reclaimed inodes invalid earlier xfs: xfs_inode_free() isn't RCU safe xfs: optimise xfs_iext_destroy xfs: skip stale inodes in xfs_iflush_cluster xfs: fix inode validity check in xfs_iflush_cluster xfs: xfs_iflush_cluster fails to abort on error xfs: remove xfs_fs_evict_inode() xfs: add "fail at unmount" error handling configuration xfs: add configuration handlers for specific errors xfs: add configuration of error failure speed xfs: introduce table-based init for error behaviors xfs: add configurable error support to metadata buffers xfs: introduce metadata IO error class xfs: configurable error behavior via sysfs xfs: buffer ->bi_end_io function requires irq-safe lock ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-05-26 13:13:40 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-05-26 13:13:40 -0400
commit: 0b9210c9c86e46a7a62bbc7b69b84001315072ff (patch)
tree: 0a0872c6b998c6fa3de29f1929be025f6060e749
parent: c5436731de860b3a3cff70c62d99242418aab1d1 (diff)
parent: 555b67e4e729ca544bb4028ab12e532c68b70ddb (diff)
49 files changed, 1293 insertions, 1135 deletions
diff --git a/fs/namei.c b/fs/namei.c
index 5375571cf6e1..15b124c18ed8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4542,7 +4542,6 @@ int readlink_copy(char __user *buffer, int buflen, const char *link)
 out:
        return len;
 }
-EXPORT_SYMBOL(readlink_copy);
 /*
 * A helper for ->readlink().  This should be used *ONLY* for symlinks that
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 686ba6fb20dd..339c696bbc01 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -93,19 +93,23 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 }
 void *
-kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
+kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
-             xfs_km_flags_t flags)
 {
-        void    *new;
+        int     retries = 0;
+        gfp_t   lflags = kmem_flags_convert(flags);
+        void    *ptr;
-        new = kmem_alloc(newsize, flags);
+        do {
-        if (ptr) {
+                ptr = krealloc(old, newsize, lflags);
-                if (new)
+                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
-                        memcpy(new, ptr,
+                        return ptr;
-                                ((oldsize < newsize) ? oldsize : newsize));
+                if (!(++retries % 100))
-                kmem_free(ptr);
+                        xfs_err(NULL,
-        }
+        "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)",
-        return new;
+                                current->comm, current->pid,
+                                newsize, __func__, lflags);
+                congestion_wait(BLK_RW_ASYNC, HZ/50);
+        } while (1);
 }
 void *
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index d1c66e465ca5..689f746224e7 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -62,7 +62,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
 extern void *kmem_alloc(size_t, xfs_km_flags_t);
 extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
-extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
+extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
 static inline void  kmem_free(const void *ptr)
 {
        kvfree(ptr);
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index fa3b948ef9c2..4e126f41a0aa 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -242,37 +242,21 @@ xfs_attr_set(
                        return error;
        }
-        /*
+        tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
-         * Start our first transaction of the day.
+                         M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
-         *
+        tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
-         * All future transactions during this code must be "chained" off
+        tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-         * this one via the trans_dup() call.  All transactions will contain
-         * the inode, and the inode will always be marked with trans_ihold().
-         * Since the inode will be locked in all transactions, we must log
-         * the inode in every transaction to let it float upward through
-         * the log.
-         */
-        args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
        /*
         * Root fork attributes can use reserved data blocks for this
         * operation if necessary
         */
+        error = xfs_trans_alloc(mp, &tres, args.total, 0,
-        if (rsvd)
+                        rsvd ? XFS_TRANS_RESERVE : 0, &args.trans);
-                args.trans->t_flags |= XFS_TRANS_RESERVE;
+        if (error)
-        tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
-                         M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
-        tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
-        tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-        error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
-        if (error) {
-                xfs_trans_cancel(args.trans);
                return error;
-        }
-        xfs_ilock(dp, XFS_ILOCK_EXCL);
+        xfs_ilock(dp, XFS_ILOCK_EXCL);
        error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
                                rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
                                       XFS_QMOPT_RES_REGBLKS);
@@ -429,31 +413,15 @@ xfs_attr_remove(
                return error;
        /*
-         * Start our first transaction of the day.
-         *
-         * All future transactions during this code must be "chained" off
-         * this one via the trans_dup() call.  All transactions will contain
-         * the inode, and the inode will always be marked with trans_ihold().
-         * Since the inode will be locked in all transactions, we must log
-         * the inode in every transaction to let it float upward through
-         * the log.
-         */
-        args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
-        /*
         * Root fork attributes can use reserved data blocks for this
         * operation if necessary
         */
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm,
-        if (flags & ATTR_ROOT)
+                        XFS_ATTRRM_SPACE_RES(mp), 0,
-                args.trans->t_flags |= XFS_TRANS_RESERVE;
+                        (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0,
+                        &args.trans);
-        error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
+        if (error)
-                                  XFS_ATTRRM_SPACE_RES(mp), 0);
-        if (error) {
-                xfs_trans_cancel(args.trans);
                return error;
-        }
        xfs_ilock(dp, XFS_ILOCK_EXCL);
        /*
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index ce41d7fe753c..932381caef1b 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1121,15 +1121,14 @@ xfs_bmap_add_attrfork(
        mp = ip->i_mount;
        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
-        tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
        blks = XFS_ADDAFORK_SPACE_RES(mp);
-        if (rsvd)
-                tp->t_flags |= XFS_TRANS_RESERVE;
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0,
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
+                        rsvd ? XFS_TRANS_RESERVE : 0, &tp);
-        if (error) {
+        if (error)
-                xfs_trans_cancel(tp);
                return error;
-        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
                        XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
@@ -6026,13 +6025,10 @@ xfs_bmap_split_extent(
        xfs_fsblock_t           firstfsb;
        int                     error;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                        XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
-                        XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+        if (error)
-        if (error) {
-                xfs_trans_cancel(tp);
                return error;
-        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 974d62e677f4..e5bb9cc3b243 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -257,15 +257,12 @@ xfs_dir2_block_to_sf(
         *
         * Convert the inode to local format and copy the data in.
         */
-        dp->i_df.if_flags &= ~XFS_IFEXTENTS;
-        dp->i_df.if_flags |= XFS_IFINLINE;
-        dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
        ASSERT(dp->i_df.if_bytes == 0);
-        xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+        xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size);
+        dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+        dp->i_d.di_size = size;
        logflags |= XFS_ILOG_DDATA;
-        memcpy(dp->i_df.if_u1.if_data, dst, size);
-        dp->i_d.di_size = size;
        xfs_dir2_sf_check(args);
 out:
        xfs_trans_log_inode(args->trans, dp, logflags);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 11faf7df14c8..bbcc8c7a44b3 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -231,6 +231,48 @@ xfs_iformat_fork(
        return error;
 }
+void
+xfs_init_local_fork(
+        struct xfs_inode        *ip,
+        int                     whichfork,
+        const void              *data,
+        int                     size)
+{
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+        int                     mem_size = size, real_size = 0;
+        bool                    zero_terminate;
+        /*
+         * If we are using the local fork to store a symlink body we need to
+         * zero-terminate it so that we can pass it back to the VFS directly.
+         * Overallocate the in-memory fork by one for that and add a zero
+         * to terminate it below.
+         */
+        zero_terminate = S_ISLNK(VFS_I(ip)->i_mode);
+        if (zero_terminate)
+                mem_size++;
+        if (size == 0)
+                ifp->if_u1.if_data = NULL;
+        else if (mem_size <= sizeof(ifp->if_u2.if_inline_data))
+                ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+        else {
+                real_size = roundup(mem_size, 4);
+                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+        }
+        if (size) {
+                memcpy(ifp->if_u1.if_data, data, size);
+                if (zero_terminate)
+                        ifp->if_u1.if_data[size] = '\0';
+        }
+        ifp->if_bytes = size;
+        ifp->if_real_bytes = real_size;
+        ifp->if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+        ifp->if_flags |= XFS_IFINLINE;
+}
 /*
 * The file is in-lined in the on-disk inode.
 * If it fits into if_inline_data, then copy
@@ -248,8 +290,6 @@ xfs_iformat_local(
        int             whichfork,
        int             size)
 {
-        xfs_ifork_t     *ifp;
-        int             real_size;
        /*
         * If the size is unreasonable, then something
@@ -265,22 +305,8 @@ xfs_iformat_local(
                                     ip->i_mount, dip);
                return -EFSCORRUPTED;
        }
-        ifp = XFS_IFORK_PTR(ip, whichfork);
-        real_size = 0;
+        xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size);
-        if (size == 0)
-                ifp->if_u1.if_data = NULL;
-        else if (size <= sizeof(ifp->if_u2.if_inline_data))
-                ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-        else {
-                real_size = roundup(size, 4);
-                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
-        }
-        ifp->if_bytes = size;
-        ifp->if_real_bytes = real_size;
-        if (size)
-                memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
-        ifp->if_flags &= ~XFS_IFEXTENTS;
-        ifp->if_flags |= XFS_IFINLINE;
        return 0;
 }
@@ -516,7 +542,6 @@ xfs_iroot_realloc(
                new_max = cur_max + rec_diff;
                new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
                ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-                                XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
                                KM_SLEEP | KM_NOFS);
                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
                                                     ifp->if_broot_bytes);
@@ -660,7 +685,6 @@ xfs_idata_realloc(
                                ifp->if_u1.if_data =
                                        kmem_realloc(ifp->if_u1.if_data,
                                                        real_size,
-                                                        ifp->if_real_bytes,
                                                        KM_SLEEP | KM_NOFS);
                        }
                } else {
@@ -1376,8 +1400,7 @@ xfs_iext_realloc_direct(
                if (rnew_size != ifp->if_real_bytes) {
                        ifp->if_u1.if_extents =
                                kmem_realloc(ifp->if_u1.if_extents,
-                                                rnew_size,
+                                                rnew_size, KM_NOFS);
-                                                ifp->if_real_bytes, KM_NOFS);
                }
                if (rnew_size > ifp->if_real_bytes) {
                        memset(&ifp->if_u1.if_extents[ifp->if_bytes /
@@ -1461,9 +1484,8 @@ xfs_iext_realloc_indirect(
        if (new_size == 0) {
                xfs_iext_destroy(ifp);
        } else {
-                ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
+                ifp->if_u1.if_ext_irec =
-                        kmem_realloc(ifp->if_u1.if_ext_irec,
+                        kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS);
-                                new_size, size, KM_NOFS);
        }
 }
@@ -1497,6 +1519,24 @@ xfs_iext_indirect_to_direct(
 }
 /*
+ * Remove all records from the indirection array.
+ */
+STATIC void
+xfs_iext_irec_remove_all(
+        struct xfs_ifork *ifp)
+{
+        int             nlists;
+        int             i;
+        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+        for (i = 0; i < nlists; i++)
+                kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf);
+        kmem_free(ifp->if_u1.if_ext_irec);
+        ifp->if_flags &= ~XFS_IFEXTIREC;
+}
+/*
 * Free incore file extents.
 */
 void
@@ -1504,14 +1544,7 @@ xfs_iext_destroy(
        xfs_ifork_t     *ifp)           /* inode fork pointer */
 {
        if (ifp->if_flags & XFS_IFEXTIREC) {
-                int     erp_idx;
+                xfs_iext_irec_remove_all(ifp);
-                int     nlists;
-                nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-                for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
-                        xfs_iext_irec_remove(ifp, erp_idx);
-                }
-                ifp->if_flags &= ~XFS_IFEXTIREC;
        } else if (ifp->if_real_bytes) {
                kmem_free(ifp->if_u1.if_extents);
        } else if (ifp->if_bytes) {
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 7d3b1ed6dcbe..f95e072ae646 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -134,6 +134,7 @@ void		xfs_iroot_realloc(struct xfs_inode *, int, int);
 int             xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int             xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
                                  int);
+void            xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
 struct xfs_bmbt_rec_host *
                xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index d54a8018b079..e8f49c029ff0 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -212,6 +212,11 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_HEADER_MAGIC  0x5452414e      /* TRAN */
 /*
+ * The only type valid for th_type in CIL-enabled file system logs:
+ */
+#define XFS_TRANS_CHECKPOINT    40
+/*
 * Log item types.
 */
 #define XFS_LI_EFI              0x1236
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 8a53eaa349f4..12ca86778e02 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -838,12 +838,10 @@ xfs_sync_sb(
        struct xfs_trans        *tp;
        int                     error;
-        tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0,
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
+                        XFS_TRANS_NO_WRITECOUNT, &tp);
-        if (error) {
+        if (error)
-                xfs_trans_cancel(tp);
                return error;
-        }
        xfs_log_sb(tp);
        if (wait)
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 81ac870834da..16002b5ec4eb 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -56,103 +56,6 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 /*
- * Transaction types.  Used to distinguish types of buffers. These never reach
- * the log.
- */
-#define XFS_TRANS_SETATTR_NOT_SIZE      1
-#define XFS_TRANS_SETATTR_SIZE          2
-#define XFS_TRANS_INACTIVE              3
-#define XFS_TRANS_CREATE                4
-#define XFS_TRANS_CREATE_TRUNC          5
-#define XFS_TRANS_TRUNCATE_FILE         6
-#define XFS_TRANS_REMOVE                7
-#define XFS_TRANS_LINK                  8
-#define XFS_TRANS_RENAME                9
-#define XFS_TRANS_MKDIR                 10
-#define XFS_TRANS_RMDIR                 11
-#define XFS_TRANS_SYMLINK               12
-#define XFS_TRANS_SET_DMATTRS           13
-#define XFS_TRANS_GROWFS                14
-#define XFS_TRANS_STRAT_WRITE           15
-#define XFS_TRANS_DIOSTRAT              16
-/* 17 was XFS_TRANS_WRITE_SYNC */
-#define XFS_TRANS_WRITEID               18
-#define XFS_TRANS_ADDAFORK              19
-#define XFS_TRANS_ATTRINVAL             20
-#define XFS_TRANS_ATRUNCATE             21
-#define XFS_TRANS_ATTR_SET              22
-#define XFS_TRANS_ATTR_RM               23
-#define XFS_TRANS_ATTR_FLAG             24
-#define XFS_TRANS_CLEAR_AGI_BUCKET      25
-#define XFS_TRANS_SB_CHANGE             26
-/*
- * Dummy entries since we use the transaction type to index into the
- * trans_type[] in xlog_recover_print_trans_head()
- */
-#define XFS_TRANS_DUMMY1                27
-#define XFS_TRANS_DUMMY2                28
-#define XFS_TRANS_QM_QUOTAOFF           29
-#define XFS_TRANS_QM_DQALLOC            30
-#define XFS_TRANS_QM_SETQLIM            31
-#define XFS_TRANS_QM_DQCLUSTER          32
-#define XFS_TRANS_QM_QINOCREATE         33
-#define XFS_TRANS_QM_QUOTAOFF_END       34
-#define XFS_TRANS_FSYNC_TS              35
-#define XFS_TRANS_GROWFSRT_ALLOC        36
-#define XFS_TRANS_GROWFSRT_ZERO         37
-#define XFS_TRANS_GROWFSRT_FREE         38
-#define XFS_TRANS_SWAPEXT               39
-#define XFS_TRANS_CHECKPOINT            40
-#define XFS_TRANS_ICREATE               41
-#define XFS_TRANS_CREATE_TMPFILE        42
-#define XFS_TRANS_TYPE_MAX              43
-/* new transaction types need to be reflected in xfs_logprint(8) */
-#define XFS_TRANS_TYPES \
-        { XFS_TRANS_SETATTR_NOT_SIZE,   "SETATTR_NOT_SIZE" }, \
-        { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
-        { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
-        { XFS_TRANS_CREATE,             "CREATE" }, \
-        { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
-        { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
-        { XFS_TRANS_REMOVE,             "REMOVE" }, \
-        { XFS_TRANS_LINK,               "LINK" }, \
-        { XFS_TRANS_RENAME,             "RENAME" }, \
-        { XFS_TRANS_MKDIR,              "MKDIR" }, \
-        { XFS_TRANS_RMDIR,              "RMDIR" }, \
-        { XFS_TRANS_SYMLINK,            "SYMLINK" }, \
-        { XFS_TRANS_SET_DMATTRS,        "SET_DMATTRS" }, \
-        { XFS_TRANS_GROWFS,             "GROWFS" }, \
-        { XFS_TRANS_STRAT_WRITE,        "STRAT_WRITE" }, \
-        { XFS_TRANS_DIOSTRAT,           "DIOSTRAT" }, \
-        { XFS_TRANS_WRITEID,            "WRITEID" }, \
-        { XFS_TRANS_ADDAFORK,           "ADDAFORK" }, \
-        { XFS_TRANS_ATTRINVAL,          "ATTRINVAL" }, \
-        { XFS_TRANS_ATRUNCATE,          "ATRUNCATE" }, \
-        { XFS_TRANS_ATTR_SET,           "ATTR_SET" }, \
-        { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
-        { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
-        { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
-        { XFS_TRANS_SB_CHANGE,          "SBCHANGE" }, \
-        { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
-        { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
-        { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
-        { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
-        { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
-        { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
-        { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
-        { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
-        { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
-        { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
-        { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
-        { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
-        { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
-        { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
-        { XFS_TRANS_ICREATE,            "ICREATE" }, \
-        { XFS_TRANS_CREATE_TMPFILE,     "CREATE_TMPFILE" }, \
-        { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
-/*
 * This structure is used to track log items associated with
 * a transaction.  It points to the log item and keeps some
 * flags to track the state of the log item.  It also tracks
@@ -181,8 +84,9 @@ int	xfs_log_calc_minimum_size(struct xfs_mount *);
 #define XFS_TRANS_SYNC          0x08    /* make commit synchronous */
 #define XFS_TRANS_DQ_DIRTY      0x10    /* at least one dquot in trx dirty */
 #define XFS_TRANS_RESERVE       0x20    /* OK to use reserved data blocks */
-#define XFS_TRANS_FREEZE_PROT   0x40    /* Transaction has elevated writer
+#define XFS_TRANS_NO_WRITECOUNT 0x40    /* do not elevate SB writecount */
-                                           count in superblock */
+#define XFS_TRANS_NOFS          0x80    /* pass KM_NOFS to kmem_alloc */
 /*
 * Field values for xfs_trans_mod_sb.
 */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index c535887c60a8..4c463b99fe57 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -84,23 +84,71 @@ xfs_find_bdev_for_inode(
 }
 /*
- * We're now finished for good with this ioend structure.
+ * We're now finished for good with this page.  Update the page state via the
- * Update the page state via the associated buffer_heads,
+ * associated buffer_heads, paying attention to the start and end offsets that
- * release holds on the inode and bio, and finally free
+ * we need to process on the page.
- * up memory.  Do not use the ioend after this.
+ */
+static void
+xfs_finish_page_writeback(
+        struct inode            *inode,
+        struct bio_vec          *bvec,
+        int                     error)
+{
+        unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
+        struct buffer_head      *head, *bh;
+        unsigned int            off = 0;
+        ASSERT(bvec->bv_offset < PAGE_SIZE);
+        ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
+        ASSERT(end < PAGE_SIZE);
+        ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0);
+        bh = head = page_buffers(bvec->bv_page);
+        do {
+                if (off < bvec->bv_offset)
+                        goto next_bh;
+                if (off > end)
+                        break;
+                bh->b_end_io(bh, !error);
+next_bh:
+                off += bh->b_size;
+        } while ((bh = bh->b_this_page) != head);
+}
+/*
+ * We're now finished for good with this ioend structure.  Update the page
+ * state, release holds on bios, and finally free up memory.  Do not use the
+ * ioend after this.
 */
 STATIC void
 xfs_destroy_ioend(
-        xfs_ioend_t             *ioend)
+        struct xfs_ioend        *ioend,
+        int                     error)
 {
-        struct buffer_head      *bh, *next;
+        struct inode            *inode = ioend->io_inode;
+        struct bio              *last = ioend->io_bio;
+        struct bio              *bio, *next;
-        for (bh = ioend->io_buffer_head; bh; bh = next) {
+        for (bio = &ioend->io_inline_bio; bio; bio = next) {
-                next = bh->b_private;
+                struct bio_vec  *bvec;
-                bh->b_end_io(bh, !ioend->io_error);
+                int             i;
-        }
+                /*
+                 * For the last bio, bi_private points to the ioend, so we
+                 * need to explicitly end the iteration here.
+                 */
+                if (bio == last)
+                        next = NULL;
+                else
+                        next = bio->bi_private;
-        mempool_free(ioend, xfs_ioend_pool);
+                /* walk each page on bio, ending page IO on them */
+                bio_for_each_segment_all(bvec, bio, i)
+                        xfs_finish_page_writeback(inode, bvec, error);
+                bio_put(bio);
+        }
 }
 /*
@@ -120,13 +168,9 @@ xfs_setfilesize_trans_alloc(
        struct xfs_trans        *tp;
        int                     error;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+        if (error)
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp);
                return error;
-        }
        ioend->io_append_trans = tp;
@@ -174,7 +218,8 @@ xfs_setfilesize(
 STATIC int
 xfs_setfilesize_ioend(
-        struct xfs_ioend        *ioend)
+        struct xfs_ioend        *ioend,
+        int                     error)
 {
        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
        struct xfs_trans        *tp = ioend->io_append_trans;
@@ -188,53 +233,32 @@ xfs_setfilesize_ioend(
        __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
        /* we abort the update if there was an IO error */
-        if (ioend->io_error) {
+        if (error) {
                xfs_trans_cancel(tp);
-                return ioend->io_error;
+                return error;
        }
        return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 }
 /*
- * Schedule IO completion handling on the final put of an ioend.
- *
- * If there is no work to do we might as well call it a day and free the
- * ioend right now.
- */
-STATIC void
-xfs_finish_ioend(
-        struct xfs_ioend        *ioend)
-{
-        if (atomic_dec_and_test(&ioend->io_remaining)) {
-                struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
-                if (ioend->io_type == XFS_IO_UNWRITTEN)
-                        queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-                else if (ioend->io_append_trans)
-                        queue_work(mp->m_data_workqueue, &ioend->io_work);
-                else
-                        xfs_destroy_ioend(ioend);
-        }
-}
-/*
 * IO write completion.
 */
 STATIC void
 xfs_end_io(
        struct work_struct *work)
 {
-        xfs_ioend_t     *ioend = container_of(work, xfs_ioend_t, io_work);
+        struct xfs_ioend        *ioend =
-        struct xfs_inode *ip = XFS_I(ioend->io_inode);
+                container_of(work, struct xfs_ioend, io_work);
-        int             error = 0;
+        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+        int                     error = ioend->io_bio->bi_error;
        /*
         * Set an error if the mount has shut down and proceed with end I/O
         * processing so it can perform whatever cleanups are necessary.
         */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                ioend->io_error = -EIO;
+                error = -EIO;
        /*
         * For unwritten extents we need to issue transactions to convert a
@@ -244,55 +268,33 @@ xfs_end_io(
         * on error.
         */
        if (ioend->io_type == XFS_IO_UNWRITTEN) {
-                if (ioend->io_error)
+                if (error)
                        goto done;
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
                                                  ioend->io_size);
        } else if (ioend->io_append_trans) {
-                error = xfs_setfilesize_ioend(ioend);
+                error = xfs_setfilesize_ioend(ioend, error);
        } else {
                ASSERT(!xfs_ioend_is_append(ioend));
        }
 done:
-        if (error)
+        xfs_destroy_ioend(ioend, error);
-                ioend->io_error = error;
-        xfs_destroy_ioend(ioend);
 }
-/*
+STATIC void
- * Allocate and initialise an IO completion structure.
+xfs_end_bio(
- * We need to track unwritten extent write completion here initially.
+        struct bio              *bio)
- * We'll need to extend this for updating the ondisk inode size later
- * (vs. incore size).
- */
-STATIC xfs_ioend_t *
-xfs_alloc_ioend(
-        struct inode            *inode,
-        unsigned int            type)
 {
-        xfs_ioend_t             *ioend;
+        struct xfs_ioend        *ioend = bio->bi_private;
+        struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
-        ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
-        /*
-         * Set the count to 1 initially, which will prevent an I/O
-         * completion callback from happening before we have started
-         * all the I/O from calling the completion routine too early.
-         */
-        atomic_set(&ioend->io_remaining, 1);
-        ioend->io_error = 0;
-        INIT_LIST_HEAD(&ioend->io_list);
-        ioend->io_type = type;
-        ioend->io_inode = inode;
-        ioend->io_buffer_head = NULL;
-        ioend->io_buffer_tail = NULL;
-        ioend->io_offset = 0;
-        ioend->io_size = 0;
-        ioend->io_append_trans = NULL;
-        INIT_WORK(&ioend->io_work, xfs_end_io);
+        if (ioend->io_type == XFS_IO_UNWRITTEN)
-        return ioend;
+                queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
+        else if (ioend->io_append_trans)
+                queue_work(mp->m_data_workqueue, &ioend->io_work);
+        else
+                xfs_destroy_ioend(ioend, bio->bi_error);
 }
 STATIC int
@@ -364,50 +366,6 @@ xfs_imap_valid(
                offset < imap->br_startoff + imap->br_blockcount;
 }
-/*
- * BIO completion handler for buffered IO.
- */
-STATIC void
-xfs_end_bio(
-        struct bio              *bio)
-{
-        xfs_ioend_t             *ioend = bio->bi_private;
-        if (!ioend->io_error)
-                ioend->io_error = bio->bi_error;
-        /* Toss bio and pass work off to an xfsdatad thread */
-        bio->bi_private = NULL;
-        bio->bi_end_io = NULL;
-        bio_put(bio);
-        xfs_finish_ioend(ioend);
-}
-STATIC void
-xfs_submit_ioend_bio(
-        struct writeback_control *wbc,
-        xfs_ioend_t             *ioend,
-        struct bio              *bio)
-{
-        atomic_inc(&ioend->io_remaining);
-        bio->bi_private = ioend;
-        bio->bi_end_io = xfs_end_bio;
-        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
-}
-STATIC struct bio *
-xfs_alloc_ioend_bio(
-        struct buffer_head      *bh)
-{
-        struct bio              *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
-        ASSERT(bio->bi_private == NULL);
-        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-        bio->bi_bdev = bh->b_bdev;
-        return bio;
-}
 STATIC void
 xfs_start_buffer_writeback(
        struct buffer_head      *bh)
@@ -452,28 +410,35 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 }
 /*
- * Submit all of the bios for an ioend. We are only passed a single ioend at a
+ * Submit the bio for an ioend. We are passed an ioend with a bio attached to
- * time; the caller is responsible for chaining prior to submission.
+ * it, and we submit that bio. The ioend may be used for multiple bio
+ * submissions, so we only want to allocate an append transaction for the ioend
+ * once. In the case of multiple bio submission, each bio will take an IO
+ * reference to the ioend to ensure that the ioend completion is only done once
+ * all bios have been submitted and the ioend is really done.
 *
 * If @fail is non-zero, it means that we have a situation where some part of
 * the submission process has failed after we have marked paged for writeback
- * and unlocked them. In this situation, we need to fail the ioend chain rather
+ * and unlocked them. In this situation, we need to fail the bio and ioend
- * than submit it to IO. This typically only happens on a filesystem shutdown.
+ * rather than submit it to IO. This typically only happens on a filesystem
+ * shutdown.
 */
 STATIC int
 xfs_submit_ioend(
        struct writeback_control *wbc,
-        xfs_ioend_t             *ioend,
+        struct xfs_ioend        *ioend,
        int                     status)
 {
-        struct buffer_head      *bh;
-        struct bio              *bio;
-        sector_t                lastblock = 0;
        /* Reserve log space if we might write beyond the on-disk inode size. */
        if (!status &&
-             ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+            ioend->io_type != XFS_IO_UNWRITTEN &&
+            xfs_ioend_is_append(ioend) &&
+            !ioend->io_append_trans)
                status = xfs_setfilesize_trans_alloc(ioend);
+        ioend->io_bio->bi_private = ioend;
+        ioend->io_bio->bi_end_io = xfs_end_bio;
        /*
         * If we are failing the IO now, just mark the ioend with an
         * error and finish it. This will run IO completion immediately
@@ -481,33 +446,73 @@ xfs_submit_ioend(
         * time.
         */
        if (status) {
-                ioend->io_error = status;
+                ioend->io_bio->bi_error = status;
-                xfs_finish_ioend(ioend);
+                bio_endio(ioend->io_bio);
                return status;
        }
-        bio = NULL;
+        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
-        for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+                   ioend->io_bio);
+        return 0;
+}
-                if (!bio) {
+static void
-retry:
+xfs_init_bio_from_bh(
-                        bio = xfs_alloc_ioend_bio(bh);
+        struct bio              *bio,
-                } else if (bh->b_blocknr != lastblock + 1) {
+        struct buffer_head      *bh)
-                        xfs_submit_ioend_bio(wbc, ioend, bio);
+{
-                        goto retry;
+        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-                }
+        bio->bi_bdev = bh->b_bdev;
+}
-                if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
+static struct xfs_ioend *
-                        xfs_submit_ioend_bio(wbc, ioend, bio);
+xfs_alloc_ioend(
-                        goto retry;
+        struct inode            *inode,
-                }
+        unsigned int            type,
+        xfs_off_t               offset,
+        struct buffer_head      *bh)
+{
+        struct xfs_ioend        *ioend;
+        struct bio              *bio;
-                lastblock = bh->b_blocknr;
+        bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
-        }
+        xfs_init_bio_from_bh(bio, bh);
-        if (bio)
-                xfs_submit_ioend_bio(wbc, ioend, bio);
+        ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
-        xfs_finish_ioend(ioend);
+        INIT_LIST_HEAD(&ioend->io_list);
-        return 0;
+        ioend->io_type = type;
+        ioend->io_inode = inode;
+        ioend->io_size = 0;
+        ioend->io_offset = offset;
+        INIT_WORK(&ioend->io_work, xfs_end_io);
+        ioend->io_append_trans = NULL;
+        ioend->io_bio = bio;
+        return ioend;
+}
+/*
+ * Allocate a new bio, and chain the old bio to the new one.
+ *
+ * Note that we have to do perform the chaining in this unintuitive order
+ * so that the bi_private linkage is set up in the right direction for the
+ * traversal in xfs_destroy_ioend().
+ */
+static void
+xfs_chain_bio(
+        struct xfs_ioend        *ioend,
+        struct writeback_control *wbc,
+        struct buffer_head      *bh)
+{
+        struct bio *new;
+        new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+        xfs_init_bio_from_bh(new, bh);
+        bio_chain(ioend->io_bio, new);
+        bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */
+        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
+                   ioend->io_bio);
+        ioend->io_bio = new;
 }
 /*
@@ -523,27 +528,24 @@ xfs_add_to_ioend(
        struct buffer_head      *bh,
        xfs_off_t               offset,
        struct xfs_writepage_ctx *wpc,
+        struct writeback_control *wbc,
        struct list_head        *iolist)
 {
        if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
            bh->b_blocknr != wpc->last_block + 1 ||
            offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
-                struct xfs_ioend        *new;
                if (wpc->ioend)
                        list_add(&wpc->ioend->io_list, iolist);
+                wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
-                new = xfs_alloc_ioend(inode, wpc->io_type);
-                new->io_offset = offset;
-                new->io_buffer_head = bh;
-                new->io_buffer_tail = bh;
-                wpc->ioend = new;
-        } else {
-                wpc->ioend->io_buffer_tail->b_private = bh;
-                wpc->ioend->io_buffer_tail = bh;
        }
-        bh->b_private = NULL;
+        /*
+         * If the buffer doesn't fit into the bio we need to allocate a new
+         * one.  This shouldn't happen more than once for a given buffer.
+         */
+        while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
+                xfs_chain_bio(wpc->ioend, wbc, bh);
        wpc->ioend->io_size += bh->b_size;
        wpc->last_block = bh->b_blocknr;
        xfs_start_buffer_writeback(bh);
@@ -803,7 +805,7 @@ xfs_writepage_map(
                        lock_buffer(bh);
                        if (wpc->io_type != XFS_IO_OVERWRITE)
                                xfs_map_at_offset(inode, bh, &wpc->imap, offset);
-                        xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
+                        xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
                        count++;
                }
@@ -1391,13 +1393,10 @@ xfs_end_io_direct_write(
                trace_xfs_end_io_direct_write_append(ip, offset, size);
-                tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+                                &tp);
-                if (error) {
+                if (!error)
-                        xfs_trans_cancel(tp);
+                        error = xfs_setfilesize(ip, tp, offset, size);
-                        return error;
-                }
-                error = xfs_setfilesize(ip, tp, offset, size);
        }
        return error;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index b4421177b68d..814aab790713 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,7 +18,7 @@
 #ifndef __XFS_AOPS_H__
 #define __XFS_AOPS_H__
-extern mempool_t *xfs_ioend_pool;
+extern struct bio_set *xfs_ioend_bioset;
 /*
 * Types of I/O for bmap clustering and I/O completion tracking.
@@ -37,22 +37,19 @@ enum {
        { XFS_IO_OVERWRITE,             "overwrite" }
 /*
- * xfs_ioend struct manages large extent writes for XFS.
+ * Structure for buffered I/O completions.
- * It can manage several multi-page bio's at once.
 */
-typedef struct xfs_ioend {
+struct xfs_ioend {
        struct list_head        io_list;        /* next ioend in chain */
        unsigned int            io_type;        /* delalloc / unwritten */
-        int                     io_error;       /* I/O error code */
-        atomic_t                io_remaining;   /* hold count */
        struct inode            *io_inode;      /* file being written to */
-        struct buffer_head      *io_buffer_head;/* buffer linked list head */
-        struct buffer_head      *io_buffer_tail;/* buffer linked list tail */
        size_t                  io_size;        /* size of the extent */
        xfs_off_t               io_offset;      /* offset in the file */
        struct work_struct      io_work;        /* xfsdatad work queue */
        struct xfs_trans        *io_append_trans;/* xact. for size update */
-} xfs_ioend_t;
+        struct bio              *io_bio;        /* bio being built */
+        struct bio              io_inline_bio;  /* MUST BE LAST! */
+};
 extern const struct address_space_operations xfs_address_space_operations;
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index dd4824589470..e3da5d448bcf 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -112,8 +112,9 @@ typedef struct attrlist_cursor_kern {
 *========================================================================*/
+/* Return 0 on success, or -errno; other state communicated via *context */
 typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
-                              unsigned char *, int, int, unsigned char *);
+                              unsigned char *, int, int);
 typedef struct xfs_attr_list_context {
        struct xfs_inode                *dp;            /* inode */
@@ -126,7 +127,6 @@ typedef struct xfs_attr_list_context {
        int                             firstu;         /* first used byte in buffer */
        int                             flags;          /* from VOP call */
        int                             resynch;        /* T/F: resynch with cursor */
-        int                             put_value;      /* T/F: need value for listent */
        put_listent_func_t              put_listent;    /* list output fmt function */
        int                             index;          /* index into output buffer */
 } xfs_attr_list_context_t;
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 2bb959ada45b..55d214981ed2 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -405,21 +405,11 @@ xfs_attr_inactive(
                goto out_destroy_fork;
        xfs_iunlock(dp, lock_mode);
-        /*
-         * Start our first transaction of the day.
-         *
-         * All future transactions during this code must be "chained" off
-         * this one via the trans_dup() call.  All transactions will contain
-         * the inode, and the inode will always be marked with trans_ihold().
-         * Since the inode will be locked in all transactions, we must log
-         * the inode in every transaction to let it float upward through
-         * the log.
-         */
        lock_mode = 0;
-        trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
-        error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrinval, 0, 0, 0, &trans);
        if (error)
-                goto out_cancel;
+                goto out_destroy_fork;
        lock_mode = XFS_ILOCK_EXCL;
        xfs_ilock(dp, lock_mode);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 4fa14820e2e2..d25f26b22ac9 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -106,18 +106,15 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                                           sfe->flags,
                                           sfe->nameval,
                                           (int)sfe->namelen,
-                                           (int)sfe->valuelen,
+                                           (int)sfe->valuelen);
-                                           &sfe->nameval[sfe->namelen]);
+                        if (error)
+                                return error;
                        /*
                         * Either search callback finished early or
                         * didn't fit it all in the buffer after all.
                         */
                        if (context->seen_enough)
                                break;
-                        if (error)
-                                return error;
                        sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
                }
                trace_xfs_attr_list_sf_all(context);
@@ -200,8 +197,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                                        sbp->flags,
                                        sbp->name,
                                        sbp->namelen,
-                                        sbp->valuelen,
+                                        sbp->valuelen);
-                                        &sbp->name[sbp->namelen]);
                if (error) {
                        kmem_free(sbuf);
                        return error;
@@ -416,6 +412,9 @@ xfs_attr3_leaf_list_int(
         */
        retval = 0;
        for (; i < ichdr.count; entry++, i++) {
+                char *name;
+                int namelen, valuelen;
                if (be32_to_cpu(entry->hashval) != cursor->hashval) {
                        cursor->hashval = be32_to_cpu(entry->hashval);
                        cursor->offset = 0;
@@ -425,56 +424,25 @@ xfs_attr3_leaf_list_int(
                        continue;               /* skip incomplete entries */
                if (entry->flags & XFS_ATTR_LOCAL) {
-                        xfs_attr_leaf_name_local_t *name_loc =
+                        xfs_attr_leaf_name_local_t *name_loc;
-                                xfs_attr3_leaf_name_local(leaf, i);
+                        name_loc = xfs_attr3_leaf_name_local(leaf, i);
-                        retval = context->put_listent(context,
+                        name = name_loc->nameval;
-                                                entry->flags,
+                        namelen = name_loc->namelen;
-                                                name_loc->nameval,
+                        valuelen = be16_to_cpu(name_loc->valuelen);
-                                                (int)name_loc->namelen,
-                                                be16_to_cpu(name_loc->valuelen),
-                                                &name_loc->nameval[name_loc->namelen]);
-                        if (retval)
-                                return retval;
                } else {
-                        xfs_attr_leaf_name_remote_t *name_rmt =
+                        xfs_attr_leaf_name_remote_t *name_rmt;
-                                xfs_attr3_leaf_name_remote(leaf, i);
+                        name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
-                        int valuelen = be32_to_cpu(name_rmt->valuelen);
+                        name = name_rmt->name;
+                        namelen = name_rmt->namelen;
-                        if (context->put_value) {
+                        valuelen = be32_to_cpu(name_rmt->valuelen);
-                                xfs_da_args_t args;
-                                memset((char *)&args, 0, sizeof(args));
-                                args.geo = context->dp->i_mount->m_attr_geo;
-                                args.dp = context->dp;
-                                args.whichfork = XFS_ATTR_FORK;
-                                args.valuelen = valuelen;
-                                args.rmtvaluelen = valuelen;
-                                args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
-                                args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
-                                args.rmtblkcnt = xfs_attr3_rmt_blocks(
-                                                        args.dp->i_mount, valuelen);
-                                retval = xfs_attr_rmtval_get(&args);
-                                if (!retval)
-                                        retval = context->put_listent(context,
-                                                        entry->flags,
-                                                        name_rmt->name,
-                                                        (int)name_rmt->namelen,
-                                                        valuelen,
-                                                        args.value);
-                                kmem_free(args.value);
-                        } else {
-                                retval = context->put_listent(context,
-                                                entry->flags,
-                                                name_rmt->name,
-                                                (int)name_rmt->namelen,
-                                                valuelen,
-                                                NULL);
-                        }
-                        if (retval)
-                                return retval;
                }
+                retval = context->put_listent(context, entry->flags,
+                                              name, namelen, valuelen);
+                if (retval)
+                        break;
                if (context->seen_enough)
                        break;
                cursor->offset++;
@@ -551,8 +519,7 @@ xfs_attr_put_listent(
        int             flags,
        unsigned char   *name,
        int             namelen,
-        int             valuelen,
+        int             valuelen)
-        unsigned char   *value)
 {
        struct attrlist *alist = (struct attrlist *)context->alist;
        attrlist_ent_t *aep;
@@ -581,7 +548,7 @@ xfs_attr_put_listent(
                trace_xfs_attr_list_full(context);
                alist->al_more = 1;
                context->seen_enough = 1;
-                return 1;
+                return 0;
        }
        aep = (attrlist_ent_t *)&context->alist[context->firstu];
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 3b6309865c65..613ea2d7ac19 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -900,19 +900,15 @@ xfs_free_eofblocks(
                 * Free them up now by truncating the file to
                 * its current size.
                 */
-                tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
                if (need_iolock) {
-                        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+                        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
-                                xfs_trans_cancel(tp);
                                return -EAGAIN;
-                        }
                }
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+                                &tp);
                if (error) {
                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
-                        xfs_trans_cancel(tp);
                        if (need_iolock)
                                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
                        return error;
@@ -1037,9 +1033,9 @@ xfs_alloc_file_space(
                /*
                 * Allocate and setup the transaction.
                 */
-                tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                                resrtextents, 0, &tp);
-                                          resblks, resrtextents);
                /*
                 * Check for running out of space
                 */
@@ -1048,7 +1044,6 @@ xfs_alloc_file_space(
                         * Free the transaction structure.
                         */
                        ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                        xfs_trans_cancel(tp);
                        break;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1311,18 +1306,10 @@ xfs_free_file_space(
                 * transaction to dip into the reserve blocks to ensure
                 * the freeing of the space succeeds at ENOSPC.
                 */
-                tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
+                                &tp);
-                /*
-                 * check for running out of space
-                 */
                if (error) {
-                        /*
-                         * Free the transaction structure.
-                         */
                        ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                        xfs_trans_cancel(tp);
                        break;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1482,19 +1469,16 @@ xfs_shift_file_space(
        }
        while (!error && !done) {
-                tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
                /*
                 * We would need to reserve permanent block for transaction.
                 * This will come into picture when after shifting extent into
                 * hole we found that adjacent extents can be merged which
                 * may lead to freeing of a block during record update.
                 */
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
-                                XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+                                XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
-                if (error) {
+                if (error)
-                        xfs_trans_cancel(tp);
                        break;
-                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
@@ -1747,12 +1731,9 @@ xfs_swap_extents(
        if (error)
                goto out_unlock;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+        if (error)
-        if (error) {
-                xfs_trans_cancel(tp);
                goto out_unlock;
-        }
        /*
         * Lock and join the inodes to the tansaction so that transaction commit
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9a2191b91137..e71cfbd5acb3 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1100,22 +1100,18 @@ xfs_bwrite(
        return error;
 }
-STATIC void
+static void
 xfs_buf_bio_end_io(
        struct bio              *bio)
 {
-        xfs_buf_t               *bp = (xfs_buf_t *)bio->bi_private;
+        struct xfs_buf          *bp = (struct xfs_buf *)bio->bi_private;
        /*
         * don't overwrite existing errors - otherwise we can lose errors on
         * buffers that require multiple bios to complete.
         */
-        if (bio->bi_error) {
+        if (bio->bi_error)
-                spin_lock(&bp->b_lock);
+                cmpxchg(&bp->b_io_error, 0, bio->bi_error);
-                if (!bp->b_io_error)
-                        bp->b_io_error = bio->bi_error;
-                spin_unlock(&bp->b_lock);
-        }
        if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 4eb89bd4ee73..8bfb974f0772 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -183,6 +183,26 @@ typedef struct xfs_buf {
        unsigned int            b_page_count;   /* size of page array */
        unsigned int            b_offset;       /* page offset in first page */
        int                     b_error;        /* error code on I/O */
+        /*
+         * async write failure retry count. Initialised to zero on the first
+         * failure, then when it exceeds the maximum configured without a
+         * success the write is considered to be failed permanently and the
+         * iodone handler will take appropriate action.
+         *
+         * For retry timeouts, we record the jiffie of the first failure. This
+         * means that we can change the retry timeout for buffers already under
+         * I/O and thus avoid getting stuck in a retry loop with a long timeout.
+         *
+         * last_error is used to ensure that we are getting repeated errors, not
+         * different errors. e.g. a block device might change ENOSPC to EIO when
+         * a failure timeout occurs, so we want to re-initialise the error
+         * retry behaviour appropriately when that happens.
+         */
+        int                     b_retries;
+        unsigned long           b_first_retry_time; /* in jiffies */
+        int                     b_last_error;
        const struct xfs_buf_ops        *b_ops;
 #ifdef XFS_BUF_LOCK_TRACKING
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 99e91a0e554e..34257992934c 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1042,35 +1042,22 @@ xfs_buf_do_callbacks(
        }
 }
-/*
+static bool
- * This is the iodone() function for buffers which have had callbacks
+xfs_buf_iodone_callback_error(
- * attached to them by xfs_buf_attach_iodone().  It should remove each
- * log item from the buffer's list and call the callback of each in turn.
- * When done, the buffer's fsprivate field is set to NULL and the buffer
- * is unlocked with a call to iodone().
- */
-void
-xfs_buf_iodone_callbacks(
        struct xfs_buf          *bp)
 {
        struct xfs_log_item     *lip = bp->b_fspriv;
        struct xfs_mount        *mp = lip->li_mountp;
        static ulong            lasttime;
        static xfs_buftarg_t    *lasttarg;
+        struct xfs_error_cfg    *cfg;
-        if (likely(!bp->b_error))
-                goto do_callbacks;
        /*
         * If we've already decided to shutdown the filesystem because of
         * I/O errors, there's no point in giving this a retry.
         */
-        if (XFS_FORCED_SHUTDOWN(mp)) {
+        if (XFS_FORCED_SHUTDOWN(mp))
-                xfs_buf_stale(bp);
+                goto out_stale;
-                bp->b_flags |= XBF_DONE;
-                trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                goto do_callbacks;
-        }
        if (bp->b_target != lasttarg ||
            time_after(jiffies, (lasttime + 5*HZ))) {
@@ -1079,45 +1066,93 @@ xfs_buf_iodone_callbacks(
        }
        lasttarg = bp->b_target;
+        /* synchronous writes will have callers process the error */
+        if (!(bp->b_flags & XBF_ASYNC))
+                goto out_stale;
+        trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+        ASSERT(bp->b_iodone != NULL);
        /*
         * If the write was asynchronous then no one will be looking for the
-         * error.  Clear the error state and write the buffer out again.
+         * error.  If this is the first failure of this type, clear the error
-         *
+         * state and write the buffer out again. This means we always retry an
-         * XXX: This helps against transient write errors, but we need to find
+         * async write failure at least once, but we also need to set the buffer
-         * a way to shut the filesystem down if the writes keep failing.
+         * up to behave correctly now for repeated failures.
-         *
-         * In practice we'll shut the filesystem down soon as non-transient
-         * errors tend to affect the whole device and a failing log write
-         * will make us give up.  But we really ought to do better here.
         */
-        if (bp->b_flags & XBF_ASYNC) {
+        if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) ||
-                ASSERT(bp->b_iodone != NULL);
+             bp->b_last_error != bp->b_error) {
+                bp->b_flags |= (XBF_WRITE | XBF_ASYNC |
+                                XBF_DONE | XBF_WRITE_FAIL);
+                bp->b_last_error = bp->b_error;
+                bp->b_retries = 0;
+                bp->b_first_retry_time = jiffies;
+                xfs_buf_ioerror(bp, 0);
+                xfs_buf_submit(bp);
+                return true;
+        }
-                trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+        /*
+         * Repeated failure on an async write. Take action according to the
+         * error configuration we have been set up to use.
+         */
+        cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
-                xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
+        if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
+            ++bp->b_retries > cfg->max_retries)
+                        goto permanent_error;
+        if (cfg->retry_timeout &&
+            time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
+                        goto permanent_error;
-                if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
+        /* At unmount we may treat errors differently */
-                        bp->b_flags |= XBF_WRITE | XBF_ASYNC |
+        if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
-                                       XBF_DONE | XBF_WRITE_FAIL;
+                goto permanent_error;
-                        xfs_buf_submit(bp);
-                } else {
-                        xfs_buf_relse(bp);
-                }
-                return;
+        /* still a transient error, higher layers will retry */
-        }
+        xfs_buf_ioerror(bp, 0);
+        xfs_buf_relse(bp);
+        return true;
        /*
-         * If the write of the buffer was synchronous, we want to make
+         * Permanent error - we need to trigger a shutdown if we haven't already
-         * sure to return the error to the caller of xfs_bwrite().
+         * to indicate that inconsistency will result from this action.
         */
+permanent_error:
+        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+out_stale:
        xfs_buf_stale(bp);
        bp->b_flags |= XBF_DONE;
        trace_xfs_buf_error_relse(bp, _RET_IP_);
+        return false;
+}
+/*
+ * This is the iodone() function for buffers which have had callbacks attached
+ * to them by xfs_buf_attach_iodone(). We need to iterate the items on the
+ * callback list, mark the buffer as having no more callbacks and then push the
+ * buffer through IO completion processing.
+ */
+void
+xfs_buf_iodone_callbacks(
+        struct xfs_buf          *bp)
+{
+        /*
+         * If there is an error, process it. Some errors require us
+         * to run callbacks after failure processing is done so we
+         * detect that and take appropriate action.
+         */
+        if (bp->b_error && xfs_buf_iodone_callback_error(bp))
+                return;
+        /*
+         * Successful IO or permanent error. Either way, we can clear the
+         * retry state here in preparation for the next error that may occur.
+         */
+        bp->b_last_error = 0;
+        bp->b_retries = 0;
-do_callbacks:
        xfs_buf_do_callbacks(bp);
        bp->b_fspriv = NULL;
        bp->b_iodone = NULL;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 316b2a1bdba5..e0646659ce16 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -614,11 +614,10 @@ xfs_qm_dqread(
        trace_xfs_dqread(dqp);
        if (flags & XFS_QMOPT_DQALLOC) {
-                tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
+                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
+                                XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
-                                          XFS_QM_DQALLOC_SPACE_RES(mp), 0);
                if (error)
-                        goto error1;
+                        goto error0;
        }
        /*
@@ -692,7 +691,7 @@ error0:
 * end of the chunk, skip ahead to first id in next allocated chunk
 * using the SEEK_DATA interface.
 */
-int
+static int
 xfs_dq_get_next_id(
        xfs_mount_t             *mp,
        uint                    type,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 85ce3032f815..44af22897c8b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -145,12 +145,10 @@ xfs_update_prealloc_flags(
        struct xfs_trans        *tp;
        int                     error;
-        tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
+        error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
-        error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
+                        0, 0, 0, &tp);
-        if (error) {
+        if (error)
-                xfs_trans_cancel(tp);
                return error;
-        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index ee3aaa0a5317..b4d75825ae37 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -198,14 +198,10 @@ xfs_growfs_data_private(
                        return error;
        }
-        tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
-        tp->t_flags |= XFS_TRANS_RESERVE;
+                        XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
+        if (error)
-                                  XFS_GROWFS_SPACE_RES(mp), 0);
-        if (error) {
-                xfs_trans_cancel(tp);
                return error;
-        }
        /*
         * Write new AG headers to disk. Non-transactional, but written
@@ -243,8 +239,8 @@ xfs_growfs_data_private(
                agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
                agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
                agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
-                agf->agf_flfirst = 0;
+                agf->agf_flfirst = cpu_to_be32(1);
-                agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
+                agf->agf_fllast = 0;
                agf->agf_flcount = 0;
                tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
                agf->agf_freeblks = cpu_to_be32(tmpsize);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index bf2d60749278..99ee6eee5e0b 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -37,9 +37,6 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
-                                struct xfs_perag *pag, struct xfs_inode *ip);
 /*
 * Allocate and initialise an xfs_inode.
 */
@@ -94,13 +91,6 @@ xfs_inode_free_callback(
        struct inode            *inode = container_of(head, struct inode, i_rcu);
        struct xfs_inode        *ip = XFS_I(inode);
-        kmem_zone_free(xfs_inode_zone, ip);
-}
-void
-xfs_inode_free(
-        struct xfs_inode        *ip)
-{
        switch (VFS_I(ip)->i_mode & S_IFMT) {
        case S_IFREG:
        case S_IFDIR:
@@ -118,6 +108,25 @@ xfs_inode_free(
                ip->i_itemp = NULL;
        }
+        kmem_zone_free(xfs_inode_zone, ip);
+}
+static void
+__xfs_inode_free(
+        struct xfs_inode        *ip)
+{
+        /* asserts to verify all state is correct here */
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
+        ASSERT(!xfs_isiflocked(ip));
+        XFS_STATS_DEC(ip->i_mount, vn_active);
+        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+void
+xfs_inode_free(
+        struct xfs_inode        *ip)
+{
        /*
         * Because we use RCU freeing we need to ensure the inode always
         * appears to be reclaimed with an invalid inode number when in the
@@ -129,12 +138,123 @@ xfs_inode_free(
        ip->i_ino = 0;
        spin_unlock(&ip->i_flags_lock);
-        /* asserts to verify all state is correct here */
+        __xfs_inode_free(ip);
-        ASSERT(atomic_read(&ip->i_pincount) == 0);
+}
-        ASSERT(!xfs_isiflocked(ip));
-        XFS_STATS_DEC(ip->i_mount, vn_active);
-        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_work_queue(
+        struct xfs_mount        *mp)
+{
+        rcu_read_lock();
+        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+                queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+        }
+        rcu_read_unlock();
+}
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_reclaim_work);
+        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+        xfs_reclaim_work_queue(mp);
+}
+static void
+xfs_perag_set_reclaim_tag(
+        struct xfs_perag        *pag)
+{
+        struct xfs_mount        *mp = pag->pag_mount;
+        ASSERT(spin_is_locked(&pag->pag_ici_lock));
+        if (pag->pag_ici_reclaimable++)
+                return;
+        /* propagate the reclaim tag up into the perag radix tree */
+        spin_lock(&mp->m_perag_lock);
+        radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
+                           XFS_ICI_RECLAIM_TAG);
+        spin_unlock(&mp->m_perag_lock);
+        /* schedule periodic background inode reclaim */
+        xfs_reclaim_work_queue(mp);
+        trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+static void
+xfs_perag_clear_reclaim_tag(
+        struct xfs_perag        *pag)
+{
+        struct xfs_mount        *mp = pag->pag_mount;
+        ASSERT(spin_is_locked(&pag->pag_ici_lock));
+        if (--pag->pag_ici_reclaimable)
+                return;
+        /* clear the reclaim tag from the perag radix tree */
+        spin_lock(&mp->m_perag_lock);
+        radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
+                             XFS_ICI_RECLAIM_TAG);
+        spin_unlock(&mp->m_perag_lock);
+        trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+        struct xfs_inode        *ip)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_perag        *pag;
+        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+        spin_lock(&pag->pag_ici_lock);
+        spin_lock(&ip->i_flags_lock);
+        radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
+                           XFS_ICI_RECLAIM_TAG);
+        xfs_perag_set_reclaim_tag(pag);
+        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+        spin_unlock(&ip->i_flags_lock);
+        spin_unlock(&pag->pag_ici_lock);
+        xfs_perag_put(pag);
+}
+STATIC void
+xfs_inode_clear_reclaim_tag(
+        struct xfs_perag        *pag,
+        xfs_ino_t               ino)
+{
+        radix_tree_tag_clear(&pag->pag_ici_root,
+                             XFS_INO_TO_AGINO(pag->pag_mount, ino),
+                             XFS_ICI_RECLAIM_TAG);
+        xfs_perag_clear_reclaim_tag(pag);
 }
 /*
@@ -264,7 +384,7 @@ xfs_iget_cache_hit(
                 */
                ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
                ip->i_flags |= XFS_INEW;
-                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+                xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
                inode->i_state = I_NEW;
                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
@@ -723,121 +843,6 @@ xfs_inode_ag_iterator_tag(
 }
 /*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs periodic sync default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
- */
-static void
-xfs_reclaim_work_queue(
-        struct xfs_mount        *mp)
-{
-        rcu_read_lock();
-        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-                queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
-                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-        }
-        rcu_read_unlock();
-}
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-void
-xfs_reclaim_worker(
-        struct work_struct *work)
-{
-        struct xfs_mount *mp = container_of(to_delayed_work(work),
-                                        struct xfs_mount, m_reclaim_work);
-        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-        xfs_reclaim_work_queue(mp);
-}
-static void
-__xfs_inode_set_reclaim_tag(
-        struct xfs_perag        *pag,
-        struct xfs_inode        *ip)
-{
-        radix_tree_tag_set(&pag->pag_ici_root,
-                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
-                           XFS_ICI_RECLAIM_TAG);
-        if (!pag->pag_ici_reclaimable) {
-                /* propagate the reclaim tag up into the perag radix tree */
-                spin_lock(&ip->i_mount->m_perag_lock);
-                radix_tree_tag_set(&ip->i_mount->m_perag_tree,
-                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-                                XFS_ICI_RECLAIM_TAG);
-                spin_unlock(&ip->i_mount->m_perag_lock);
-                /* schedule periodic background inode reclaim */
-                xfs_reclaim_work_queue(ip->i_mount);
-                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-                                                        -1, _RET_IP_);
-        }
-        pag->pag_ici_reclaimable++;
-}
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_set_reclaim_tag(
-        xfs_inode_t     *ip)
-{
-        struct xfs_mount *mp = ip->i_mount;
-        struct xfs_perag *pag;
-        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        spin_lock(&pag->pag_ici_lock);
-        spin_lock(&ip->i_flags_lock);
-        __xfs_inode_set_reclaim_tag(pag, ip);
-        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-        spin_unlock(&ip->i_flags_lock);
-        spin_unlock(&pag->pag_ici_lock);
-        xfs_perag_put(pag);
-}
-STATIC void
-__xfs_inode_clear_reclaim(
-        xfs_perag_t     *pag,
-        xfs_inode_t     *ip)
-{
-        pag->pag_ici_reclaimable--;
-        if (!pag->pag_ici_reclaimable) {
-                /* clear the reclaim tag from the perag radix tree */
-                spin_lock(&ip->i_mount->m_perag_lock);
-                radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
-                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-                                XFS_ICI_RECLAIM_TAG);
-                spin_unlock(&ip->i_mount->m_perag_lock);
-                trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
-                                                        -1, _RET_IP_);
-        }
-}
-STATIC void
-__xfs_inode_clear_reclaim_tag(
-        xfs_mount_t     *mp,
-        xfs_perag_t     *pag,
-        xfs_inode_t     *ip)
-{
-        radix_tree_tag_clear(&pag->pag_ici_root,
-                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
-        __xfs_inode_clear_reclaim(pag, ip);
-}
-/*
 * Grab the inode for reclaim exclusively.
 * Return 0 if we grabbed it, non-zero otherwise.
 */
@@ -929,6 +934,7 @@ xfs_reclaim_inode(
        int                     sync_mode)
 {
        struct xfs_buf          *bp = NULL;
+        xfs_ino_t               ino = ip->i_ino; /* for radix_tree_delete */
        int                     error;
 restart:
@@ -993,6 +999,22 @@ restart:
        xfs_iflock(ip);
 reclaim:
+        /*
+         * Because we use RCU freeing we need to ensure the inode always appears
+         * to be reclaimed with an invalid inode number when in the free state.
+         * We do this as early as possible under the ILOCK and flush lock so
+         * that xfs_iflush_cluster() can be guaranteed to detect races with us
+         * here. By doing this, we guarantee that once xfs_iflush_cluster has
+         * locked both the XFS_ILOCK and the flush lock that it will see either
+         * a valid, flushable inode that will serialise correctly against the
+         * locks below, or it will see a clean (and invalid) inode that it can
+         * skip.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ip->i_flags = XFS_IRECLAIM;
+        ip->i_ino = 0;
+        spin_unlock(&ip->i_flags_lock);
        xfs_ifunlock(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1006,9 +1028,9 @@ reclaim:
         */
        spin_lock(&pag->pag_ici_lock);
        if (!radix_tree_delete(&pag->pag_ici_root,
-                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+                                XFS_INO_TO_AGINO(ip->i_mount, ino)))
                ASSERT(0);
-        __xfs_inode_clear_reclaim(pag, ip);
+        xfs_perag_clear_reclaim_tag(pag);
        spin_unlock(&pag->pag_ici_lock);
        /*
@@ -1023,7 +1045,7 @@ reclaim:
        xfs_qm_dqdetach(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        xfs_inode_free(ip);
+        __xfs_inode_free(ip);
        return error;
 out_ifunlock:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 96f606deee31..ee6799e0476f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1030,7 +1030,7 @@ xfs_dir_ialloc(
                        tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
                }
-                code = xfs_trans_roll(&tp, 0);
+                code = xfs_trans_roll(&tp, NULL);
                if (committed != NULL)
                        *committed = 1;
@@ -1161,11 +1161,9 @@ xfs_create(
                rdev = 0;
                resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
                tres = &M_RES(mp)->tr_mkdir;
-                tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
        } else {
                resblks = XFS_CREATE_SPACE_RES(mp, name->len);
                tres = &M_RES(mp)->tr_create;
-                tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
        }
        /*
@@ -1174,20 +1172,19 @@ xfs_create(
         * the case we'll drop the one we have and get a more
         * appropriate transaction later.
         */
-        error = xfs_trans_reserve(tp, tres, resblks, 0);
+        error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
        if (error == -ENOSPC) {
                /* flush outstanding delalloc blocks and retry */
                xfs_flush_inodes(mp);
-                error = xfs_trans_reserve(tp, tres, resblks, 0);
+                error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
        }
        if (error == -ENOSPC) {
                /* No space at all so try a "no-allocation" reservation */
                resblks = 0;
-                error = xfs_trans_reserve(tp, tres, 0, 0);
+                error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
        }
        if (error)
-                goto out_trans_cancel;
+                goto out_release_inode;
        xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
                      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
@@ -1337,17 +1334,16 @@ xfs_create_tmpfile(
                return error;
        resblks = XFS_IALLOC_SPACE_RES(mp);
-        tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
        tres = &M_RES(mp)->tr_create_tmpfile;
-        error = xfs_trans_reserve(tp, tres, resblks, 0);
+        error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
        if (error == -ENOSPC) {
                /* No space at all so try a "no-allocation" reservation */
                resblks = 0;
-                error = xfs_trans_reserve(tp, tres, 0, 0);
+                error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
        }
        if (error)
-                goto out_trans_cancel;
+                goto out_release_inode;
        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
                                                pdqp, resblks, 1, 0);
@@ -1432,15 +1428,14 @@ xfs_link(
        if (error)
                goto std_return;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
        resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
        if (error == -ENOSPC) {
                resblks = 0;
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
+                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
        }
        if (error)
-                goto error_return;
+                goto std_return;
        xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
@@ -1710,11 +1705,9 @@ xfs_inactive_truncate(
        struct xfs_trans        *tp;
        int                     error;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error) {
                ASSERT(XFS_FORCED_SHUTDOWN(mp));
-                xfs_trans_cancel(tp);
                return error;
        }
@@ -1764,8 +1757,6 @@ xfs_inactive_ifree(
        struct xfs_trans        *tp;
        int                     error;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
        /*
         * The ifree transaction might need to allocate blocks for record
         * insertion to the finobt. We don't want to fail here at ENOSPC, so
@@ -1781,9 +1772,8 @@ xfs_inactive_ifree(
         * now remains allocated and sits on the unlinked list until the fs is
         * repaired.
         */
-        tp->t_flags |= XFS_TRANS_RESERVE;
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
+                        XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
-                                  XFS_IFREE_SPACE_RES(mp), 0);
        if (error) {
                if (error == -ENOSPC) {
                        xfs_warn_ratelimited(mp,
@@ -1792,7 +1782,6 @@ xfs_inactive_ifree(
                } else {
                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
                }
-                xfs_trans_cancel(tp);
                return error;
        }
@@ -2525,11 +2514,6 @@ xfs_remove(
        if (error)
                goto std_return;
-        if (is_dir)
-                tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
-        else
-                tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
        /*
         * We try to get the real space reservation first,
         * allowing for directory btree deletion(s) implying
@@ -2540,14 +2524,15 @@ xfs_remove(
         * block from the directory.
         */
        resblks = XFS_REMOVE_SPACE_RES(mp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
        if (error == -ENOSPC) {
                resblks = 0;
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
+                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
+                                &tp);
        }
        if (error) {
                ASSERT(error != -ENOSPC);
-                goto out_trans_cancel;
+                goto std_return;
        }
        xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
@@ -2855,6 +2840,7 @@ xfs_rename_alloc_whiteout(
         * and flag it as linkable.
         */
        drop_nlink(VFS_I(tmpfile));
+        xfs_setup_iops(tmpfile);
        xfs_finish_inode_setup(tmpfile);
        VFS_I(tmpfile)->i_state |= I_LINKABLE;
@@ -2910,15 +2896,15 @@ xfs_rename(
        xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
                                inodes, &num_inodes);
-        tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
        if (error == -ENOSPC) {
                spaceres = 0;
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
+                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
+                                &tp);
        }
        if (error)
-                goto out_trans_cancel;
+                goto out_release_wip;
        /*
         * Attach the dquots to the inodes
@@ -3155,6 +3141,7 @@ out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
 out_trans_cancel:
        xfs_trans_cancel(tp);
+out_release_wip:
        if (wip)
                IRELE(wip);
        return error;
@@ -3162,16 +3149,16 @@ out_trans_cancel:
 STATIC int
 xfs_iflush_cluster(
-        xfs_inode_t     *ip,
+        struct xfs_inode        *ip,
-        xfs_buf_t       *bp)
+        struct xfs_buf          *bp)
 {
-        xfs_mount_t             *mp = ip->i_mount;
+        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_perag        *pag;
        unsigned long           first_index, mask;
        unsigned long           inodes_per_cluster;
-        int                     ilist_size;
+        int                     cilist_size;
-        xfs_inode_t             **ilist;
+        struct xfs_inode        **cilist;
-        xfs_inode_t             *iq;
+        struct xfs_inode        *cip;
        int                     nr_found;
        int                     clcount = 0;
        int                     bufwasdelwri;
@@ -3180,23 +3167,23 @@ xfs_iflush_cluster(
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
        inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
-        ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+        cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
-        ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
+        cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
-        if (!ilist)
+        if (!cilist)
                goto out_put;
        mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
        rcu_read_lock();
        /* really need a gang lookup range call here */
-        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
                                        first_index, inodes_per_cluster);
        if (nr_found == 0)
                goto out_free;
        for (i = 0; i < nr_found; i++) {
-                iq = ilist[i];
+                cip = cilist[i];
-                if (iq == ip)
+                if (cip == ip)
                        continue;
                /*
@@ -3205,20 +3192,30 @@ xfs_iflush_cluster(
                 * We need to check under the i_flags_lock for a valid inode
                 * here. Skip it if it is not valid or the wrong inode.
                 */
-                spin_lock(&ip->i_flags_lock);
+                spin_lock(&cip->i_flags_lock);
-                if (!ip->i_ino ||
+                if (!cip->i_ino ||
-                    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                    __xfs_iflags_test(cip, XFS_ISTALE)) {
-                        spin_unlock(&ip->i_flags_lock);
+                        spin_unlock(&cip->i_flags_lock);
                        continue;
                }
-                spin_unlock(&ip->i_flags_lock);
+                /*
+                 * Once we fall off the end of the cluster, no point checking
+                 * any more inodes in the list because they will also all be
+                 * outside the cluster.
+                 */
+                if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
+                        spin_unlock(&cip->i_flags_lock);
+                        break;
+                }
+                spin_unlock(&cip->i_flags_lock);
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
                 * later after the appropriate locks are acquired.
                 */
-                if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
+                if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
                        continue;
                /*
@@ -3226,15 +3223,28 @@ xfs_iflush_cluster(
                 * then this inode cannot be flushed and is skipped.
                 */
-                if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+                if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
+                        continue;
+                if (!xfs_iflock_nowait(cip)) {
+                        xfs_iunlock(cip, XFS_ILOCK_SHARED);
                        continue;
-                if (!xfs_iflock_nowait(iq)) {
+                }
-                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
+                if (xfs_ipincount(cip)) {
+                        xfs_ifunlock(cip);
+                        xfs_iunlock(cip, XFS_ILOCK_SHARED);
                        continue;
                }
-                if (xfs_ipincount(iq)) {
-                        xfs_ifunlock(iq);
-                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
+                /*
+                 * Check the inode number again, just to be certain we are not
+                 * racing with freeing in xfs_reclaim_inode(). See the comments
+                 * in that function for more information as to why the initial
+                 * check is not sufficient.
+                 */
+                if (!cip->i_ino) {
+                        xfs_ifunlock(cip);
+                        xfs_iunlock(cip, XFS_ILOCK_SHARED);
                        continue;
                }
@@ -3242,18 +3252,18 @@ xfs_iflush_cluster(
                 * arriving here means that this inode can be flushed.  First
                 * re-check that it's dirty before flushing.
                 */
-                if (!xfs_inode_clean(iq)) {
+                if (!xfs_inode_clean(cip)) {
                        int     error;
-                        error = xfs_iflush_int(iq, bp);
+                        error = xfs_iflush_int(cip, bp);
                        if (error) {
-                                xfs_iunlock(iq, XFS_ILOCK_SHARED);
+                                xfs_iunlock(cip, XFS_ILOCK_SHARED);
                                goto cluster_corrupt_out;
                        }
                        clcount++;
                } else {
-                        xfs_ifunlock(iq);
+                        xfs_ifunlock(cip);
                }
-                xfs_iunlock(iq, XFS_ILOCK_SHARED);
+                xfs_iunlock(cip, XFS_ILOCK_SHARED);
        }
        if (clcount) {
@@ -3263,7 +3273,7 @@ xfs_iflush_cluster(
 out_free:
        rcu_read_unlock();
-        kmem_free(ilist);
+        kmem_free(cilist);
 out_put:
        xfs_perag_put(pag);
        return 0;
@@ -3306,8 +3316,8 @@ cluster_corrupt_out:
        /*
         * Unlocks the flush lock
         */
-        xfs_iflush_abort(iq, false);
+        xfs_iflush_abort(cip, false);
-        kmem_free(ilist);
+        kmem_free(cilist);
        xfs_perag_put(pag);
        return -EFSCORRUPTED;
 }
@@ -3327,7 +3337,7 @@ xfs_iflush(
        struct xfs_buf          **bpp)
 {
        struct xfs_mount        *mp = ip->i_mount;
-        struct xfs_buf          *bp;
+        struct xfs_buf          *bp = NULL;
        struct xfs_dinode       *dip;
        int                     error;
@@ -3369,14 +3379,22 @@ xfs_iflush(
        }
        /*
-         * Get the buffer containing the on-disk inode.
+         * Get the buffer containing the on-disk inode. We are doing a try-lock
+         * operation here, so we may get  an EAGAIN error. In that case, we
+         * simply want to return with the inode still dirty.
+         *
+         * If we get any other error, we effectively have a corruption situation
+         * and we cannot flush the inode, so we treat it the same as failing
+         * xfs_iflush_int().
         */
        error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
                               0);
-        if (error || !bp) {
+        if (error == -EAGAIN) {
                xfs_ifunlock(ip);
                return error;
        }
+        if (error)
+                goto corrupt_out;
        /*
         * First flush out the inode that xfs_iflush was called with.
@@ -3404,7 +3422,8 @@ xfs_iflush(
        return 0;
 corrupt_out:
-        xfs_buf_relse(bp);
+        if (bp)
+                xfs_buf_relse(bp);
        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 cluster_corrupt_out:
        error = -EFSCORRUPTED;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 43e1d51b15eb..e52d7c7aeb5b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -440,6 +440,9 @@ loff_t	__xfs_seek_hole_data(struct inode *inode, loff_t start,
 /* from xfs_iops.c */
+extern void xfs_setup_inode(struct xfs_inode *ip);
+extern void xfs_setup_iops(struct xfs_inode *ip);
 /*
 * When setting up a newly allocated inode, we need to call
 * xfs_finish_inode_setup() once the inode is fully instantiated at
@@ -447,7 +450,6 @@ loff_t	__xfs_seek_hole_data(struct inode *inode, loff_t start,
 * before we've completed instantiation. Otherwise we can do it
 * the moment the inode lookup is complete.
 */
-extern void xfs_setup_inode(struct xfs_inode *ip);
 static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
 {
        xfs_iflags_clear(ip, XFS_INEW);
@@ -458,6 +460,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
 static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
 {
        xfs_setup_inode(ip);
+        xfs_setup_iops(ip);
        xfs_finish_inode_setup(ip);
 }
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c48b5b18d771..a1b07612224c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -210,7 +210,7 @@ xfs_inode_item_format_data_fork(
                         */
                        data_bytes = roundup(ip->i_df.if_bytes, 4);
                        ASSERT(ip->i_df.if_real_bytes == 0 ||
-                               ip->i_df.if_real_bytes == data_bytes);
+                               ip->i_df.if_real_bytes >= data_bytes);
                        ASSERT(ip->i_df.if_u1.if_data != NULL);
                        ASSERT(ip->i_d.di_size > 0);
                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
@@ -305,7 +305,7 @@ xfs_inode_item_format_attr_fork(
                         */
                        data_bytes = roundup(ip->i_afp->if_bytes, 4);
                        ASSERT(ip->i_afp->if_real_bytes == 0 ||
-                               ip->i_afp->if_real_bytes == data_bytes);
+                               ip->i_afp->if_real_bytes >= data_bytes);
                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
                                        ip->i_afp->if_u1.if_data,
@@ -479,6 +479,8 @@ STATIC uint
 xfs_inode_item_push(
        struct xfs_log_item     *lip,
        struct list_head        *buffer_list)
+                __releases(&lip->li_ailp->xa_lock)
+                __acquires(&lip->li_ailp->xa_lock)
 {
        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
        struct xfs_inode        *ip = iip->ili_inode;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bcb6c19ce3ea..dbca7375deef 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -277,7 +277,6 @@ xfs_readlink_by_handle(
 {
        struct dentry           *dentry;
        __u32                   olen;
-        void                    *link;
        int                     error;
        if (!capable(CAP_SYS_ADMIN))
@@ -288,7 +287,7 @@ xfs_readlink_by_handle(
                return PTR_ERR(dentry);
        /* Restrict this handle operation to symlinks only. */
-        if (!d_is_symlink(dentry)) {
+        if (!d_inode(dentry)->i_op->readlink) {
                error = -EINVAL;
                goto out_dput;
        }
@@ -298,21 +297,8 @@ xfs_readlink_by_handle(
                goto out_dput;
        }
-        link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
+        error = d_inode(dentry)->i_op->readlink(dentry, hreq->ohandle, olen);
-        if (!link) {
-                error = -ENOMEM;
-                goto out_dput;
-        }
-        error = xfs_readlink(XFS_I(d_inode(dentry)), link);
-        if (error)
-                goto out_kfree;
-        error = readlink_copy(hreq->ohandle, olen, link);
-        if (error)
-                goto out_kfree;
- out_kfree:
-        kfree(link);
 out_dput:
        dput(dentry);
        return error;
@@ -334,12 +320,10 @@ xfs_set_dmattrs(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+        if (error)
-        if (error) {
-                xfs_trans_cancel(tp);
                return error;
-        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -1141,10 +1125,9 @@ xfs_ioctl_setattr_get_trans(
        if (XFS_FORCED_SHUTDOWN(mp))
                goto out_unlock;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
        if (error)
-                goto out_cancel;
+                return ERR_PTR(error);
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d81bdc080370..58391355a44d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -132,6 +132,7 @@ xfs_iomap_write_direct(
        int             error;
        int             lockmode;
        int             bmapi_flags = XFS_BMAPI_PREALLOC;
+        uint            tflags = 0;
        rt = XFS_IS_REALTIME_INODE(ip);
        extsz = xfs_get_extsz_hint(ip);
@@ -192,11 +193,6 @@ xfs_iomap_write_direct(
                return error;
        /*
-         * Allocate and setup the transaction
-         */
-        tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-        /*
         * For DAX, we do not allocate unwritten extents, but instead we zero
         * the block before we commit the transaction.  Ideally we'd like to do
         * this outside the transaction context, but if we commit and then crash
@@ -209,23 +205,17 @@ xfs_iomap_write_direct(
         * the reserve block pool for bmbt block allocation if there is no space
         * left but we need to do unwritten extent conversion.
         */
        if (IS_DAX(VFS_I(ip))) {
                bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
                if (ISUNWRITTEN(imap)) {
-                        tp->t_flags |= XFS_TRANS_RESERVE;
+                        tflags |= XFS_TRANS_RESERVE;
                        resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
                }
        }
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents,
-                                  resblks, resrtextents);
+                        tflags, &tp);
-        /*
+        if (error)
-         * Check for running out of space, note: need lock to return
-         */
-        if (error) {
-                xfs_trans_cancel(tp);
                return error;
-        }
        lockmode = XFS_ILOCK_EXCL;
        xfs_ilock(ip, lockmode);
@@ -726,15 +716,13 @@ xfs_iomap_write_allocate(
                nimaps = 0;
                while (nimaps == 0) {
-                        tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
-                        tp->t_flags |= XFS_TRANS_RESERVE;
                        nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
-                        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                                                  nres, 0);
+                        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres,
-                        if (error) {
+                                        0, XFS_TRANS_RESERVE, &tp);
-                                xfs_trans_cancel(tp);
+                        if (error)
                                return error;
-                        }
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
                        xfs_trans_ijoin(tp, ip, 0);
@@ -878,25 +866,18 @@ xfs_iomap_write_unwritten(
        do {
                /*
-                 * set up a transaction to convert the range of extents
+                 * Set up a transaction to convert the range of extents
                 * from unwritten to real. Do allocations in a loop until
                 * we have covered the range passed in.
                 *
-                 * Note that we open code the transaction allocation here
+                 * Note that we can't risk to recursing back into the filesystem
-                 * to pass KM_NOFS--we can't risk to recursing back into
+                 * here as we might be asked to write out the same inode that we
-                 * the filesystem here as we might be asked to write out
+                 * complete here and might deadlock on the iolock.
-                 * the same inode that we complete here and might deadlock
-                 * on the iolock.
                 */
-                sb_start_intwrite(mp->m_super);
+                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
-                tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
+                                XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
-                tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
+                if (error)
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                                          resblks, 0);
-                if (error) {
-                        xfs_trans_cancel(tp);
                        return error;
-                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_trans_ijoin(tp, ip, 0);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index fb7dc61f4a29..c5d4eba6972e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -181,6 +181,8 @@ xfs_generic_create(
        }
 #endif
+        xfs_setup_iops(ip);
        if (tmpfile)
                d_tmpfile(dentry, inode);
        else
@@ -368,6 +370,8 @@ xfs_vn_symlink(
        if (unlikely(error))
                goto out_cleanup_inode;
+        xfs_setup_iops(cip);
        d_instantiate(dentry, inode);
        xfs_finish_inode_setup(cip);
        return 0;
@@ -442,6 +446,16 @@ xfs_vn_get_link(
        return ERR_PTR(error);
 }
+STATIC const char *
+xfs_vn_get_link_inline(
+        struct dentry           *dentry,
+        struct inode            *inode,
+        struct delayed_call     *done)
+{
+        ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE);
+        return XFS_I(inode)->i_df.if_u1.if_data;
+}
 STATIC int
 xfs_vn_getattr(
        struct vfsmount         *mnt,
@@ -599,12 +613,12 @@ xfs_setattr_nonsize(
                        return error;
        }
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
        if (error)
-                goto out_trans_cancel;
+                goto out_dqrele;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, 0);
        /*
         * Change file ownership.  Must be the owner or privileged.
@@ -633,12 +647,10 @@ xfs_setattr_nonsize(
                                                NULL, capable(CAP_FOWNER) ?
                                                XFS_QMOPT_FORCE_RES : 0);
                        if (error)      /* out of quota */
-                                goto out_unlock;
+                                goto out_cancel;
                }
        }
-        xfs_trans_ijoin(tp, ip, 0);
        /*
         * Change file ownership.  Must be the owner or privileged.
         */
@@ -722,10 +734,9 @@ xfs_setattr_nonsize(
        return 0;
-out_unlock:
+out_cancel:
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out_trans_cancel:
        xfs_trans_cancel(tp);
+out_dqrele:
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
        return error;
@@ -834,7 +845,7 @@ xfs_setattr_size(
         * We have to do all the page cache truncate work outside the
         * transaction context as the "lock" order is page lock->log space
         * reservation as defined by extent allocation in the writeback path.
-         * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
+         * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but
         * having already truncated the in-memory version of the file (i.e. made
         * user visible changes). There's not much we can do about this, except
         * to hope that the caller sees ENOMEM and retries the truncate
@@ -849,10 +860,9 @@ xfs_setattr_size(
                return error;
        truncate_setsize(inode, newsize);
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error)
-                goto out_trans_cancel;
+                return error;
        lock_flags |= XFS_ILOCK_EXCL;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -971,12 +981,9 @@ xfs_vn_update_time(
        trace_xfs_update_time(ip);
-        tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+        if (error)
-        if (error) {
-                xfs_trans_cancel(tp);
                return error;
-        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        if (flags & S_CTIME)
@@ -1167,6 +1174,18 @@ static const struct inode_operations xfs_symlink_inode_operations = {
        .update_time            = xfs_vn_update_time,
 };
+static const struct inode_operations xfs_inline_symlink_inode_operations = {
+        .readlink               = generic_readlink,
+        .get_link               = xfs_vn_get_link_inline,
+        .getattr                = xfs_vn_getattr,
+        .setattr                = xfs_vn_setattr,
+        .setxattr               = generic_setxattr,
+        .getxattr               = generic_getxattr,
+        .removexattr            = generic_removexattr,
+        .listxattr              = xfs_vn_listxattr,
+        .update_time            = xfs_vn_update_time,
+};
 STATIC void
 xfs_diflags_to_iflags(
        struct inode            *inode,
@@ -1193,7 +1212,7 @@ xfs_diflags_to_iflags(
 }
 /*
- * Initialize the Linux inode and set up the operation vectors.
+ * Initialize the Linux inode.
 *
 * When reading existing inodes from disk this is called directly from xfs_iget,
 * when creating a new inode it is called from xfs_ialloc after setting up the
@@ -1232,32 +1251,12 @@ xfs_setup_inode(
        i_size_write(inode, ip->i_d.di_size);
        xfs_diflags_to_iflags(inode, ip);
-        ip->d_ops = ip->i_mount->m_nondir_inode_ops;
+        if (S_ISDIR(inode->i_mode)) {
-        lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
-        switch (inode->i_mode & S_IFMT) {
-        case S_IFREG:
-                inode->i_op = &xfs_inode_operations;
-                inode->i_fop = &xfs_file_operations;
-                inode->i_mapping->a_ops = &xfs_address_space_operations;
-                break;
-        case S_IFDIR:
                lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
-                if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
-                        inode->i_op = &xfs_dir_ci_inode_operations;
-                else
-                        inode->i_op = &xfs_dir_inode_operations;
-                inode->i_fop = &xfs_dir_file_operations;
                ip->d_ops = ip->i_mount->m_dir_inode_ops;
-                break;
+        } else {
-        case S_IFLNK:
+                ip->d_ops = ip->i_mount->m_nondir_inode_ops;
-                inode->i_op = &xfs_symlink_inode_operations;
+                lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
-                if (!(ip->i_df.if_flags & XFS_IFINLINE))
-                        inode->i_mapping->a_ops = &xfs_address_space_operations;
-                break;
-        default:
-                inode->i_op = &xfs_inode_operations;
-                init_special_inode(inode, inode->i_mode, inode->i_rdev);
-                break;
        }
        /*
@@ -1277,3 +1276,35 @@ xfs_setup_inode(
                cache_no_acl(inode);
        }
 }
+void
+xfs_setup_iops(
+        struct xfs_inode        *ip)
+{
+        struct inode            *inode = &ip->i_vnode;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_op = &xfs_inode_operations;
+                inode->i_fop = &xfs_file_operations;
+                inode->i_mapping->a_ops = &xfs_address_space_operations;
+                break;
+        case S_IFDIR:
+                if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+                        inode->i_op = &xfs_dir_ci_inode_operations;
+                else
+                        inode->i_op = &xfs_dir_inode_operations;
+                inode->i_fop = &xfs_dir_file_operations;
+                break;
+        case S_IFLNK:
+                if (ip->i_df.if_flags & XFS_IFINLINE)
+                        inode->i_op = &xfs_inline_symlink_inode_operations;
+                else
+                        inode->i_op = &xfs_symlink_inode_operations;
+                break;
+        default:
+                inode->i_op = &xfs_inode_operations;
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                break;
+        }
+}
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b49ccf5c1d75..bde02f1fba73 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -435,8 +435,7 @@ xfs_log_reserve(
        int                     cnt,
        struct xlog_ticket      **ticp,
        __uint8_t               client,
-        bool                    permanent,
+        bool                    permanent)
-        uint                    t_type)
 {
        struct xlog             *log = mp->m_log;
        struct xlog_ticket      *tic;
@@ -456,7 +455,6 @@ xfs_log_reserve(
        if (!tic)
                return -ENOMEM;
-        tic->t_trans_type = t_type;
        *ticp = tic;
        xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -823,8 +821,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
        } while (iclog != first_iclog);
 #endif
        if (! (XLOG_FORCED_SHUTDOWN(log))) {
-                error = xfs_log_reserve(mp, 600, 1, &tic,
+                error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
-                                        XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
                if (!error) {
                        /* the data section must be 32 bit size aligned */
                        struct {
@@ -2032,58 +2029,8 @@ xlog_print_tic_res(
            REG_TYPE_STR(ICREATE, "inode create")
        };
 #undef REG_TYPE_STR
-#define TRANS_TYPE_STR(type)    [XFS_TRANS_##type] = #type
-        static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
-            TRANS_TYPE_STR(SETATTR_NOT_SIZE),
-            TRANS_TYPE_STR(SETATTR_SIZE),
-            TRANS_TYPE_STR(INACTIVE),
-            TRANS_TYPE_STR(CREATE),
-            TRANS_TYPE_STR(CREATE_TRUNC),
-            TRANS_TYPE_STR(TRUNCATE_FILE),
-            TRANS_TYPE_STR(REMOVE),
-            TRANS_TYPE_STR(LINK),
-            TRANS_TYPE_STR(RENAME),
-            TRANS_TYPE_STR(MKDIR),
-            TRANS_TYPE_STR(RMDIR),
-            TRANS_TYPE_STR(SYMLINK),
-            TRANS_TYPE_STR(SET_DMATTRS),
-            TRANS_TYPE_STR(GROWFS),
-            TRANS_TYPE_STR(STRAT_WRITE),
-            TRANS_TYPE_STR(DIOSTRAT),
-            TRANS_TYPE_STR(WRITEID),
-            TRANS_TYPE_STR(ADDAFORK),
-            TRANS_TYPE_STR(ATTRINVAL),
-            TRANS_TYPE_STR(ATRUNCATE),
-            TRANS_TYPE_STR(ATTR_SET),
-            TRANS_TYPE_STR(ATTR_RM),
-            TRANS_TYPE_STR(ATTR_FLAG),
-            TRANS_TYPE_STR(CLEAR_AGI_BUCKET),
-            TRANS_TYPE_STR(SB_CHANGE),
-            TRANS_TYPE_STR(DUMMY1),
-            TRANS_TYPE_STR(DUMMY2),
-            TRANS_TYPE_STR(QM_QUOTAOFF),
-            TRANS_TYPE_STR(QM_DQALLOC),
-            TRANS_TYPE_STR(QM_SETQLIM),
-            TRANS_TYPE_STR(QM_DQCLUSTER),
-            TRANS_TYPE_STR(QM_QINOCREATE),
-            TRANS_TYPE_STR(QM_QUOTAOFF_END),
-            TRANS_TYPE_STR(FSYNC_TS),
-            TRANS_TYPE_STR(GROWFSRT_ALLOC),
-            TRANS_TYPE_STR(GROWFSRT_ZERO),
-            TRANS_TYPE_STR(GROWFSRT_FREE),
-            TRANS_TYPE_STR(SWAPEXT),
-            TRANS_TYPE_STR(CHECKPOINT),
-            TRANS_TYPE_STR(ICREATE),
-            TRANS_TYPE_STR(CREATE_TMPFILE)
-        };
-#undef TRANS_TYPE_STR
        xfs_warn(mp, "xlog_write: reservation summary:");
-        xfs_warn(mp, "  trans type  = %s (%u)",
-                 ((ticket->t_trans_type <= 0 ||
-                   ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
-                  "bad-trans-type" : trans_type_str[ticket->t_trans_type]),
-                 ticket->t_trans_type);
        xfs_warn(mp, "  unit res    = %d bytes",
                 ticket->t_unit_res);
        xfs_warn(mp, "  current res = %d bytes",
@@ -3378,7 +3325,7 @@ xfs_log_force(
 {
        int     error;
-        trace_xfs_log_force(mp, 0);
+        trace_xfs_log_force(mp, 0, _RET_IP_);
        error = _xfs_log_force(mp, flags, NULL);
        if (error)
                xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3527,7 +3474,7 @@ xfs_log_force_lsn(
 {
        int     error;
-        trace_xfs_log_force(mp, lsn);
+        trace_xfs_log_force(mp, lsn, _RET_IP_);
        error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
        if (error)
                xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3709,7 +3656,6 @@ xlog_ticket_alloc(
        tic->t_tid              = prandom_u32();
        tic->t_clientid         = client;
        tic->t_flags            = XLOG_TIC_INITED;
-        tic->t_trans_type       = 0;
        if (permanent)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index aa533a7d50f2..80ba0c047090 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -161,8 +161,7 @@ int	  xfs_log_reserve(struct xfs_mount *mp,
                          int              count,
                          struct xlog_ticket **ticket,
                          __uint8_t        clientid,
-                          bool             permanent,
+                          bool             permanent);
-                          uint             t_type);
 int       xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
 int       xfs_log_unmount_write(struct xfs_mount *mp);
 void      xfs_log_unmount(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4e7649351f5a..5e54e7955ea6 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -51,7 +51,6 @@ xlog_cil_ticket_alloc(
        tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
                                KM_SLEEP|KM_NOFS);
-        tic->t_trans_type = XFS_TRANS_CHECKPOINT;
        /*
         * set the current reservation to zero so we know to steal the basic
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index ed8896310c00..765f084759b5 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -175,7 +175,6 @@ typedef struct xlog_ticket {
        char               t_cnt;        /* current count                : 1  */
        char               t_clientid;   /* who does this belong to;     : 1  */
        char               t_flags;      /* properties of reservation    : 1  */
-        uint               t_trans_type; /* transaction type             : 4  */
        /* reservation array fields */
        uint               t_res_num;                    /* num in array : 4 */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 396565f43247..835997843846 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3843,7 +3843,7 @@ xlog_recover_add_to_cont_trans(
        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
        old_len = item->ri_buf[item->ri_cnt-1].i_len;
-        ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
+        ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
        memcpy(&ptr[old_len], dp, len);
        item->ri_buf[item->ri_cnt-1].i_len += len;
        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4205,10 +4205,9 @@ xlog_recover_process_efi(
                }
        }
-        tp = xfs_trans_alloc(mp, 0);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error)
-                goto abort_error;
+                return error;
        efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
        for (i = 0; i < efip->efi_format.efi_nextents; i++) {
@@ -4355,10 +4354,9 @@ xlog_recover_clear_agi_bucket(
        int             offset;
        int             error;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
        if (error)
-                goto out_abort;
+                goto out_error;
        error = xfs_read_agi(mp, tp, agno, &agibp);
        if (error)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index cfd4210dd015..e39b02351b4a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -89,7 +89,6 @@ xfs_uuid_mount(
        if (hole < 0) {
                xfs_uuid_table = kmem_realloc(xfs_uuid_table,
                        (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
-                        xfs_uuid_table_size  * sizeof(*xfs_uuid_table),
                        KM_SLEEP);
                hole = xfs_uuid_table_size++;
        }
@@ -681,6 +680,9 @@ xfs_mountfs(
        xfs_set_maxicount(mp);
+        /* enable fail_at_unmount as default */
+        mp->m_fail_unmount = 1;
        error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
        if (error)
                goto out;
@@ -690,10 +692,15 @@ xfs_mountfs(
        if (error)
                goto out_remove_sysfs;
-        error = xfs_uuid_mount(mp);
+        error = xfs_error_sysfs_init(mp);
        if (error)
                goto out_del_stats;
+        error = xfs_uuid_mount(mp);
+        if (error)
+                goto out_remove_error_sysfs;
        /*
         * Set the minimum read and write sizes
         */
@@ -957,6 +964,7 @@ xfs_mountfs(
        cancel_delayed_work_sync(&mp->m_reclaim_work);
        xfs_reclaim_inodes(mp, SYNC_WAIT);
 out_log_dealloc:
+        mp->m_flags |= XFS_MOUNT_UNMOUNTING;
        xfs_log_mount_cancel(mp);
 out_fail_wait:
        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
@@ -968,6 +976,8 @@ xfs_mountfs(
        xfs_da_unmount(mp);
 out_remove_uuid:
        xfs_uuid_unmount(mp);
+ out_remove_error_sysfs:
+        xfs_error_sysfs_del(mp);
 out_del_stats:
        xfs_sysfs_del(&mp->m_stats.xs_kobj);
 out_remove_sysfs:
@@ -1006,6 +1016,14 @@ xfs_unmountfs(
        xfs_log_force(mp, XFS_LOG_SYNC);
        /*
+         * We now need to tell the world we are unmounting. This will allow
+         * us to detect that the filesystem is going away and we should error
+         * out anything that we have been retrying in the background. This will
+         * prevent neverending retries in AIL pushing from hanging the unmount.
+         */
+        mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+        /*
         * Flush all pending changes from the AIL.
         */
        xfs_ail_push_all_sync(mp->m_ail);
@@ -1056,6 +1074,7 @@ xfs_unmountfs(
 #endif
        xfs_free_perag(mp);
+        xfs_error_sysfs_del(mp);
        xfs_sysfs_del(&mp->m_stats.xs_kobj);
        xfs_sysfs_del(&mp->m_kobj);
 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index eafe257b357a..c1b798c72126 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -37,6 +37,32 @@ enum {
        XFS_LOWSP_MAX,
 };
+/*
+ * Error Configuration
+ *
+ * Error classes define the subsystem the configuration belongs to.
+ * Error numbers define the errors that are configurable.
+ */
+enum {
+        XFS_ERR_METADATA,
+        XFS_ERR_CLASS_MAX,
+};
+enum {
+        XFS_ERR_DEFAULT,
+        XFS_ERR_EIO,
+        XFS_ERR_ENOSPC,
+        XFS_ERR_ENODEV,
+        XFS_ERR_ERRNO_MAX,
+};
+#define XFS_ERR_RETRY_FOREVER   -1
+struct xfs_error_cfg {
+        struct xfs_kobj kobj;
+        int             max_retries;
+        unsigned long   retry_timeout;  /* in jiffies, 0 = no timeout */
+};
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
@@ -127,6 +153,9 @@ typedef struct xfs_mount {
        int64_t                 m_low_space[XFS_LOWSP_MAX];
                                                /* low free space thresholds */
        struct xfs_kobj         m_kobj;
+        struct xfs_kobj         m_error_kobj;
+        struct xfs_kobj         m_error_meta_kobj;
+        struct xfs_error_cfg    m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
        struct xstats           m_stats;        /* per-fs stats */
        struct workqueue_struct *m_buf_workqueue;
@@ -148,6 +177,7 @@ typedef struct xfs_mount {
         */
        __uint32_t              m_generation;
+        bool                    m_fail_unmount;
 #ifdef DEBUG
        /*
         * DEBUG mode instrumentation to test and/or trigger delayed allocation
@@ -166,6 +196,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_WSYNC         (1ULL << 0)     /* for nfs - all metadata ops
                                                   must be synchronous except
                                                   for space allocations */
+#define XFS_MOUNT_UNMOUNTING    (1ULL << 1)     /* filesystem is unmounting */
 #define XFS_MOUNT_WAS_CLEAN     (1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN   (1ULL << 4)     /* atomic stop of all filesystem
                                                   operations, typically for
@@ -364,4 +395,7 @@ extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
 int     xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
                        xfs_off_t count_fsb);
+struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
+                int error_class, int error);
 #endif  /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 51ddaf2c2b8c..d5b756669fb5 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -308,12 +308,9 @@ xfs_fs_commit_blocks(
                        goto out_drop_iolock;
        }
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+        if (error)
-        if (error) {
-                xfs_trans_cancel(tp);
                goto out_drop_iolock;
-        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index be125e1758c1..a60d9e2739d1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -783,13 +783,10 @@ xfs_qm_qino_alloc(
                }
        }
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
+                        XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp);
-                                  XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
+        if (error)
-        if (error) {
-                xfs_trans_cancel(tp);
                return error;
-        }
        if (need_alloc) {
                error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index f4d0e0a8f517..475a3882a81f 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -236,10 +236,8 @@ xfs_qm_scall_trunc_qfile(
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error) {
-                xfs_trans_cancel(tp);
                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
                goto out_put;
        }
@@ -436,12 +434,9 @@ xfs_qm_scall_setqlim(
        defq = xfs_get_defquota(dqp, q);
        xfs_dqunlock(dqp);
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
+        if (error)
-        if (error) {
-                xfs_trans_cancel(tp);
                goto out_rele;
-        }
        xfs_dqlock(dqp);
        xfs_trans_dqjoin(tp, dqp);
@@ -569,13 +564,9 @@ xfs_qm_log_quotaoff_end(
        int                     error;
        xfs_qoff_logitem_t      *qoffi;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
+        if (error)
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp);
                return error;
-        }
        qoffi = xfs_trans_get_qoff_item(tp, startqoff,
                                        flags & XFS_ALL_QUOTA_ACCT);
@@ -603,12 +594,9 @@ xfs_qm_log_quotaoff(
        *qoffstartp = NULL;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
+        if (error)
-        if (error) {
-                xfs_trans_cancel(tp);
                goto out;
-        }
        qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
        xfs_trans_log_quotaoff_item(tp, qoffi);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index abf44435d04a..3938b37d1043 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -780,15 +780,14 @@ xfs_growfs_rt_alloc(
         * Allocate space to the file, as necessary.
         */
        while (oblocks < nblocks) {
-                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
                resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
                /*
                 * Reserve space & log for one extent added to the file.
                 */
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
+                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks,
-                                          resblks, 0);
+                                0, 0, &tp);
                if (error)
-                        goto out_trans_cancel;
+                        return error;
                /*
                 * Lock the inode.
                 */
@@ -823,14 +822,13 @@ xfs_growfs_rt_alloc(
                for (bno = map.br_startoff, fsbno = map.br_startblock;
                     bno < map.br_startoff + map.br_blockcount;
                     bno++, fsbno++) {
-                        tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
                        /*
                         * Reserve log for one block zeroing.
                         */
-                        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
+                        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero,
-                                                  0, 0);
+                                        0, 0, 0, &tp);
                        if (error)
-                                goto out_trans_cancel;
+                                return error;
                        /*
                         * Lock the bitmap inode.
                         */
@@ -994,11 +992,10 @@ xfs_growfs_rt(
                /*
                 * Start a transaction, get the log reservation.
                 */
-                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
+                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtfree, 0, 0, 0,
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree,
+                                &tp);
-                                          0, 0);
                if (error)
-                        goto error_cancel;
+                        break;
                /*
                 * Lock out other callers by grabbing the bitmap inode lock.
                 */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 187e14b696c2..416421d7ff10 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -58,8 +58,7 @@
 #include <linux/parser.h>
 static const struct super_operations xfs_super_operations;
-static kmem_zone_t *xfs_ioend_zone;
+struct bio_set *xfs_ioend_bioset;
-mempool_t *xfs_ioend_pool;
 static struct kset *xfs_kset;           /* top-level xfs sysfs dir */
 #ifdef DEBUG
@@ -350,6 +349,7 @@ xfs_parseargs(
                case Opt_pqnoenforce:
                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
                        mp->m_qflags &= ~XFS_PQUOTA_ENFD;
+                        break;
                case Opt_gquota:
                case Opt_grpquota:
                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
@@ -928,7 +928,7 @@ xfs_fs_alloc_inode(
 /*
 * Now that the generic code is guaranteed not to be accessing
- * the linux inode, we can reclaim the inode.
+ * the linux inode, we can inactivate and reclaim the inode.
 */
 STATIC void
 xfs_fs_destroy_inode(
@@ -938,9 +938,14 @@ xfs_fs_destroy_inode(
        trace_xfs_destroy_inode(ip);
-        XFS_STATS_INC(ip->i_mount, vn_reclaim);
+        ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+        XFS_STATS_INC(ip->i_mount, vn_rele);
+        XFS_STATS_INC(ip->i_mount, vn_remove);
+        xfs_inactive(ip);
        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+        XFS_STATS_INC(ip->i_mount, vn_reclaim);
        /*
         * We should never get here with one of the reclaim flags already set.
@@ -987,24 +992,6 @@ xfs_fs_inode_init_once(
                     "xfsino", ip->i_ino);
 }
-STATIC void
-xfs_fs_evict_inode(
-        struct inode            *inode)
-{
-        xfs_inode_t             *ip = XFS_I(inode);
-        ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-        trace_xfs_evict_inode(ip);
-        truncate_inode_pages_final(&inode->i_data);
-        clear_inode(inode);
-        XFS_STATS_INC(ip->i_mount, vn_rele);
-        XFS_STATS_INC(ip->i_mount, vn_remove);
-        xfs_inactive(ip);
-}
 /*
 * We do an unlocked check for XFS_IDONTCACHE here because we are already
 * serialised against cache hits here via the inode->i_lock and igrab() in
@@ -1276,6 +1263,16 @@ xfs_fs_remount(
                        return -EINVAL;
                }
+                if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+                    xfs_sb_has_ro_compat_feature(sbp,
+                                        XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+                        xfs_warn(mp,
+"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
+                                (sbp->sb_features_ro_compat &
+                                        XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+                        return -EINVAL;
+                }
                mp->m_flags &= ~XFS_MOUNT_RDONLY;
                /*
@@ -1663,7 +1660,6 @@ xfs_fs_free_cached_objects(
 static const struct super_operations xfs_super_operations = {
        .alloc_inode            = xfs_fs_alloc_inode,
        .destroy_inode          = xfs_fs_destroy_inode,
-        .evict_inode            = xfs_fs_evict_inode,
        .drop_inode             = xfs_fs_drop_inode,
        .put_super              = xfs_fs_put_super,
        .sync_fs                = xfs_fs_sync_fs,
@@ -1688,20 +1684,15 @@ MODULE_ALIAS_FS("xfs");
 STATIC int __init
 xfs_init_zones(void)
 {
+        xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
-        xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
+                        offsetof(struct xfs_ioend, io_inline_bio));
-        if (!xfs_ioend_zone)
+        if (!xfs_ioend_bioset)
                goto out;
-        xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
-                                                  xfs_ioend_zone);
-        if (!xfs_ioend_pool)
-                goto out_destroy_ioend_zone;
        xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
                                                "xfs_log_ticket");
        if (!xfs_log_ticket_zone)
-                goto out_destroy_ioend_pool;
+                goto out_free_ioend_bioset;
        xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
                                                "xfs_bmap_free_item");
@@ -1797,10 +1788,8 @@ xfs_init_zones(void)
        kmem_zone_destroy(xfs_bmap_free_item_zone);
 out_destroy_log_ticket_zone:
        kmem_zone_destroy(xfs_log_ticket_zone);
- out_destroy_ioend_pool:
+ out_free_ioend_bioset:
-        mempool_destroy(xfs_ioend_pool);
+        bioset_free(xfs_ioend_bioset);
- out_destroy_ioend_zone:
-        kmem_zone_destroy(xfs_ioend_zone);
 out:
        return -ENOMEM;
 }
@@ -1826,9 +1815,7 @@ xfs_destroy_zones(void)
        kmem_zone_destroy(xfs_btree_cur_zone);
        kmem_zone_destroy(xfs_bmap_free_item_zone);
        kmem_zone_destroy(xfs_log_ticket_zone);
-        mempool_destroy(xfs_ioend_pool);
+        bioset_free(xfs_ioend_bioset);
-        kmem_zone_destroy(xfs_ioend_zone);
 }
 STATIC int __init
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index b44284c1adda..08a46c6181fd 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -131,6 +131,8 @@ xfs_readlink(
        trace_xfs_readlink(ip);
+        ASSERT(!(ip->i_df.if_flags & XFS_IFINLINE));
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
@@ -150,12 +152,7 @@ xfs_readlink(
        }
-        if (ip->i_df.if_flags & XFS_IFINLINE) {
+        error = xfs_readlink_bmap(ip, link);
-                memcpy(link, ip->i_df.if_u1.if_data, pathlen);
-                link[pathlen] = '\0';
-        } else {
-                error = xfs_readlink_bmap(ip, link);
-        }
 out:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -221,7 +218,6 @@ xfs_symlink(
        if (error)
                return error;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
        /*
         * The symlink will fit into the inode data fork?
         * There can't be any attributes so we get the whole variable part.
@@ -231,13 +227,15 @@ xfs_symlink(
        else
                fs_blocks = xfs_symlink_blocks(mp, pathlen);
        resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp);
        if (error == -ENOSPC && fs_blocks == 0) {
                resblks = 0;
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
+                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0,
+                                &tp);
        }
        if (error)
-                goto out_trans_cancel;
+                goto out_release_inode;
        xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
                      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
@@ -302,19 +300,11 @@ xfs_symlink(
         * If the symlink will fit into the inode, write it inline.
         */
        if (pathlen <= XFS_IFORK_DSIZE(ip)) {
-                xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
+                xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
-                memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
-                ip->i_d.di_size = pathlen;
-                /*
-                 * The inode was initially created in extent format.
-                 */
-                ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
-                ip->i_df.if_flags |= XFS_IFINLINE;
+                ip->i_d.di_size = pathlen;
                ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
                xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
        } else {
                int     offset;
@@ -455,12 +445,9 @@ xfs_inactive_symlink_rmt(
         */
        ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
-        tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+        if (error)
-        if (error) {
-                xfs_trans_cancel(tp);
                return error;
-        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, 0);
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 6ced4f143494..4c2c55086208 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -17,10 +17,11 @@
 */
 #include "xfs.h"
-#include "xfs_sysfs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
+#include "xfs_sysfs.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_stats.h"
@@ -362,3 +363,291 @@ struct kobj_type xfs_log_ktype = {
        .sysfs_ops = &xfs_sysfs_ops,
        .default_attrs = xfs_log_attrs,
 };
+/*
+ * Metadata IO error configuration
+ *
+ * The sysfs structure here is:
+ *      ...xfs/<dev>/error/<class>/<errno>/<error_attrs>
+ *
+ * where <class> allows us to discriminate between data IO and metadata IO,
+ * and any other future type of IO (e.g. special inode or directory error
+ * handling) we care to support.
+ */
+static inline struct xfs_error_cfg *
+to_error_cfg(struct kobject *kobject)
+{
+        struct xfs_kobj *kobj = to_kobj(kobject);
+        return container_of(kobj, struct xfs_error_cfg, kobj);
+}
+static inline struct xfs_mount *
+err_to_mp(struct kobject *kobject)
+{
+        struct xfs_kobj *kobj = to_kobj(kobject);
+        return container_of(kobj, struct xfs_mount, m_error_kobj);
+}
+static ssize_t
+max_retries_show(
+        struct kobject  *kobject,
+        char            *buf)
+{
+        struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+        return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries);
+}
+static ssize_t
+max_retries_store(
+        struct kobject  *kobject,
+        const char      *buf,
+        size_t          count)
+{
+        struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+        int             ret;
+        int             val;
+        ret = kstrtoint(buf, 0, &val);
+        if (ret)
+                return ret;
+        if (val < -1)
+                return -EINVAL;
+        cfg->max_retries = val;
+        return count;
+}
+XFS_SYSFS_ATTR_RW(max_retries);
+static ssize_t
+retry_timeout_seconds_show(
+        struct kobject  *kobject,
+        char            *buf)
+{
+        struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+        return snprintf(buf, PAGE_SIZE, "%ld\n",
+                        jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC);
+}
+static ssize_t
+retry_timeout_seconds_store(
+        struct kobject  *kobject,
+        const char      *buf,
+        size_t          count)
+{
+        struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+        int             ret;
+        int             val;
+        ret = kstrtoint(buf, 0, &val);
+        if (ret)
+                return ret;
+        /* 1 day timeout maximum */
+        if (val < 0 || val > 86400)
+                return -EINVAL;
+        cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+        return count;
+}
+XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
+static ssize_t
+fail_at_unmount_show(
+        struct kobject  *kobject,
+        char            *buf)
+{
+        struct xfs_mount        *mp = err_to_mp(kobject);
+        return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount);
+}
+static ssize_t
+fail_at_unmount_store(
+        struct kobject  *kobject,
+        const char      *buf,
+        size_t          count)
+{
+        struct xfs_mount        *mp = err_to_mp(kobject);
+        int             ret;
+        int             val;
+        ret = kstrtoint(buf, 0, &val);
+        if (ret)
+                return ret;
+        if (val < 0 || val > 1)
+                return -EINVAL;
+        mp->m_fail_unmount = val;
+        return count;
+}
+XFS_SYSFS_ATTR_RW(fail_at_unmount);
+static struct attribute *xfs_error_attrs[] = {
+        ATTR_LIST(max_retries),
+        ATTR_LIST(retry_timeout_seconds),
+        NULL,
+};
+struct kobj_type xfs_error_cfg_ktype = {
+        .release = xfs_sysfs_release,
+        .sysfs_ops = &xfs_sysfs_ops,
+        .default_attrs = xfs_error_attrs,
+};
+struct kobj_type xfs_error_ktype = {
+        .release = xfs_sysfs_release,
+        .sysfs_ops = &xfs_sysfs_ops,
+};
+/*
+ * Error initialization tables. These need to be ordered in the same
+ * order as the enums used to index the array. All class init tables need to
+ * define a "default" behaviour as the first entry, all other entries can be
+ * empty.
+ */
+struct xfs_error_init {
+        char            *name;
+        int             max_retries;
+        int             retry_timeout;  /* in seconds */
+};
+static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
+        { .name = "default",
+          .max_retries = XFS_ERR_RETRY_FOREVER,
+          .retry_timeout = 0,
+        },
+        { .name = "EIO",
+          .max_retries = XFS_ERR_RETRY_FOREVER,
+          .retry_timeout = 0,
+        },
+        { .name = "ENOSPC",
+          .max_retries = XFS_ERR_RETRY_FOREVER,
+          .retry_timeout = 0,
+        },
+        { .name = "ENODEV",
+          .max_retries = 0,
+        },
+};
+static int
+xfs_error_sysfs_init_class(
+        struct xfs_mount        *mp,
+        int                     class,
+        const char              *parent_name,
+        struct xfs_kobj         *parent_kobj,
+        const struct xfs_error_init init[])
+{
+        struct xfs_error_cfg    *cfg;
+        int                     error;
+        int                     i;
+        ASSERT(class < XFS_ERR_CLASS_MAX);
+        error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype,
+                                &mp->m_error_kobj, parent_name);
+        if (error)
+                return error;
+        for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) {
+                cfg = &mp->m_error_cfg[class][i];
+                error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype,
+                                        parent_kobj, init[i].name);
+                if (error)
+                        goto out_error;
+                cfg->max_retries = init[i].max_retries;
+                cfg->retry_timeout = msecs_to_jiffies(
+                                        init[i].retry_timeout * MSEC_PER_SEC);
+        }
+        return 0;
+out_error:
+        /* unwind the entries that succeeded */
+        for (i--; i >= 0; i--) {
+                cfg = &mp->m_error_cfg[class][i];
+                xfs_sysfs_del(&cfg->kobj);
+        }
+        xfs_sysfs_del(parent_kobj);
+        return error;
+}
+int
+xfs_error_sysfs_init(
+        struct xfs_mount        *mp)
+{
+        int                     error;
+        /* .../xfs/<dev>/error/ */
+        error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
+                                &mp->m_kobj, "error");
+        if (error)
+                return error;
+        error = sysfs_create_file(&mp->m_error_kobj.kobject,
+                                  ATTR_LIST(fail_at_unmount));
+        if (error)
+                goto out_error;
+        /* .../xfs/<dev>/error/metadata/ */
+        error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
+                                "metadata", &mp->m_error_meta_kobj,
+                                xfs_error_meta_init);
+        if (error)
+                goto out_error;
+        return 0;
+out_error:
+        xfs_sysfs_del(&mp->m_error_kobj);
+        return error;
+}
+void
+xfs_error_sysfs_del(
+        struct xfs_mount        *mp)
+{
+        struct xfs_error_cfg    *cfg;
+        int                     i, j;
+        for (i = 0; i < XFS_ERR_CLASS_MAX; i++) {
+                for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) {
+                        cfg = &mp->m_error_cfg[i][j];
+                        xfs_sysfs_del(&cfg->kobj);
+                }
+        }
+        xfs_sysfs_del(&mp->m_error_meta_kobj);
+        xfs_sysfs_del(&mp->m_error_kobj);
+}
+struct xfs_error_cfg *
+xfs_error_get_cfg(
+        struct xfs_mount        *mp,
+        int                     error_class,
+        int                     error)
+{
+        struct xfs_error_cfg    *cfg;
+        switch (error) {
+        case EIO:
+                cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
+                break;
+        case ENOSPC:
+                cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC];
+                break;
+        case ENODEV:
+                cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV];
+                break;
+        default:
+                cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT];
+                break;
+        }
+        return cfg;
+}
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index be692e59938d..d04637181ef2 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -58,4 +58,7 @@ xfs_sysfs_del(
        wait_for_completion(&kobj->complete);
 }
+int     xfs_error_sysfs_init(struct xfs_mount *mp);
+void    xfs_error_sysfs_del(struct xfs_mount *mp);
 #endif  /* __XFS_SYSFS_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c8d58426008e..ea94ee0fe5ea 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -364,7 +364,6 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split);
 DEFINE_BUF_EVENT(xfs_buf_get_uncached);
 DEFINE_BUF_EVENT(xfs_bdstrat_shut);
 DEFINE_BUF_EVENT(xfs_buf_item_relse);
-DEFINE_BUF_EVENT(xfs_buf_item_iodone);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
 DEFINE_BUF_EVENT(xfs_buf_error_relse);
 DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
@@ -944,7 +943,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
        TP_ARGS(log, tic),
        TP_STRUCT__entry(
                __field(dev_t, dev)
-                __field(unsigned, trans_type)
                __field(char, ocnt)
                __field(char, cnt)
                __field(int, curr_res)
@@ -962,7 +960,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
        ),
        TP_fast_assign(
                __entry->dev = log->l_mp->m_super->s_dev;
-                __entry->trans_type = tic->t_trans_type;
                __entry->ocnt = tic->t_ocnt;
                __entry->cnt = tic->t_cnt;
                __entry->curr_res = tic->t_curr_res;
@@ -980,14 +977,13 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_block = log->l_curr_block;
                __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
        ),
-        TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
+        TP_printk("dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u "
                  "t_unit_res %u t_flags %s reserveq %s "
                  "writeq %s grant_reserve_cycle %d "
                  "grant_reserve_bytes %d grant_write_cycle %d "
                  "grant_write_bytes %d curr_cycle %d curr_block %d "
                  "tail_cycle %d tail_block %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
                  __entry->ocnt,
                  __entry->cnt,
                  __entry->curr_res,
@@ -1053,19 +1049,21 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
 )
 TRACE_EVENT(xfs_log_force,
-        TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn),
+        TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn, unsigned long caller_ip),
-        TP_ARGS(mp, lsn),
+        TP_ARGS(mp, lsn, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_lsn_t, lsn)
+                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->lsn = lsn;
+                __entry->caller_ip = caller_ip;
        ),
-        TP_printk("dev %d:%d lsn 0x%llx",
+        TP_printk("dev %d:%d lsn 0x%llx caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  __entry->lsn)
+                  __entry->lsn, (void *)__entry->caller_ip)
 )
 #define DEFINE_LOG_ITEM_EVENT(name) \
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 20c53666cb4b..5f3d33d16e67 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -47,47 +47,6 @@ xfs_trans_init(
 }
 /*
- * This routine is called to allocate a transaction structure.
- * The type parameter indicates the type of the transaction.  These
- * are enumerated in xfs_trans.h.
- *
- * Dynamically allocate the transaction structure from the transaction
- * zone, initialize it, and return it to the caller.
- */
-xfs_trans_t *
-xfs_trans_alloc(
-        xfs_mount_t     *mp,
-        uint            type)
-{
-        xfs_trans_t     *tp;
-        sb_start_intwrite(mp->m_super);
-        tp = _xfs_trans_alloc(mp, type, KM_SLEEP);
-        tp->t_flags |= XFS_TRANS_FREEZE_PROT;
-        return tp;
-}
-xfs_trans_t *
-_xfs_trans_alloc(
-        xfs_mount_t     *mp,
-        uint            type,
-        xfs_km_flags_t  memflags)
-{
-        xfs_trans_t     *tp;
-        WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
-        atomic_inc(&mp->m_active_trans);
-        tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
-        tp->t_magic = XFS_TRANS_HEADER_MAGIC;
-        tp->t_type = type;
-        tp->t_mountp = mp;
-        INIT_LIST_HEAD(&tp->t_items);
-        INIT_LIST_HEAD(&tp->t_busy);
-        return tp;
-}
-/*
 * Free the transaction structure.  If there is more clean up
 * to do when the structure is freed, add it here.
 */
@@ -99,7 +58,7 @@ xfs_trans_free(
        xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
        atomic_dec(&tp->t_mountp->m_active_trans);
-        if (tp->t_flags & XFS_TRANS_FREEZE_PROT)
+        if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
                sb_end_intwrite(tp->t_mountp->m_super);
        xfs_trans_free_dqinfo(tp);
        kmem_zone_free(xfs_trans_zone, tp);
@@ -125,7 +84,6 @@ xfs_trans_dup(
         * Initialize the new transaction structure.
         */
        ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
-        ntp->t_type = tp->t_type;
        ntp->t_mountp = tp->t_mountp;
        INIT_LIST_HEAD(&ntp->t_items);
        INIT_LIST_HEAD(&ntp->t_busy);
@@ -135,9 +93,9 @@ xfs_trans_dup(
        ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
                       (tp->t_flags & XFS_TRANS_RESERVE) |
-                       (tp->t_flags & XFS_TRANS_FREEZE_PROT);
+                       (tp->t_flags & XFS_TRANS_NO_WRITECOUNT);
        /* We gave our writer reference to the new transaction */
-        tp->t_flags &= ~XFS_TRANS_FREEZE_PROT;
+        tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
        ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
        ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
        tp->t_blk_res = tp->t_blk_res_used;
@@ -165,7 +123,7 @@ xfs_trans_dup(
 * This does not do quota reservations. That typically is done by the
 * caller afterwards.
 */
-int
+static int
 xfs_trans_reserve(
        struct xfs_trans        *tp,
        struct xfs_trans_res    *resp,
@@ -219,7 +177,7 @@ xfs_trans_reserve(
                                                resp->tr_logres,
                                                resp->tr_logcount,
                                                &tp->t_ticket, XFS_TRANSACTION,
-                                                permanent, tp->t_type);
+                                                permanent);
                }
                if (error)
@@ -268,6 +226,42 @@ undo_blocks:
        return error;
 }
+int
+xfs_trans_alloc(
+        struct xfs_mount        *mp,
+        struct xfs_trans_res    *resp,
+        uint                    blocks,
+        uint                    rtextents,
+        uint                    flags,
+        struct xfs_trans        **tpp)
+{
+        struct xfs_trans        *tp;
+        int                     error;
+        if (!(flags & XFS_TRANS_NO_WRITECOUNT))
+                sb_start_intwrite(mp->m_super);
+        WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
+        atomic_inc(&mp->m_active_trans);
+        tp = kmem_zone_zalloc(xfs_trans_zone,
+                (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
+        tp->t_magic = XFS_TRANS_HEADER_MAGIC;
+        tp->t_flags = flags;
+        tp->t_mountp = mp;
+        INIT_LIST_HEAD(&tp->t_items);
+        INIT_LIST_HEAD(&tp->t_busy);
+        error = xfs_trans_reserve(tp, resp, blocks, rtextents);
+        if (error) {
+                xfs_trans_cancel(tp);
+                return error;
+        }
+        *tpp = tp;
+        return 0;
+}
 /*
 * Record the indicated change to the given field for application
 * to the file system's superblock when the transaction commits.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index e7c49cf43fbc..9a462e892e4f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -90,7 +90,6 @@ void	xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
 */
 typedef struct xfs_trans {
        unsigned int            t_magic;        /* magic number */
-        unsigned int            t_type;         /* transaction type */
        unsigned int            t_log_res;      /* amt of log space resvd */
        unsigned int            t_log_count;    /* count for perm log res */
        unsigned int            t_blk_res;      /* # of blocks resvd */
@@ -148,10 +147,9 @@ typedef struct xfs_trans {
 /*
 * XFS transaction mechanism exported interfaces.
 */
-xfs_trans_t     *xfs_trans_alloc(struct xfs_mount *, uint);
+int             xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
-xfs_trans_t     *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
+                        uint blocks, uint rtextents, uint flags,
-int             xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
+                        struct xfs_trans **tpp);
-                                  uint, uint);
 void            xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
 struct xfs_buf  *xfs_trans_get_buf_map(struct xfs_trans *tp,
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index d111f691f313..ec58ff094b1d 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -146,7 +146,7 @@ __xfs_xattr_put_listent(
        arraytop = context->count + prefix_len + namelen + 1;
        if (arraytop > context->firstu) {
                context->count = -1;    /* insufficient space */
-                return 1;
+                return 0;
        }
        offset = (char *)context->alist + context->count;
        strncpy(offset, prefix, prefix_len);
@@ -166,8 +166,7 @@ xfs_xattr_put_listent(
        int             flags,
        unsigned char   *name,
        int             namelen,
-        int             valuelen,
+        int             valuelen)
-        unsigned char   *value)
 {
        char *prefix;
        int prefix_len;
@@ -221,11 +220,15 @@ xfs_xattr_put_listent(
 }
 ssize_t
-xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
+xfs_vn_listxattr(
+        struct dentry   *dentry,
+        char            *data,
+        size_t          size)
 {
        struct xfs_attr_list_context context;
        struct attrlist_cursor_kern cursor = { 0 };
-        struct inode            *inode = d_inode(dentry);
+        struct inode    *inode = d_inode(dentry);
+        int             error;
        /*
         * First read the regular on-disk attributes.
@@ -239,7 +242,9 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
        context.firstu = context.bufsize;
        context.put_listent = xfs_xattr_put_listent;
-        xfs_attr_list_int(&context);
+        error = xfs_attr_list_int(&context);
+        if (error)
+                return error;
        if (context.count < 0)
                return -ERANGE;
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-05-26 13:13:40 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-05-26 13:13:40 -0400
commit	0b9210c9c86e46a7a62bbc7b69b84001315072ff (patch)
tree	0a0872c6b998c6fa3de29f1929be025f6060e749
parent	c5436731de860b3a3cff70c62d99242418aab1d1 (diff)
parent	555b67e4e729ca544bb4028ab12e532c68b70ddb (diff)