18 files changed, 736 insertions, 798 deletions
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt
index d8119e9d2d60..96d0df28bed3 100644
--- a/Documentation/filesystems/xfs-delayed-logging-design.txt
+++ b/Documentation/filesystems/xfs-delayed-logging-design.txt
@@ -794,11 +794,6 @@ designed.
 Roadmap:
-2.6.35 Inclusion in mainline as an experimental mount option
-        => approximately 2-3 months to merge window
-        => needs to be in xfs-dev tree in 4-6 weeks
-        => code is nearing readiness for review
 2.6.37 Remove experimental tag from mount option
        => should be roughly 6 months after initial merge
        => enough time to:
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 089eaca860b4..a0fa3bf0d1bb 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1333,6 +1333,21 @@ xfs_vm_writepage(
        trace_xfs_writepage(inode, page, 0);
        /*
+         * Refuse to write the page out if we are called from reclaim context.
+         *
+         * This is primarily to avoid stack overflows when called from deep
+         * used stacks in random callers for direct reclaim, but disabling
+         * reclaim for kswap is a nice side-effect as kswapd causes rather
+         * suboptimal I/O patters, too.
+         *
+         * This should really be done by the core VM, but until that happens
+         * filesystems like XFS, btrfs and ext4 have to take care of this
+         * by themselves.
+         */
+        if (current->flags & PF_MEMALLOC)
+                goto out_fail;
+        /*
         * We need a transaction if:
         *  1. There are delalloc buffers on the page
         *  2. The page is uptodate and we have unmapped buffers
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 9c8019c78c92..44f0b2de153e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -585,11 +585,20 @@ xfs_vn_fallocate(
        bf.l_len = len;
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        /* check the new inode size is valid before allocating */
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+                new_size = offset + len;
+                error = inode_newsize_ok(inode, new_size);
+                if (error)
+                        goto out_unlock;
+        }
        error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
                                       0, XFS_ATTR_NOLOCK);
-        if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
+        if (error)
-            offset + len > i_size_read(inode))
+                goto out_unlock;
-                new_size = offset + len;
        /* Change file size if needed */
        if (new_size) {
@@ -600,6 +609,7 @@ xfs_vn_fallocate(
                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
        }
+out_unlock:
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 out_error:
        return error;
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 9ac8aea91529..067cafbfc635 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -23,7 +23,6 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
-#include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3884e20bc14e..ef7f0218bccb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -164,10 +164,6 @@ xfs_inode_ag_iterator(
                struct xfs_perag        *pag;
                pag = xfs_perag_get(mp, ag);
-                if (!pag->pag_ici_init) {
-                        xfs_perag_put(pag);
-                        continue;
-                }
                error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
                                                exclusive, &nr);
                xfs_perag_put(pag);
@@ -867,12 +863,7 @@ xfs_reclaim_inode_shrink(
        down_read(&xfs_mount_list_lock);
        list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
                for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
                        pag = xfs_perag_get(mp, ag);
-                        if (!pag->pag_ici_init) {
-                                xfs_perag_put(pag);
-                                continue;
-                        }
                        reclaimable += pag->pag_ici_reclaimable;
                        xfs_perag_put(pag);
                }
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 207fa77f63ae..d12be8470cba 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -50,7 +50,6 @@
 #include "quota/xfs_dquot_item.h"
 #include "quota/xfs_dquot.h"
 #include "xfs_log_recover.h"
-#include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 /*
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index ff6bc797baf2..73d5aa117384 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -82,33 +82,6 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
        )
 )
-#define DEFINE_PERAG_REF_EVENT(name) \
-TRACE_EVENT(name, \
-        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
-                 unsigned long caller_ip), \
-        TP_ARGS(mp, agno, refcount, caller_ip), \
-        TP_STRUCT__entry( \
-                __field(dev_t, dev) \
-                __field(xfs_agnumber_t, agno) \
-                __field(int, refcount) \
-                __field(unsigned long, caller_ip) \
-        ), \
-        TP_fast_assign( \
-                __entry->dev = mp->m_super->s_dev; \
-                __entry->agno = agno; \
-                __entry->refcount = refcount; \
-                __entry->caller_ip = caller_ip; \
-        ), \
-        TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
-                  __entry->agno, \
-                  __entry->refcount, \
-                  (char *)__entry->caller_ip) \
-);
-DEFINE_PERAG_REF_EVENT(xfs_perag_get)
-DEFINE_PERAG_REF_EVENT(xfs_perag_put)
 #define DEFINE_ATTR_LIST_EVENT(name) \
 DEFINE_EVENT(xfs_attr_list_class, name, \
        TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -122,6 +95,37 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+DECLARE_EVENT_CLASS(xfs_perag_class,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+                 unsigned long caller_ip),
+        TP_ARGS(mp, agno, refcount, caller_ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(int, refcount)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->refcount = refcount;
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->refcount,
+                  (char *)__entry->caller_ip)
+);
+#define DEFINE_PERAG_REF_EVENT(name)    \
+DEFINE_EVENT(xfs_perag_class, name,     \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,       \
+                 unsigned long caller_ip),                                      \
+        TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 TRACE_EVENT(xfs_attr_list_node_descend,
        TP_PROTO(struct xfs_attr_list_context *ctx,
                 struct xfs_da_node_entry *btree),
@@ -775,165 +779,181 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
-#define DEFINE_RW_EVENT(name) \
+DECLARE_EVENT_CLASS(xfs_file_class,
-TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
-        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+        TP_ARGS(ip, count, offset, flags),
-        TP_ARGS(ip, count, offset, flags), \
+        TP_STRUCT__entry(
-        TP_STRUCT__entry( \
+                __field(dev_t, dev)
-                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino)
-                __field(xfs_ino_t, ino) \
+                __field(xfs_fsize_t, size)
-                __field(xfs_fsize_t, size) \
+                __field(xfs_fsize_t, new_size)
-                __field(xfs_fsize_t, new_size) \
+                __field(loff_t, offset)
-                __field(loff_t, offset) \
+                __field(size_t, count)
-                __field(size_t, count) \
+                __field(int, flags)
-                __field(int, flags) \
+        ),
-        ), \
+        TP_fast_assign(
-        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
-                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino;
-                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size;
-                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size;
-                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset;
-                __entry->offset = offset; \
+                __entry->count = count;
-                __entry->count = count; \
+                __entry->flags = flags;
-                __entry->flags = flags; \
+        ),
-        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count 0x%zx ioflags %s",
-                  "offset 0x%llx count 0x%zx ioflags %s", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->size, \
+                  __entry->new_size,
-                  __entry->new_size, \
+                  __entry->offset,
-                  __entry->offset, \
+                  __entry->count,
-                  __entry->count, \
+                  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
-                  __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
 )
+#define DEFINE_RW_EVENT(name)           \
+DEFINE_EVENT(xfs_file_class, name,      \
+        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+        TP_ARGS(ip, count, offset, flags))
 DEFINE_RW_EVENT(xfs_file_read);
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
 DEFINE_RW_EVENT(xfs_file_splice_read);
 DEFINE_RW_EVENT(xfs_file_splice_write);
+DECLARE_EVENT_CLASS(xfs_page_class,
-#define DEFINE_PAGE_EVENT(name) \
+        TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
-TRACE_EVENT(name, \
+        TP_ARGS(inode, page, off),
-        TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
+        TP_STRUCT__entry(
-        TP_ARGS(inode, page, off), \
+                __field(dev_t, dev)
-        TP_STRUCT__entry( \
+                __field(xfs_ino_t, ino)
-                __field(dev_t, dev) \
+                __field(pgoff_t, pgoff)
-                __field(xfs_ino_t, ino) \
+                __field(loff_t, size)
-                __field(pgoff_t, pgoff) \
+                __field(unsigned long, offset)
-                __field(loff_t, size) \
+                __field(int, delalloc)
-                __field(unsigned long, offset) \
+                __field(int, unmapped)
-                __field(int, delalloc) \
+                __field(int, unwritten)
-                __field(int, unmapped) \
+        ),
-                __field(int, unwritten) \
+        TP_fast_assign(
-        ), \
+                int delalloc = -1, unmapped = -1, unwritten = -1;
-        TP_fast_assign( \
-                int delalloc = -1, unmapped = -1, unwritten = -1; \
+                if (page_has_buffers(page))
-        \
+                        xfs_count_page_state(page, &delalloc,
-                if (page_has_buffers(page)) \
+                                             &unmapped, &unwritten);
-                        xfs_count_page_state(page, &delalloc, \
+                __entry->dev = inode->i_sb->s_dev;
-                                             &unmapped, &unwritten); \
+                __entry->ino = XFS_I(inode)->i_ino;
-                __entry->dev = inode->i_sb->s_dev; \
+                __entry->pgoff = page_offset(page);
-                __entry->ino = XFS_I(inode)->i_ino; \
+                __entry->size = i_size_read(inode);
-                __entry->pgoff = page_offset(page); \
+                __entry->offset = off;
-                __entry->size = i_size_read(inode); \
+                __entry->delalloc = delalloc;
-                __entry->offset = off; \
+                __entry->unmapped = unmapped;
-                __entry->delalloc = delalloc; \
+                __entry->unwritten = unwritten;
-                __entry->unmapped = unmapped; \
+        ),
-                __entry->unwritten = unwritten; \
+        TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-        ), \
+                  "delalloc %d unmapped %d unwritten %d",
-        TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  "delalloc %d unmapped %d unwritten %d", \
+                  __entry->ino,
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->pgoff,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->pgoff, \
+                  __entry->offset,
-                  __entry->size, \
+                  __entry->delalloc,
-                  __entry->offset, \
+                  __entry->unmapped,
-                  __entry->delalloc, \
+                  __entry->unwritten)
-                  __entry->unmapped, \
-                  __entry->unwritten) \
 )
+#define DEFINE_PAGE_EVENT(name)         \
+DEFINE_EVENT(xfs_page_class, name,      \
+        TP_PROTO(struct inode *inode, struct page *page, unsigned long off),    \
+        TP_ARGS(inode, page, off))
 DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
-#define DEFINE_IOMAP_EVENT(name) \
+DECLARE_EVENT_CLASS(xfs_iomap_class,
-TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+                 int flags, struct xfs_bmbt_irec *irec),
-                 int flags, struct xfs_bmbt_irec *irec), \
+        TP_ARGS(ip, offset, count, flags, irec),
-        TP_ARGS(ip, offset, count, flags, irec), \
+        TP_STRUCT__entry(
-        TP_STRUCT__entry( \
+                __field(dev_t, dev)
-                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino)
-                __field(xfs_ino_t, ino) \
+                __field(loff_t, size)
-                __field(loff_t, size) \
+                __field(loff_t, new_size)
-                __field(loff_t, new_size) \
+                __field(loff_t, offset)
-                __field(loff_t, offset) \
+                __field(size_t, count)
-                __field(size_t, count) \
+                __field(int, flags)
-                __field(int, flags) \
+                __field(xfs_fileoff_t, startoff)
-                __field(xfs_fileoff_t, startoff) \
+                __field(xfs_fsblock_t, startblock)
-                __field(xfs_fsblock_t, startblock) \
+                __field(xfs_filblks_t, blockcount)
-                __field(xfs_filblks_t, blockcount) \
+        ),
-        ), \
+        TP_fast_assign(
-        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
-                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino;
-                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size;
-                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size;
-                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset;
-                __entry->offset = offset; \
+                __entry->count = count;
-                __entry->count = count; \
+                __entry->flags = flags;
-                __entry->flags = flags; \
+                __entry->startoff = irec ? irec->br_startoff : 0;
-                __entry->startoff = irec ? irec->br_startoff : 0; \
+                __entry->startblock = irec ? irec->br_startblock : 0;
-                __entry->startblock = irec ? irec->br_startblock : 0; \
+                __entry->blockcount = irec ? irec->br_blockcount : 0;
-                __entry->blockcount = irec ? irec->br_blockcount : 0; \
+        ),
-        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count %zd flags %s "
-                  "offset 0x%llx count %zd flags %s " \
+                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
-                  "startoff 0x%llx startblock %lld blockcount 0x%llx", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->size, \
+                  __entry->new_size,
-                  __entry->new_size, \
+                  __entry->offset,
-                  __entry->offset, \
+                  __entry->count,
-                  __entry->count, \
+                  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
-                  __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
+                  __entry->startoff,
-                  __entry->startoff, \
+                  (__int64_t)__entry->startblock,
-                  (__int64_t)__entry->startblock, \
+                  __entry->blockcount)
-                  __entry->blockcount) \
 )
+#define DEFINE_IOMAP_EVENT(name)        \
+DEFINE_EVENT(xfs_iomap_class, name,     \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+                 int flags, struct xfs_bmbt_irec *irec),                \
+        TP_ARGS(ip, offset, count, flags, irec))
 DEFINE_IOMAP_EVENT(xfs_iomap_enter);
 DEFINE_IOMAP_EVENT(xfs_iomap_found);
 DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-#define DEFINE_SIMPLE_IO_EVENT(name) \
+DECLARE_EVENT_CLASS(xfs_simple_io_class,
-TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
-        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
+        TP_ARGS(ip, offset, count),
-        TP_ARGS(ip, offset, count), \
+        TP_STRUCT__entry(
-        TP_STRUCT__entry( \
+                __field(dev_t, dev)
-                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino)
-                __field(xfs_ino_t, ino) \
+                __field(loff_t, size)
-                __field(loff_t, size) \
+                __field(loff_t, new_size)
-                __field(loff_t, new_size) \
+                __field(loff_t, offset)
-                __field(loff_t, offset) \
+                __field(size_t, count)
-                __field(size_t, count) \
+        ),
-        ), \
+        TP_fast_assign(
-        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
-                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino;
-                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size;
-                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size;
-                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset;
-                __entry->offset = offset; \
+                __entry->count = count;
-                __entry->count = count; \
+        ),
-        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count %zd",
-                  "offset 0x%llx count %zd", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->size, \
+                  __entry->new_size,
-                  __entry->new_size, \
+                  __entry->offset,
-                  __entry->offset, \
+                  __entry->count)
-                  __entry->count) \
 );
+#define DEFINE_SIMPLE_IO_EVENT(name)    \
+DEFINE_EVENT(xfs_simple_io_class, name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),        \
+        TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 38e764146644..2d8b7bc792c9 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -249,8 +249,10 @@ xfs_qm_hold_quotafs_ref(
        if (!xfs_Gqm) {
                xfs_Gqm = xfs_Gqm_init();
-                if (!xfs_Gqm)
+                if (!xfs_Gqm) {
+                        mutex_unlock(&xfs_Gqm_lock);
                        return ENOMEM;
+                }
        }
        /*
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 401f364ad36c..4917d4eed4ed 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,6 @@ typedef struct xfs_perag {
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
-        int             pag_ici_init;   /* incore inode cache initialised */
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
        int             pag_ici_reclaimable;    /* reclaimable inodes */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 6845db90818f..75df75f43d48 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -382,9 +382,6 @@ xfs_iget(
        /* get the perag structure and ensure that it's inode capable */
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
-        if (!pag->pagi_inodeok)
-                return EINVAL;
-        ASSERT(pag->pag_ici_init);
        agino = XFS_INO_TO_AGINO(mp, ino);
 again:
@@ -744,30 +741,24 @@ xfs_ilock_demote(
 }
 #ifdef DEBUG
-/*
- * Debug-only routine, without additional rw_semaphore APIs, we can
- * now only answer requests regarding whether we hold the lock for write
- * (reader state is outside our visibility, we only track writer state).
- *
- * Note: this means !xfs_isilocked would give false positives, so don't do that.
- */
 int
 xfs_isilocked(
        xfs_inode_t             *ip,
        uint                    lock_flags)
 {
-        if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) ==
+        if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
-                        XFS_ILOCK_EXCL) {
+                if (!(lock_flags & XFS_ILOCK_SHARED))
-                if (!ip->i_lock.mr_writer)
+                        return !!ip->i_lock.mr_writer;
-                        return 0;
+                return rwsem_is_locked(&ip->i_lock.mr_lock);
        }
-        if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) ==
+        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
-                        XFS_IOLOCK_EXCL) {
+                if (!(lock_flags & XFS_IOLOCK_SHARED))
-                if (!ip->i_iolock.mr_writer)
+                        return !!ip->i_iolock.mr_writer;
-                        return 0;
+                return rwsem_is_locked(&ip->i_iolock.mr_lock);
        }
-        return 1;
+        ASSERT(0);
+        return 0;
 }
 #endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8cd6e8d8fe9c..d53c39de7d05 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1940,10 +1940,10 @@ xfs_ifree_cluster(
        int                     blks_per_cluster;
        int                     nbufs;
        int                     ninodes;
-        int                     i, j, found, pre_flushed;
+        int                     i, j;
        xfs_daddr_t             blkno;
        xfs_buf_t               *bp;
-        xfs_inode_t             *ip, **ip_found;
+        xfs_inode_t             *ip;
        xfs_inode_log_item_t    *iip;
        xfs_log_item_t          *lip;
        struct xfs_perag        *pag;
@@ -1960,114 +1960,97 @@ xfs_ifree_cluster(
                nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
        }
-        ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
        for (j = 0; j < nbufs; j++, inum += ninodes) {
+                int     found = 0;
                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
                                         XFS_INO_TO_AGBNO(mp, inum));
+                /*
+                 * We obtain and lock the backing buffer first in the process
+                 * here, as we have to ensure that any dirty inode that we
+                 * can't get the flush lock on is attached to the buffer.
+                 * If we scan the in-memory inodes first, then buffer IO can
+                 * complete before we get a lock on it, and hence we may fail
+                 * to mark all the active inodes on the buffer stale.
+                 */
+                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+                                        mp->m_bsize * blks_per_cluster,
+                                        XBF_LOCK);
+                /*
+                 * Walk the inodes already attached to the buffer and mark them
+                 * stale. These will all have the flush locks held, so an
+                 * in-memory inode walk can't lock them.
+                 */
+                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                while (lip) {
+                        if (lip->li_type == XFS_LI_INODE) {
+                                iip = (xfs_inode_log_item_t *)lip;
+                                ASSERT(iip->ili_logged == 1);
+                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
+                                xfs_trans_ail_copy_lsn(mp->m_ail,
+                                                        &iip->ili_flush_lsn,
+                                                        &iip->ili_item.li_lsn);
+                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
+                                found++;
+                        }
+                        lip = lip->li_bio_list;
+                }
                /*
-                 * Look for each inode in memory and attempt to lock it,
+                 * For each inode in memory attempt to add it to the inode
-                 * we can be racing with flush and tail pushing here.
+                 * buffer and set it up for being staled on buffer IO
-                 * any inode we get the locks on, add to an array of
+                 * completion.  This is safe as we've locked out tail pushing
-                 * inode items to process later.
+                 * and flushing by locking the buffer.
                 *
-                 * The get the buffer lock, we could beat a flush
+                 * We have already marked every inode that was part of a
-                 * or tail pushing thread to the lock here, in which
+                 * transaction stale above, which means there is no point in
-                 * case they will go looking for the inode buffer
+                 * even trying to lock them.
-                 * and fail, we need some other form of interlock
-                 * here.
                 */
-                found = 0;
                for (i = 0; i < ninodes; i++) {
                        read_lock(&pag->pag_ici_lock);
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
-                        /* Inode not in memory or we found it already,
+                        /* Inode not in memory or stale, nothing to do */
-                         * nothing to do
-                         */
                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
                                read_unlock(&pag->pag_ici_lock);
                                continue;
                        }
-                        if (xfs_inode_clean(ip)) {
+                        /* don't try to lock/unlock the current inode */
-                                read_unlock(&pag->pag_ici_lock);
+                        if (ip != free_ip &&
-                                continue;
+                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                        }
-                        /* If we can get the locks then add it to the
-                         * list, otherwise by the time we get the bp lock
-                         * below it will already be attached to the
-                         * inode buffer.
-                         */
-                        /* This inode will already be locked - by us, lets
-                         * keep it that way.
-                         */
-                        if (ip == free_ip) {
-                                if (xfs_iflock_nowait(ip)) {
-                                        xfs_iflags_set(ip, XFS_ISTALE);
-                                        if (xfs_inode_clean(ip)) {
-                                                xfs_ifunlock(ip);
-                                        } else {
-                                                ip_found[found++] = ip;
-                                        }
-                                }
                                read_unlock(&pag->pag_ici_lock);
                                continue;
                        }
+                        read_unlock(&pag->pag_ici_lock);
-                        if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                        if (!xfs_iflock_nowait(ip)) {
-                                if (xfs_iflock_nowait(ip)) {
+                                if (ip != free_ip)
-                                        xfs_iflags_set(ip, XFS_ISTALE);
-                                        if (xfs_inode_clean(ip)) {
-                                                xfs_ifunlock(ip);
-                                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                        } else {
-                                                ip_found[found++] = ip;
-                                        }
-                                } else {
                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                }
+                                continue;
                        }
-                        read_unlock(&pag->pag_ici_lock);
-                }
-                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 
+                        xfs_iflags_set(ip, XFS_ISTALE);
-                                        mp->m_bsize * blks_per_cluster,
+                        if (xfs_inode_clean(ip)) {
-                                        XBF_LOCK);
+                                ASSERT(ip != free_ip);
+                                xfs_ifunlock(ip);
-                pre_flushed = 0;
+                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                                continue;
-                while (lip) {
-                        if (lip->li_type == XFS_LI_INODE) {
-                                iip = (xfs_inode_log_item_t *)lip;
-                                ASSERT(iip->ili_logged == 1);
-                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
-                                xfs_trans_ail_copy_lsn(mp->m_ail,
-                                                        &iip->ili_flush_lsn,
-                                                        &iip->ili_item.li_lsn);
-                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-                                pre_flushed++;
                        }
-                        lip = lip->li_bio_list;
-                }
-                for (i = 0; i < found; i++) {
-                        ip = ip_found[i];
                        iip = ip->i_itemp;
                        if (!iip) {
+                                /* inode with unlogged changes only */
+                                ASSERT(ip != free_ip);
                                ip->i_update_core = 0;
                                xfs_ifunlock(ip);
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                                continue;
                        }
+                        found++;
                        iip->ili_last_fields = iip->ili_format.ilf_fields;
                        iip->ili_format.ilf_fields = 0;
@@ -2078,17 +2061,16 @@ xfs_ifree_cluster(
                        xfs_buf_attach_iodone(bp,
                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
                                xfs_istale_done, (xfs_log_item_t *)iip);
-                        if (ip != free_ip) {
+                        if (ip != free_ip)
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                        }
                }
-                if (found || pre_flushed)
+                if (found)
                        xfs_trans_stale_inode_buf(tp, bp);
                xfs_trans_binval(tp, bp);
        }
-        kmem_free(ip_found);
        xfs_perag_put(pag);
 }
@@ -2649,8 +2631,6 @@ xfs_iflush_cluster(
        int                     i;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        ASSERT(pag->pagi_inodeok);
-        ASSERT(pag->pag_ici_init);
        inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
        ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 14a69aec2c0b..ed0684cc50ee 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -132,15 +132,10 @@ xlog_align(
        int             nbblks,
        xfs_buf_t       *bp)
 {
-        xfs_daddr_t     offset;
+        xfs_daddr_t     offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
-        xfs_caddr_t     ptr;
-        offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1);
+        ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
-        ptr = XFS_BUF_PTR(bp) + BBTOB(offset);
+        return XFS_BUF_PTR(bp) + BBTOB(offset);
-        ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
-        return ptr;
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d7bf38c8cd1c..d59f4e8bedcf 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -268,10 +268,10 @@ xfs_sb_validate_fsb_count(
 #if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
        if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
-                return E2BIG;
+                return EFBIG;
 #else                  /* Limited by UINT_MAX of sectors */
        if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
-                return E2BIG;
+                return EFBIG;
 #endif
        return 0;
 }
@@ -393,7 +393,7 @@ xfs_mount_validate_sb(
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
                xfs_fs_mount_cmn_err(flags,
                        "file system too large to be mounted on this system.");
-                return XFS_ERROR(E2BIG);
+                return XFS_ERROR(EFBIG);
        }
        if (unlikely(sbp->sb_inprogress)) {
@@ -413,17 +413,6 @@ xfs_mount_validate_sb(
        return 0;
 }
-STATIC void
-xfs_initialize_perag_icache(
-        xfs_perag_t     *pag)
-{
-        if (!pag->pag_ici_init) {
-                rwlock_init(&pag->pag_ici_lock);
-                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
-                pag->pag_ici_init = 1;
-        }
-}
 int
 xfs_initialize_perag(
        xfs_mount_t     *mp,
@@ -436,13 +425,8 @@ xfs_initialize_perag(
        xfs_agino_t     agino;
        xfs_ino_t       ino;
        xfs_sb_t        *sbp = &mp->m_sb;
-        xfs_ino_t       max_inum = XFS_MAXINUMBER_32;
        int             error = -ENOMEM;
-        /* Check to see if the filesystem can overflow 32 bit inodes */
-        agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
-        ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
        /*
         * Walk the current per-ag tree so we don't try to initialise AGs
         * that already exist (growfs case). Allocate and insert all the
@@ -456,11 +440,18 @@ xfs_initialize_perag(
                }
                if (!first_initialised)
                        first_initialised = index;
                pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
                if (!pag)
                        goto out_unwind;
+                pag->pag_agno = index;
+                pag->pag_mount = mp;
+                rwlock_init(&pag->pag_ici_lock);
+                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
                if (radix_tree_preload(GFP_NOFS))
                        goto out_unwind;
                spin_lock(&mp->m_perag_lock);
                if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
                        BUG();
@@ -469,25 +460,26 @@ xfs_initialize_perag(
                        error = -EEXIST;
                        goto out_unwind;
                }
-                pag->pag_agno = index;
-                pag->pag_mount = mp;
                spin_unlock(&mp->m_perag_lock);
                radix_tree_preload_end();
        }
-        /* Clear the mount flag if no inode can overflow 32 bits
+        /*
-         * on this filesystem, or if specifically requested..
+         * If we mount with the inode64 option, or no inode overflows
+         * the legacy 32-bit address space clear the inode32 option.
         */
-        if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) {
+        agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+        ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+        if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
                mp->m_flags |= XFS_MOUNT_32BITINODES;
-        } else {
+        else
                mp->m_flags &= ~XFS_MOUNT_32BITINODES;
-        }
-        /* If we can overflow then setup the ag headers accordingly */
        if (mp->m_flags & XFS_MOUNT_32BITINODES) {
-                /* Calculate how much should be reserved for inodes to
+                /*
-                 * meet the max inode percentage.
+                 * Calculate how much should be reserved for inodes to meet
+                 * the max inode percentage.
                 */
                if (mp->m_maxicount) {
                        __uint64_t      icount;
@@ -500,30 +492,28 @@ xfs_initialize_perag(
                } else {
                        max_metadata = agcount;
                }
                for (index = 0; index < agcount; index++) {
                        ino = XFS_AGINO_TO_INO(mp, index, agino);
-                        if (ino > max_inum) {
+                        if (ino > XFS_MAXINUMBER_32) {
                                index++;
                                break;
                        }
-                        /* This ag is preferred for inodes */
                        pag = xfs_perag_get(mp, index);
                        pag->pagi_inodeok = 1;
                        if (index < max_metadata)
                                pag->pagf_metadata = 1;
-                        xfs_initialize_perag_icache(pag);
                        xfs_perag_put(pag);
                }
        } else {
-                /* Setup default behavior for smaller filesystems */
                for (index = 0; index < agcount; index++) {
                        pag = xfs_perag_get(mp, index);
                        pag->pagi_inodeok = 1;
-                        xfs_initialize_perag_icache(pag);
                        xfs_perag_put(pag);
                }
        }
        if (maxagi)
                *maxagi = index;
        return 0;
@@ -1009,7 +999,7 @@ xfs_check_sizes(xfs_mount_t *mp)
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
                cmn_err(CE_WARN, "XFS: size check 1 failed");
-                return XFS_ERROR(E2BIG);
+                return XFS_ERROR(EFBIG);
        }
        error = xfs_read_buf(mp, mp->m_ddev_targp,
                             d - XFS_FSS_TO_BB(mp, 1),
@@ -1019,7 +1009,7 @@ xfs_check_sizes(xfs_mount_t *mp)
        } else {
                cmn_err(CE_WARN, "XFS: size check 2 failed");
                if (error == ENOSPC)
-                        error = XFS_ERROR(E2BIG);
+                        error = XFS_ERROR(EFBIG);
                return error;
        }
@@ -1027,7 +1017,7 @@ xfs_check_sizes(xfs_mount_t *mp)
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
                        cmn_err(CE_WARN, "XFS: size check 3 failed");
-                        return XFS_ERROR(E2BIG);
+                        return XFS_ERROR(EFBIG);
                }
                error = xfs_read_buf(mp, mp->m_logdev_targp,
                                     d - XFS_FSB_TO_BB(mp, 1),
@@ -1037,7 +1027,7 @@ xfs_check_sizes(xfs_mount_t *mp)
                } else {
                        cmn_err(CE_WARN, "XFS: size check 3 failed");
                        if (error == ENOSPC)
-                                error = XFS_ERROR(E2BIG);
+                                error = XFS_ERROR(EFBIG);
                        return error;
                }
        }
@@ -1254,7 +1244,7 @@ xfs_mountfs(
         * Allocate and initialize the per-ag data.
         */
        spin_lock_init(&mp->m_perag_lock);
-        INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
+        INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
        error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
        if (error) {
                cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6be05f756d59..16445518506d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2247,7 +2247,7 @@ xfs_rtmount_init(
                cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
                        (unsigned long long) XFS_BB_TO_FSB(mp, d),
                        (unsigned long long) mp->m_sb.sb_rblocks);
-                return XFS_ERROR(E2BIG);
+                return XFS_ERROR(EFBIG);
        }
        error = xfs_read_buf(mp, mp->m_rtdev_targp,
                                d - XFS_FSB_TO_BB(mp, 1),
@@ -2256,7 +2256,7 @@ xfs_rtmount_init(
                cmn_err(CE_WARN,
        "XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
                if (error == ENOSPC)
-                        return XFS_ERROR(E2BIG);
+                        return XFS_ERROR(EFBIG);
                return error;
        }
        xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b2d67adb6a08..ff614c29b441 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -147,7 +147,16 @@ xfs_growfs_rt(
 # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
 # define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
 # define xfs_growfs_rt(mp,in)                           (ENOSYS)
-# define xfs_rtmount_init(m)    (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+static inline int               /* error */
+xfs_rtmount_init(
+        xfs_mount_t     *mp)    /* file system mount structure */
+{
+        if (mp->m_sb.sb_rblocks == 0)
+                return 0;
+        cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
+        return ENOSYS;
+}
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtunmount_inodes(m)
 #endif  /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index ce558efa2ea0..28547dfce037 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -48,134 +48,489 @@
 kmem_zone_t     *xfs_trans_zone;
 /*
- * Reservation functions here avoid a huge stack in xfs_trans_init
+ * Various log reservation values.
- * due to register overflow from temporaries in the calculations.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
 */
 STATIC uint
-xfs_calc_write_reservation(xfs_mount_t *mp)
+xfs_calc_write_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *              4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_itruncate_reservation(xfs_mount_t *mp)
+xfs_calc_itruncate_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
+                     128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+                    (4 * mp->m_sb.sb_sectsize +
+                     4 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 4) +
+                     128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
+                     128 * 5 +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *      of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_rename_reservation(xfs_mount_t *mp)
+xfs_calc_rename_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((4 * mp->m_sb.sb_inodesize +
+                     2 * XFS_DIROP_LOG_RES(mp) +
+                     128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
+                    (3 * mp->m_sb.sb_sectsize +
+                     3 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 3) +
+                     128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
 }
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_link_reservation(xfs_mount_t *mp)
+xfs_calc_link_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     XFS_DIROP_LOG_RES(mp) +
+                     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+                    (mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_remove_reservation(xfs_mount_t *mp)
+xfs_calc_remove_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     XFS_DIROP_LOG_RES(mp) +
+                     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
+/*
+ * For symlink we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: 1 block
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the blocks for the symlink: 1 kB
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_symlink_reservation(xfs_mount_t *mp)
+xfs_calc_symlink_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, 1) +
+                     XFS_DIROP_LOG_RES(mp) +
+                     1024 +
+                     128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+                     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_create_reservation(xfs_mount_t *mp)
+xfs_calc_create_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_FSB_TO_B(mp, 1) +
+                     XFS_DIROP_LOG_RES(mp) +
+                     128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
+                    (3 * mp->m_sb.sb_sectsize +
+                     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+                     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * Making a new directory is the same as creating a new file.
+ */
 STATIC uint
-xfs_calc_mkdir_reservation(xfs_mount_t *mp)
+xfs_calc_mkdir_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return xfs_calc_create_reservation(mp);
 }
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_ifree_reservation(xfs_mount_t *mp)
+xfs_calc_ifree_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize +
+                mp->m_sb.sb_sectsize +
+                XFS_FSB_TO_B(mp, 1) +
+                MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
+                    XFS_INODE_CLUSTER_SIZE(mp)) +
+                128 * 5 +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
 STATIC uint
-xfs_calc_ichange_reservation(xfs_mount_t *mp)
+xfs_calc_ichange_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize +
+                512;
 }
+/*
+ * Growing the data section of the filesystem.
+ *      superblock
+ *      agi and agf
+ *      allocation btrees
+ */
 STATIC uint
-xfs_calc_growdata_reservation(xfs_mount_t *mp)
+xfs_calc_growdata_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWDATA_LOG_RES(mp);
+        return mp->m_sb.sb_sectsize * 3 +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *      superblock: sector size
+ *      agf of the ag from which the extent is allocated: sector size
+ *      bmap btree for bitmap/summary inode: max depth * blocksize
+ *      bitmap/summary inode: inode size
+ *      allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
 STATIC uint
-xfs_calc_growrtalloc_reservation(xfs_mount_t *mp)
+xfs_calc_growrtalloc_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWRTALLOC_LOG_RES(mp);
+        return 2 * mp->m_sb.sb_sectsize +
+                XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+                mp->m_sb.sb_inodesize +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *      one bitmap/summary block: blocksize
+ */
 STATIC uint
-xfs_calc_growrtzero_reservation(xfs_mount_t *mp)
+xfs_calc_growrtzero_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWRTZERO_LOG_RES(mp);
+        return mp->m_sb.sb_blocksize + 128;
 }
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *      superblock: sector size
+ *      bitmap inode: inode size
+ *      summary inode: inode size
+ *      one bitmap block: blocksize
+ *      summary blocks: new summary size
+ */
 STATIC uint
-xfs_calc_growrtfree_reservation(xfs_mount_t *mp)
+xfs_calc_growrtfree_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWRTFREE_LOG_RES(mp);
+        return mp->m_sb.sb_sectsize +
+                2 * mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_blocksize +
+                mp->m_rsumsize +
+                128 * 5;
 }
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *      inode
+ */
 STATIC uint
-xfs_calc_swrite_reservation(xfs_mount_t *mp)
+xfs_calc_swrite_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_SWRITE_LOG_RES(mp);
+        return mp->m_sb.sb_inodesize + 128;
 }
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *      inode
+ */
 STATIC uint
 xfs_calc_writeid_reservation(xfs_mount_t *mp)
 {
-        return XFS_CALC_WRITEID_LOG_RES(mp);
+        return mp->m_sb.sb_inodesize + 128;
 }
+/*
+ * Converting the inode from non-attributed to attributed.
+ *      the inode being converted: inode size
+ *      agf block and superblock (for block allocation)
+ *      the new block (directory sized)
+ *      bmap blocks for the new directory block
+ *      allocation btrees
+ */
 STATIC uint
-xfs_calc_addafork_reservation(xfs_mount_t *mp)
+xfs_calc_addafork_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize * 2 +
+                mp->m_dirblksize +
+                XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
+                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *              4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_attrinval_reservation(xfs_mount_t *mp)
+xfs_calc_attrinval_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ATTRINVAL_LOG_RES(mp);
+        return MAX((mp->m_sb.sb_inodesize +
+                    XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                    128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
+                   (4 * mp->m_sb.sb_sectsize +
+                    4 * mp->m_sb.sb_sectsize +
+                    mp->m_sb.sb_sectsize +
+                    XFS_ALLOCFREE_LOG_RES(mp, 4) +
+                    128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
 }
+/*
+ * Setting an attribute.
+ *      the inode getting the attribute
+ *      the superblock for allocations
+ *      the agfs extents are allocated from
+ *      the attribute btree * max depth
+ *      the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime.
+ */
 STATIC uint
-xfs_calc_attrset_reservation(xfs_mount_t *mp)
+xfs_calc_attrset_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize +
+                XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+                128 * (2 + XFS_DA_NODE_MAXDEPTH);
 }
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_attrrm_reservation(xfs_mount_t *mp)
+xfs_calc_attrrm_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                     128 * (1 + XFS_DA_NODE_MAXDEPTH +
+                            XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
 STATIC uint
-xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
+xfs_calc_clear_agi_bucket_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp);
+        return mp->m_sb.sb_sectsize + 128;
 }
 /*
@@ -184,11 +539,10 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
 */
 void
 xfs_trans_init(
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        xfs_trans_reservations_t        *resp;
+        struct xfs_trans_reservations *resp = &mp->m_reservations;
-        resp = &(mp->m_reservations);
        resp->tr_write = xfs_calc_write_reservation(mp);
        resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
        resp->tr_rename = xfs_calc_rename_reservation(mp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 8c69e7824f68..e639e8e9a2a9 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -300,24 +300,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 /*
- * Various log reservation values.
- * These are based on the size of the file system block
- * because that is what most transactions manipulate.
- * Each adds in an additional 128 bytes per item logged to
- * try to account for the overhead of the transaction mechanism.
- *
- * Note:
- * Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish()
- * call.  This is because the number in the worst case is quite high
- * and quite unusual.  In order to fix this we need to change
- * xfs_bmap_finish() to free extents in only a single AG at a time.
- * This will require changes to the EFI code as well, however, so that
- * the EFI for the extents not freed is logged again in each transaction.
- * See bug 261917.
- */
-/*
 * Per-extent log reservation for the allocation btree changes
 * involved in freeing or allocating an extent.
 * 2 trees * (2 blocks/level * max depth - 1) * block size
@@ -341,429 +323,36 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
        (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
         XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
-/*
- * In a write transaction we can allocate a maximum of 2
- * extents.  This gives:
- *    the inode getting the new extents: inode size
- *    the inode's bmap btree: max depth * block size
- *    the agfs of the ags from which the extents are allocated: 2 * sector
- *    the superblock free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
- *    the agfs of the ags containing the blocks: 2 * sector size
- *    the agfls of the ags containing the blocks: 2 * sector size
- *    the super block free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_WRITE_LOG_RES(mp) \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
-         ((2 * (mp)->m_sb.sb_sectsize) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 #define XFS_WRITE_LOG_RES(mp)   ((mp)->m_reservations.tr_write)
-/*
- * In truncating a file we free up to two extents at once.  We can modify:
- *    the inode being truncated: inode size
- *    the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *              4 exts * 2 trees * (2 * max depth - 1) * block size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
-          (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
-         ((4 * (mp)->m_sb.sb_sectsize) + \
-          (4 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 4) + \
-          (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
-          (128 * 5) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-           (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-            XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
-/*
- * In renaming a files we can modify:
- *    the four inodes involved: 4 * inode size
- *    the two directory btrees: 2 * (max depth + v2) * dir block size
- *    the two directory bmap btrees: 2 * max depth * block size
- * And the bmap_finish transaction can free dir and bmap blocks (two sets
- *      of bmap blocks) giving:
- *    the agf for the ags in which the blocks live: 3 * sector size
- *    the agfl for the ags in which the blocks live: 3 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_RENAME_LOG_RES(mp) \
-        (MAX( \
-         ((4 * (mp)->m_sb.sb_inodesize) + \
-          (2 * XFS_DIROP_LOG_RES(mp)) + \
-          (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
-         ((3 * (mp)->m_sb.sb_sectsize) + \
-          (3 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 3) + \
-          (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
 #define XFS_RENAME_LOG_RES(mp)  ((mp)->m_reservations.tr_rename)
-/*
- * For creating a link to an inode:
- *    the parent directory inode: inode size
- *    the linked inode: inode size
- *    the directory btree could split: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free some bmap blocks giving:
- *    the agf for the ag in which the blocks live: sector size
- *    the agfl for the ag in which the blocks live: sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_LINK_LOG_RES(mp) \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          XFS_DIROP_LOG_RES(mp) + \
-          (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
-         ((mp)->m_sb.sb_sectsize + \
-          (mp)->m_sb.sb_sectsize + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_LINK_LOG_RES(mp)    ((mp)->m_reservations.tr_link)
-/*
- * For removing a directory entry we can modify:
- *    the parent directory inode: inode size
- *    the removed inode: inode size
- *    the directory btree could join: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free the dir and bmap blocks giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_REMOVE_LOG_RES(mp)     \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          XFS_DIROP_LOG_RES(mp) + \
-          (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
-         ((2 * (mp)->m_sb.sb_sectsize) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 #define XFS_REMOVE_LOG_RES(mp)  ((mp)->m_reservations.tr_remove)
-/*
- * For symlink we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: 1 block
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- *    the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_SYMLINK_LOG_RES(mp)            \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B(mp, 1) + \
-          XFS_DIROP_LOG_RES(mp) + \
-          1024 + \
-          (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
-         (2 * (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-          XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
-/*
- * For create we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: block size
- *    the superblock for the nlink flag: sector size
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_CREATE_LOG_RES(mp)             \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B(mp, 1) + \
-          XFS_DIROP_LOG_RES(mp) + \
-          (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
-         (3 * (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-          XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_CREATE_LOG_RES(mp)  ((mp)->m_reservations.tr_create)
-/*
- * Making a new directory is the same as creating a new file.
- */
-#define XFS_CALC_MKDIR_LOG_RES(mp)      XFS_CALC_CREATE_LOG_RES(mp)
 #define XFS_MKDIR_LOG_RES(mp)   ((mp)->m_reservations.tr_mkdir)
-/*
- * In freeing an inode we can modify:
- *    the inode being freed: inode size
- *    the super block free inode counter: sector size
- *    the agi hash list and counters: sector size
- *    the inode btree entry: block size
- *    the on disk inode before ours in the agi hash list: inode cluster size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_IFREE_LOG_RES(mp) \
-        ((mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_sectsize + \
-         (mp)->m_sb.sb_sectsize + \
-         XFS_FSB_TO_B((mp), 1) + \
-         MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
-         (128 * 5) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_IFREE_LOG_RES(mp)   ((mp)->m_reservations.tr_ifree)
-/*
- * When only changing the inode we log the inode and possibly the superblock
- * We also add a bit of slop for the transaction stuff.
- */
-#define XFS_CALC_ICHANGE_LOG_RES(mp)    ((mp)->m_sb.sb_inodesize + \
-                                         (mp)->m_sb.sb_sectsize + 512)
 #define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
-/*
- * Growing the data section of the filesystem.
- *      superblock
- *      agi and agf
- *      allocation btrees
- */
-#define XFS_CALC_GROWDATA_LOG_RES(mp) \
-        ((mp)->m_sb.sb_sectsize * 3 + \
-         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_GROWDATA_LOG_RES(mp)    ((mp)->m_reservations.tr_growdata)
-/*
- * Growing the rt section of the filesystem.
- * In the first set of transactions (ALLOC) we allocate space to the
- * bitmap or summary files.
- *      superblock: sector size
- *      agf of the ag from which the extent is allocated: sector size
- *      bmap btree for bitmap/summary inode: max depth * blocksize
- *      bitmap/summary inode: inode size
- *      allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
- */
-#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
-        (2 * (mp)->m_sb.sb_sectsize + \
-         XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
-         (mp)->m_sb.sb_inodesize + \
-         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * \
-          (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_GROWRTALLOC_LOG_RES(mp)     ((mp)->m_reservations.tr_growrtalloc)
-/*
- * Growing the rt section of the filesystem.
- * In the second set of transactions (ZERO) we zero the new metadata blocks.
- *      one bitmap/summary block: blocksize
- */
-#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \
-        ((mp)->m_sb.sb_blocksize + 128)
 #define XFS_GROWRTZERO_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtzero)
-/*
- * Growing the rt section of the filesystem.
- * In the third set of transactions (FREE) we update metadata without
- * allocating any new blocks.
- *      superblock: sector size
- *      bitmap inode: inode size
- *      summary inode: inode size
- *      one bitmap block: blocksize
- *      summary blocks: new summary size
- */
-#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \
-        ((mp)->m_sb.sb_sectsize + \
-         2 * (mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_blocksize + \
-         (mp)->m_rsumsize + \
-         (128 * 5))
 #define XFS_GROWRTFREE_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtfree)
-/*
- * Logging the inode modification timestamp on a synchronous write.
- *      inode
- */
-#define XFS_CALC_SWRITE_LOG_RES(mp) \
-        ((mp)->m_sb.sb_inodesize + 128)
 #define XFS_SWRITE_LOG_RES(mp)  ((mp)->m_reservations.tr_swrite)
 /*
 * Logging the inode timestamps on an fsync -- same as SWRITE
 * as long as SWRITE logs the entire inode core
 */
 #define XFS_FSYNC_TS_LOG_RES(mp)        ((mp)->m_reservations.tr_swrite)
-/*
- * Logging the inode mode bits when writing a setuid/setgid file
- *      inode
- */
-#define XFS_CALC_WRITEID_LOG_RES(mp) \
-        ((mp)->m_sb.sb_inodesize + 128)
 #define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-/*
- * Converting the inode from non-attributed to attributed.
- *      the inode being converted: inode size
- *      agf block and superblock (for block allocation)
- *      the new block (directory sized)
- *      bmap blocks for the new directory block
- *      allocation btrees
- */
-#define XFS_CALC_ADDAFORK_LOG_RES(mp)   \
-        ((mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_sectsize * 2 + \
-         (mp)->m_dirblksize + \
-         XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
-         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
-                 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_ADDAFORK_LOG_RES(mp)        ((mp)->m_reservations.tr_addafork)
-/*
- * Removing the attribute fork of a file
- *    the inode being truncated: inode size
- *    the inode's bmap btree: max depth * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *              4 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_ATTRINVAL_LOG_RES(mp)  \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
-          (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
-         ((4 * (mp)->m_sb.sb_sectsize) + \
-          (4 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 4) + \
-          (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
 #define XFS_ATTRINVAL_LOG_RES(mp)       ((mp)->m_reservations.tr_attrinval)
-/*
- * Setting an attribute.
- *      the inode getting the attribute
- *      the superblock for allocations
- *      the agfs extents are allocated from
- *      the attribute btree * max depth
- *      the inode allocation btree
- * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime.
- */
-#define XFS_CALC_ATTRSET_LOG_RES(mp)    \
-        ((mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
-          (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
 #define XFS_ATTRSET_LOG_RES(mp, ext)    \
        ((mp)->m_reservations.tr_attrset + \
         (ext * (mp)->m_sb.sb_sectsize) + \
         (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
         (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
-/*
- * Removing an attribute.
- *    the inode: inode size
- *    the attribute btree could join: max depth * block size
- *    the inode bmap btree could join or split: max depth * block size
- * And the bmap_finish transaction can free the attr blocks freed giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_ATTRRM_LOG_RES(mp)     \
-        (MAX( \
-          ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
-          (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
-         ((2 * (mp)->m_sb.sb_sectsize) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 #define XFS_ATTRRM_LOG_RES(mp)  ((mp)->m_reservations.tr_attrrm)
-/*
- * Clearing a bad agino number in an agi hash bucket.
- */
-#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
-        ((mp)->m_sb.sb_sectsize + 128)
 #define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 9d376be0ea38..a06bd62504fc 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -267,7 +267,7 @@ xfs_setattr(
                if (code) {
                        ASSERT(tp == NULL);
                        lock_flags &= ~XFS_ILOCK_EXCL;
-                        ASSERT(lock_flags == XFS_IOLOCK_EXCL);
+                        ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock);
                        goto error_return;
                }
                tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);