60 files changed, 1162 insertions, 1126 deletions
diff --git a/fs/afs/server.c b/fs/afs/server.c
index f49099516675..9fdc7fe3a7bc 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -91,9 +91,10 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
                memcpy(&server->addr, addr, sizeof(struct in_addr));
                server->addr.s_addr = addr->s_addr;
+                _leave(" = %p{%d}", server, atomic_read(&server->usage));
+        } else {
+                _leave(" = NULL [nomem]");
        }
-        _leave(" = %p{%d}", server, atomic_read(&server->usage));
        return server;
 }
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c5f9a0e5d72..63039ed9576f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -990,10 +990,9 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
                /* clear any space allocated but not loaded */
                if (phdr->p_filesz < phdr->p_memsz) {
-                        ret = clear_user((void *) (seg->addr + phdr->p_filesz),
+                        if (clear_user((void *) (seg->addr + phdr->p_filesz),
-                                         phdr->p_memsz - phdr->p_filesz);
+                                       phdr->p_memsz - phdr->p_filesz))
-                        if (ret)
+                                return -EFAULT;
-                                return ret;
                }
                if (mm) {
@@ -1027,7 +1026,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
        struct elf32_fdpic_loadseg *seg;
        struct elf32_phdr *phdr;
        unsigned long load_addr, delta_vaddr;
-        int loop, dvset, ret;
+        int loop, dvset;
        load_addr = params->load_addr;
        delta_vaddr = 0;
@@ -1127,9 +1126,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
                 * PT_LOAD */
                if (prot & PROT_WRITE && disp > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
-                        ret = clear_user((void __user *) maddr, disp);
+                        if (clear_user((void __user *) maddr, disp))
-                        if (ret)
+                                return -EFAULT;
-                                return ret;
                        maddr += disp;
                }
@@ -1164,19 +1162,17 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
                if (prot & PROT_WRITE && excess1 > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx",
                               loop, maddr + phdr->p_filesz, excess1);
-                        ret = clear_user((void __user *) maddr + phdr->p_filesz,
+                        if (clear_user((void __user *) maddr + phdr->p_filesz,
-                                         excess1);
+                                       excess1))
-                        if (ret)
+                                return -EFAULT;
-                                return ret;
                }
 #else
                if (excess > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx",
                               loop, maddr + phdr->p_filesz, excess);
-                        ret = clear_user((void *) maddr + phdr->p_filesz, excess);
+                        if (clear_user((void *) maddr + phdr->p_filesz, excess))
-                        if (ret)
+                                return -EFAULT;
-                                return ret;
                }
 #endif
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 49566c1687d8..b6ab27ccf214 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -56,15 +56,22 @@
 #endif
 /*
- * User data (stack, data section and bss) needs to be aligned
+ * User data (data section and bss) needs to be aligned.
- * for the same reasons as SLAB memory is, and to the same amount.
+ * We pick 0x20 here because it is the max value elf2flt has always
- * Avoid duplicating architecture specific code by using the same
+ * used in producing FLAT files, and because it seems to be large
- * macro as with SLAB allocation:
+ * enough to make all the gcc alignment related tests happy.
+ */
+#define FLAT_DATA_ALIGN (0x20)
+/*
+ * User data (stack) also needs to be aligned.
+ * Here we can be a bit looser than the data sections since this
+ * needs to only meet arch ABI requirements.
 */
 #ifdef ARCH_SLAB_MINALIGN
-#define FLAT_DATA_ALIGN (ARCH_SLAB_MINALIGN)
+#define FLAT_STACK_ALIGN        (ARCH_SLAB_MINALIGN)
 #else
-#define FLAT_DATA_ALIGN (sizeof(void *))
+#define FLAT_STACK_ALIGN        (sizeof(void *))
 #endif
 #define RELOC_FAILED 0xff00ff01         /* Relocation incorrect somewhere */
@@ -129,7 +136,7 @@ static unsigned long create_flat_tables(
        sp = (unsigned long *)p;
        sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
-        sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN);
+        sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
        argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
        envp = argv + (argc + 1);
@@ -589,7 +596,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                if (IS_ERR_VALUE(result)) {
                        printk("Unable to read data+bss, errno %d\n", (int)-result);
                        do_munmap(current->mm, textpos, text_len);
-                        do_munmap(current->mm, realdatastart, data_len + extra);
+                        do_munmap(current->mm, realdatastart, len);
                        ret = result;
                        goto err;
                }
@@ -876,7 +883,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        stack_len = TOP_OF_ARGS - bprm->p;             /* the strings */
        stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
        stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
-        stack_len += FLAT_DATA_ALIGN - 1;  /* reserve for upcoming alignment */
+        stack_len += FLAT_STACK_ALIGN - 1;  /* reserve for upcoming alignment */
        
        res = load_flat_file(bprm, &libinfo, 0, &stack_len);
        if (IS_ERR_VALUE(res))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7346c96308a5..99d6af811747 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -706,8 +706,13 @@ retry:
 * @bdev is about to be opened exclusively.  Check @bdev can be opened
 * exclusively and mark that an exclusive open is in progress.  Each
 * successful call to this function must be matched with a call to
- * either bd_claim() or bd_abort_claiming().  If this function
+ * either bd_finish_claiming() or bd_abort_claiming() (which do not
- * succeeds, the matching bd_claim() is guaranteed to succeed.
+ * fail).
+ *
+ * This function is used to gain exclusive access to the block device
+ * without actually causing other exclusive open attempts to fail. It
+ * should be used when the open sequence itself requires exclusive
+ * access but may subsequently fail.
 *
 * CONTEXT:
 * Might sleep.
@@ -734,6 +739,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
                return ERR_PTR(-ENXIO);
        whole = bdget_disk(disk, 0);
+        module_put(disk->fops->owner);
        put_disk(disk);
        if (!whole)
                return ERR_PTR(-ENOMEM);
@@ -782,15 +788,46 @@ static void bd_abort_claiming(struct block_device *whole, void *holder)
        __bd_abort_claiming(whole, holder);             /* releases bdev_lock */
 }
+/* increment holders when we have a legitimate claim. requires bdev_lock */
+static void __bd_claim(struct block_device *bdev, struct block_device *whole,
+                                        void *holder)
+{
+        /* note that for a whole device bd_holders
+         * will be incremented twice, and bd_holder will
+         * be set to bd_claim before being set to holder
+         */
+        whole->bd_holders++;
+        whole->bd_holder = bd_claim;
+        bdev->bd_holders++;
+        bdev->bd_holder = holder;
+}
+/**
+ * bd_finish_claiming - finish claiming a block device
+ * @bdev: block device of interest (passed to bd_start_claiming())
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Finish a claiming block started by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_finish_claiming(struct block_device *bdev,
+                                struct block_device *whole, void *holder)
+{
+        spin_lock(&bdev_lock);
+        BUG_ON(!bd_may_claim(bdev, whole, holder));
+        __bd_claim(bdev, whole, holder);
+        __bd_abort_claiming(whole, holder); /* not actually an abort */
+}
 /**
 * bd_claim - claim a block device
 * @bdev: block device to claim
 * @holder: holder trying to claim @bdev
 *
- * Try to claim @bdev which must have been opened successfully.  This
+ * Try to claim @bdev which must have been opened successfully.
- * function may be called with or without preceding
- * blk_start_claiming().  In the former case, this function is always
- * successful and terminates the claiming block.
 *
 * CONTEXT:
 * Might sleep.
@@ -806,23 +843,10 @@ int bd_claim(struct block_device *bdev, void *holder)
        might_sleep();
        spin_lock(&bdev_lock);
        res = bd_prepare_to_claim(bdev, whole, holder);
-        if (res == 0) {
+        if (res == 0)
-                /* note that for a whole device bd_holders
+                __bd_claim(bdev, whole, holder);
-                 * will be incremented twice, and bd_holder will
+        spin_unlock(&bdev_lock);
-                 * be set to bd_claim before being set to holder
-                 */
-                whole->bd_holders++;
-                whole->bd_holder = bd_claim;
-                bdev->bd_holders++;
-                bdev->bd_holder = holder;
-        }
-        if (whole->bd_claiming)
-                __bd_abort_claiming(whole, holder);     /* releases bdev_lock */
-        else
-                spin_unlock(&bdev_lock);
        return res;
 }
@@ -1476,7 +1500,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
        if (whole) {
                if (res == 0)
-                        BUG_ON(bd_claim(bdev, filp) != 0);
+                        bd_finish_claiming(bdev, whole, filp);
                else
                        bd_abort_claiming(whole, filp);
        }
@@ -1712,7 +1736,7 @@ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *h
        if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
                goto out_blkdev_put;
-        BUG_ON(bd_claim(bdev, holder) != 0);
+        bd_finish_claiming(bdev, whole, holder);
        return bdev;
 out_blkdev_put:
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 8d432cd9d580..2222d161c7b6 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,6 +60,8 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                size = __btrfs_getxattr(inode, name, value, size);
                if (size > 0) {
                        acl = posix_acl_from_xattr(value, size);
+                        if (IS_ERR(acl))
+                                return acl;
                        set_cached_acl(inode, type, acl);
                }
                kfree(value);
@@ -160,6 +162,12 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        int ret;
        struct posix_acl *acl = NULL;
+        if (!is_owner_or_cap(dentry->d_inode))
+                return -EPERM;
+        if (!IS_POSIXACL(dentry->d_inode))
+                return -EOPNOTSUPP;
        if (value) {
                acl = posix_acl_from_xattr(value, size);
                if (acl == NULL) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f3b287c22caf..34f7c375567e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1941,8 +1941,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                     btrfs_level_size(tree_root,
                                      btrfs_super_log_root_level(disk_super));
-                log_tree_root = kzalloc(sizeof(struct btrfs_root),
+                log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-                                                      GFP_NOFS);
+                if (!log_tree_root) {
+                        err = -ENOMEM;
+                        goto fail_trans_kthread;
+                }
                __setup_root(nodesize, leafsize, sectorsize, stripesize,
                             log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1982,6 +1985,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (!fs_info->fs_root)
                goto fail_trans_kthread;
+        if (IS_ERR(fs_info->fs_root)) {
+                err = PTR_ERR(fs_info->fs_root);
+                goto fail_trans_kthread;
+        }
        if (!(sb->s_flags & MS_RDONLY)) {
                down_read(&fs_info->cleanup_work_sem);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b9080d71991a..32d094002a57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4360,7 +4360,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
        block_rsv = get_block_rsv(trans, root);
        cache = btrfs_lookup_block_group(root->fs_info, buf->start);
-        BUG_ON(block_rsv->space_info != cache->space_info);
+        if (block_rsv->space_info != cache->space_info)
+                goto out;
        if (btrfs_header_generation(buf) == trans->transid) {
                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 787b50a16a14..e354c33df082 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1140,7 +1140,7 @@ int btrfs_sync_file(struct file *file, int datasync)
        /*
         * ok we haven't committed the transaction yet, lets do a commit
         */
-        if (file && file->private_data)
+        if (file->private_data)
                btrfs_ioctl_trans_end(file);
        trans = btrfs_start_transaction(root, 0);
@@ -1190,14 +1190,22 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
 static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
 {
-        vma->vm_ops = &btrfs_file_vm_ops;
+        struct address_space *mapping = filp->f_mapping;
+        if (!mapping->a_ops->readpage)
+                return -ENOEXEC;
        file_accessed(filp);
+        vma->vm_ops = &btrfs_file_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
 const struct file_operations btrfs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
+        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
        .splice_read    = generic_file_splice_read,
        .aio_write      = btrfs_file_aio_write,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fa6ccc1bfe2a..1bff92ad4744 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2673,7 +2673,7 @@ static int check_path_shared(struct btrfs_root *root,
        struct extent_buffer *eb;
        int level;
        int ret;
-        u64 refs;
+        u64 refs = 1;
        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
                if (!path->nodes[level])
@@ -6884,7 +6884,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                        ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
+                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
                                                        last_byte - cur_offset,
                                                        1 << inode->i_blkbits,
                                                        offset + len,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4cdb98cf26de..4dbaf89b1337 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1280,7 +1280,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
-                goto out;
+                goto out_up_write;
        }
        trans->block_rsv = &root->fs_info->global_block_rsv;
@@ -1845,7 +1845,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
        di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
                                   dir_id, "default", 7, 1);
-        if (!di) {
+        if (IS_ERR_OR_NULL(di)) {
                btrfs_free_path(path);
                btrfs_end_transaction(trans, root);
                printk(KERN_ERR "Umm, you don't have the default dir item, "
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 05d41e569236..b37d723b9d4a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -784,16 +784,17 @@ again:
                                struct btrfs_extent_ref_v0 *ref0;
                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
                                                struct btrfs_extent_ref_v0);
-                                root = find_tree_root(rc, eb, ref0);
-                                if (!root->ref_cows)
-                                        cur->cowonly = 1;
                                if (key.objectid == key.offset) {
+                                        root = find_tree_root(rc, eb, ref0);
                                        if (root && !should_ignore_root(root))
                                                cur->root = root;
                                        else
                                                list_add(&cur->list, &useless);
                                        break;
                                }
+                                if (is_cowonly_root(btrfs_ref_root_v0(eb,
+                                                                      ref0)))
+                                        cur->cowonly = 1;
                        }
 #else
                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index b91ccd972644..2d958be761c8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -330,7 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
        struct btrfs_path *path;
        int ret;
-        u32 refs;
        struct btrfs_root_item *ri;
        struct extent_buffer *leaf;
@@ -344,8 +343,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        leaf = path->nodes[0];
        ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
-        refs = btrfs_disk_root_refs(leaf, ri);
-        BUG_ON(refs != 0);
        ret = btrfs_del_item(trans, root, path);
 out:
        btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d34b2dfc9628..f2393b390318 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -360,6 +360,8 @@ static struct dentry *get_default_root(struct super_block *sb,
         */
        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
        di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+        if (IS_ERR(di))
+                return ERR_CAST(di);
        if (!di) {
                /*
                 * Ok the default dir item isn't there.  This is weird since
@@ -390,8 +392,8 @@ setup_root:
        location.offset = 0;
        inode = btrfs_iget(sb, &location, new_root, &new);
-        if (!inode)
+        if (IS_ERR(inode))
-                return ERR_PTR(-ENOMEM);
+                return ERR_CAST(inode);
        /*
         * If we're just mounting the root most subvol put the inode and return
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ae3e3a306445..619b61655ee5 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -981,6 +981,46 @@ static int send_cap_msg(struct ceph_mds_session *session,
        return 0;
 }
+static void __queue_cap_release(struct ceph_mds_session *session,
+                                u64 ino, u64 cap_id, u32 migrate_seq,
+                                u32 issue_seq)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_cap_release *head;
+        struct ceph_mds_cap_item *item;
+        spin_lock(&session->s_cap_lock);
+        BUG_ON(!session->s_num_cap_releases);
+        msg = list_first_entry(&session->s_cap_releases,
+                               struct ceph_msg, list_head);
+        dout(" adding %llx release to mds%d msg %p (%d left)\n",
+             ino, session->s_mds, msg, session->s_num_cap_releases);
+        BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+        head = msg->front.iov_base;
+        head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+        item = msg->front.iov_base + msg->front.iov_len;
+        item->ino = cpu_to_le64(ino);
+        item->cap_id = cpu_to_le64(cap_id);
+        item->migrate_seq = cpu_to_le32(migrate_seq);
+        item->seq = cpu_to_le32(issue_seq);
+        session->s_num_cap_releases--;
+        msg->front.iov_len += sizeof(*item);
+        if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+                dout(" release msg %p full\n", msg);
+                list_move_tail(&msg->list_head, &session->s_cap_releases_done);
+        } else {
+                dout(" release msg %p at %d/%d (%d)\n", msg,
+                     (int)le32_to_cpu(head->num),
+                     (int)CEPH_CAPS_PER_RELEASE,
+                     (int)msg->front.iov_len);
+        }
+        spin_unlock(&session->s_cap_lock);
+}
 /*
 * Queue cap releases when an inode is dropped from our cache.  Since
 * inode is about to be destroyed, there is no need for i_lock.
@@ -994,41 +1034,9 @@ void ceph_queue_caps_release(struct inode *inode)
        while (p) {
                struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
                struct ceph_mds_session *session = cap->session;
-                struct ceph_msg *msg;
-                struct ceph_mds_cap_release *head;
-                struct ceph_mds_cap_item *item;
-                spin_lock(&session->s_cap_lock);
+                __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
-                BUG_ON(!session->s_num_cap_releases);
+                                    cap->mseq, cap->issue_seq);
-                msg = list_first_entry(&session->s_cap_releases,
-                                       struct ceph_msg, list_head);
-                dout(" adding %p release to mds%d msg %p (%d left)\n",
-                     inode, session->s_mds, msg, session->s_num_cap_releases);
-                BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
-                head = msg->front.iov_base;
-                head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
-                item = msg->front.iov_base + msg->front.iov_len;
-                item->ino = cpu_to_le64(ceph_ino(inode));
-                item->cap_id = cpu_to_le64(cap->cap_id);
-                item->migrate_seq = cpu_to_le32(cap->mseq);
-                item->seq = cpu_to_le32(cap->issue_seq);
-                session->s_num_cap_releases--;
-                msg->front.iov_len += sizeof(*item);
-                if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
-                        dout(" release msg %p full\n", msg);
-                        list_move_tail(&msg->list_head,
-                                       &session->s_cap_releases_done);
-                } else {
-                        dout(" release msg %p at %d/%d (%d)\n", msg,
-                             (int)le32_to_cpu(head->num),
-                             (int)CEPH_CAPS_PER_RELEASE,
-                             (int)msg->front.iov_len);
-                }
-                spin_unlock(&session->s_cap_lock);
                p = rb_next(p);
                __ceph_remove_cap(cap);
        }
@@ -2655,7 +2663,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        struct ceph_mds_caps *h;
        int mds = session->s_mds;
        int op;
-        u32 seq;
+        u32 seq, mseq;
        struct ceph_vino vino;
        u64 cap_id;
        u64 size, max_size;
@@ -2675,6 +2683,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        vino.snap = CEPH_NOSNAP;
        cap_id = le64_to_cpu(h->cap_id);
        seq = le32_to_cpu(h->seq);
+        mseq = le32_to_cpu(h->migrate_seq);
        size = le64_to_cpu(h->size);
        max_size = le64_to_cpu(h->max_size);
@@ -2689,6 +2698,18 @@ void ceph_handle_caps(struct ceph_mds_session *session,
             vino.snap, inode);
        if (!inode) {
                dout(" i don't have ino %llx\n", vino.ino);
+                if (op == CEPH_CAP_OP_IMPORT)
+                        __queue_cap_release(session, vino.ino, cap_id,
+                                            mseq, seq);
+                /*
+                 * send any full release message to try to move things
+                 * along for the mds (who clearly thinks we still have this
+                 * cap).
+                 */
+                ceph_add_cap_releases(mdsc, session, -1);
+                ceph_send_cap_releases(mdsc, session);
                goto done;
        }
@@ -2714,7 +2735,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        spin_lock(&inode->i_lock);
        cap = __get_cap_for_mds(ceph_inode(inode), mds);
        if (!cap) {
-                dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
+                dout(" no cap on %p ino %llx.%llx from mds%d\n",
                     inode, ceph_ino(inode), ceph_snap(inode), mds);
                spin_unlock(&inode->i_lock);
                goto done;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 226f5a50d362..ab47f46ca282 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -827,7 +827,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
        spin_lock(&dcache_lock);
        spin_lock(&dn->d_lock);
-        list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+        list_move(&dn->d_u.d_child, &dir->d_subdirs);
        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
             dn->d_u.d_child.prev, dn->d_u.d_child.next);
        spin_unlock(&dn->d_lock);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b49f12822cbc..1766947fc07a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1066,9 +1066,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,
 *
 * Called under s_mutex.
 */
-static int add_cap_releases(struct ceph_mds_client *mdsc,
+int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
-                            struct ceph_mds_session *session,
+                          struct ceph_mds_session *session,
-                            int extra)
+                          int extra)
 {
        struct ceph_msg *msg;
        struct ceph_mds_cap_release *head;
@@ -1176,8 +1176,8 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
 /*
 * called under s_mutex
 */
-static void send_cap_releases(struct ceph_mds_client *mdsc,
+void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
-                       struct ceph_mds_session *session)
+                            struct ceph_mds_session *session)
 {
        struct ceph_msg *msg;
@@ -1980,7 +1980,7 @@ out_err:
        }
        mutex_unlock(&mdsc->mutex);
-        add_cap_releases(mdsc, req->r_session, -1);
+        ceph_add_cap_releases(mdsc, req->r_session, -1);
        mutex_unlock(&session->s_mutex);
        /* kick calling process */
@@ -2433,6 +2433,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        struct ceph_dentry_info *di;
        int mds = session->s_mds;
        struct ceph_mds_lease *h = msg->front.iov_base;
+        u32 seq;
        struct ceph_vino vino;
        int mask;
        struct qstr dname;
@@ -2446,6 +2447,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        vino.ino = le64_to_cpu(h->ino);
        vino.snap = CEPH_NOSNAP;
        mask = le16_to_cpu(h->mask);
+        seq = le32_to_cpu(h->seq);
        dname.name = (void *)h + sizeof(*h) + sizeof(u32);
        dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
        if (dname.len != get_unaligned_le32(h+1))
@@ -2456,8 +2458,9 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        /* lookup inode */
        inode = ceph_find_inode(sb, vino);
-        dout("handle_lease '%s', mask %d, ino %llx %p\n",
+        dout("handle_lease %s, mask %d, ino %llx %p %.*s\n",
-             ceph_lease_op_name(h->action), mask, vino.ino, inode);
+             ceph_lease_op_name(h->action), mask, vino.ino, inode,
+             dname.len, dname.name);
        if (inode == NULL) {
                dout("handle_lease no inode %llx\n", vino.ino);
                goto release;
@@ -2482,7 +2485,8 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        switch (h->action) {
        case CEPH_MDS_LEASE_REVOKE:
                if (di && di->lease_session == session) {
-                        h->seq = cpu_to_le32(di->lease_seq);
+                        if (ceph_seq_cmp(di->lease_seq, seq) > 0)
+                                h->seq = cpu_to_le32(di->lease_seq);
                        __ceph_mdsc_drop_dentry_lease(dentry);
                }
                release = 1;
@@ -2496,7 +2500,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
                        unsigned long duration =
                                le32_to_cpu(h->duration_ms) * HZ / 1000;
-                        di->lease_seq = le32_to_cpu(h->seq);
+                        di->lease_seq = seq;
                        dentry->d_time = di->lease_renew_from + duration;
                        di->lease_renew_after = di->lease_renew_from +
                                (duration >> 1);
@@ -2686,10 +2690,10 @@ static void delayed_work(struct work_struct *work)
                        send_renew_caps(mdsc, s);
                else
                        ceph_con_keepalive(&s->s_con);
-                add_cap_releases(mdsc, s, -1);
+                ceph_add_cap_releases(mdsc, s, -1);
                if (s->s_state == CEPH_MDS_SESSION_OPEN ||
                    s->s_state == CEPH_MDS_SESSION_HUNG)
-                        send_cap_releases(mdsc, s);
+                        ceph_send_cap_releases(mdsc, s);
                mutex_unlock(&s->s_mutex);
                ceph_put_mds_session(s);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d9936c4f1212..b292fa42a66d 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -322,6 +322,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
        kref_put(&req->r_kref, ceph_mdsc_release_request);
 }
+extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_session *session,
+                                 int extra);
+extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+                                   struct ceph_mds_session *session);
 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
 extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 21c62e9b7d1d..07a539906e67 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -400,6 +400,8 @@ static void release_generic_request(struct kref *kref)
                ceph_msg_put(req->reply);
        if (req->request)
                ceph_msg_put(req->request);
+        kfree(req);
 }
 static void put_generic_request(struct ceph_mon_generic_request *req)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 4e0bee240b9d..fa87f51e38e1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -89,7 +89,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files = le64_to_cpu(st.num_objects);
        buf->f_ffree = -1;
-        buf->f_namelen = PATH_MAX;
+        buf->f_namelen = NAME_MAX;
        buf->f_frsize = PAGE_CACHE_SIZE;
        /* leave fsid little-endian, regardless of host endianness */
@@ -926,7 +926,7 @@ static int ceph_compare_super(struct super_block *sb, void *data)
 /*
 * construct our own bdi so we can control readahead, etc.
 */
-static atomic_long_t bdi_seq = ATOMIC_INIT(0);
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index f1ff785b2292..75541af4b3db 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1952,6 +1952,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
                        bytes_read -= PAGE_CACHE_SIZE;
                        continue;
                }
+                page_cache_release(page);
                target = kmap_atomic(page, KM_USER0);
diff --git a/fs/compat.c b/fs/compat.c
index f0b391c50552..6490d2134ff3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -626,7 +626,7 @@ ssize_t compat_rw_copy_check_uvector(int type,
                tot_len += len;
                if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
                        goto out;
-                if (!access_ok(vrfy_dir(type), buf, len)) {
+                if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
                        ret = -EFAULT;
                        goto out;
                }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 41645142b88b..cf78d44a8d6a 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -72,10 +72,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
        if (!sd)
                return -EINVAL;
-        error = simple_setattr(dentry, iattr);
-        if (error)
-                return error;
        sd_iattr = sd->s_iattr;
        if (!sd_iattr) {
                /* setting attributes for the first time, allocate now */
@@ -89,9 +85,12 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
                sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
                sd->s_iattr = sd_iattr;
        }
        /* attributes were changed atleast once in past */
+        error = simple_setattr(dentry, iattr);
+        if (error)
+                return error;
        if (ia_valid & ATTR_UID)
                sd_iattr->ia_uid = iattr->ia_uid;
        if (ia_valid & ATTR_GID)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 19214435b752..3675088cb88c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1552,7 +1552,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
                if (error)
                        return error;
        }
-        if (iattr->ia_valid & ATTR_SIZE) {
+        if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
                error = ext2_setsize(inode, iattr->ia_size);
                if (error)
                        return error;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 19df61c321fd..42272d67955a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4942,20 +4942,26 @@ void ext4_set_inode_flags(struct inode *inode)
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
 void ext4_get_inode_flags(struct ext4_inode_info *ei)
 {
-        unsigned int flags = ei->vfs_inode.i_flags;
+        unsigned int vfs_fl;
+        unsigned long old_fl, new_fl;
-        ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
-                        EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
+        do {
-        if (flags & S_SYNC)
+                vfs_fl = ei->vfs_inode.i_flags;
-                ei->i_flags |= EXT4_SYNC_FL;
+                old_fl = ei->i_flags;
-        if (flags & S_APPEND)
+                new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
-                ei->i_flags |= EXT4_APPEND_FL;
+                                EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
-        if (flags & S_IMMUTABLE)
+                                EXT4_DIRSYNC_FL);
-                ei->i_flags |= EXT4_IMMUTABLE_FL;
+                if (vfs_fl & S_SYNC)
-        if (flags & S_NOATIME)
+                        new_fl |= EXT4_SYNC_FL;
-                ei->i_flags |= EXT4_NOATIME_FL;
+                if (vfs_fl & S_APPEND)
-        if (flags & S_DIRSYNC)
+                        new_fl |= EXT4_APPEND_FL;
-                ei->i_flags |= EXT4_DIRSYNC_FL;
+                if (vfs_fl & S_IMMUTABLE)
+                        new_fl |= EXT4_IMMUTABLE_FL;
+                if (vfs_fl & S_NOATIME)
+                        new_fl |= EXT4_NOATIME_FL;
+                if (vfs_fl & S_DIRSYNC)
+                        new_fl |= EXT4_DIRSYNC_FL;
+        } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
 }
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -5191,7 +5197,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = 0;
-                ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                return 0;
        }
        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
@@ -5204,9 +5210,9 @@ static int ext4_inode_blocks_set(handle_t *handle,
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
-                ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
        } else {
-                ei->i_flags |= EXT4_HUGE_FILE_FL;
+                ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                /* i_block is stored in file system block size */
                i_blocks = i_blocks >> (inode->i_blkbits - 9);
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3a6c92ac131c..52abfa12762a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -960,6 +960,9 @@ mext_check_arguments(struct inode *orig_inode,
                return -EINVAL;
        }
+        if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+                return -EPERM;
        /* Ext4 move extent does not support swapfile */
        if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
                ext4_debug("ext4 move extent: The argument files should "
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f74d270ba155..51e11bf5708f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -274,7 +274,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
        ret = copy_from_user(&owner, owner_p, sizeof(owner));
        if (ret)
-                return ret;
+                return -EFAULT;
        switch (owner.type) {
        case F_OWNER_TID:
@@ -332,8 +332,11 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
        }
        read_unlock(&filp->f_owner.lock);
-        if (!ret)
+        if (!ret) {
                ret = copy_to_user(owner_p, &owner, sizeof(owner));
+                if (ret)
+                        ret = -EFAULT;
+        }
        return ret;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ea8592b90696..1d1088f48bc2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,7 +45,6 @@ struct wb_writeback_args {
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
-        unsigned int sb_pinned:1;
 };
 /*
@@ -193,8 +192,7 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
 }
 static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-                                 struct wb_writeback_args *args,
+                                 struct wb_writeback_args *args)
-                                 int wait)
 {
        struct bdi_work *work;
@@ -206,8 +204,6 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
        if (work) {
                bdi_work_init(work, args);
                bdi_queue_work(bdi, work);
-                if (wait)
-                        bdi_wait_on_work_clear(work);
        } else {
                struct bdi_writeback *wb = &bdi->wb;
@@ -234,11 +230,6 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
                .sync_mode      = WB_SYNC_ALL,
                .nr_pages       = LONG_MAX,
                .range_cyclic   = 0,
-                /*
-                 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
-                 * lets make it explicitly clear.
-                 */
-                .sb_pinned      = 1,
        };
        struct bdi_work work;
@@ -254,23 +245,21 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 * @bdi: the backing device to write from
 * @sb: write inodes from this super_block
 * @nr_pages: the number of pages to write
- * @sb_locked: caller already holds sb umount sem.
 *
 * Description:
 *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
 *   started when this function returns, we make no guarentees on
- *   completion. Caller specifies whether sb umount sem is held already or not.
+ *   completion. Caller need not hold sb s_umount semaphore.
 *
 */
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-                         long nr_pages, int sb_locked)
+                         long nr_pages)
 {
        struct wb_writeback_args args = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_NONE,
                .nr_pages       = nr_pages,
                .range_cyclic   = 1,
-                .sb_pinned      = sb_locked,
        };
        /*
@@ -282,7 +271,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
                args.for_background = 1;
        }
-        bdi_alloc_queue_work(bdi, &args, sb_locked);
+        bdi_alloc_queue_work(bdi, &args);
 }
 /*
@@ -595,7 +584,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
        /*
         * Caller must already hold the ref for this
         */
-        if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
+        if (wbc->sync_mode == WB_SYNC_ALL) {
                WARN_ON(!rwsem_is_locked(&sb->s_umount));
                return SB_NOT_PINNED;
        }
@@ -769,7 +758,6 @@ static long wb_writeback(struct bdi_writeback *wb,
                .for_kupdate            = args->for_kupdate,
                .for_background         = args->for_background,
                .range_cyclic           = args->range_cyclic,
-                .sb_pinned              = args->sb_pinned,
        };
        unsigned long oldest_jif;
        long wrote = 0;
@@ -912,7 +900,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
        while ((work = get_next_work_item(bdi, wb)) != NULL) {
                struct wb_writeback_args args = work->args;
-                int post_clear;
                /*
                 * Override sync mode, in case we must wait for completion
@@ -920,13 +907,11 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
                if (force_wait)
                        work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
-                post_clear = WB_SYNC_ALL || args.sb_pinned;
                /*
                 * If this isn't a data integrity operation, just notify
                 * that we have seen this work and we are now starting it.
                 */
-                if (!post_clear)
+                if (args.sync_mode == WB_SYNC_NONE)
                        wb_clear_pending(wb, work);
                wrote += wb_writeback(wb, &args);
@@ -935,7 +920,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
                 * This is a data integrity writeback, so only do the
                 * notification when we have completed the work.
                 */
-                if (post_clear)
+                if (args.sync_mode == WB_SYNC_ALL)
                        wb_clear_pending(wb, work);
        }
@@ -1011,7 +996,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
                if (!bdi_has_dirty_io(bdi))
                        continue;
-                bdi_alloc_queue_work(bdi, &args, 0);
+                bdi_alloc_queue_work(bdi, &args);
        }
        rcu_read_unlock();
@@ -1220,18 +1205,6 @@ static void wait_sb_inodes(struct super_block *sb)
        iput(old_inode);
 }
-static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
-{
-        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-        long nr_to_write;
-        nr_to_write = nr_dirty + nr_unstable +
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-        bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
-}
 /**
 * writeback_inodes_sb  -       writeback dirty inodes from given super_block
 * @sb: the superblock
@@ -1243,21 +1216,16 @@ static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
 */
 void writeback_inodes_sb(struct super_block *sb)
 {
-        __writeback_inodes_sb(sb, 0);
+        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-}
+        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-EXPORT_SYMBOL(writeback_inodes_sb);
+        long nr_to_write;
-/**
+        nr_to_write = nr_dirty + nr_unstable +
- * writeback_inodes_sb_locked   - writeback dirty inodes from given super_block
+                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
- * @sb: the superblock
- *
+        bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
- * Like writeback_inodes_sb(), except the caller already holds the
- * sb umount sem.
- */
-void writeback_inodes_sb_locked(struct super_block *sb)
-{
-        __writeback_inodes_sb(sb, 1);
 }
+EXPORT_SYMBOL(writeback_inodes_sb);
 /**
 * writeback_inodes_sb_if_idle  -       start writeback if none underway
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 47aefd376e54..723b889fd219 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -710,30 +710,26 @@ static void fscache_write_op(struct fscache_operation *_op)
                goto superseded;
        }
-        if (page) {
+        radix_tree_tag_set(&cookie->stores, page->index,
-                radix_tree_tag_set(&cookie->stores, page->index,
+                           FSCACHE_COOKIE_STORING_TAG);
-                                   FSCACHE_COOKIE_STORING_TAG);
+        radix_tree_tag_clear(&cookie->stores, page->index,
-                radix_tree_tag_clear(&cookie->stores, page->index,
+                             FSCACHE_COOKIE_PENDING_TAG);
-                                     FSCACHE_COOKIE_PENDING_TAG);
-        }
        spin_unlock(&cookie->stores_lock);
        spin_unlock(&object->lock);
-        if (page) {
+        fscache_set_op_state(&op->op, "Store");
-                fscache_set_op_state(&op->op, "Store");
+        fscache_stat(&fscache_n_store_pages);
-                fscache_stat(&fscache_n_store_pages);
+        fscache_stat(&fscache_n_cop_write_page);
-                fscache_stat(&fscache_n_cop_write_page);
+        ret = object->cache->ops->write_page(op, page);
-                ret = object->cache->ops->write_page(op, page);
+        fscache_stat_d(&fscache_n_cop_write_page);
-                fscache_stat_d(&fscache_n_cop_write_page);
+        fscache_set_op_state(&op->op, "EndWrite");
-                fscache_set_op_state(&op->op, "EndWrite");
+        fscache_end_page_write(object, page);
-                fscache_end_page_write(object, page);
+        if (ret < 0) {
-                if (ret < 0) {
+                fscache_set_op_state(&op->op, "Abort");
-                        fscache_set_op_state(&op->op, "Abort");
+                fscache_abort_object(object);
-                        fscache_abort_object(object);
+        } else {
-                } else {
+                fscache_enqueue_operation(&op->op);
-                        fscache_enqueue_operation(&op->op);
-                }
        }
        _leave("");
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index a33aab6b5e68..54a92fd02bbd 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -234,8 +234,9 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                        if (inode->i_mode != mode) {
                                struct iattr attr;
-                                attr.ia_valid = ATTR_MODE;
+                                attr.ia_valid = ATTR_MODE | ATTR_CTIME;
                                attr.ia_mode = mode;
+                                attr.ia_ctime = CURRENT_TIME_SEC;
                                rc = jffs2_do_setattr(inode, &attr);
                                if (rc < 0)
                                        return rc;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 7aa4417e085f..166062a68230 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -222,15 +222,18 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
        dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
        jffs2_free_raw_inode(ri);
-        d_instantiate(dentry, inode);
        D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
                  inode->i_ino, inode->i_mode, inode->i_nlink,
                  f->inocache->pino_nlink, inode->i_mapping->nrpages));
+        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
 fail:
        make_bad_inode(inode);
+        unlock_new_inode(inode);
        iput(inode);
        jffs2_free_raw_inode(ri);
        return ret;
@@ -360,8 +363,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
                /* Eeek. Wave bye bye */
                mutex_unlock(&f->sem);
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fn);
-                return PTR_ERR(fn);
+                goto fail;
        }
        /* We use f->target field to store the target path. */
@@ -370,8 +373,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
                printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
                mutex_unlock(&f->sem);
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
        memcpy(f->target, target, targetlen + 1);
@@ -386,30 +389,24 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        jffs2_complete_reservation(c);
        ret = jffs2_init_security(inode, dir_i);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_init_acl_post(inode);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
                                  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-        if (ret) {
+        if (ret)
-                /* Eep. */
+                goto fail;
-                jffs2_clear_inode(inode);
-                return ret;
-        }
        rd = jffs2_alloc_raw_dirent();
        if (!rd) {
                /* Argh. Now we treat it like a normal delete */
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
        dir_f = JFFS2_INODE_INFO(dir_i);
@@ -437,8 +434,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
                jffs2_complete_reservation(c);
                jffs2_free_raw_dirent(rd);
                mutex_unlock(&dir_f->sem);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fd);
-                return PTR_ERR(fd);
+                goto fail;
        }
        dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -453,7 +450,14 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        jffs2_complete_reservation(c);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
+ fail:
+        make_bad_inode(inode);
+        unlock_new_inode(inode);
+        iput(inode);
+        return ret;
 }
@@ -519,8 +523,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
                /* Eeek. Wave bye bye */
                mutex_unlock(&f->sem);
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fn);
-                return PTR_ERR(fn);
+                goto fail;
        }
        /* No data here. Only a metadata node, which will be
           obsoleted by the first data write
@@ -531,30 +535,24 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        jffs2_complete_reservation(c);
        ret = jffs2_init_security(inode, dir_i);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_init_acl_post(inode);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
                                  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-        if (ret) {
+        if (ret)
-                /* Eep. */
+                goto fail;
-                jffs2_clear_inode(inode);
-                return ret;
-        }
        rd = jffs2_alloc_raw_dirent();
        if (!rd) {
                /* Argh. Now we treat it like a normal delete */
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
        dir_f = JFFS2_INODE_INFO(dir_i);
@@ -582,8 +580,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
                jffs2_complete_reservation(c);
                jffs2_free_raw_dirent(rd);
                mutex_unlock(&dir_f->sem);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fd);
-                return PTR_ERR(fd);
+                goto fail;
        }
        dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -599,7 +597,14 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        jffs2_complete_reservation(c);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
+ fail:
+        make_bad_inode(inode);
+        unlock_new_inode(inode);
+        iput(inode);
+        return ret;
 }
 static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
@@ -693,8 +698,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
                /* Eeek. Wave bye bye */
                mutex_unlock(&f->sem);
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fn);
-                return PTR_ERR(fn);
+                goto fail;
        }
        /* No data here. Only a metadata node, which will be
           obsoleted by the first data write
@@ -705,30 +710,24 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        jffs2_complete_reservation(c);
        ret = jffs2_init_security(inode, dir_i);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_init_acl_post(inode);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
                                  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-        if (ret) {
+        if (ret)
-                /* Eep. */
+                goto fail;
-                jffs2_clear_inode(inode);
-                return ret;
-        }
        rd = jffs2_alloc_raw_dirent();
        if (!rd) {
                /* Argh. Now we treat it like a normal delete */
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
        dir_f = JFFS2_INODE_INFO(dir_i);
@@ -759,8 +758,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
                jffs2_complete_reservation(c);
                jffs2_free_raw_dirent(rd);
                mutex_unlock(&dir_f->sem);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fd);
-                return PTR_ERR(fd);
+                goto fail;
        }
        dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -775,8 +774,14 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        jffs2_complete_reservation(c);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
+ fail:
+        make_bad_inode(inode);
+        unlock_new_inode(inode);
+        iput(inode);
+        return ret;
 }
 static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 8bc2c80ab159..459d39d1ea0b 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -465,7 +465,12 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
        inode->i_blocks = 0;
        inode->i_size = 0;
-        insert_inode_hash(inode);
+        if (insert_inode_locked(inode) < 0) {
+                make_bad_inode(inode);
+                unlock_new_inode(inode);
+                iput(inode);
+                return ERR_PTR(-EINVAL);
+        }
        return inode;
 }
diff --git a/fs/libfs.c b/fs/libfs.c
index 09e1016eb774..dcaf972cbf1b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -489,7 +489,8 @@ int simple_write_end(struct file *file, struct address_space *mapping,
 * unique inode values later for this filesystem, then you must take care
 * to pass it an appropriate max_reserved value to avoid collisions.
 */
-int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
+int simple_fill_super(struct super_block *s, unsigned long magic,
+                      struct tree_descr *files)
 {
        struct inode *inode;
        struct dentry *root;
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 91969589131c..1dbf921ca44b 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -75,10 +75,6 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
        if (!IS_ERR(page))
                kmap(page);
        return page;
-fail:
-        dir_put_page(page);
-        return ERR_PTR(-EIO);
 }
 static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 12f7109720c2..4a2734758778 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4122,8 +4122,8 @@ nfs4_state_shutdown(void)
        nfs4_lock_state();
        nfs4_release_reclaim();
        __nfs4_state_shutdown();
-        nfsd4_destroy_callback_queue();
        nfs4_unlock_state();
+        nfsd4_destroy_callback_queue();
 }
 /*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ebbf3b6b2457..3c111120b619 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -443,8 +443,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        if (size_change)
                put_write_access(inode);
        if (!err)
-                if (EX_ISSYNC(fhp->fh_export))
+                commit_metadata(fhp);
-                        write_inode_now(inode, 1);
 out:
        return err;
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index af638d59e3bf..43c8c5b541fd 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -75,8 +75,6 @@ struct nilfs_btree_path {
 extern struct kmem_cache *nilfs_btree_path_cache;
-int nilfs_btree_path_cache_init(void);
-void nilfs_btree_path_cache_destroy(void);
 int nilfs_btree_init(struct nilfs_bmap *);
 int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
                                   const __u64 *, const __u64 *, int);
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index fdf1c3b6d673..85fbb66455e2 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -127,8 +127,6 @@ struct nilfs_segment_buffer {
 extern struct kmem_cache *nilfs_segbuf_cachep;
-int __init nilfs_init_segbuf_cache(void);
-void nilfs_destroy_segbuf_cache(void);
 struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
 void nilfs_segbuf_free(struct nilfs_segment_buffer *);
 void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index dca142361ccf..01e20dbb217d 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -221,8 +221,6 @@ enum {
 extern struct kmem_cache *nilfs_transaction_cachep;
 /* segment.c */
-extern int nilfs_init_transaction_cache(void);
-extern void nilfs_destroy_transaction_cache(void);
 extern void nilfs_relax_pressure_in_lock(struct super_block *);
 extern int nilfs_construct_segment(struct super_block *);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 03b34b738993..414ef68931cf 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1130,13 +1130,13 @@ static void nilfs_segbuf_init_once(void *obj)
 static void nilfs_destroy_cachep(void)
 {
-         if (nilfs_inode_cachep)
+        if (nilfs_inode_cachep)
                kmem_cache_destroy(nilfs_inode_cachep);
-         if (nilfs_transaction_cachep)
+        if (nilfs_transaction_cachep)
                kmem_cache_destroy(nilfs_transaction_cachep);
-         if (nilfs_segbuf_cachep)
+        if (nilfs_segbuf_cachep)
                kmem_cache_destroy(nilfs_segbuf_cachep);
-         if (nilfs_btree_path_cache)
+        if (nilfs_btree_path_cache)
                kmem_cache_destroy(nilfs_btree_path_cache);
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index db6eaaba0dd8..279eef96c51c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -26,9 +26,14 @@
 /*
 * The max size that a non-root user is allowed to grow the pipe. Can
- * be set by root in /proc/sys/fs/pipe-max-pages
+ * be set by root in /proc/sys/fs/pipe-max-size
 */
-unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
+unsigned int pipe_max_size = 1048576;
+/*
+ * Minimum pipe size, as required by POSIX
+ */
+unsigned int pipe_min_size = PAGE_SIZE;
 /*
 * We use a start+len construction, which provides full use of the 
@@ -1118,26 +1123,20 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
 * Allocate a new array of pipe buffers and copy the info over. Returns the
 * pipe size if successful, or return -ERROR on error.
 */
-static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
 {
        struct pipe_buffer *bufs;
        /*
-         * Must be a power-of-2 currently
-         */
-        if (!is_power_of_2(arg))
-                return -EINVAL;
-        /*
         * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
         * expect a lot of shrink+grow operations, just free and allocate
         * again like we would do for growing. If the pipe currently
         * contains more buffers than arg, then return busy.
         */
-        if (arg < pipe->nrbufs)
+        if (nr_pages < pipe->nrbufs)
                return -EBUSY;
-        bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
+        bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
        if (unlikely(!bufs))
                return -ENOMEM;
@@ -1146,20 +1145,56 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
         * and adjust the indexes.
         */
        if (pipe->nrbufs) {
-                const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
+                unsigned int tail;
-                const unsigned int head = pipe->nrbufs - tail;
+                unsigned int head;
+                tail = pipe->curbuf + pipe->nrbufs;
+                if (tail < pipe->buffers)
+                        tail = 0;
+                else
+                        tail &= (pipe->buffers - 1);
+                head = pipe->nrbufs - tail;
                if (head)
                        memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
                if (tail)
-                        memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
+                        memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
        }
        pipe->curbuf = 0;
        kfree(pipe->bufs);
        pipe->bufs = bufs;
-        pipe->buffers = arg;
+        pipe->buffers = nr_pages;
-        return arg;
+        return nr_pages * PAGE_SIZE;
+}
+/*
+ * Currently we rely on the pipe array holding a power-of-2 number
+ * of pages.
+ */
+static inline unsigned int round_pipe_size(unsigned int size)
+{
+        unsigned long nr_pages;
+        nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+}
+/*
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * will return an error.
+ */
+int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
+                 size_t *lenp, loff_t *ppos)
+{
+        int ret;
+        ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+        if (ret < 0 || !write)
+                return ret;
+        pipe_max_size = round_pipe_size(pipe_max_size);
+        return ret;
 }
 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1174,23 +1209,25 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
        mutex_lock(&pipe->inode->i_mutex);
        switch (cmd) {
-        case F_SETPIPE_SZ:
+        case F_SETPIPE_SZ: {
-                if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) {
+                unsigned int size, nr_pages;
-                        ret = -EINVAL;
+                size = round_pipe_size(arg);
+                nr_pages = size >> PAGE_SHIFT;
+                ret = -EINVAL;
+                if (!nr_pages)
                        goto out;
-                }
-                /*
+                if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
-                 * The pipe needs to be at least 2 pages large to
+                        ret = -EPERM;
-                 * guarantee POSIX behaviour.
-                 */
-                if (arg < 2) {
-                        ret = -EINVAL;
                        goto out;
                }
-                ret = pipe_set_size(pipe, arg);
+                ret = pipe_set_size(pipe, nr_pages);
                break;
+                }
        case F_GETPIPE_SZ:
-                ret = pipe->buffers;
+                ret = pipe->buffers * PAGE_SIZE;
                break;
        default:
                ret = -EINVAL;
diff --git a/fs/splice.c b/fs/splice.c
index ac22b00d86c3..740e6b9faf7a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -354,7 +354,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                break;
                        error = add_to_page_cache_lru(page, mapping, index,
-                                                mapping_gfp_mask(mapping));
+                                                GFP_KERNEL);
                        if (unlikely(error)) {
                                page_cache_release(page);
                                if (error == -EEXIST)
diff --git a/fs/sync.c b/fs/sync.c
index c9f83f480ec5..15aa6f03b2da 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
        if (wait)
                sync_inodes_sb(sb);
        else
-                writeback_inodes_sb_locked(sb);
+                writeback_inodes_sb(sb);
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, wait);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index bde1a4c3679a..0835a3b70e03 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -117,11 +117,13 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
        if (error)
                goto out;
+        error = sysfs_sd_setattr(sd, iattr);
+        if (error)
+                goto out;
        /* this ignores size changes */
        generic_setattr(inode, iattr);
-        error = sysfs_sd_setattr(sd, iattr);
 out:
        mutex_unlock(&sysfs_mutex);
        return error;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 089eaca860b4..34640d6dbdcb 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1333,6 +1333,21 @@ xfs_vm_writepage(
        trace_xfs_writepage(inode, page, 0);
        /*
+         * Refuse to write the page out if we are called from reclaim context.
+         *
+         * This is primarily to avoid stack overflows when called from deep
+         * used stacks in random callers for direct reclaim, but disabling
+         * reclaim for kswap is a nice side-effect as kswapd causes rather
+         * suboptimal I/O patters, too.
+         *
+         * This should really be done by the core VM, but until that happens
+         * filesystems like XFS, btrfs and ext4 have to take care of this
+         * by themselves.
+         */
+        if (current->flags & PF_MEMALLOC)
+                goto out_fail;
+        /*
         * We need a transaction if:
         *  1. There are delalloc buffers on the page
         *  2. The page is uptodate and we have unmapped buffers
@@ -1366,14 +1381,6 @@ xfs_vm_writepage(
        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-        /*
-         *  VM calculation for nr_to_write seems off.  Bump it way
-         *  up, this gets simple streaming writes zippy again.
-         *  To be reviewed again after Jens' writeback changes.
-         */
-        wbc->nr_to_write *= 4;
        /*
         * Convert delayed allocate, unwritten or unmapped space
         * to real space and flush out to disk.
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 9c8019c78c92..44f0b2de153e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -585,11 +585,20 @@ xfs_vn_fallocate(
        bf.l_len = len;
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        /* check the new inode size is valid before allocating */
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+                new_size = offset + len;
+                error = inode_newsize_ok(inode, new_size);
+                if (error)
+                        goto out_unlock;
+        }
        error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
                                       0, XFS_ATTR_NOLOCK);
-        if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
+        if (error)
-            offset + len > i_size_read(inode))
+                goto out_unlock;
-                new_size = offset + len;
        /* Change file size if needed */
        if (new_size) {
@@ -600,6 +609,7 @@ xfs_vn_fallocate(
                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
        }
+out_unlock:
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 out_error:
        return error;
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 9ac8aea91529..067cafbfc635 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -23,7 +23,6 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
-#include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3884e20bc14e..ef7f0218bccb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -164,10 +164,6 @@ xfs_inode_ag_iterator(
                struct xfs_perag        *pag;
                pag = xfs_perag_get(mp, ag);
-                if (!pag->pag_ici_init) {
-                        xfs_perag_put(pag);
-                        continue;
-                }
                error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
                                                exclusive, &nr);
                xfs_perag_put(pag);
@@ -867,12 +863,7 @@ xfs_reclaim_inode_shrink(
        down_read(&xfs_mount_list_lock);
        list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
                for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
                        pag = xfs_perag_get(mp, ag);
-                        if (!pag->pag_ici_init) {
-                                xfs_perag_put(pag);
-                                continue;
-                        }
                        reclaimable += pag->pag_ici_reclaimable;
                        xfs_perag_put(pag);
                }
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 207fa77f63ae..d12be8470cba 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -50,7 +50,6 @@
 #include "quota/xfs_dquot_item.h"
 #include "quota/xfs_dquot.h"
 #include "xfs_log_recover.h"
-#include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 /*
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index ff6bc797baf2..73d5aa117384 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -82,33 +82,6 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
        )
 )
-#define DEFINE_PERAG_REF_EVENT(name) \
-TRACE_EVENT(name, \
-        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
-                 unsigned long caller_ip), \
-        TP_ARGS(mp, agno, refcount, caller_ip), \
-        TP_STRUCT__entry( \
-                __field(dev_t, dev) \
-                __field(xfs_agnumber_t, agno) \
-                __field(int, refcount) \
-                __field(unsigned long, caller_ip) \
-        ), \
-        TP_fast_assign( \
-                __entry->dev = mp->m_super->s_dev; \
-                __entry->agno = agno; \
-                __entry->refcount = refcount; \
-                __entry->caller_ip = caller_ip; \
-        ), \
-        TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
-                  __entry->agno, \
-                  __entry->refcount, \
-                  (char *)__entry->caller_ip) \
-);
-DEFINE_PERAG_REF_EVENT(xfs_perag_get)
-DEFINE_PERAG_REF_EVENT(xfs_perag_put)
 #define DEFINE_ATTR_LIST_EVENT(name) \
 DEFINE_EVENT(xfs_attr_list_class, name, \
        TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -122,6 +95,37 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+DECLARE_EVENT_CLASS(xfs_perag_class,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+                 unsigned long caller_ip),
+        TP_ARGS(mp, agno, refcount, caller_ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(int, refcount)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->refcount = refcount;
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->refcount,
+                  (char *)__entry->caller_ip)
+);
+#define DEFINE_PERAG_REF_EVENT(name)    \
+DEFINE_EVENT(xfs_perag_class, name,     \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,       \
+                 unsigned long caller_ip),                                      \
+        TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 TRACE_EVENT(xfs_attr_list_node_descend,
        TP_PROTO(struct xfs_attr_list_context *ctx,
                 struct xfs_da_node_entry *btree),
@@ -775,165 +779,181 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
-#define DEFINE_RW_EVENT(name) \
+DECLARE_EVENT_CLASS(xfs_file_class,
-TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
-        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+        TP_ARGS(ip, count, offset, flags),
-        TP_ARGS(ip, count, offset, flags), \
+        TP_STRUCT__entry(
-        TP_STRUCT__entry( \
+                __field(dev_t, dev)
-                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino)
-                __field(xfs_ino_t, ino) \
+                __field(xfs_fsize_t, size)
-                __field(xfs_fsize_t, size) \
+                __field(xfs_fsize_t, new_size)
-                __field(xfs_fsize_t, new_size) \
+                __field(loff_t, offset)
-                __field(loff_t, offset) \
+                __field(size_t, count)
-                __field(size_t, count) \
+                __field(int, flags)
-                __field(int, flags) \
+        ),
-        ), \
+        TP_fast_assign(
-        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
-                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino;
-                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size;
-                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size;
-                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset;
-                __entry->offset = offset; \
+                __entry->count = count;
-                __entry->count = count; \
+                __entry->flags = flags;
-                __entry->flags = flags; \
+        ),
-        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count 0x%zx ioflags %s",
-                  "offset 0x%llx count 0x%zx ioflags %s", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->size, \
+                  __entry->new_size,
-                  __entry->new_size, \
+                  __entry->offset,
-                  __entry->offset, \
+                  __entry->count,
-                  __entry->count, \
+                  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
-                  __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
 )
+#define DEFINE_RW_EVENT(name)           \
+DEFINE_EVENT(xfs_file_class, name,      \
+        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+        TP_ARGS(ip, count, offset, flags))
 DEFINE_RW_EVENT(xfs_file_read);
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
 DEFINE_RW_EVENT(xfs_file_splice_read);
 DEFINE_RW_EVENT(xfs_file_splice_write);
+DECLARE_EVENT_CLASS(xfs_page_class,
-#define DEFINE_PAGE_EVENT(name) \
+        TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
-TRACE_EVENT(name, \
+        TP_ARGS(inode, page, off),
-        TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
+        TP_STRUCT__entry(
-        TP_ARGS(inode, page, off), \
+                __field(dev_t, dev)
-        TP_STRUCT__entry( \
+                __field(xfs_ino_t, ino)
-                __field(dev_t, dev) \
+                __field(pgoff_t, pgoff)
-                __field(xfs_ino_t, ino) \
+                __field(loff_t, size)
-                __field(pgoff_t, pgoff) \
+                __field(unsigned long, offset)
-                __field(loff_t, size) \
+                __field(int, delalloc)
-                __field(unsigned long, offset) \
+                __field(int, unmapped)
-                __field(int, delalloc) \
+                __field(int, unwritten)
-                __field(int, unmapped) \
+        ),
-                __field(int, unwritten) \
+        TP_fast_assign(
-        ), \
+                int delalloc = -1, unmapped = -1, unwritten = -1;
-        TP_fast_assign( \
-                int delalloc = -1, unmapped = -1, unwritten = -1; \
+                if (page_has_buffers(page))
-        \
+                        xfs_count_page_state(page, &delalloc,
-                if (page_has_buffers(page)) \
+                                             &unmapped, &unwritten);
-                        xfs_count_page_state(page, &delalloc, \
+                __entry->dev = inode->i_sb->s_dev;
-                                             &unmapped, &unwritten); \
+                __entry->ino = XFS_I(inode)->i_ino;
-                __entry->dev = inode->i_sb->s_dev; \
+                __entry->pgoff = page_offset(page);
-                __entry->ino = XFS_I(inode)->i_ino; \
+                __entry->size = i_size_read(inode);
-                __entry->pgoff = page_offset(page); \
+                __entry->offset = off;
-                __entry->size = i_size_read(inode); \
+                __entry->delalloc = delalloc;
-                __entry->offset = off; \
+                __entry->unmapped = unmapped;
-                __entry->delalloc = delalloc; \
+                __entry->unwritten = unwritten;
-                __entry->unmapped = unmapped; \
+        ),
-                __entry->unwritten = unwritten; \
+        TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-        ), \
+                  "delalloc %d unmapped %d unwritten %d",
-        TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  "delalloc %d unmapped %d unwritten %d", \
+                  __entry->ino,
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->pgoff,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->pgoff, \
+                  __entry->offset,
-                  __entry->size, \
+                  __entry->delalloc,
-                  __entry->offset, \
+                  __entry->unmapped,
-                  __entry->delalloc, \
+                  __entry->unwritten)
-                  __entry->unmapped, \
-                  __entry->unwritten) \
 )
+#define DEFINE_PAGE_EVENT(name)         \
+DEFINE_EVENT(xfs_page_class, name,      \
+        TP_PROTO(struct inode *inode, struct page *page, unsigned long off),    \
+        TP_ARGS(inode, page, off))
 DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
-#define DEFINE_IOMAP_EVENT(name) \
+DECLARE_EVENT_CLASS(xfs_iomap_class,
-TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+                 int flags, struct xfs_bmbt_irec *irec),
-                 int flags, struct xfs_bmbt_irec *irec), \
+        TP_ARGS(ip, offset, count, flags, irec),
-        TP_ARGS(ip, offset, count, flags, irec), \
+        TP_STRUCT__entry(
-        TP_STRUCT__entry( \
+                __field(dev_t, dev)
-                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino)
-                __field(xfs_ino_t, ino) \
+                __field(loff_t, size)
-                __field(loff_t, size) \
+                __field(loff_t, new_size)
-                __field(loff_t, new_size) \
+                __field(loff_t, offset)
-                __field(loff_t, offset) \
+                __field(size_t, count)
-                __field(size_t, count) \
+                __field(int, flags)
-                __field(int, flags) \
+                __field(xfs_fileoff_t, startoff)
-                __field(xfs_fileoff_t, startoff) \
+                __field(xfs_fsblock_t, startblock)
-                __field(xfs_fsblock_t, startblock) \
+                __field(xfs_filblks_t, blockcount)
-                __field(xfs_filblks_t, blockcount) \
+        ),
-        ), \
+        TP_fast_assign(
-        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
-                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino;
-                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size;
-                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size;
-                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset;
-                __entry->offset = offset; \
+                __entry->count = count;
-                __entry->count = count; \
+                __entry->flags = flags;
-                __entry->flags = flags; \
+                __entry->startoff = irec ? irec->br_startoff : 0;
-                __entry->startoff = irec ? irec->br_startoff : 0; \
+                __entry->startblock = irec ? irec->br_startblock : 0;
-                __entry->startblock = irec ? irec->br_startblock : 0; \
+                __entry->blockcount = irec ? irec->br_blockcount : 0;
-                __entry->blockcount = irec ? irec->br_blockcount : 0; \
+        ),
-        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count %zd flags %s "
-                  "offset 0x%llx count %zd flags %s " \
+                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
-                  "startoff 0x%llx startblock %lld blockcount 0x%llx", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->size, \
+                  __entry->new_size,
-                  __entry->new_size, \
+                  __entry->offset,
-                  __entry->offset, \
+                  __entry->count,
-                  __entry->count, \
+                  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
-                  __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
+                  __entry->startoff,
-                  __entry->startoff, \
+                  (__int64_t)__entry->startblock,
-                  (__int64_t)__entry->startblock, \
+                  __entry->blockcount)
-                  __entry->blockcount) \
 )
+#define DEFINE_IOMAP_EVENT(name)        \
+DEFINE_EVENT(xfs_iomap_class, name,     \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+                 int flags, struct xfs_bmbt_irec *irec),                \
+        TP_ARGS(ip, offset, count, flags, irec))
 DEFINE_IOMAP_EVENT(xfs_iomap_enter);
 DEFINE_IOMAP_EVENT(xfs_iomap_found);
 DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-#define DEFINE_SIMPLE_IO_EVENT(name) \
+DECLARE_EVENT_CLASS(xfs_simple_io_class,
-TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
-        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
+        TP_ARGS(ip, offset, count),
-        TP_ARGS(ip, offset, count), \
+        TP_STRUCT__entry(
-        TP_STRUCT__entry( \
+                __field(dev_t, dev)
-                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino)
-                __field(xfs_ino_t, ino) \
+                __field(loff_t, size)
-                __field(loff_t, size) \
+                __field(loff_t, new_size)
-                __field(loff_t, new_size) \
+                __field(loff_t, offset)
-                __field(loff_t, offset) \
+                __field(size_t, count)
-                __field(size_t, count) \
+        ),
-        ), \
+        TP_fast_assign(
-        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
-                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino;
-                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size;
-                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size;
-                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset;
-                __entry->offset = offset; \
+                __entry->count = count;
-                __entry->count = count; \
+        ),
-        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count %zd",
-                  "offset 0x%llx count %zd", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->size, \
+                  __entry->new_size,
-                  __entry->new_size, \
+                  __entry->offset,
-                  __entry->offset, \
+                  __entry->count)
-                  __entry->count) \
 );
+#define DEFINE_SIMPLE_IO_EVENT(name)    \
+DEFINE_EVENT(xfs_simple_io_class, name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),        \
+        TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 38e764146644..2d8b7bc792c9 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -249,8 +249,10 @@ xfs_qm_hold_quotafs_ref(
        if (!xfs_Gqm) {
                xfs_Gqm = xfs_Gqm_init();
-                if (!xfs_Gqm)
+                if (!xfs_Gqm) {
+                        mutex_unlock(&xfs_Gqm_lock);
                        return ENOMEM;
+                }
        }
        /*
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 401f364ad36c..4917d4eed4ed 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,6 @@ typedef struct xfs_perag {
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
-        int             pag_ici_init;   /* incore inode cache initialised */
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
        int             pag_ici_reclaimable;    /* reclaimable inodes */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 6845db90818f..75df75f43d48 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -382,9 +382,6 @@ xfs_iget(
        /* get the perag structure and ensure that it's inode capable */
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
-        if (!pag->pagi_inodeok)
-                return EINVAL;
-        ASSERT(pag->pag_ici_init);
        agino = XFS_INO_TO_AGINO(mp, ino);
 again:
@@ -744,30 +741,24 @@ xfs_ilock_demote(
 }
 #ifdef DEBUG
-/*
- * Debug-only routine, without additional rw_semaphore APIs, we can
- * now only answer requests regarding whether we hold the lock for write
- * (reader state is outside our visibility, we only track writer state).
- *
- * Note: this means !xfs_isilocked would give false positives, so don't do that.
- */
 int
 xfs_isilocked(
        xfs_inode_t             *ip,
        uint                    lock_flags)
 {
-        if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) ==
+        if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
-                        XFS_ILOCK_EXCL) {
+                if (!(lock_flags & XFS_ILOCK_SHARED))
-                if (!ip->i_lock.mr_writer)
+                        return !!ip->i_lock.mr_writer;
-                        return 0;
+                return rwsem_is_locked(&ip->i_lock.mr_lock);
        }
-        if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) ==
+        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
-                        XFS_IOLOCK_EXCL) {
+                if (!(lock_flags & XFS_IOLOCK_SHARED))
-                if (!ip->i_iolock.mr_writer)
+                        return !!ip->i_iolock.mr_writer;
-                        return 0;
+                return rwsem_is_locked(&ip->i_iolock.mr_lock);
        }
-        return 1;
+        ASSERT(0);
+        return 0;
 }
 #endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8cd6e8d8fe9c..d53c39de7d05 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1940,10 +1940,10 @@ xfs_ifree_cluster(
        int                     blks_per_cluster;
        int                     nbufs;
        int                     ninodes;
-        int                     i, j, found, pre_flushed;
+        int                     i, j;
        xfs_daddr_t             blkno;
        xfs_buf_t               *bp;
-        xfs_inode_t             *ip, **ip_found;
+        xfs_inode_t             *ip;
        xfs_inode_log_item_t    *iip;
        xfs_log_item_t          *lip;
        struct xfs_perag        *pag;
@@ -1960,114 +1960,97 @@ xfs_ifree_cluster(
                nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
        }
-        ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
        for (j = 0; j < nbufs; j++, inum += ninodes) {
+                int     found = 0;
                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
                                         XFS_INO_TO_AGBNO(mp, inum));
+                /*
+                 * We obtain and lock the backing buffer first in the process
+                 * here, as we have to ensure that any dirty inode that we
+                 * can't get the flush lock on is attached to the buffer.
+                 * If we scan the in-memory inodes first, then buffer IO can
+                 * complete before we get a lock on it, and hence we may fail
+                 * to mark all the active inodes on the buffer stale.
+                 */
+                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+                                        mp->m_bsize * blks_per_cluster,
+                                        XBF_LOCK);
+                /*
+                 * Walk the inodes already attached to the buffer and mark them
+                 * stale. These will all have the flush locks held, so an
+                 * in-memory inode walk can't lock them.
+                 */
+                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                while (lip) {
+                        if (lip->li_type == XFS_LI_INODE) {
+                                iip = (xfs_inode_log_item_t *)lip;
+                                ASSERT(iip->ili_logged == 1);
+                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
+                                xfs_trans_ail_copy_lsn(mp->m_ail,
+                                                        &iip->ili_flush_lsn,
+                                                        &iip->ili_item.li_lsn);
+                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
+                                found++;
+                        }
+                        lip = lip->li_bio_list;
+                }
                /*
-                 * Look for each inode in memory and attempt to lock it,
+                 * For each inode in memory attempt to add it to the inode
-                 * we can be racing with flush and tail pushing here.
+                 * buffer and set it up for being staled on buffer IO
-                 * any inode we get the locks on, add to an array of
+                 * completion.  This is safe as we've locked out tail pushing
-                 * inode items to process later.
+                 * and flushing by locking the buffer.
                 *
-                 * The get the buffer lock, we could beat a flush
+                 * We have already marked every inode that was part of a
-                 * or tail pushing thread to the lock here, in which
+                 * transaction stale above, which means there is no point in
-                 * case they will go looking for the inode buffer
+                 * even trying to lock them.
-                 * and fail, we need some other form of interlock
-                 * here.
                 */
-                found = 0;
                for (i = 0; i < ninodes; i++) {
                        read_lock(&pag->pag_ici_lock);
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
-                        /* Inode not in memory or we found it already,
+                        /* Inode not in memory or stale, nothing to do */
-                         * nothing to do
-                         */
                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
                                read_unlock(&pag->pag_ici_lock);
                                continue;
                        }
-                        if (xfs_inode_clean(ip)) {
+                        /* don't try to lock/unlock the current inode */
-                                read_unlock(&pag->pag_ici_lock);
+                        if (ip != free_ip &&
-                                continue;
+                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                        }
-                        /* If we can get the locks then add it to the
-                         * list, otherwise by the time we get the bp lock
-                         * below it will already be attached to the
-                         * inode buffer.
-                         */
-                        /* This inode will already be locked - by us, lets
-                         * keep it that way.
-                         */
-                        if (ip == free_ip) {
-                                if (xfs_iflock_nowait(ip)) {
-                                        xfs_iflags_set(ip, XFS_ISTALE);
-                                        if (xfs_inode_clean(ip)) {
-                                                xfs_ifunlock(ip);
-                                        } else {
-                                                ip_found[found++] = ip;
-                                        }
-                                }
                                read_unlock(&pag->pag_ici_lock);
                                continue;
                        }
+                        read_unlock(&pag->pag_ici_lock);
-                        if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                        if (!xfs_iflock_nowait(ip)) {
-                                if (xfs_iflock_nowait(ip)) {
+                                if (ip != free_ip)
-                                        xfs_iflags_set(ip, XFS_ISTALE);
-                                        if (xfs_inode_clean(ip)) {
-                                                xfs_ifunlock(ip);
-                                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                        } else {
-                                                ip_found[found++] = ip;
-                                        }
-                                } else {
                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                }
+                                continue;
                        }
-                        read_unlock(&pag->pag_ici_lock);
-                }
-                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 
+                        xfs_iflags_set(ip, XFS_ISTALE);
-                                        mp->m_bsize * blks_per_cluster,
+                        if (xfs_inode_clean(ip)) {
-                                        XBF_LOCK);
+                                ASSERT(ip != free_ip);
+                                xfs_ifunlock(ip);
-                pre_flushed = 0;
+                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                                continue;
-                while (lip) {
-                        if (lip->li_type == XFS_LI_INODE) {
-                                iip = (xfs_inode_log_item_t *)lip;
-                                ASSERT(iip->ili_logged == 1);
-                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
-                                xfs_trans_ail_copy_lsn(mp->m_ail,
-                                                        &iip->ili_flush_lsn,
-                                                        &iip->ili_item.li_lsn);
-                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-                                pre_flushed++;
                        }
-                        lip = lip->li_bio_list;
-                }
-                for (i = 0; i < found; i++) {
-                        ip = ip_found[i];
                        iip = ip->i_itemp;
                        if (!iip) {
+                                /* inode with unlogged changes only */
+                                ASSERT(ip != free_ip);
                                ip->i_update_core = 0;
                                xfs_ifunlock(ip);
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                                continue;
                        }
+                        found++;
                        iip->ili_last_fields = iip->ili_format.ilf_fields;
                        iip->ili_format.ilf_fields = 0;
@@ -2078,17 +2061,16 @@ xfs_ifree_cluster(
                        xfs_buf_attach_iodone(bp,
                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
                                xfs_istale_done, (xfs_log_item_t *)iip);
-                        if (ip != free_ip) {
+                        if (ip != free_ip)
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                        }
                }
-                if (found || pre_flushed)
+                if (found)
                        xfs_trans_stale_inode_buf(tp, bp);
                xfs_trans_binval(tp, bp);
        }
-        kmem_free(ip_found);
        xfs_perag_put(pag);
 }
@@ -2649,8 +2631,6 @@ xfs_iflush_cluster(
        int                     i;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        ASSERT(pag->pagi_inodeok);
-        ASSERT(pag->pag_ici_init);
        inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
        ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 14a69aec2c0b..ed0684cc50ee 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -132,15 +132,10 @@ xlog_align(
        int             nbblks,
        xfs_buf_t       *bp)
 {
-        xfs_daddr_t     offset;
+        xfs_daddr_t     offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
-        xfs_caddr_t     ptr;
-        offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1);
+        ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
-        ptr = XFS_BUF_PTR(bp) + BBTOB(offset);
+        return XFS_BUF_PTR(bp) + BBTOB(offset);
-        ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
-        return ptr;
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d7bf38c8cd1c..d59f4e8bedcf 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -268,10 +268,10 @@ xfs_sb_validate_fsb_count(
 #if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
        if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
-                return E2BIG;
+                return EFBIG;
 #else                  /* Limited by UINT_MAX of sectors */
        if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
-                return E2BIG;
+                return EFBIG;
 #endif
        return 0;
 }
@@ -393,7 +393,7 @@ xfs_mount_validate_sb(
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
                xfs_fs_mount_cmn_err(flags,
                        "file system too large to be mounted on this system.");
-                return XFS_ERROR(E2BIG);
+                return XFS_ERROR(EFBIG);
        }
        if (unlikely(sbp->sb_inprogress)) {
@@ -413,17 +413,6 @@ xfs_mount_validate_sb(
        return 0;
 }
-STATIC void
-xfs_initialize_perag_icache(
-        xfs_perag_t     *pag)
-{
-        if (!pag->pag_ici_init) {
-                rwlock_init(&pag->pag_ici_lock);
-                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
-                pag->pag_ici_init = 1;
-        }
-}
 int
 xfs_initialize_perag(
        xfs_mount_t     *mp,
@@ -436,13 +425,8 @@ xfs_initialize_perag(
        xfs_agino_t     agino;
        xfs_ino_t       ino;
        xfs_sb_t        *sbp = &mp->m_sb;
-        xfs_ino_t       max_inum = XFS_MAXINUMBER_32;
        int             error = -ENOMEM;
-        /* Check to see if the filesystem can overflow 32 bit inodes */
-        agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
-        ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
        /*
         * Walk the current per-ag tree so we don't try to initialise AGs
         * that already exist (growfs case). Allocate and insert all the
@@ -456,11 +440,18 @@ xfs_initialize_perag(
                }
                if (!first_initialised)
                        first_initialised = index;
                pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
                if (!pag)
                        goto out_unwind;
+                pag->pag_agno = index;
+                pag->pag_mount = mp;
+                rwlock_init(&pag->pag_ici_lock);
+                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
                if (radix_tree_preload(GFP_NOFS))
                        goto out_unwind;
                spin_lock(&mp->m_perag_lock);
                if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
                        BUG();
@@ -469,25 +460,26 @@ xfs_initialize_perag(
                        error = -EEXIST;
                        goto out_unwind;
                }
-                pag->pag_agno = index;
-                pag->pag_mount = mp;
                spin_unlock(&mp->m_perag_lock);
                radix_tree_preload_end();
        }
-        /* Clear the mount flag if no inode can overflow 32 bits
+        /*
-         * on this filesystem, or if specifically requested..
+         * If we mount with the inode64 option, or no inode overflows
+         * the legacy 32-bit address space clear the inode32 option.
         */
-        if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) {
+        agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+        ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+        if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
                mp->m_flags |= XFS_MOUNT_32BITINODES;
-        } else {
+        else
                mp->m_flags &= ~XFS_MOUNT_32BITINODES;
-        }
-        /* If we can overflow then setup the ag headers accordingly */
        if (mp->m_flags & XFS_MOUNT_32BITINODES) {
-                /* Calculate how much should be reserved for inodes to
+                /*
-                 * meet the max inode percentage.
+                 * Calculate how much should be reserved for inodes to meet
+                 * the max inode percentage.
                 */
                if (mp->m_maxicount) {
                        __uint64_t      icount;
@@ -500,30 +492,28 @@ xfs_initialize_perag(
                } else {
                        max_metadata = agcount;
                }
                for (index = 0; index < agcount; index++) {
                        ino = XFS_AGINO_TO_INO(mp, index, agino);
-                        if (ino > max_inum) {
+                        if (ino > XFS_MAXINUMBER_32) {
                                index++;
                                break;
                        }
-                        /* This ag is preferred for inodes */
                        pag = xfs_perag_get(mp, index);
                        pag->pagi_inodeok = 1;
                        if (index < max_metadata)
                                pag->pagf_metadata = 1;
-                        xfs_initialize_perag_icache(pag);
                        xfs_perag_put(pag);
                }
        } else {
-                /* Setup default behavior for smaller filesystems */
                for (index = 0; index < agcount; index++) {
                        pag = xfs_perag_get(mp, index);
                        pag->pagi_inodeok = 1;
-                        xfs_initialize_perag_icache(pag);
                        xfs_perag_put(pag);
                }
        }
        if (maxagi)
                *maxagi = index;
        return 0;
@@ -1009,7 +999,7 @@ xfs_check_sizes(xfs_mount_t *mp)
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
                cmn_err(CE_WARN, "XFS: size check 1 failed");
-                return XFS_ERROR(E2BIG);
+                return XFS_ERROR(EFBIG);
        }
        error = xfs_read_buf(mp, mp->m_ddev_targp,
                             d - XFS_FSS_TO_BB(mp, 1),
@@ -1019,7 +1009,7 @@ xfs_check_sizes(xfs_mount_t *mp)
        } else {
                cmn_err(CE_WARN, "XFS: size check 2 failed");
                if (error == ENOSPC)
-                        error = XFS_ERROR(E2BIG);
+                        error = XFS_ERROR(EFBIG);
                return error;
        }
@@ -1027,7 +1017,7 @@ xfs_check_sizes(xfs_mount_t *mp)
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
                        cmn_err(CE_WARN, "XFS: size check 3 failed");
-                        return XFS_ERROR(E2BIG);
+                        return XFS_ERROR(EFBIG);
                }
                error = xfs_read_buf(mp, mp->m_logdev_targp,
                                     d - XFS_FSB_TO_BB(mp, 1),
@@ -1037,7 +1027,7 @@ xfs_check_sizes(xfs_mount_t *mp)
                } else {
                        cmn_err(CE_WARN, "XFS: size check 3 failed");
                        if (error == ENOSPC)
-                                error = XFS_ERROR(E2BIG);
+                                error = XFS_ERROR(EFBIG);
                        return error;
                }
        }
@@ -1254,7 +1244,7 @@ xfs_mountfs(
         * Allocate and initialize the per-ag data.
         */
        spin_lock_init(&mp->m_perag_lock);
-        INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
+        INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
        error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
        if (error) {
                cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6be05f756d59..16445518506d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2247,7 +2247,7 @@ xfs_rtmount_init(
                cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
                        (unsigned long long) XFS_BB_TO_FSB(mp, d),
                        (unsigned long long) mp->m_sb.sb_rblocks);
-                return XFS_ERROR(E2BIG);
+                return XFS_ERROR(EFBIG);
        }
        error = xfs_read_buf(mp, mp->m_rtdev_targp,
                                d - XFS_FSB_TO_BB(mp, 1),
@@ -2256,7 +2256,7 @@ xfs_rtmount_init(
                cmn_err(CE_WARN,
        "XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
                if (error == ENOSPC)
-                        return XFS_ERROR(E2BIG);
+                        return XFS_ERROR(EFBIG);
                return error;
        }
        xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b2d67adb6a08..ff614c29b441 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -147,7 +147,16 @@ xfs_growfs_rt(
 # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
 # define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
 # define xfs_growfs_rt(mp,in)                           (ENOSYS)
-# define xfs_rtmount_init(m)    (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+static inline int               /* error */
+xfs_rtmount_init(
+        xfs_mount_t     *mp)    /* file system mount structure */
+{
+        if (mp->m_sb.sb_rblocks == 0)
+                return 0;
+        cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
+        return ENOSYS;
+}
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtunmount_inodes(m)
 #endif  /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index ce558efa2ea0..28547dfce037 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -48,134 +48,489 @@
 kmem_zone_t     *xfs_trans_zone;
 /*
- * Reservation functions here avoid a huge stack in xfs_trans_init
+ * Various log reservation values.
- * due to register overflow from temporaries in the calculations.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
 */
 STATIC uint
-xfs_calc_write_reservation(xfs_mount_t *mp)
+xfs_calc_write_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *              4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_itruncate_reservation(xfs_mount_t *mp)
+xfs_calc_itruncate_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
+                     128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+                    (4 * mp->m_sb.sb_sectsize +
+                     4 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 4) +
+                     128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
+                     128 * 5 +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *      of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_rename_reservation(xfs_mount_t *mp)
+xfs_calc_rename_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((4 * mp->m_sb.sb_inodesize +
+                     2 * XFS_DIROP_LOG_RES(mp) +
+                     128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
+                    (3 * mp->m_sb.sb_sectsize +
+                     3 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 3) +
+                     128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
 }
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_link_reservation(xfs_mount_t *mp)
+xfs_calc_link_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     XFS_DIROP_LOG_RES(mp) +
+                     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+                    (mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_remove_reservation(xfs_mount_t *mp)
+xfs_calc_remove_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     XFS_DIROP_LOG_RES(mp) +
+                     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
+/*
+ * For symlink we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: 1 block
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the blocks for the symlink: 1 kB
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_symlink_reservation(xfs_mount_t *mp)
+xfs_calc_symlink_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, 1) +
+                     XFS_DIROP_LOG_RES(mp) +
+                     1024 +
+                     128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+                     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_create_reservation(xfs_mount_t *mp)
+xfs_calc_create_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_FSB_TO_B(mp, 1) +
+                     XFS_DIROP_LOG_RES(mp) +
+                     128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
+                    (3 * mp->m_sb.sb_sectsize +
+                     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+                     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * Making a new directory is the same as creating a new file.
+ */
 STATIC uint
-xfs_calc_mkdir_reservation(xfs_mount_t *mp)
+xfs_calc_mkdir_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return xfs_calc_create_reservation(mp);
 }
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_ifree_reservation(xfs_mount_t *mp)
+xfs_calc_ifree_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize +
+                mp->m_sb.sb_sectsize +
+                XFS_FSB_TO_B(mp, 1) +
+                MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
+                    XFS_INODE_CLUSTER_SIZE(mp)) +
+                128 * 5 +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
 STATIC uint
-xfs_calc_ichange_reservation(xfs_mount_t *mp)
+xfs_calc_ichange_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize +
+                512;
 }
+/*
+ * Growing the data section of the filesystem.
+ *      superblock
+ *      agi and agf
+ *      allocation btrees
+ */
 STATIC uint
-xfs_calc_growdata_reservation(xfs_mount_t *mp)
+xfs_calc_growdata_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWDATA_LOG_RES(mp);
+        return mp->m_sb.sb_sectsize * 3 +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *      superblock: sector size
+ *      agf of the ag from which the extent is allocated: sector size
+ *      bmap btree for bitmap/summary inode: max depth * blocksize
+ *      bitmap/summary inode: inode size
+ *      allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
 STATIC uint
-xfs_calc_growrtalloc_reservation(xfs_mount_t *mp)
+xfs_calc_growrtalloc_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWRTALLOC_LOG_RES(mp);
+        return 2 * mp->m_sb.sb_sectsize +
+                XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+                mp->m_sb.sb_inodesize +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *      one bitmap/summary block: blocksize
+ */
 STATIC uint
-xfs_calc_growrtzero_reservation(xfs_mount_t *mp)
+xfs_calc_growrtzero_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWRTZERO_LOG_RES(mp);
+        return mp->m_sb.sb_blocksize + 128;
 }
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *      superblock: sector size
+ *      bitmap inode: inode size
+ *      summary inode: inode size
+ *      one bitmap block: blocksize
+ *      summary blocks: new summary size
+ */
 STATIC uint
-xfs_calc_growrtfree_reservation(xfs_mount_t *mp)
+xfs_calc_growrtfree_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWRTFREE_LOG_RES(mp);
+        return mp->m_sb.sb_sectsize +
+                2 * mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_blocksize +
+                mp->m_rsumsize +
+                128 * 5;
 }
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *      inode
+ */
 STATIC uint
-xfs_calc_swrite_reservation(xfs_mount_t *mp)
+xfs_calc_swrite_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_SWRITE_LOG_RES(mp);
+        return mp->m_sb.sb_inodesize + 128;
 }
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *      inode
+ */
 STATIC uint
 xfs_calc_writeid_reservation(xfs_mount_t *mp)
 {
-        return XFS_CALC_WRITEID_LOG_RES(mp);
+        return mp->m_sb.sb_inodesize + 128;
 }
+/*
+ * Converting the inode from non-attributed to attributed.
+ *      the inode being converted: inode size
+ *      agf block and superblock (for block allocation)
+ *      the new block (directory sized)
+ *      bmap blocks for the new directory block
+ *      allocation btrees
+ */
 STATIC uint
-xfs_calc_addafork_reservation(xfs_mount_t *mp)
+xfs_calc_addafork_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize * 2 +
+                mp->m_dirblksize +
+                XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
+                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *              4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_attrinval_reservation(xfs_mount_t *mp)
+xfs_calc_attrinval_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ATTRINVAL_LOG_RES(mp);
+        return MAX((mp->m_sb.sb_inodesize +
+                    XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                    128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
+                   (4 * mp->m_sb.sb_sectsize +
+                    4 * mp->m_sb.sb_sectsize +
+                    mp->m_sb.sb_sectsize +
+                    XFS_ALLOCFREE_LOG_RES(mp, 4) +
+                    128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
 }
+/*
+ * Setting an attribute.
+ *      the inode getting the attribute
+ *      the superblock for allocations
+ *      the agfs extents are allocated from
+ *      the attribute btree * max depth
+ *      the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime.
+ */
 STATIC uint
-xfs_calc_attrset_reservation(xfs_mount_t *mp)
+xfs_calc_attrset_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize +
+                XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+                128 * (2 + XFS_DA_NODE_MAXDEPTH);
 }
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_attrrm_reservation(xfs_mount_t *mp)
+xfs_calc_attrrm_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                     128 * (1 + XFS_DA_NODE_MAXDEPTH +
+                            XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
 STATIC uint
-xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
+xfs_calc_clear_agi_bucket_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp);
+        return mp->m_sb.sb_sectsize + 128;
 }
 /*
@@ -184,11 +539,10 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
 */
 void
 xfs_trans_init(
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        xfs_trans_reservations_t        *resp;
+        struct xfs_trans_reservations *resp = &mp->m_reservations;
-        resp = &(mp->m_reservations);
        resp->tr_write = xfs_calc_write_reservation(mp);
        resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
        resp->tr_rename = xfs_calc_rename_reservation(mp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 8c69e7824f68..e639e8e9a2a9 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -300,24 +300,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 /*
- * Various log reservation values.
- * These are based on the size of the file system block
- * because that is what most transactions manipulate.
- * Each adds in an additional 128 bytes per item logged to
- * try to account for the overhead of the transaction mechanism.
- *
- * Note:
- * Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish()
- * call.  This is because the number in the worst case is quite high
- * and quite unusual.  In order to fix this we need to change
- * xfs_bmap_finish() to free extents in only a single AG at a time.
- * This will require changes to the EFI code as well, however, so that
- * the EFI for the extents not freed is logged again in each transaction.
- * See bug 261917.
- */
-/*
 * Per-extent log reservation for the allocation btree changes
 * involved in freeing or allocating an extent.
 * 2 trees * (2 blocks/level * max depth - 1) * block size
@@ -341,429 +323,36 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
        (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
         XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
-/*
- * In a write transaction we can allocate a maximum of 2
- * extents.  This gives:
- *    the inode getting the new extents: inode size
- *    the inode's bmap btree: max depth * block size
- *    the agfs of the ags from which the extents are allocated: 2 * sector
- *    the superblock free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
- *    the agfs of the ags containing the blocks: 2 * sector size
- *    the agfls of the ags containing the blocks: 2 * sector size
- *    the super block free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_WRITE_LOG_RES(mp) \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
-         ((2 * (mp)->m_sb.sb_sectsize) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 #define XFS_WRITE_LOG_RES(mp)   ((mp)->m_reservations.tr_write)
-/*
- * In truncating a file we free up to two extents at once.  We can modify:
- *    the inode being truncated: inode size
- *    the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *              4 exts * 2 trees * (2 * max depth - 1) * block size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
-          (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
-         ((4 * (mp)->m_sb.sb_sectsize) + \
-          (4 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 4) + \
-          (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
-          (128 * 5) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-           (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-            XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
-/*
- * In renaming a files we can modify:
- *    the four inodes involved: 4 * inode size
- *    the two directory btrees: 2 * (max depth + v2) * dir block size
- *    the two directory bmap btrees: 2 * max depth * block size
- * And the bmap_finish transaction can free dir and bmap blocks (two sets
- *      of bmap blocks) giving:
- *    the agf for the ags in which the blocks live: 3 * sector size
- *    the agfl for the ags in which the blocks live: 3 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_RENAME_LOG_RES(mp) \
-        (MAX( \
-         ((4 * (mp)->m_sb.sb_inodesize) + \
-          (2 * XFS_DIROP_LOG_RES(mp)) + \
-          (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
-         ((3 * (mp)->m_sb.sb_sectsize) + \
-          (3 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 3) + \
-          (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
 #define XFS_RENAME_LOG_RES(mp)  ((mp)->m_reservations.tr_rename)
-/*
- * For creating a link to an inode:
- *    the parent directory inode: inode size
- *    the linked inode: inode size
- *    the directory btree could split: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free some bmap blocks giving:
- *    the agf for the ag in which the blocks live: sector size
- *    the agfl for the ag in which the blocks live: sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_LINK_LOG_RES(mp) \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          XFS_DIROP_LOG_RES(mp) + \
-          (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
-         ((mp)->m_sb.sb_sectsize + \
-          (mp)->m_sb.sb_sectsize + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_LINK_LOG_RES(mp)    ((mp)->m_reservations.tr_link)
-/*
- * For removing a directory entry we can modify:
- *    the parent directory inode: inode size
- *    the removed inode: inode size
- *    the directory btree could join: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free the dir and bmap blocks giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_REMOVE_LOG_RES(mp)     \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          XFS_DIROP_LOG_RES(mp) + \
-          (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
-         ((2 * (mp)->m_sb.sb_sectsize) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 #define XFS_REMOVE_LOG_RES(mp)  ((mp)->m_reservations.tr_remove)
-/*
- * For symlink we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: 1 block
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- *    the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_SYMLINK_LOG_RES(mp)            \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B(mp, 1) + \
-          XFS_DIROP_LOG_RES(mp) + \
-          1024 + \
-          (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
-         (2 * (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-          XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
-/*
- * For create we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: block size
- *    the superblock for the nlink flag: sector size
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_CREATE_LOG_RES(mp)             \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B(mp, 1) + \
-          XFS_DIROP_LOG_RES(mp) + \
-          (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
-         (3 * (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-          XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_CREATE_LOG_RES(mp)  ((mp)->m_reservations.tr_create)
-/*
- * Making a new directory is the same as creating a new file.
- */
-#define XFS_CALC_MKDIR_LOG_RES(mp)      XFS_CALC_CREATE_LOG_RES(mp)
 #define XFS_MKDIR_LOG_RES(mp)   ((mp)->m_reservations.tr_mkdir)
-/*
- * In freeing an inode we can modify:
- *    the inode being freed: inode size
- *    the super block free inode counter: sector size
- *    the agi hash list and counters: sector size
- *    the inode btree entry: block size
- *    the on disk inode before ours in the agi hash list: inode cluster size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_IFREE_LOG_RES(mp) \
-        ((mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_sectsize + \
-         (mp)->m_sb.sb_sectsize + \
-         XFS_FSB_TO_B((mp), 1) + \
-         MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
-         (128 * 5) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_IFREE_LOG_RES(mp)   ((mp)->m_reservations.tr_ifree)
-/*
- * When only changing the inode we log the inode and possibly the superblock
- * We also add a bit of slop for the transaction stuff.
- */
-#define XFS_CALC_ICHANGE_LOG_RES(mp)    ((mp)->m_sb.sb_inodesize + \
-                                         (mp)->m_sb.sb_sectsize + 512)
 #define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
-/*
- * Growing the data section of the filesystem.
- *      superblock
- *      agi and agf
- *      allocation btrees
- */
-#define XFS_CALC_GROWDATA_LOG_RES(mp) \
-        ((mp)->m_sb.sb_sectsize * 3 + \
-         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_GROWDATA_LOG_RES(mp)    ((mp)->m_reservations.tr_growdata)
-/*
- * Growing the rt section of the filesystem.
- * In the first set of transactions (ALLOC) we allocate space to the
- * bitmap or summary files.
- *      superblock: sector size
- *      agf of the ag from which the extent is allocated: sector size
- *      bmap btree for bitmap/summary inode: max depth * blocksize
- *      bitmap/summary inode: inode size
- *      allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
- */
-#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
-        (2 * (mp)->m_sb.sb_sectsize + \
-         XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
-         (mp)->m_sb.sb_inodesize + \
-         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * \
-          (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_GROWRTALLOC_LOG_RES(mp)     ((mp)->m_reservations.tr_growrtalloc)
-/*
- * Growing the rt section of the filesystem.
- * In the second set of transactions (ZERO) we zero the new metadata blocks.
- *      one bitmap/summary block: blocksize
- */
-#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \
-        ((mp)->m_sb.sb_blocksize + 128)
 #define XFS_GROWRTZERO_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtzero)
-/*
- * Growing the rt section of the filesystem.
- * In the third set of transactions (FREE) we update metadata without
- * allocating any new blocks.
- *      superblock: sector size
- *      bitmap inode: inode size
- *      summary inode: inode size
- *      one bitmap block: blocksize
- *      summary blocks: new summary size
- */
-#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \
-        ((mp)->m_sb.sb_sectsize + \
-         2 * (mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_blocksize + \
-         (mp)->m_rsumsize + \
-         (128 * 5))
 #define XFS_GROWRTFREE_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtfree)
-/*
- * Logging the inode modification timestamp on a synchronous write.
- *      inode
- */
-#define XFS_CALC_SWRITE_LOG_RES(mp) \
-        ((mp)->m_sb.sb_inodesize + 128)
 #define XFS_SWRITE_LOG_RES(mp)  ((mp)->m_reservations.tr_swrite)
 /*
 * Logging the inode timestamps on an fsync -- same as SWRITE
 * as long as SWRITE logs the entire inode core
 */
 #define XFS_FSYNC_TS_LOG_RES(mp)        ((mp)->m_reservations.tr_swrite)
-/*
- * Logging the inode mode bits when writing a setuid/setgid file
- *      inode
- */
-#define XFS_CALC_WRITEID_LOG_RES(mp) \
-        ((mp)->m_sb.sb_inodesize + 128)
 #define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-/*
- * Converting the inode from non-attributed to attributed.
- *      the inode being converted: inode size
- *      agf block and superblock (for block allocation)
- *      the new block (directory sized)
- *      bmap blocks for the new directory block
- *      allocation btrees
- */
-#define XFS_CALC_ADDAFORK_LOG_RES(mp)   \
-        ((mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_sectsize * 2 + \
-         (mp)->m_dirblksize + \
-         XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
-         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
-                 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_ADDAFORK_LOG_RES(mp)        ((mp)->m_reservations.tr_addafork)
-/*
- * Removing the attribute fork of a file
- *    the inode being truncated: inode size
- *    the inode's bmap btree: max depth * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *              4 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_ATTRINVAL_LOG_RES(mp)  \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
-          (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
-         ((4 * (mp)->m_sb.sb_sectsize) + \
-          (4 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 4) + \
-          (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
 #define XFS_ATTRINVAL_LOG_RES(mp)       ((mp)->m_reservations.tr_attrinval)
-/*
- * Setting an attribute.
- *      the inode getting the attribute
- *      the superblock for allocations
- *      the agfs extents are allocated from
- *      the attribute btree * max depth
- *      the inode allocation btree
- * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime.
- */
-#define XFS_CALC_ATTRSET_LOG_RES(mp)    \
-        ((mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
-          (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
 #define XFS_ATTRSET_LOG_RES(mp, ext)    \
        ((mp)->m_reservations.tr_attrset + \
         (ext * (mp)->m_sb.sb_sectsize) + \
         (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
         (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
-/*
- * Removing an attribute.
- *    the inode: inode size
- *    the attribute btree could join: max depth * block size
- *    the inode bmap btree could join or split: max depth * block size
- * And the bmap_finish transaction can free the attr blocks freed giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_ATTRRM_LOG_RES(mp)     \
-        (MAX( \
-          ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
-          (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
-         ((2 * (mp)->m_sb.sb_sectsize) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 #define XFS_ATTRRM_LOG_RES(mp)  ((mp)->m_reservations.tr_attrrm)
-/*
- * Clearing a bad agino number in an agi hash bucket.
- */
-#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
-        ((mp)->m_sb.sb_sectsize + 128)
 #define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 9d376be0ea38..a06bd62504fc 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -267,7 +267,7 @@ xfs_setattr(
                if (code) {
                        ASSERT(tp == NULL);
                        lock_flags &= ~XFS_ILOCK_EXCL;
-                        ASSERT(lock_flags == XFS_IOLOCK_EXCL);
+                        ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock);
                        goto error_return;
                }
                tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);