36 files changed, 715 insertions, 415 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 18f74ec4dce9..9d03d1ebca6f 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1001,44 +1001,6 @@ done:
 }
 /**
- * v9fs_vfs_readlink - read a symlink's location
- * @dentry: dentry for symlink
- * @buffer: buffer to load symlink location into
- * @buflen: length of buffer
- *
- */
-static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
-                             int buflen)
-{
-        int retval;
-        int ret;
-        char *link = __getname();
-        if (unlikely(!link))
-                return -ENOMEM;
-        if (buflen > PATH_MAX)
-                buflen = PATH_MAX;
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
-                                                                        dentry);
-        retval = v9fs_readlink(dentry, link, buflen);
-        if (retval > 0) {
-                if ((ret = copy_to_user(buffer, link, retval)) != 0) {
-                        P9_DPRINTK(P9_DEBUG_ERROR,
-                                        "problem copying to user: %d\n", ret);
-                        retval = ret;
-                }
-        }
-        __putname(link);
-        return retval;
-}
-/**
 * v9fs_vfs_follow_link - follow a symlink path
 * @dentry: dentry for symlink
 * @nd: nameidata
@@ -1230,7 +1192,6 @@ static const struct inode_operations v9fs_dir_inode_operations_ext = {
        .rmdir = v9fs_vfs_rmdir,
        .mknod = v9fs_vfs_mknod,
        .rename = v9fs_vfs_rename,
-        .readlink = v9fs_vfs_readlink,
        .getattr = v9fs_vfs_getattr,
        .setattr = v9fs_vfs_setattr,
 };
@@ -1253,7 +1214,7 @@ static const struct inode_operations v9fs_file_inode_operations = {
 };
 static const struct inode_operations v9fs_symlink_inode_operations = {
-        .readlink = v9fs_vfs_readlink,
+        .readlink = generic_readlink,
        .follow_link = v9fs_vfs_follow_link,
        .put_link = v9fs_vfs_put_link,
        .getattr = v9fs_vfs_getattr,
diff --git a/fs/bio.c b/fs/bio.c
index 76e6713abf94..12429c9553eb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -78,7 +78,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
        i = 0;
        while (i < bio_slab_nr) {
-                struct bio_slab *bslab = &bio_slabs[i];
+                bslab = &bio_slabs[i];
                if (!bslab->slab && entry == -1)
                        entry = i;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2e9e69987a82..54f4798ab46a 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -112,12 +112,14 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
        switch (type) {
        case ACL_TYPE_ACCESS:
                mode = inode->i_mode;
-                ret = posix_acl_equiv_mode(acl, &mode);
-                if (ret < 0)
-                        return ret;
-                ret = 0;
-                inode->i_mode = mode;
                name = POSIX_ACL_XATTR_ACCESS;
+                if (acl) {
+                        ret = posix_acl_equiv_mode(acl, &mode);
+                        if (ret < 0)
+                                return ret;
+                        inode->i_mode = mode;
+                }
+                ret = 0;
                break;
        case ACL_TYPE_DEFAULT:
                if (!S_ISDIR(inode->i_mode))
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 56e50137d0e6..432a2da4641e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -83,6 +83,17 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
        return (cache->flags & bits) == bits;
 }
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+{
+        atomic_inc(&cache->count);
+}
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
+{
+        if (atomic_dec_and_test(&cache->count))
+                kfree(cache);
+}
 /*
 * this adds the block group to the fs_info rb tree for the block group
 * cache
@@ -156,7 +167,7 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
                }
        }
        if (ret)
-                atomic_inc(&ret->count);
+                btrfs_get_block_group(ret);
        spin_unlock(&info->block_group_cache_lock);
        return ret;
@@ -407,6 +418,8 @@ err:
        put_caching_control(caching_ctl);
        atomic_dec(&block_group->space_info->caching_threads);
+        btrfs_put_block_group(block_group);
        return 0;
 }
@@ -447,6 +460,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
        up_write(&fs_info->extent_commit_sem);
        atomic_inc(&cache->space_info->caching_threads);
+        btrfs_get_block_group(cache);
        tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
                          cache->key.objectid);
@@ -486,12 +500,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
        return cache;
 }
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
-{
-        if (atomic_dec_and_test(&cache->count))
-                kfree(cache);
-}
 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
                                                  u64 flags)
 {
@@ -2582,7 +2590,7 @@ next_block_group(struct btrfs_root *root,
        if (node) {
                cache = rb_entry(node, struct btrfs_block_group_cache,
                                 cache_node);
-                atomic_inc(&cache->count);
+                btrfs_get_block_group(cache);
        } else
                cache = NULL;
        spin_unlock(&root->fs_info->block_group_cache_lock);
@@ -4227,7 +4235,7 @@ search:
                u64 offset;
                int cached;
-                atomic_inc(&block_group->count);
+                btrfs_get_block_group(block_group);
                search_start = block_group->key.objectid;
 have_block_group:
@@ -4315,7 +4323,7 @@ have_block_group:
                                btrfs_put_block_group(block_group);
                                block_group = last_ptr->block_group;
-                                atomic_inc(&block_group->count);
+                                btrfs_get_block_group(block_group);
                                spin_unlock(&last_ptr->lock);
                                spin_unlock(&last_ptr->refill_lock);
@@ -7395,9 +7403,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                        wait_block_group_cache_done(block_group);
                btrfs_remove_free_space_cache(block_group);
+                btrfs_put_block_group(block_group);
-                WARN_ON(atomic_read(&block_group->count) != 1);
-                kfree(block_group);
                spin_lock(&info->block_group_cache_lock);
        }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index feaa13b105d9..c02033596f02 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -506,7 +506,8 @@ next_slot:
 }
 static int extent_mergeable(struct extent_buffer *leaf, int slot,
-                            u64 objectid, u64 bytenr, u64 *start, u64 *end)
+                            u64 objectid, u64 bytenr, u64 orig_offset,
+                            u64 *start, u64 *end)
 {
        struct btrfs_file_extent_item *fi;
        struct btrfs_key key;
@@ -522,6 +523,7 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
            btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+            btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
            btrfs_file_extent_compression(leaf, fi) ||
            btrfs_file_extent_encryption(leaf, fi) ||
            btrfs_file_extent_other_encoding(leaf, fi))
@@ -561,6 +563,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
        u64 split;
        int del_nr = 0;
        int del_slot = 0;
+        int recow;
        int ret;
        btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -568,6 +571,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
 again:
+        recow = 0;
        split = start;
        key.objectid = inode->i_ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
@@ -591,12 +595,60 @@ again:
        bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
        orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
+        memcpy(&new_key, &key, sizeof(new_key));
+        if (start == key.offset && end < extent_end) {
+                other_start = 0;
+                other_end = start;
+                if (extent_mergeable(leaf, path->slots[0] - 1,
+                                     inode->i_ino, bytenr, orig_offset,
+                                     &other_start, &other_end)) {
+                        new_key.offset = end;
+                        btrfs_set_item_key_safe(trans, root, path, &new_key);
+                        fi = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_file_extent_item);
+                        btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                        extent_end - end);
+                        btrfs_set_file_extent_offset(leaf, fi,
+                                                     end - orig_offset);
+                        fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+                                            struct btrfs_file_extent_item);
+                        btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                        end - other_start);
+                        btrfs_mark_buffer_dirty(leaf);
+                        goto out;
+                }
+        }
+        if (start > key.offset && end == extent_end) {
+                other_start = end;
+                other_end = 0;
+                if (extent_mergeable(leaf, path->slots[0] + 1,
+                                     inode->i_ino, bytenr, orig_offset,
+                                     &other_start, &other_end)) {
+                        fi = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_file_extent_item);
+                        btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                        start - key.offset);
+                        path->slots[0]++;
+                        new_key.offset = start;
+                        btrfs_set_item_key_safe(trans, root, path, &new_key);
+                        fi = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_file_extent_item);
+                        btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                        other_end - start);
+                        btrfs_set_file_extent_offset(leaf, fi,
+                                                     start - orig_offset);
+                        btrfs_mark_buffer_dirty(leaf);
+                        goto out;
+                }
+        }
        while (start > key.offset || end < extent_end) {
                if (key.offset == start)
                        split = end;
-                memcpy(&new_key, &key, sizeof(new_key));
                new_key.offset = split;
                ret = btrfs_duplicate_item(trans, root, path, &new_key);
                if (ret == -EAGAIN) {
@@ -631,15 +683,18 @@ again:
                        path->slots[0]--;
                        extent_end = end;
                }
+                recow = 1;
        }
-        fi = btrfs_item_ptr(leaf, path->slots[0],
-                            struct btrfs_file_extent_item);
        other_start = end;
        other_end = 0;
-        if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+        if (extent_mergeable(leaf, path->slots[0] + 1,
-                             bytenr, &other_start, &other_end)) {
+                             inode->i_ino, bytenr, orig_offset,
+                             &other_start, &other_end)) {
+                if (recow) {
+                        btrfs_release_path(root, path);
+                        goto again;
+                }
                extent_end = other_end;
                del_slot = path->slots[0] + 1;
                del_nr++;
@@ -650,8 +705,13 @@ again:
        }
        other_start = 0;
        other_end = start;
-        if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
+        if (extent_mergeable(leaf, path->slots[0] - 1,
-                             bytenr, &other_start, &other_end)) {
+                             inode->i_ino, bytenr, orig_offset,
+                             &other_start, &other_end)) {
+                if (recow) {
+                        btrfs_release_path(root, path);
+                        goto again;
+                }
                key.offset = other_start;
                del_slot = path->slots[0];
                del_nr++;
@@ -660,22 +720,22 @@ again:
                                        inode->i_ino, orig_offset);
                BUG_ON(ret);
        }
+        fi = btrfs_item_ptr(leaf, path->slots[0],
+                           struct btrfs_file_extent_item);
        if (del_nr == 0) {
                btrfs_set_file_extent_type(leaf, fi,
                                           BTRFS_FILE_EXTENT_REG);
                btrfs_mark_buffer_dirty(leaf);
-                goto out;
+        } else {
-        }
+                btrfs_set_file_extent_type(leaf, fi,
+                                           BTRFS_FILE_EXTENT_REG);
-        fi = btrfs_item_ptr(leaf, del_slot - 1,
+                btrfs_set_file_extent_num_bytes(leaf, fi,
-                            struct btrfs_file_extent_item);
+                                                extent_end - key.offset);
-        btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
+                btrfs_mark_buffer_dirty(leaf);
-        btrfs_set_file_extent_num_bytes(leaf, fi,
-                                        extent_end - key.offset);
-        btrfs_mark_buffer_dirty(leaf);
-        ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
-        BUG_ON(ret);
+                BUG_ON(ret);
+        }
 out:
        btrfs_free_path(path);
        return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5440bab23635..b330e27c2d8b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3796,6 +3796,12 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        if (location.type == BTRFS_INODE_ITEM_KEY) {
                inode = btrfs_iget(dir->i_sb, &location, root);
+                if (unlikely(root->clean_orphans) &&
+                    !(inode->i_sb->s_flags & MS_RDONLY)) {
+                        down_read(&root->fs_info->cleanup_work_sem);
+                        btrfs_orphan_cleanup(root);
+                        up_read(&root->fs_info->cleanup_work_sem);
+                }
                return inode;
        }
@@ -3995,7 +4001,11 @@ skip:
        /* Reached end of directory/root. Bump pos past the last item. */
        if (key_type == BTRFS_DIR_INDEX_KEY)
-                filp->f_pos = INT_LIMIT(off_t);
+                /*
+                 * 32-bit glibc will use getdents64, but then strtol -
+                 * so the last number we can serve is this.
+                 */
+                filp->f_pos = 0x7fffffff;
        else
                filp->f_pos++;
 nopos:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b10a49d4bc6a..5c2a9e78a949 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -626,6 +626,8 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
        if (ordered)
                offset = entry_end(ordered);
+        else
+                offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
        mutex_lock(&tree->mutex);
        disk_i_size = BTRFS_I(inode)->disk_i_size;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index a9728680eca8..ed3e4a2ec2c8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3281,8 +3281,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                return -ENOMEM;
        path = btrfs_alloc_path();
-        if (!path)
+        if (!path) {
+                kfree(cluster);
                return -ENOMEM;
+        }
        rc->extents_found = 0;
        rc->extents_skipped = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 198cff28766d..220dad5db017 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2649,8 +2649,10 @@ again:
        em = lookup_extent_mapping(em_tree, logical, *length);
        read_unlock(&em_tree->lock);
-        if (!em && unplug_page)
+        if (!em && unplug_page) {
+                kfree(multi);
                return 0;
+        }
        if (!em) {
                printk(KERN_CRIT "unable to find logical %llu len %llu\n",
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index fea9e898c4ba..b44ce0a0711c 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -269,7 +269,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
        int err;
        mntget(newmnt);
-        err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags, mntlist);
+        err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
        switch (err) {
        case 0:
                path_put(&nd->path);
@@ -371,7 +371,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        if (IS_ERR(mnt))
                goto out_err;
-        nd->path.mnt->mnt_flags |= MNT_SHRINKABLE;
        rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
 out:
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 332dd00f0894..c5c45de1a2ee 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1005,6 +1005,9 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
 COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
 #endif
+/* Big V (don't complain on serial console) */
+IGNORE_IOCTL(VT_OPENQRY)
+IGNORE_IOCTL(VT_GETMODE)
 /* Little p (/dev/rtc, /dev/envctrl, etc.) */
 COMPATIBLE_IOCTL(RTC_AIE_ON)
 COMPATIBLE_IOCTL(RTC_AIE_OFF)
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index c8afa6b1d91d..32a5f46b1157 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -121,8 +121,10 @@ static int get_target(const char *symname, struct path *path,
                                ret = -ENOENT;
                                path_put(path);
                        }
-                } else
+                } else {
                        ret = -EPERM;
+                        path_put(path);
+                }
        }
        return ret;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index fbb6e5eed697..7cb0a59f4b9d 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1748,7 +1748,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
                            char *cipher_name, size_t *key_size)
 {
        char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
-        char *full_alg_name;
+        char *full_alg_name = NULL;
        int rc;
        *key_tfm = NULL;
@@ -1763,7 +1763,6 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
        if (rc)
                goto out;
        *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
-        kfree(full_alg_name);
        if (IS_ERR(*key_tfm)) {
                rc = PTR_ERR(*key_tfm);
                printk(KERN_ERR "Unable to allocate crypto cipher with name "
@@ -1786,6 +1785,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
                goto out;
        }
 out:
+        kfree(full_alg_name);
        return rc;
 }
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 9e944057001b..678172b61be2 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -158,7 +158,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
        struct dentry *ecryptfs_dentry = file->f_path.dentry;
        /* Private value of ecryptfs_dentry allocated in
         * ecryptfs_lookup() */
-        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+        struct dentry *lower_dentry;
        struct ecryptfs_file_info *file_info;
        mount_crypt_stat = &ecryptfs_superblock_to_private(
@@ -191,13 +191,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                                      | ECRYPTFS_ENCRYPTED);
        }
        mutex_unlock(&crypt_stat->cs_mutex);
-        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
-            && !(file->f_flags & O_RDONLY)) {
-                rc = -EPERM;
-                printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
-                       "file must hence be opened RO\n", __func__);
-                goto out;
-        }
        if (!ecryptfs_inode_to_private(inode)->lower_file) {
                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
                if (rc) {
@@ -208,6 +201,13 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                        goto out;
                }
        }
+        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
+            && !(file->f_flags & O_RDONLY)) {
+                rc = -EPERM;
+                printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
+                       "file must hence be opened RO\n", __func__);
+                goto out;
+        }
        ecryptfs_set_file_lower(
                file, ecryptfs_inode_to_private(inode)->lower_file);
        if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
@@ -299,7 +299,6 @@ static int ecryptfs_ioctl(struct inode *inode, struct file *file,
 const struct file_operations ecryptfs_dir_fops = {
        .readdir = ecryptfs_readdir,
        .ioctl = ecryptfs_ioctl,
-        .mmap = generic_file_mmap,
        .open = ecryptfs_open,
        .flush = ecryptfs_flush,
        .release = ecryptfs_release,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 429ca0b3ba08..4a430ab4115c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -282,7 +282,8 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                goto out;
        }
        rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
-                                ecryptfs_dir_inode->i_sb, 1);
+                                ecryptfs_dir_inode->i_sb,
+                                ECRYPTFS_INTERPOSE_FLAG_D_ADD);
        if (rc) {
                printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
                       __func__, rc);
@@ -463,9 +464,6 @@ out_lock:
        unlock_dir(lower_dir_dentry);
        dput(lower_new_dentry);
        dput(lower_old_dentry);
-        d_drop(lower_old_dentry);
-        d_drop(new_dentry);
-        d_drop(old_dentry);
        return rc;
 }
@@ -614,6 +612,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct dentry *lower_new_dentry;
        struct dentry *lower_old_dir_dentry;
        struct dentry *lower_new_dir_dentry;
+        struct dentry *trap = NULL;
        lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
        lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
@@ -621,7 +620,17 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        dget(lower_new_dentry);
        lower_old_dir_dentry = dget_parent(lower_old_dentry);
        lower_new_dir_dentry = dget_parent(lower_new_dentry);
-        lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+        trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+        /* source should not be ancestor of target */
+        if (trap == lower_old_dentry) {
+                rc = -EINVAL;
+                goto out_lock;
+        }
+        /* target should not be ancestor of source */
+        if (trap == lower_new_dentry) {
+                rc = -ENOTEMPTY;
+                goto out_lock;
+        }
        rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
                        lower_new_dir_dentry->d_inode, lower_new_dentry);
        if (rc)
@@ -715,31 +724,31 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
        /* Released in ecryptfs_put_link(); only release here on error */
        buf = kmalloc(len, GFP_KERNEL);
        if (!buf) {
-                rc = -ENOMEM;
+                buf = ERR_PTR(-ENOMEM);
                goto out;
        }
        old_fs = get_fs();
        set_fs(get_ds());
        rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
        set_fs(old_fs);
-        if (rc < 0)
+        if (rc < 0) {
-                goto out_free;
+                kfree(buf);
-        else
+                buf = ERR_PTR(rc);
+        } else
                buf[rc] = '\0';
-        rc = 0;
-        nd_set_link(nd, buf);
-        goto out;
-out_free:
-        kfree(buf);
 out:
-        return ERR_PTR(rc);
+        nd_set_link(nd, buf);
+        return NULL;
 }
 static void
 ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
 {
-        /* Free the char* */
+        char *buf = nd_get_link(nd);
-        kfree(nd_get_link(nd));
+        if (!IS_ERR(buf)) {
+                /* Free the char* */
+                kfree(buf);
+        }
 }
 /**
@@ -772,18 +781,23 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
 }
 /**
- * ecryptfs_truncate
+ * truncate_upper
 * @dentry: The ecryptfs layer dentry
- * @new_length: The length to expand the file to
+ * @ia: Address of the ecryptfs inode's attributes
+ * @lower_ia: Address of the lower inode's attributes
 *
 * Function to handle truncations modifying the size of the file. Note
 * that the file sizes are interpolated. When expanding, we are simply
- * writing strings of 0's out. When truncating, we need to modify the
+ * writing strings of 0's out. When truncating, we truncate the upper
- * underlying file size according to the page index interpolations.
+ * inode and update the lower_ia according to the page index
+ * interpolations. If ATTR_SIZE is set in lower_ia->ia_valid upon return,
+ * the caller must use lower_ia in a call to notify_change() to perform
+ * the truncation of the lower inode.
 *
 * Returns zero on success; non-zero otherwise
 */
-int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
+static int truncate_upper(struct dentry *dentry, struct iattr *ia,
+                          struct iattr *lower_ia)
 {
        int rc = 0;
        struct inode *inode = dentry->d_inode;
@@ -794,8 +808,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
        loff_t lower_size_before_truncate;
        loff_t lower_size_after_truncate;
-        if (unlikely((new_length == i_size)))
+        if (unlikely((ia->ia_size == i_size))) {
+                lower_ia->ia_valid &= ~ATTR_SIZE;
                goto out;
+        }
        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
        /* Set up a fake ecryptfs file, this is used to interface with
         * the file in the underlying filesystem so that the
@@ -815,28 +831,30 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
                &fake_ecryptfs_file,
                ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
        /* Switch on growing or shrinking file */
-        if (new_length > i_size) {
+        if (ia->ia_size > i_size) {
                char zero[] = { 0x00 };
+                lower_ia->ia_valid &= ~ATTR_SIZE;
                /* Write a single 0 at the last position of the file;
                 * this triggers code that will fill in 0's throughout
                 * the intermediate portion of the previous end of the
                 * file and the new and of the file */
                rc = ecryptfs_write(&fake_ecryptfs_file, zero,
-                                    (new_length - 1), 1);
+                                    (ia->ia_size - 1), 1);
-        } else { /* new_length < i_size_read(inode) */
+        } else { /* ia->ia_size < i_size_read(inode) */
-                /* We're chopping off all the pages down do the page
+                /* We're chopping off all the pages down to the page
-                 * in which new_length is located. Fill in the end of
+                 * in which ia->ia_size is located. Fill in the end of
-                 * that page from (new_length & ~PAGE_CACHE_MASK) to
+                 * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to
                 * PAGE_CACHE_SIZE with zeros. */
                size_t num_zeros = (PAGE_CACHE_SIZE
-                                    - (new_length & ~PAGE_CACHE_MASK));
+                                    - (ia->ia_size & ~PAGE_CACHE_MASK));
                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-                        rc = vmtruncate(inode, new_length);
+                        rc = vmtruncate(inode, ia->ia_size);
                        if (rc)
                                goto out_free;
-                        rc = vmtruncate(lower_dentry->d_inode, new_length);
+                        lower_ia->ia_size = ia->ia_size;
+                        lower_ia->ia_valid |= ATTR_SIZE;
                        goto out_free;
                }
                if (num_zeros) {
@@ -848,7 +866,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
                                goto out_free;
                        }
                        rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt,
-                                            new_length, num_zeros);
+                                            ia->ia_size, num_zeros);
                        kfree(zeros_virt);
                        if (rc) {
                                printk(KERN_ERR "Error attempting to zero out "
@@ -857,7 +875,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
                                goto out_free;
                        }
                }
-                vmtruncate(inode, new_length);
+                vmtruncate(inode, ia->ia_size);
                rc = ecryptfs_write_inode_size_to_metadata(inode);
                if (rc) {
                        printk(KERN_ERR "Problem with "
@@ -870,10 +888,12 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
                lower_size_before_truncate =
                    upper_size_to_lower_size(crypt_stat, i_size);
                lower_size_after_truncate =
-                    upper_size_to_lower_size(crypt_stat, new_length);
+                    upper_size_to_lower_size(crypt_stat, ia->ia_size);
-                if (lower_size_after_truncate < lower_size_before_truncate)
+                if (lower_size_after_truncate < lower_size_before_truncate) {
-                        vmtruncate(lower_dentry->d_inode,
+                        lower_ia->ia_size = lower_size_after_truncate;
-                                   lower_size_after_truncate);
+                        lower_ia->ia_valid |= ATTR_SIZE;
+                } else
+                        lower_ia->ia_valid &= ~ATTR_SIZE;
        }
 out_free:
        if (ecryptfs_file_to_private(&fake_ecryptfs_file))
@@ -883,6 +903,33 @@ out:
        return rc;
 }
+/**
+ * ecryptfs_truncate
+ * @dentry: The ecryptfs layer dentry
+ * @new_length: The length to expand the file to
+ *
+ * Simple function that handles the truncation of an eCryptfs inode and
+ * its corresponding lower inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
+{
+        struct iattr ia = { .ia_valid = ATTR_SIZE, .ia_size = new_length };
+        struct iattr lower_ia = { .ia_valid = 0 };
+        int rc;
+        rc = truncate_upper(dentry, &ia, &lower_ia);
+        if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
+                struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+                mutex_lock(&lower_dentry->d_inode->i_mutex);
+                rc = notify_change(lower_dentry, &lower_ia);
+                mutex_unlock(&lower_dentry->d_inode->i_mutex);
+        }
+        return rc;
+}
 static int
 ecryptfs_permission(struct inode *inode, int mask)
 {
@@ -905,6 +952,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 {
        int rc = 0;
        struct dentry *lower_dentry;
+        struct iattr lower_ia;
        struct inode *inode;
        struct inode *lower_inode;
        struct ecryptfs_crypt_stat *crypt_stat;
@@ -943,15 +991,11 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
                }
        }
        mutex_unlock(&crypt_stat->cs_mutex);
+        memcpy(&lower_ia, ia, sizeof(lower_ia));
+        if (ia->ia_valid & ATTR_FILE)
+                lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file);
        if (ia->ia_valid & ATTR_SIZE) {
-                ecryptfs_printk(KERN_DEBUG,
+                rc = truncate_upper(dentry, ia, &lower_ia);
-                                "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n",
-                                ia->ia_valid, ATTR_SIZE);
-                rc = ecryptfs_truncate(dentry, ia->ia_size);
-                /* ecryptfs_truncate handles resizing of the lower file */
-                ia->ia_valid &= ~ATTR_SIZE;
-                ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n",
-                                ia->ia_valid);
                if (rc < 0)
                        goto out;
        }
@@ -960,17 +1004,32 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
         * mode change is for clearing setuid/setgid bits. Allow lower fs
         * to interpret this in its own way.
         */
-        if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+        if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
-                ia->ia_valid &= ~ATTR_MODE;
+                lower_ia.ia_valid &= ~ATTR_MODE;
        mutex_lock(&lower_dentry->d_inode->i_mutex);
-        rc = notify_change(lower_dentry, ia);
+        rc = notify_change(lower_dentry, &lower_ia);
        mutex_unlock(&lower_dentry->d_inode->i_mutex);
 out:
        fsstack_copy_attr_all(inode, lower_inode);
        return rc;
 }
+int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                     struct kstat *stat)
+{
+        struct kstat lower_stat;
+        int rc;
+        rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
+                         ecryptfs_dentry_to_lower(dentry), &lower_stat);
+        if (!rc) {
+                generic_fillattr(dentry->d_inode, stat);
+                stat->blocks = lower_stat.blocks;
+        }
+        return rc;
+}
 int
 ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                  size_t size, int flags)
@@ -1100,6 +1159,7 @@ const struct inode_operations ecryptfs_dir_iops = {
 const struct inode_operations ecryptfs_main_iops = {
        .permission = ecryptfs_permission,
        .setattr = ecryptfs_setattr,
+        .getattr = ecryptfs_getattr,
        .setxattr = ecryptfs_setxattr,
        .getxattr = ecryptfs_getxattr,
        .listxattr = ecryptfs_listxattr,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 567bc4b9f70a..ea2f92101dfe 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -585,8 +585,8 @@ out:
 *                        with as much information as it can before needing
 *                        the lower filesystem.
 * ecryptfs_read_super(): this accesses the lower filesystem and uses
- *                        ecryptfs_interpolate to perform most of the linking
+ *                        ecryptfs_interpose to perform most of the linking
- * ecryptfs_interpolate(): links the lower filesystem into ecryptfs
+ * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
 */
 static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
                        const char *dev_name, void *raw_data,
diff --git a/fs/eventfd.c b/fs/eventfd.c
index d26402ff06ea..7758cc382ef0 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -135,26 +135,71 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
        return events;
 }
-static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
+static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
-                            loff_t *ppos)
+{
+        *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+        ctx->count -= *cnt;
+}
+/**
+ * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
+ * @ctx: [in] Pointer to eventfd context.
+ * @wait: [in] Wait queue to be removed.
+ * @cnt: [out] Pointer to the 64bit conter value.
+ *
+ * Returns zero if successful, or the following error codes:
+ *
+ * -EAGAIN      : The operation would have blocked.
+ *
+ * This is used to atomically remove a wait queue entry from the eventfd wait
+ * queue head, and read/reset the counter value.
+ */
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+                                  __u64 *cnt)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&ctx->wqh.lock, flags);
+        eventfd_ctx_do_read(ctx, cnt);
+        __remove_wait_queue(&ctx->wqh, wait);
+        if (*cnt != 0 && waitqueue_active(&ctx->wqh))
+                wake_up_locked_poll(&ctx->wqh, POLLOUT);
+        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+        return *cnt != 0 ? 0 : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
+/**
+ * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
+ * @ctx: [in] Pointer to eventfd context.
+ * @no_wait: [in] Different from zero if the operation should not block.
+ * @cnt: [out] Pointer to the 64bit conter value.
+ *
+ * Returns zero if successful, or the following error codes:
+ *
+ * -EAGAIN      : The operation would have blocked but @no_wait was nonzero.
+ * -ERESTARTSYS : A signal interrupted the wait operation.
+ *
+ * If @no_wait is zero, the function might sleep until the eventfd internal
+ * counter becomes greater than zero.
+ */
+ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
 {
-        struct eventfd_ctx *ctx = file->private_data;
        ssize_t res;
-        __u64 ucnt = 0;
        DECLARE_WAITQUEUE(wait, current);
-        if (count < sizeof(ucnt))
-                return -EINVAL;
        spin_lock_irq(&ctx->wqh.lock);
+        *cnt = 0;
        res = -EAGAIN;
        if (ctx->count > 0)
-                res = sizeof(ucnt);
+                res = 0;
-        else if (!(file->f_flags & O_NONBLOCK)) {
+        else if (!no_wait) {
                __add_wait_queue(&ctx->wqh, &wait);
-                for (res = 0;;) {
+                for (;;) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        if (ctx->count > 0) {
-                                res = sizeof(ucnt);
+                                res = 0;
                                break;
                        }
                        if (signal_pending(current)) {
@@ -168,18 +213,32 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
                __remove_wait_queue(&ctx->wqh, &wait);
                __set_current_state(TASK_RUNNING);
        }
-        if (likely(res > 0)) {
+        if (likely(res == 0)) {
-                ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+                eventfd_ctx_do_read(ctx, cnt);
-                ctx->count -= ucnt;
                if (waitqueue_active(&ctx->wqh))
                        wake_up_locked_poll(&ctx->wqh, POLLOUT);
        }
        spin_unlock_irq(&ctx->wqh.lock);
-        if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
-                return -EFAULT;
        return res;
 }
+EXPORT_SYMBOL_GPL(eventfd_ctx_read);
+static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
+                            loff_t *ppos)
+{
+        struct eventfd_ctx *ctx = file->private_data;
+        ssize_t res;
+        __u64 cnt;
+        if (count < sizeof(cnt))
+                return -EINVAL;
+        res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
+        if (res < 0)
+                return res;
+        return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
+}
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
                             loff_t *ppos)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index af7b62699ea9..874d169a193e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -361,14 +361,11 @@ struct ext4_new_group_data {
           so set the magic i_delalloc_reserve_flag after taking the 
           inode allocation semaphore for */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
-        /* Call ext4_da_update_reserve_space() after successfully 
-           allocating the blocks */
-#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE    0x0008
        /* caller is from the direct IO path, request to creation of an
        unitialized extents if not allocated, split the uninitialized
        extent if blocks has been preallocated already*/
-#define EXT4_GET_BLOCKS_DIO                     0x0010
+#define EXT4_GET_BLOCKS_DIO                     0x0008
-#define EXT4_GET_BLOCKS_CONVERT                 0x0020
+#define EXT4_GET_BLOCKS_CONVERT                 0x0010
 #define EXT4_GET_BLOCKS_DIO_CREATE_EXT          (EXT4_GET_BLOCKS_DIO|\
                                         EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
        /* Convert extent to initialized after direct IO complete */
@@ -1443,6 +1440,8 @@ extern int ext4_block_truncate_page(handle_t *handle,
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern int flush_aio_dio_completed_IO(struct inode *inode);
+extern void ext4_da_update_reserve_space(struct inode *inode,
+                                        int used, int quota_claim);
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
 extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7d7b74e94687..765a4826b118 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3132,7 +3132,19 @@ out:
                unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
                                        newblock + max_blocks,
                                        allocated - max_blocks);
+                allocated = max_blocks;
        }
+        /*
+         * If we have done fallocate with the offset that is already
+         * delayed allocated, we would have block reservation
+         * and quota reservation done in the delayed write path.
+         * But fallocate would have already updated quota and block
+         * count for this offset. So cancel these reservation
+         */
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+                ext4_da_update_reserve_space(inode, allocated, 0);
 map_out:
        set_buffer_mapped(bh_result);
 out1:
@@ -3368,9 +3380,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        /* previous routine could use block we allocated */
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
+        if (allocated > max_blocks)
+                allocated = max_blocks;
        set_buffer_new(bh_result);
        /*
+         * Update reserved blocks/metadata blocks after successful
+         * block allocation which had been deferred till now.
+         */
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+                ext4_da_update_reserve_space(inode, allocated, 1);
+        /*
         * Cache the extent and update transaction to commit on fdatasync only
         * when it is _not_ an uninitialized extent.
         */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c818972c8302..e11952404e02 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1053,11 +1053,12 @@ static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
 * Called with i_data_sem down, which is important since we can call
 * ext4_discard_preallocations() from here.
 */
-static void ext4_da_update_reserve_space(struct inode *inode, int used)
+void ext4_da_update_reserve_space(struct inode *inode,
+                                        int used, int quota_claim)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
-        int mdb_free = 0;
+        int mdb_free = 0, allocated_meta_blocks = 0;
        spin_lock(&ei->i_block_reservation_lock);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
@@ -1073,6 +1074,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
        ei->i_reserved_data_blocks -= used;
        used += ei->i_allocated_meta_blocks;
        ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+        allocated_meta_blocks = ei->i_allocated_meta_blocks;
        ei->i_allocated_meta_blocks = 0;
        percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
@@ -1090,9 +1092,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        /* Update quota subsystem */
-        vfs_dq_claim_block(inode, used);
+        if (quota_claim) {
-        if (mdb_free)
+                vfs_dq_claim_block(inode, used);
-                vfs_dq_release_reservation_block(inode, mdb_free);
+                if (mdb_free)
+                        vfs_dq_release_reservation_block(inode, mdb_free);
+        } else {
+                /*
+                 * We did fallocate with an offset that is already delayed
+                 * allocated. So on delayed allocated writeback we should
+                 * not update the quota for allocated blocks. But then
+                 * converting an fallocate region to initialized region would
+                 * have caused a metadata allocation. So claim quota for
+                 * that
+                 */
+                if (allocated_meta_blocks)
+                        vfs_dq_claim_block(inode, allocated_meta_blocks);
+                vfs_dq_release_reservation_block(inode, mdb_free + used);
+        }
        /*
         * If we have done all the pending block allocations and if
@@ -1292,18 +1308,20 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
                         */
                        EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
                }
-        }
+                /*
+                 * Update reserved blocks/metadata blocks after successful
+                 * block allocation which had been deferred till now. We don't
+                 * support fallocate for non extent files. So we can update
+                 * reserve space here.
+                 */
+                if ((retval > 0) &&
+                        (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+                        ext4_da_update_reserve_space(inode, retval, 1);
+        }
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
-        /*
-         * Update reserved blocks/metadata blocks after successful
-         * block allocation which had been deferred till now.
-         */
-        if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
-                ext4_da_update_reserve_space(inode, retval);
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && buffer_mapped(bh)) {
                int ret = check_block_validity(inode, "file system "
@@ -1835,24 +1853,12 @@ repeat:
         * later. Real quota accounting is done at pages writeout
         * time.
         */
-        if (vfs_dq_reserve_block(inode, md_needed + 1)) {
+        if (vfs_dq_reserve_block(inode, md_needed + 1))
-                /* 
-                 * We tend to badly over-estimate the amount of
-                 * metadata blocks which are needed, so if we have
-                 * reserved any metadata blocks, try to force out the
-                 * inode and see if we have any better luck.
-                 */
-                if (md_reserved && retries++ <= 3)
-                        goto retry;
                return -EDQUOT;
-        }
        if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
                vfs_dq_release_reservation_block(inode, md_needed + 1);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
-                retry:
-                        if (md_reserved)
-                                write_inode_now(inode, (retries == 3));
                        yield();
                        goto repeat;
                }
@@ -2213,10 +2219,10 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         * variables are updated after the blocks have been allocated.
         */
        new.b_state = 0;
-        get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
+        get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
-                            EXT4_GET_BLOCKS_DELALLOC_RESERVE);
        if (mpd->b_state & (1 << BH_Delay))
-                get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
+                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
        blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
                               &new, get_blocks_flags);
        if (blks < 0) {
@@ -3032,7 +3038,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                               loff_t pos, unsigned len, unsigned flags,
                               struct page **pagep, void **fsdata)
 {
-        int ret, retries = 0;
+        int ret, retries = 0, quota_retries = 0;
        struct page *page;
        pgoff_t index;
        unsigned from, to;
@@ -3091,6 +3097,22 @@ retry:
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
+        if ((ret == -EDQUOT) &&
+            EXT4_I(inode)->i_reserved_meta_blocks &&
+            (quota_retries++ < 3)) {
+                /*
+                 * Since we often over-estimate the number of meta
+                 * data blocks required, we may sometimes get a
+                 * spurios out of quota error even though there would
+                 * be enough space once we write the data blocks and
+                 * find out how many meta data blocks were _really_
+                 * required.  So try forcing the inode write to see if
+                 * that helps.
+                 */
+                write_inode_now(inode, (quota_retries == 3));
+                goto retry;
+        }
 out:
        return ret;
 }
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 2cf93ec40a67..5ef953e6f908 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -199,7 +199,9 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
                     int force)
 {
-        write_lock_irq(&filp->f_owner.lock);
+        unsigned long flags;
+        write_lock_irqsave(&filp->f_owner.lock, flags);
        if (force || !filp->f_owner.pid) {
                put_pid(filp->f_owner.pid);
                filp->f_owner.pid = get_pid(pid);
@@ -211,7 +213,7 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
                        filp->f_owner.euid = cred->euid;
                }
        }
-        write_unlock_irq(&filp->f_owner.lock);
+        write_unlock_irqrestore(&filp->f_owner.lock, flags);
 }
 int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
@@ -618,60 +620,90 @@ static DEFINE_RWLOCK(fasync_lock);
 static struct kmem_cache *fasync_cache __read_mostly;
 /*
- * fasync_helper() is used by almost all character device drivers
+ * Remove a fasync entry. If successfully removed, return
- * to set up the fasync queue. It returns negative on error, 0 if it did
+ * positive and clear the FASYNC flag. If no entry exists,
- * no changes and positive if it added/deleted the entry.
+ * do nothing and return 0.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
+ *
+ * We always take the 'filp->f_lock', in since fasync_lock
+ * needs to be irq-safe.
 */
-int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
+static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 {
        struct fasync_struct *fa, **fp;
-        struct fasync_struct *new = NULL;
        int result = 0;
-        if (on) {
+        spin_lock(&filp->f_lock);
-                new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+        write_lock_irq(&fasync_lock);
-                if (!new)
+        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
-                        return -ENOMEM;
+                if (fa->fa_file != filp)
+                        continue;
+                *fp = fa->fa_next;
+                kmem_cache_free(fasync_cache, fa);
+                filp->f_flags &= ~FASYNC;
+                result = 1;
+                break;
        }
+        write_unlock_irq(&fasync_lock);
+        spin_unlock(&filp->f_lock);
+        return result;
+}
+/*
+ * Add a fasync entry. Return negative on error, positive if
+ * added, and zero if did nothing but change an existing one.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
+ */
+static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+{
+        struct fasync_struct *new, *fa, **fp;
+        int result = 0;
+        new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+        if (!new)
+                return -ENOMEM;
-        /*
-         * We need to take f_lock first since it's not an IRQ-safe
-         * lock.
-         */
        spin_lock(&filp->f_lock);
        write_lock_irq(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
-                if (fa->fa_file == filp) {
+                if (fa->fa_file != filp)
-                        if(on) {
+                        continue;
-                                fa->fa_fd = fd;
+                fa->fa_fd = fd;
-                                kmem_cache_free(fasync_cache, new);
+                kmem_cache_free(fasync_cache, new);
-                        } else {
+                goto out;
-                                *fp = fa->fa_next;
-                                kmem_cache_free(fasync_cache, fa);
-                                result = 1;
-                        }
-                        goto out;
-                }
        }
-        if (on) {
+        new->magic = FASYNC_MAGIC;
-                new->magic = FASYNC_MAGIC;
+        new->fa_file = filp;
-                new->fa_file = filp;
+        new->fa_fd = fd;
-                new->fa_fd = fd;
+        new->fa_next = *fapp;
-                new->fa_next = *fapp;
+        *fapp = new;
-                *fapp = new;
+        result = 1;
-                result = 1;
+        filp->f_flags |= FASYNC;
-        }
 out:
-        if (on)
-                filp->f_flags |= FASYNC;
-        else
-                filp->f_flags &= ~FASYNC;
        write_unlock_irq(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
 }
+/*
+ * fasync_helper() is used by almost all character device drivers
+ * to set up the fasync queue, and for regular files by the file
+ * lease code. It returns negative on error, 0 if it did no changes
+ * and positive if it added/deleted the entry.
+ */
+int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
+{
+        if (!on)
+                return fasync_remove_entry(filp, fapp);
+        return fasync_add_entry(fd, filp, fapp);
+}
 EXPORT_SYMBOL(fasync_helper);
 void __kill_fasync(struct fasync_struct *fa, int sig, int band)
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index a5089a6dd67a..7239efc690d8 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -646,22 +646,27 @@ static const struct super_operations hppfs_sbops = {
 static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
                          int buflen)
 {
-        struct dentry *proc_dentry;
+        struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-        proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
        return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer,
                                                    buflen);
 }
 static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-        struct dentry *proc_dentry;
+        struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-        proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
        return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
 }
+static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd,
+                           void *cookie)
+{
+        struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
+        if (proc_dentry->d_inode->i_op->put_link)
+                proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie);
+}
 static const struct inode_operations hppfs_dir_iops = {
        .lookup         = hppfs_lookup,
 };
@@ -669,6 +674,7 @@ static const struct inode_operations hppfs_dir_iops = {
 static const struct inode_operations hppfs_link_iops = {
        .readlink       = hppfs_readlink,
        .follow_link    = hppfs_follow_link,
+        .put_link       = hppfs_put_link,
 };
 static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
diff --git a/fs/namei.c b/fs/namei.c
index b55440baf7ab..94a5e60779f9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -561,6 +561,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
                dget(dentry);
        }
        mntget(path->mnt);
+        nd->last_type = LAST_BIND;
        cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
        error = PTR_ERR(cookie);
        if (!IS_ERR(cookie)) {
@@ -1603,11 +1604,12 @@ struct file *do_filp_open(int dfd, const char *pathname,
        struct file *filp;
        struct nameidata nd;
        int error;
-        struct path path, save;
+        struct path path;
        struct dentry *dir;
        int count = 0;
        int will_truncate;
        int flag = open_to_namei_flags(open_flag);
+        int force_reval = 0;
        /*
         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
@@ -1619,7 +1621,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
                open_flag |= O_DSYNC;
        if (!acc_mode)
-                acc_mode = MAY_OPEN | ACC_MODE(flag);
+                acc_mode = MAY_OPEN | ACC_MODE(open_flag);
        /* O_TRUNC implies we need access checks for write permissions */
        if (flag & O_TRUNC)
@@ -1659,9 +1661,12 @@ struct file *do_filp_open(int dfd, const char *pathname,
        /*
         * Create - we need to know the parent.
         */
+reval:
        error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
        if (error)
                return ERR_PTR(error);
+        if (force_reval)
+                nd.flags |= LOOKUP_REVAL;
        error = path_walk(pathname, &nd);
        if (error) {
                if (nd.root.mnt)
@@ -1853,17 +1858,7 @@ do_link:
        error = security_inode_follow_link(path.dentry, &nd);
        if (error)
                goto exit_dput;
-        save = nd.path;
-        path_get(&save);
        error = __do_follow_link(&path, &nd);
-        if (error == -ESTALE) {
-                /* nd.path had been dropped */
-                nd.path = save;
-                path_get(&nd.path);
-                nd.flags |= LOOKUP_REVAL;
-                error = __do_follow_link(&path, &nd);
-        }
-        path_put(&save);
        path_put(&path);
        if (error) {
                /* Does someone understand code flow here? Or it is only
@@ -1873,6 +1868,10 @@ do_link:
                release_open_intent(&nd);
                if (nd.root.mnt)
                        path_put(&nd.root);
+                if (error == -ESTALE && !force_reval) {
+                        force_reval = 1;
+                        goto reval;
+                }
                return ERR_PTR(error);
        }
        nd.flags &= ~LOOKUP_PARENT;
diff --git a/fs/namespace.c b/fs/namespace.c
index 7d70d63ceb29..c768f733c8d6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -965,10 +965,12 @@ EXPORT_SYMBOL(may_umount_tree);
 int may_umount(struct vfsmount *mnt)
 {
        int ret = 1;
+        down_read(&namespace_sem);
        spin_lock(&vfsmount_lock);
        if (propagate_mount_busy(mnt, 2))
                ret = 0;
        spin_unlock(&vfsmount_lock);
+        up_read(&namespace_sem);
        return ret;
 }
@@ -1352,12 +1354,12 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        if (err)
                goto out_cleanup_ids;
+        spin_lock(&vfsmount_lock);
        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                        set_mnt_shared(p);
        }
-        spin_lock(&vfsmount_lock);
        if (parent_path) {
                detach_mnt(source_mnt, parent_path);
                attach_mnt(source_mnt, path);
@@ -1534,8 +1536,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
                err = change_mount_flags(path->mnt, flags);
        else
                err = do_remount_sb(sb, flags, data, 0);
-        if (!err)
+        if (!err) {
+                spin_lock(&vfsmount_lock);
+                mnt_flags |= path->mnt->mnt_flags & MNT_PNODE_MASK;
                path->mnt->mnt_flags = mnt_flags;
+                spin_unlock(&vfsmount_lock);
+        }
        up_write(&sb->s_umount);
        if (!err) {
                security_sb_post_remount(path->mnt, flags, data);
@@ -1665,6 +1671,8 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 {
        int err;
+        mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD);
        down_write(&namespace_sem);
        /* Something was mounted here while we slept */
        while (d_mountpoint(path->dentry) &&
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index c9ee67b442e1..1afb0a10229f 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -121,7 +121,7 @@ static int idr_callback(int id, void *p, void *data)
        if (warned)
                return 0;
-        warned = false;
+        warned = true;
        entry = p;
        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 8271cf05c957..a94e8bd8eb1f 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -552,7 +552,7 @@ retry:
        spin_lock(&group->inotify_data.idr_lock);
        ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
-                                group->inotify_data.last_wd,
+                                group->inotify_data.last_wd+1,
                                &tmp_ientry->wd);
        spin_unlock(&group->inotify_data.idr_lock);
        if (ret) {
@@ -632,7 +632,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
        spin_lock_init(&group->inotify_data.idr_lock);
        idr_init(&group->inotify_data.idr);
-        group->inotify_data.last_wd = 1;
+        group->inotify_data.last_wd = 0;
        group->inotify_data.user = user;
        group->inotify_data.fa = NULL;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 18d5cc62d8ed..e42bbd843ed1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1419,7 +1419,6 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
                goto out;
        error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
-        nd->last_type = LAST_BIND;
 out:
        return ERR_PTR(error);
 }
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2efc57173fd7..1739a4aba25f 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -123,30 +123,6 @@ add_error:
 /*****************************************************************************/
 /*
- * check that file shrinkage doesn't leave any VMAs dangling in midair
- */
-static int ramfs_nommu_check_mappings(struct inode *inode,
-                                      size_t newsize, size_t size)
-{
-        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
-        /* search for VMAs that fall within the dead zone */
-        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
-                              newsize >> PAGE_SHIFT,
-                              (size + PAGE_SIZE - 1) >> PAGE_SHIFT
-                              ) {
-                /* found one - only interested if it's shared out of the page
-                 * cache */
-                if (vma->vm_flags & VM_SHARED)
-                        return -ETXTBSY; /* not quite true, but near enough */
-        }
-        return 0;
-}
-/*****************************************************************************/
-/*
 *
 */
 static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
@@ -164,7 +140,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
        /* check that a decrease in size doesn't cut off any shared mappings */
        if (newsize < size) {
-                ret = ramfs_nommu_check_mappings(inode, newsize, size);
+                ret = nommu_shrink_inode_mappings(inode, size, newsize);
                if (ret < 0)
                        return ret;
        }
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 09783cc444ac..77414db10dc2 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -954,16 +954,14 @@ xfs_fs_destroy_inode(
        ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
        /*
-         * If we have nothing to flush with this inode then complete the
+         * We always use background reclaim here because even if the
-         * teardown now, otherwise delay the flush operation.
+         * inode is clean, it still may be under IO and hence we have
+         * to take the flush lock. The background reclaim path handles
+         * this more efficiently than we can here, so simply let background
+         * reclaim tear down all inodes.
         */
-        if (!xfs_inode_clean(ip)) {
-                xfs_inode_set_reclaim_tag(ip);
-                return;
-        }
 out_reclaim:
-        xfs_ireclaim(ip);
+        xfs_inode_set_reclaim_tag(ip);
 }
 /*
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 6fed97a8cd3e..1f5e4bb5e970 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -65,7 +65,6 @@ xfs_inode_ag_lookup(
         * as the tree is sparse and a gang lookup walks to find
         * the number of objects requested.
         */
-        read_lock(&pag->pag_ici_lock);
        if (tag == XFS_ICI_NO_TAG) {
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                (void **)&ip, *first_index, 1);
@@ -74,7 +73,7 @@ xfs_inode_ag_lookup(
                                (void **)&ip, *first_index, 1, tag);
        }
        if (!nr_found)
-                goto unlock;
+                return NULL;
        /*
         * Update the index for the next lookup. Catch overflows
@@ -84,13 +83,8 @@ xfs_inode_ag_lookup(
         */
        *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
        if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-                goto unlock;
+                return NULL;
        return ip;
-unlock:
-        read_unlock(&pag->pag_ici_lock);
-        return NULL;
 }
 STATIC int
@@ -100,7 +94,8 @@ xfs_inode_ag_walk(
        int                     (*execute)(struct xfs_inode *ip,
                                           struct xfs_perag *pag, int flags),
        int                     flags,
-        int                     tag)
+        int                     tag,
+        int                     exclusive)
 {
        struct xfs_perag        *pag = &mp->m_perag[ag];
        uint32_t                first_index;
@@ -114,10 +109,20 @@ restart:
                int             error = 0;
                xfs_inode_t     *ip;
+                if (exclusive)
+                        write_lock(&pag->pag_ici_lock);
+                else
+                        read_lock(&pag->pag_ici_lock);
                ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
-                if (!ip)
+                if (!ip) {
+                        if (exclusive)
+                                write_unlock(&pag->pag_ici_lock);
+                        else
+                                read_unlock(&pag->pag_ici_lock);
                        break;
+                }
+                /* execute releases pag->pag_ici_lock */
                error = execute(ip, pag, flags);
                if (error == EAGAIN) {
                        skipped++;
@@ -125,9 +130,8 @@ restart:
                }
                if (error)
                        last_error = error;
-                /*
-                 * bail out if the filesystem is corrupted.
+                /* bail out if the filesystem is corrupted.  */
-                 */
                if (error == EFSCORRUPTED)
                        break;
@@ -148,7 +152,8 @@ xfs_inode_ag_iterator(
        int                     (*execute)(struct xfs_inode *ip,
                                           struct xfs_perag *pag, int flags),
        int                     flags,
-        int                     tag)
+        int                     tag,
+        int                     exclusive)
 {
        int                     error = 0;
        int                     last_error = 0;
@@ -157,7 +162,8 @@ xfs_inode_ag_iterator(
        for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
                if (!mp->m_perag[ag].pag_ici_init)
                        continue;
-                error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
+                error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
+                                                exclusive);
                if (error) {
                        last_error = error;
                        if (error == EFSCORRUPTED)
@@ -174,30 +180,31 @@ xfs_sync_inode_valid(
        struct xfs_perag        *pag)
 {
        struct inode            *inode = VFS_I(ip);
+        int                     error = EFSCORRUPTED;
        /* nothing to sync during shutdown */
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                read_unlock(&pag->pag_ici_lock);
+                goto out_unlock;
-                return EFSCORRUPTED;
-        }
-        /*
+        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-         * If we can't get a reference on the inode, it must be in reclaim.
+        error = ENOENT;
-         * Leave it for the reclaim code to flush. Also avoid inodes that
+        if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-         * haven't been fully initialised.
+                goto out_unlock;
-         */
-        if (!igrab(inode)) {
-                read_unlock(&pag->pag_ici_lock);
-                return ENOENT;
-        }
-        read_unlock(&pag->pag_ici_lock);
-        if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
+        /* If we can't grab the inode, it must on it's way to reclaim. */
+        if (!igrab(inode))
+                goto out_unlock;
+        if (is_bad_inode(inode)) {
                IRELE(ip);
-                return ENOENT;
+                goto out_unlock;
        }
-        return 0;
+        /* inode is valid */
+        error = 0;
+out_unlock:
+        read_unlock(&pag->pag_ici_lock);
+        return error;
 }
 STATIC int
@@ -282,7 +289,7 @@ xfs_sync_data(
        ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
-                                      XFS_ICI_NO_TAG);
+                                      XFS_ICI_NO_TAG, 0);
        if (error)
                return XFS_ERROR(error);
@@ -304,7 +311,7 @@ xfs_sync_attr(
        ASSERT((flags & ~SYNC_WAIT) == 0);
        return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
-                                     XFS_ICI_NO_TAG);
+                                     XFS_ICI_NO_TAG, 0);
 }
 STATIC int
@@ -664,60 +671,6 @@ xfs_syncd_stop(
        kthread_stop(mp->m_sync_task);
 }
-STATIC int
-xfs_reclaim_inode(
-        xfs_inode_t     *ip,
-        int             sync_mode)
-{
-        xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-        /* The hash lock here protects a thread in xfs_iget_core from
-         * racing with us on linking the inode back with a vnode.
-         * Once we have the XFS_IRECLAIM flag set it will not touch
-         * us.
-         */
-        write_lock(&pag->pag_ici_lock);
-        spin_lock(&ip->i_flags_lock);
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
-            !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
-                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
-                return -EAGAIN;
-        }
-        __xfs_iflags_set(ip, XFS_IRECLAIM);
-        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
-        xfs_put_perag(ip->i_mount, pag);
-        /*
-         * If the inode is still dirty, then flush it out.  If the inode
-         * is not in the AIL, then it will be OK to flush it delwri as
-         * long as xfs_iflush() does not keep any references to the inode.
-         * We leave that decision up to xfs_iflush() since it has the
-         * knowledge of whether it's OK to simply do a delwri flush of
-         * the inode or whether we need to wait until the inode is
-         * pulled from the AIL.
-         * We get the flush lock regardless, though, just to make sure
-         * we don't free it while it is being flushed.
-         */
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_iflock(ip);
-        /*
-         * In the case of a forced shutdown we rely on xfs_iflush() to
-         * wait for the inode to be unpinned before returning an error.
-         */
-        if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
-                /* synchronize with xfs_iflush_done */
-                xfs_iflock(ip);
-                xfs_ifunlock(ip);
-        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        xfs_ireclaim(ip);
-        return 0;
-}
 void
 __xfs_inode_set_reclaim_tag(
        struct xfs_perag        *pag,
@@ -760,19 +713,55 @@ __xfs_inode_clear_reclaim_tag(
 }
 STATIC int
-xfs_reclaim_inode_now(
+xfs_reclaim_inode(
        struct xfs_inode        *ip,
        struct xfs_perag        *pag,
-        int                     flags)
+        int                     sync_mode)
 {
-        /* ignore if already under reclaim */
+        /*
-        if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
+         * The radix tree lock here protects a thread in xfs_iget from racing
-                read_unlock(&pag->pag_ici_lock);
+         * with us starting reclaim on the inode.  Once we have the
+         * XFS_IRECLAIM flag set it will not touch us.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+        if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+                /* ignore as it is already under reclaim */
+                spin_unlock(&ip->i_flags_lock);
+                write_unlock(&pag->pag_ici_lock);
                return 0;
        }
-        read_unlock(&pag->pag_ici_lock);
+        __xfs_iflags_set(ip, XFS_IRECLAIM);
+        spin_unlock(&ip->i_flags_lock);
+        write_unlock(&pag->pag_ici_lock);
-        return xfs_reclaim_inode(ip, flags);
+        /*
+         * If the inode is still dirty, then flush it out.  If the inode
+         * is not in the AIL, then it will be OK to flush it delwri as
+         * long as xfs_iflush() does not keep any references to the inode.
+         * We leave that decision up to xfs_iflush() since it has the
+         * knowledge of whether it's OK to simply do a delwri flush of
+         * the inode or whether we need to wait until the inode is
+         * pulled from the AIL.
+         * We get the flush lock regardless, though, just to make sure
+         * we don't free it while it is being flushed.
+         */
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_iflock(ip);
+        /*
+         * In the case of a forced shutdown we rely on xfs_iflush() to
+         * wait for the inode to be unpinned before returning an error.
+         */
+        if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+                /* synchronize with xfs_iflush_done */
+                xfs_iflock(ip);
+                xfs_ifunlock(ip);
+        }
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        xfs_ireclaim(ip);
+        return 0;
 }
 int
@@ -780,6 +769,6 @@ xfs_reclaim_inodes(
        xfs_mount_t     *mp,
        int             mode)
 {
-        return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
+        return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
-                                        XFS_ICI_RECLAIM_TAG);
+                                        XFS_ICI_RECLAIM_TAG, 1);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index a500b4d91835..ea932b43335d 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-        int flags, int tag);
+        int flags, int tag, int write_lock);
 #endif
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 71af76fe8a23..873e07e29074 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes(
        uint             flags)
 {
        ASSERT(mp->m_quotainfo);
-        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
+        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
 }
 /*------------------------------------------------------------------------*/
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index d1483a4f71b8..84ca1cf16a1e 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -114,10 +114,82 @@ xfs_swapext(
        return error;
 }
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6.  If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+        xfs_inode_t     *ip,    /* target inode */
+        xfs_inode_t     *tip)   /* tmp inode */
+{
+        /* Should never get a local format */
+        if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+            tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+                return EINVAL;
+        /*
+         * if the target inode has less extents that then temporary inode then
+         * why did userspace call us?
+         */
+        if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+                return EINVAL;
+        /*
+         * if the target inode is in extent form and the temp inode is in btree
+         * form then we will end up with the target inode in the wrong format
+         * as we already know there are less extents in the temp inode.
+         */
+        if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+            tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+                return EINVAL;
+        /* Check temp in extent form to max in target */
+        if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+                return EINVAL;
+        /* Check target in extent form to max in temp */
+        if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+                return EINVAL;
+        /* Check root block of temp in btree form to max in target */
+        if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+            XFS_IFORK_BOFF(ip) &&
+            tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+                return EINVAL;
+        /* Check root block of target in btree form to max in temp */
+        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+            XFS_IFORK_BOFF(tip) &&
+            ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+                return EINVAL;
+        return 0;
+}
 int
 xfs_swap_extents(
-        xfs_inode_t     *ip,
+        xfs_inode_t     *ip,    /* target inode */
-        xfs_inode_t     *tip,
+        xfs_inode_t     *tip,   /* tmp inode */
        xfs_swapext_t   *sxp)
 {
        xfs_mount_t     *mp;
@@ -161,13 +233,6 @@ xfs_swap_extents(
                goto out_unlock;
        }
-        /* Should never get a local format */
-        if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
-            tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-                error = XFS_ERROR(EINVAL);
-                goto out_unlock;
-        }
        if (VN_CACHED(VFS_I(tip)) != 0) {
                error = xfs_flushinval_pages(tip, 0, -1,
                                FI_REMAPF_LOCKED);
@@ -189,13 +254,12 @@ xfs_swap_extents(
                goto out_unlock;
        }
-        /*
+        /* check inode formats now that data is flushed */
-         * If the target has extended attributes, the tmp file
+        error = xfs_swap_extents_check_format(ip, tip);
-         * must also in order to ensure the correct data fork
+        if (error) {
-         * format.
+                xfs_fs_cmn_err(CE_NOTE, mp,
-         */
+                    "%s: inode 0x%llx format is incompatible for exchanging.",
-        if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
+                                __FILE__, ip->i_ino);
-                error = XFS_ERROR(EINVAL);
                goto out_unlock;
        }
@@ -276,6 +340,16 @@ xfs_swap_extents(
        *tifp = *tempifp;       /* struct copy */
        /*
+         * Fix the in-memory data fork values that are dependent on the fork
+         * offset in the inode. We can't assume they remain the same as attr2
+         * has dynamic fork offsets.
+         */
+        ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
+                                        (uint)sizeof(xfs_bmbt_rec_t);
+        tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
+                                        (uint)sizeof(xfs_bmbt_rec_t);
+        /*
         * Fix the on-disk inode values
         */
        tmp = (__uint64_t)ip->i_d.di_nblocks;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index fa402a6bbbcf..155e798f30a1 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -73,7 +73,6 @@ xfs_inode_alloc(
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
-        ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 391d36b0e68c..ef77fd88c8e3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2842,13 +2842,9 @@ xfs_iflush(
        /*
         * If the inode isn't dirty, then just release the inode flush lock and
-         * do nothing. Treat stale inodes the same; we cannot rely on the
+         * do nothing.
-         * backing buffer remaining stale in cache for the remaining life of
-         * the stale inode and so xfs_itobp() below may give us a buffer that
-         * no longer contains inodes below. Doing this stale check here also
-         * avoids forcing the log on pinned, stale inodes.
         */
-        if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
+        if (xfs_inode_clean(ip)) {
                xfs_ifunlock(ip);
                return 0;
        }
@@ -2872,6 +2868,19 @@ xfs_iflush(
        xfs_iunpin_wait(ip);
        /*
+         * For stale inodes we cannot rely on the backing buffer remaining
+         * stale in cache for the remaining life of the stale inode and so
+         * xfs_itobp() below may give us a buffer that no longer contains
+         * inodes below. We have to check this after ensuring the inode is
+         * unpinned so that it is safe to reclaim the stale inode after the
+         * flush call.
+         */
+        if (xfs_iflags_test(ip, XFS_ISTALE)) {
+                xfs_ifunlock(ip);
+                return 0;
+        }
+        /*
         * This may have been unpinned because the filesystem is shutting
         * down forcibly. If that's the case we must not write this inode
         * to disk, because the log record didn't make it to disk!
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 9e15a1185362..6be05f756d59 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1517,6 +1517,8 @@ xfs_rtfree_range(
         */
        error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
                &postblock);
+        if (error)
+                return error;
        /*
         * If there are blocks not being freed at the front of the
         * old extent, add summary data for them to be allocated.