Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6

Conflicts: net/ipv4/fib_frontend.c
author: David S. Miller <davem@davemloft.net> 2010-12-27 01:37:05 -0500
committer: David S. Miller <davem@davemloft.net> 2010-12-27 01:37:05 -0500
commit: 17f7f4d9fcce8f1b75b5f735569309dee7665968 (patch)
tree: 14d7e49ca0053a0fcab3c33b5023bf3f90c5c08a /fs
parent: 041110a439e21cd40709ead4ffbfa8034619ad77 (diff)
parent: d7c1255a3a21e98bdc64df8ccf005a174d7e6289 (diff)
145 files changed, 2171 insertions, 1249 deletions
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d5c1401f003..d34896cfb19 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -980,19 +980,11 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
        }
 }
-static DEFINE_MUTEX(autofs4_ioctl_mutex);
 static long autofs4_root_ioctl(struct file *filp,
                               unsigned int cmd, unsigned long arg)
 {
-        long ret;
        struct inode *inode = filp->f_dentry->d_inode;
+        return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
-        mutex_lock(&autofs4_ioctl_mutex);
-        ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
-        mutex_unlock(&autofs4_ioctl_mutex);
-        return ret;
 }
 #ifdef CONFIG_COMPAT
@@ -1002,13 +994,11 @@ static long autofs4_root_compat_ioctl(struct file *filp,
        struct inode *inode = filp->f_path.dentry->d_inode;
        int ret;
-        mutex_lock(&autofs4_ioctl_mutex);
        if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
        else
                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
                        (unsigned long)compat_ptr(arg));
-        mutex_unlock(&autofs4_ioctl_mutex);
        return ret;
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 06e8ff12b97..4230252fd68 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -11,7 +11,6 @@
 #include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/major.h>
-#include <linux/smp_lock.h>
 #include <linux/device_cgroup.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7845d1f7d1d..b50bc4bd5c5 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -91,23 +91,10 @@ static inline int compressed_bio_size(struct btrfs_root *root,
 static struct bio *compressed_bio_alloc(struct block_device *bdev,
                                        u64 first_byte, gfp_t gfp_flags)
 {
-        struct bio *bio;
        int nr_vecs;
        nr_vecs = bio_get_nr_vecs(bdev);
-        bio = bio_alloc(gfp_flags, nr_vecs);
+        return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
-        if (bio == NULL && (current->flags & PF_MEMALLOC)) {
-                while (!bio && (nr_vecs /= 2))
-                        bio = bio_alloc(gfp_flags, nr_vecs);
-        }
-        if (bio) {
-                bio->bi_size = 0;
-                bio->bi_bdev = bdev;
-                bio->bi_sector = first_byte >> 9;
-        }
-        return bio;
 }
 static int check_compressed_csum(struct inode *inode,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8db9234f6b4..af52f6d7a4d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -808,9 +808,9 @@ struct btrfs_block_group_cache {
        int extents_thresh;
        int free_extents;
        int total_bitmaps;
-        int ro:1;
+        unsigned int ro:1;
-        int dirty:1;
+        unsigned int dirty:1;
-        int iref:1;
+        unsigned int iref:1;
        int disk_cache_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fb827d0d718..51d2e4de34e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,6 +28,7 @@
 #include <linux/freezer.h>
 #include <linux/crc32c.h>
 #include <linux/slab.h>
+#include <linux/migrate.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -355,6 +356,8 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
                                             btrfs_header_generation(eb));
        BUG_ON(ret);
+        WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
                WARN_ON(1);
@@ -693,6 +696,27 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                                   __btree_submit_bio_done);
 }
+#ifdef CONFIG_MIGRATION
+static int btree_migratepage(struct address_space *mapping,
+                        struct page *newpage, struct page *page)
+{
+        /*
+         * we can't safely write a btree page from here,
+         * we haven't done the locking hook
+         */
+        if (PageDirty(page))
+                return -EAGAIN;
+        /*
+         * Buffers may be managed in a filesystem specific way.
+         * We must have no buffers or drop them.
+         */
+        if (page_has_private(page) &&
+            !try_to_release_page(page, GFP_KERNEL))
+                return -EAGAIN;
+        return migrate_page(mapping, newpage, page);
+}
+#endif
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
@@ -707,8 +731,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
        }
        redirty_page_for_writepage(wbc, page);
-        eb = btrfs_find_tree_block(root, page_offset(page),
+        eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
-                                      PAGE_CACHE_SIZE);
        WARN_ON(!eb);
        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
@@ -799,6 +822,9 @@ static const struct address_space_operations btree_aops = {
        .releasepage    = btree_releasepage,
        .invalidatepage = btree_invalidatepage,
        .sync_page      = block_sync_page,
+#ifdef CONFIG_MIGRATION
+        .migratepage    = btree_migratepage,
+#endif
 };
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -981,7 +1007,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
-        BUG_ON(!root->node);
+        if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
+                free_extent_buffer(root->node);
+                return -EIO;
+        }
        root->commit_root = btrfs_root_node(root);
        return 0;
 }
@@ -1538,10 +1567,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                                 GFP_NOFS);
        struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
                                                 GFP_NOFS);
-        struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
+        struct btrfs_root *tree_root = btrfs_sb(sb);
-                                               GFP_NOFS);
+        struct btrfs_fs_info *fs_info = tree_root->fs_info;
-        struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
-                                                GFP_NOFS);
        struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
                                                GFP_NOFS);
        struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 951ef09b82f..659f532d26a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -166,7 +166,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 static struct dentry *btrfs_get_parent(struct dentry *child)
 {
        struct inode *dir = child->d_inode;
-        static struct dentry *dentry;
+        struct dentry *dentry;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
@@ -232,9 +232,85 @@ fail:
        return ERR_PTR(ret);
 }
+static int btrfs_get_name(struct dentry *parent, char *name,
+                          struct dentry *child)
+{
+        struct inode *inode = child->d_inode;
+        struct inode *dir = parent->d_inode;
+        struct btrfs_path *path;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_inode_ref *iref;
+        struct btrfs_root_ref *rref;
+        struct extent_buffer *leaf;
+        unsigned long name_ptr;
+        struct btrfs_key key;
+        int name_len;
+        int ret;
+        if (!dir || !inode)
+                return -EINVAL;
+        if (!S_ISDIR(dir->i_mode))
+                return -EINVAL;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+                key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+                key.type = BTRFS_ROOT_BACKREF_KEY;
+                key.offset = (u64)-1;
+                root = root->fs_info->tree_root;
+        } else {
+                key.objectid = inode->i_ino;
+                key.offset = dir->i_ino;
+                key.type = BTRFS_INODE_REF_KEY;
+        }
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0) {
+                btrfs_free_path(path);
+                return ret;
+        } else if (ret > 0) {
+                if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+                        path->slots[0]--;
+                } else {
+                        btrfs_free_path(path);
+                        return -ENOENT;
+                }
+        }
+        leaf = path->nodes[0];
+        if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               rref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_root_ref);
+               name_ptr = (unsigned long)(rref + 1);
+               name_len = btrfs_root_ref_name_len(leaf, rref);
+        } else {
+                iref = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_inode_ref);
+                name_ptr = (unsigned long)(iref + 1);
+                name_len = btrfs_inode_ref_name_len(leaf, iref);
+        }
+        read_extent_buffer(leaf, name, name_ptr, name_len);
+        btrfs_free_path(path);
+        /*
+         * have to add the null termination to make sure that reconnect_path
+         * gets the right len for strlen
+         */
+        name[name_len] = '\0';
+        return 0;
+}
 const struct export_operations btrfs_export_ops = {
        .encode_fh      = btrfs_encode_fh,
        .fh_to_dentry   = btrfs_fh_to_dentry,
        .fh_to_parent   = btrfs_fh_to_parent,
        .get_parent     = btrfs_get_parent,
+        .get_name       = btrfs_get_name,
 };
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0c097f3aec4..227e5815d83 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -429,6 +429,7 @@ err:
 static int cache_block_group(struct btrfs_block_group_cache *cache,
                             struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
                             int load_cache_only)
 {
        struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -442,9 +443,12 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        /*
         * We can't do the read from on-disk cache during a commit since we need
-         * to have the normal tree locking.
+         * to have the normal tree locking.  Also if we are currently trying to
+         * allocate blocks for the tree root we can't do the fast caching since
+         * we likely hold important locks.
         */
-        if (!trans->transaction->in_commit) {
+        if (!trans->transaction->in_commit &&
+            (root && root != root->fs_info->tree_root)) {
                spin_lock(&cache->lock);
                if (cache->cached != BTRFS_CACHE_NO) {
                        spin_unlock(&cache->lock);
@@ -2741,6 +2745,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
        struct btrfs_root *root = block_group->fs_info->tree_root;
        struct inode *inode = NULL;
        u64 alloc_hint = 0;
+        int dcs = BTRFS_DC_ERROR;
        int num_pages = 0;
        int retries = 0;
        int ret = 0;
@@ -2795,6 +2800,8 @@ again:
        spin_lock(&block_group->lock);
        if (block_group->cached != BTRFS_CACHE_FINISHED) {
+                /* We're not cached, don't bother trying to write stuff out */
+                dcs = BTRFS_DC_WRITTEN;
                spin_unlock(&block_group->lock);
                goto out_put;
        }
@@ -2821,6 +2828,8 @@ again:
        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                              num_pages, num_pages,
                                              &alloc_hint);
+        if (!ret)
+                dcs = BTRFS_DC_SETUP;
        btrfs_free_reserved_data_space(inode, num_pages);
 out_put:
        iput(inode);
@@ -2828,10 +2837,7 @@ out_free:
        btrfs_release_path(root, path);
 out:
        spin_lock(&block_group->lock);
-        if (ret)
+        block_group->disk_cache_state = dcs;
-                block_group->disk_cache_state = BTRFS_DC_ERROR;
-        else
-                block_group->disk_cache_state = BTRFS_DC_SETUP;
        spin_unlock(&block_group->lock);
        return ret;
@@ -3037,7 +3043,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-        u64 num_devices = root->fs_info->fs_devices->rw_devices;
+        /*
+         * we add in the count of missing devices because we want
+         * to make sure that any RAID levels on a degraded FS
+         * continue to be honored.
+         */
+        u64 num_devices = root->fs_info->fs_devices->rw_devices +
+                root->fs_info->fs_devices->missing_devices;
        if (num_devices == 1)
                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -3412,7 +3424,7 @@ again:
         * our reservation.
         */
        if (unused <= space_info->total_bytes) {
-                unused -= space_info->total_bytes;
+                unused = space_info->total_bytes - unused;
                if (unused >= num_bytes) {
                        if (!reserved)
                                space_info->bytes_reserved += orig_bytes;
@@ -4080,7 +4092,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                 * space back to the block group, otherwise we will leak space.
                 */
                if (!alloc && cache->cached == BTRFS_CACHE_NO)
-                        cache_block_group(cache, trans, 1);
+                        cache_block_group(cache, trans, NULL, 1);
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
@@ -4930,11 +4942,31 @@ search:
                btrfs_get_block_group(block_group);
                search_start = block_group->key.objectid;
+                /*
+                 * this can happen if we end up cycling through all the
+                 * raid types, but we want to make sure we only allocate
+                 * for the proper type.
+                 */
+                if (!block_group_bits(block_group, data)) {
+                    u64 extra = BTRFS_BLOCK_GROUP_DUP |
+                                BTRFS_BLOCK_GROUP_RAID1 |
+                                BTRFS_BLOCK_GROUP_RAID10;
+                        /*
+                         * if they asked for extra copies and this block group
+                         * doesn't provide them, bail.  This does allow us to
+                         * fill raid0 from raid1.
+                         */
+                        if ((data & extra) && !(block_group->flags & extra))
+                                goto loop;
+                }
 have_block_group:
                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
                        u64 free_percent;
-                        ret = cache_block_group(block_group, trans, 1);
+                        ret = cache_block_group(block_group, trans,
+                                                orig_root, 1);
                        if (block_group->cached == BTRFS_CACHE_FINISHED)
                                goto have_block_group;
@@ -4958,7 +4990,8 @@ have_block_group:
                        if (loop > LOOP_CACHING_NOWAIT ||
                            (loop > LOOP_FIND_IDEAL &&
                             atomic_read(&space_info->caching_threads) < 2)) {
-                                ret = cache_block_group(block_group, trans, 0);
+                                ret = cache_block_group(block_group, trans,
+                                                        orig_root, 0);
                                BUG_ON(ret);
                        }
                        found_uncached_bg = true;
@@ -5515,7 +5548,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        u64 num_bytes = ins->offset;
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-        cache_block_group(block_group, trans, 0);
+        cache_block_group(block_group, trans, NULL, 0);
        caching_ctl = get_caching_control(block_group);
        if (!caching_ctl) {
@@ -6300,9 +6333,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                           NULL, NULL);
                BUG_ON(ret < 0);
                if (ret > 0) {
-                        ret = btrfs_del_orphan_item(trans, tree_root,
+                        /* if we fail to delete the orphan item this time
-                                                    root->root_key.objectid);
+                         * around, it'll get picked up the next time.
-                        BUG_ON(ret);
+                         *
+                         * The most common failure here is just -ENOENT.
+                         */
+                        btrfs_del_orphan_item(trans, tree_root,
+                                              root->root_key.objectid);
                }
        }
@@ -7878,7 +7915,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
-        num_devices = root->fs_info->fs_devices->rw_devices;
+        /*
+         * we add in the count of missing devices because we want
+         * to make sure that any RAID levels on a degraded FS
+         * continue to be honored.
+         */
+        num_devices = root->fs_info->fs_devices->rw_devices +
+                root->fs_info->fs_devices->missing_devices;
        if (num_devices == 1) {
                stripped |= BTRFS_BLOCK_GROUP_DUP;
                stripped = flags & ~stripped;
@@ -8247,7 +8291,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                        break;
                if (ret != 0)
                        goto error;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                cache = kzalloc(sizeof(*cache), GFP_NOFS);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eac10e3260a..3e86b9f3650 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1828,9 +1828,9 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
        bio_put(bio);
 }
-static struct bio *
+struct bio *
-extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
-                 gfp_t gfp_flags)
+                gfp_t gfp_flags)
 {
        struct bio *bio;
@@ -1919,7 +1919,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
        else
                nr = bio_get_nr_vecs(bdev);
-        bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+        bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
        bio_add_page(bio, page, page_size, offset);
        bio->bi_end_io = end_io_func;
@@ -2901,21 +2901,53 @@ out:
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent)
 {
-        int ret;
+        int ret = 0;
        u64 off = start;
        u64 max = start + len;
        u32 flags = 0;
+        u32 found_type;
+        u64 last;
        u64 disko = 0;
+        struct btrfs_key found_key;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
+        struct btrfs_path *path;
+        struct btrfs_file_extent_item *item;
        int end = 0;
        u64 em_start = 0, em_len = 0;
        unsigned long emflags;
-        ret = 0;
+        int hole = 0;
        if (len == 0)
                return -EINVAL;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
+                                       path, inode->i_ino, -1, 0);
+        if (ret < 0) {
+                btrfs_free_path(path);
+                return ret;
+        }
+        WARN_ON(!ret);
+        path->slots[0]--;
+        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                              struct btrfs_file_extent_item);
+        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+        found_type = btrfs_key_type(&found_key);
+        /* No extents, just return */
+        if (found_key.objectid != inode->i_ino ||
+            found_type != BTRFS_EXTENT_DATA_KEY) {
+                btrfs_free_path(path);
+                return 0;
+        }
+        last = found_key.offset;
+        btrfs_free_path(path);
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
                         &cached_state, GFP_NOFS);
        em = get_extent(inode, NULL, 0, off, max - off, 0);
@@ -2925,11 +2957,18 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                ret = PTR_ERR(em);
                goto out;
        }
        while (!end) {
+                hole = 0;
                off = em->start + em->len;
                if (off >= max)
                        end = 1;
+                if (em->block_start == EXTENT_MAP_HOLE) {
+                        hole = 1;
+                        goto next;
+                }
                em_start = em->start;
                em_len = em->len;
@@ -2939,8 +2978,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
                        end = 1;
                        flags |= FIEMAP_EXTENT_LAST;
-                } else if (em->block_start == EXTENT_MAP_HOLE) {
-                        flags |= FIEMAP_EXTENT_UNWRITTEN;
                } else if (em->block_start == EXTENT_MAP_INLINE) {
                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
                                  FIEMAP_EXTENT_NOT_ALIGNED);
@@ -2953,10 +2990,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
+next:
                emflags = em->flags;
                free_extent_map(em);
                em = NULL;
                if (!end) {
                        em = get_extent(inode, NULL, 0, off, max - off, 0);
                        if (!em)
@@ -2967,15 +3004,23 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        }
                        emflags = em->flags;
                }
                if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
-                ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+                if (em_start == last) {
-                                        em_len, flags);
+                        flags |= FIEMAP_EXTENT_LAST;
-                if (ret)
+                        end = 1;
-                        goto out_free;
+                }
+                if (!hole) {
+                        ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+                                                em_len, flags);
+                        if (ret)
+                                goto out_free;
+                }
        }
 out_free:
        free_extent_map(em);
@@ -3836,8 +3881,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
        spin_lock(&tree->buffer_lock);
        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-        if (!eb)
+        if (!eb) {
-                goto out;
+                spin_unlock(&tree->buffer_lock);
+                return ret;
+        }
        if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
                ret = 0;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1c6d4f342ef..4183c8178f0 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -310,4 +310,7 @@ int extent_clear_unlock_delalloc(struct inode *inode,
                                struct extent_io_tree *tree,
                                u64 start, u64 end, struct page *locked_page,
                                unsigned long op);
+struct bio *
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+                gfp_t gfp_flags);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e354c33df08..66836d85763 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -48,30 +48,34 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                                         struct page **prepared_pages,
                                         struct iov_iter *i)
 {
-        size_t copied;
+        size_t copied = 0;
        int pg = 0;
        int offset = pos & (PAGE_CACHE_SIZE - 1);
+        int total_copied = 0;
        while (write_bytes > 0) {
                size_t count = min_t(size_t,
                                     PAGE_CACHE_SIZE - offset, write_bytes);
                struct page *page = prepared_pages[pg];
-again:
+                /*
-                if (unlikely(iov_iter_fault_in_readable(i, count)))
+                 * Copy data from userspace to the current page
-                        return -EFAULT;
+                 *
+                 * Disable pagefault to avoid recursive lock since
-                /* Copy data from userspace to the current page */
+                 * the pages are already locked
-                copied = iov_iter_copy_from_user(page, i, offset, count);
+                 */
+                pagefault_disable();
+                copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+                pagefault_enable();
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
                iov_iter_advance(i, copied);
                write_bytes -= copied;
+                total_copied += copied;
+                /* Return to btrfs_file_aio_write to fault page */
                if (unlikely(copied == 0)) {
-                        count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+                        break;
-                                      iov_iter_single_seg_count(i));
-                        goto again;
                }
                if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
@@ -81,7 +85,7 @@ again:
                        offset = 0;
                }
        }
-        return 0;
+        return total_copied;
 }
 /*
@@ -854,6 +858,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        unsigned long last_index;
        int will_write;
        int buffered = 0;
+        int copied = 0;
+        int dirty_pages = 0;
        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
                      (file->f_flags & O_DIRECT));
@@ -970,7 +976,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                WARN_ON(num_pages > nrptrs);
                memset(pages, 0, sizeof(struct page *) * nrptrs);
-                ret = btrfs_delalloc_reserve_space(inode, write_bytes);
+                /*
+                 * Fault pages before locking them in prepare_pages
+                 * to avoid recursive lock
+                 */
+                if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                ret = btrfs_delalloc_reserve_space(inode,
+                                        num_pages << PAGE_CACHE_SHIFT);
                if (ret)
                        goto out;
@@ -978,37 +994,49 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                                    pos, first_index, last_index,
                                    write_bytes);
                if (ret) {
-                        btrfs_delalloc_release_space(inode, write_bytes);
+                        btrfs_delalloc_release_space(inode,
+                                        num_pages << PAGE_CACHE_SHIFT);
                        goto out;
                }
-                ret = btrfs_copy_from_user(pos, num_pages,
+                copied = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, &i);
-                if (ret == 0) {
+                dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
+                                        PAGE_CACHE_SHIFT;
+                if (num_pages > dirty_pages) {
+                        if (copied > 0)
+                                atomic_inc(
+                                        &BTRFS_I(inode)->outstanding_extents);
+                        btrfs_delalloc_release_space(inode,
+                                        (num_pages - dirty_pages) <<
+                                        PAGE_CACHE_SHIFT);
+                }
+                if (copied > 0) {
                        dirty_and_release_pages(NULL, root, file, pages,
-                                                num_pages, pos, write_bytes);
+                                                dirty_pages, pos, copied);
                }
                btrfs_drop_pages(pages, num_pages);
-                if (ret) {
-                        btrfs_delalloc_release_space(inode, write_bytes);
-                        goto out;
-                }
-                if (will_write) {
+                if (copied > 0) {
-                        filemap_fdatawrite_range(inode->i_mapping, pos,
+                        if (will_write) {
-                                                 pos + write_bytes - 1);
+                                filemap_fdatawrite_range(inode->i_mapping, pos,
-                } else {
+                                                         pos + copied - 1);
-                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                        } else {
-                                                           num_pages);
+                                balance_dirty_pages_ratelimited_nr(
-                        if (num_pages <
+                                                        inode->i_mapping,
-                            (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+                                                        dirty_pages);
-                                btrfs_btree_balance_dirty(root, 1);
+                                if (dirty_pages <
-                        btrfs_throttle(root);
+                                (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+                                        btrfs_btree_balance_dirty(root, 1);
+                                btrfs_throttle(root);
+                        }
                }
-                pos += write_bytes;
+                pos += copied;
-                num_written += write_bytes;
+                num_written += copied;
                cond_resched();
        }
@@ -1047,8 +1075,14 @@ out:
                if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
                        trans = btrfs_start_transaction(root, 0);
+                        if (IS_ERR(trans)) {
+                                num_written = PTR_ERR(trans);
+                                goto done;
+                        }
+                        mutex_lock(&inode->i_mutex);
                        ret = btrfs_log_dentry_safe(trans, root,
                                                    file->f_dentry);
+                        mutex_unlock(&inode->i_mutex);
                        if (ret == 0) {
                                ret = btrfs_sync_log(trans, root);
                                if (ret == 0)
@@ -1067,6 +1101,7 @@ out:
                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
                }
        }
+done:
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
 }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 22ee0dc2e6b..60d68426695 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -290,7 +290,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                       (unsigned long long)BTRFS_I(inode)->generation,
                       (unsigned long long)generation,
                       (unsigned long long)block_group->key.objectid);
-                goto out;
+                goto free_cache;
        }
        if (!num_entries)
@@ -524,6 +524,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                return 0;
        }
+        node = rb_first(&block_group->free_space_offset);
+        if (!node) {
+                iput(inode);
+                return 0;
+        }
        last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
        filemap_write_and_wait(inode->i_mapping);
        btrfs_wait_ordered_range(inode, inode->i_size &
@@ -543,10 +549,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
         */
        first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
-        node = rb_first(&block_group->free_space_offset);
-        if (!node)
-                goto out_free;
        /*
         * Lock all pages first so we can lock the extent safely.
         *
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 558cac2dfa5..72f31ecb5c9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -495,7 +495,7 @@ again:
                add_async_extent(async_cow, start, num_bytes,
                                 total_compressed, pages, nr_pages_ret);
-                if (start + num_bytes < end && start + num_bytes < actual_end) {
+                if (start + num_bytes < end) {
                        start += num_bytes;
                        pages = NULL;
                        cond_resched();
@@ -4501,6 +4501,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        BTRFS_I(inode)->index_cnt = 2;
        BTRFS_I(inode)->root = root;
        BTRFS_I(inode)->generation = trans->transid;
+        inode->i_generation = BTRFS_I(inode)->generation;
        btrfs_set_inode_space_info(root, inode);
        if (mode & S_IFDIR)
@@ -4622,12 +4623,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 }
 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
-                            struct dentry *dentry, struct inode *inode,
+                            struct inode *dir, struct dentry *dentry,
-                            int backref, u64 index)
+                            struct inode *inode, int backref, u64 index)
 {
-        int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+        int err = btrfs_add_link(trans, dir, inode,
-                                 inode, dentry->d_name.name,
+                                 dentry->d_name.name, dentry->d_name.len,
-                                 dentry->d_name.len, backref, index);
+                                 backref, index);
        if (!err) {
                d_instantiate(dentry, inode);
                return 0;
@@ -4668,8 +4669,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        btrfs_set_trans_block_group(trans, dir);
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, dir->i_ino, objectid,
-                                dentry->d_parent->d_inode->i_ino, objectid,
                                BTRFS_I(dir)->block_group, mode, &index);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
@@ -4682,7 +4682,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        }
        btrfs_set_trans_block_group(trans, inode);
-        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
@@ -4730,10 +4730,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        btrfs_set_trans_block_group(trans, dir);
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, dir->i_ino, objectid,
-                                dentry->d_parent->d_inode->i_ino,
+                                BTRFS_I(dir)->block_group, mode, &index);
-                                objectid, BTRFS_I(dir)->block_group, mode,
-                                &index);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_unlock;
@@ -4745,7 +4743,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        }
        btrfs_set_trans_block_group(trans, inode);
-        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
@@ -4787,6 +4785,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                return -EPERM;
        btrfs_inc_nlink(inode);
+        inode->i_ctime = CURRENT_TIME;
        err = btrfs_set_inode_index(dir, &index);
        if (err)
@@ -4805,15 +4804,17 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        btrfs_set_trans_block_group(trans, dir);
        ihold(inode);
-        err = btrfs_add_nondir(trans, dentry, inode, 1, index);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
        if (err) {
                drop_inode = 1;
        } else {
+                struct dentry *parent = dget_parent(dentry);
                btrfs_update_inode_block_group(trans, dir);
                err = btrfs_update_inode(trans, root, inode);
                BUG_ON(err);
-                btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+                btrfs_log_new_name(trans, inode, NULL, parent);
+                dput(parent);
        }
        nr = trans->blocks_used;
@@ -4853,8 +4854,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        btrfs_set_trans_block_group(trans, dir);
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, dir->i_ino, objectid,
-                                dentry->d_parent->d_inode->i_ino, objectid,
                                BTRFS_I(dir)->block_group, S_IFDIR | mode,
                                &index);
        if (IS_ERR(inode)) {
@@ -4877,9 +4877,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (err)
                goto out_fail;
-        err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+        err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
-                                 inode, dentry->d_name.name,
+                             dentry->d_name.len, 0, index);
-                                 dentry->d_name.len, 0, index);
        if (err)
                goto out_fail;
@@ -5535,13 +5534,21 @@ struct btrfs_dio_private {
        u64 bytes;
        u32 *csums;
        void *private;
+        /* number of bios pending for this dio */
+        atomic_t pending_bios;
+        /* IO errors */
+        int errors;
+        struct bio *orig_bio;
 };
 static void btrfs_endio_direct_read(struct bio *bio, int err)
 {
+        struct btrfs_dio_private *dip = bio->bi_private;
        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
        struct bio_vec *bvec = bio->bi_io_vec;
-        struct btrfs_dio_private *dip = bio->bi_private;
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 start;
@@ -5595,15 +5602,18 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
        struct btrfs_trans_handle *trans;
        struct btrfs_ordered_extent *ordered = NULL;
        struct extent_state *cached_state = NULL;
+        u64 ordered_offset = dip->logical_offset;
+        u64 ordered_bytes = dip->bytes;
        int ret;
        if (err)
                goto out_done;
+again:
-        ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
-                                             dip->logical_offset, dip->bytes);
+                                                   &ordered_offset,
+                                                   ordered_bytes);
        if (!ret)
-                goto out_done;
+                goto out_test;
        BUG_ON(!ordered);
@@ -5663,8 +5673,20 @@ out_unlock:
 out:
        btrfs_delalloc_release_metadata(inode, ordered->len);
        btrfs_end_transaction(trans, root);
+        ordered_offset = ordered->file_offset + ordered->len;
        btrfs_put_ordered_extent(ordered);
        btrfs_put_ordered_extent(ordered);
+out_test:
+        /*
+         * our bio might span multiple ordered extents.  If we haven't
+         * completed the accounting for the whole dio, go back and try again
+         */
+        if (ordered_offset < dip->logical_offset + dip->bytes) {
+                ordered_bytes = dip->logical_offset + dip->bytes -
+                        ordered_offset;
+                goto again;
+        }
 out_done:
        bio->bi_private = dip->private;
@@ -5684,6 +5706,176 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
        return 0;
 }
+static void btrfs_end_dio_bio(struct bio *bio, int err)
+{
+        struct btrfs_dio_private *dip = bio->bi_private;
+        if (err) {
+                printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu "
+                      "sector %#Lx len %u err no %d\n",
+                      dip->inode->i_ino, bio->bi_rw,
+                      (unsigned long long)bio->bi_sector, bio->bi_size, err);
+                dip->errors = 1;
+                /*
+                 * before atomic variable goto zero, we must make sure
+                 * dip->errors is perceived to be set.
+                 */
+                smp_mb__before_atomic_dec();
+        }
+        /* if there are more bios still pending for this dio, just exit */
+        if (!atomic_dec_and_test(&dip->pending_bios))
+                goto out;
+        if (dip->errors)
+                bio_io_error(dip->orig_bio);
+        else {
+                set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
+                bio_endio(dip->orig_bio, 0);
+        }
+out:
+        bio_put(bio);
+}
+static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
+                                       u64 first_sector, gfp_t gfp_flags)
+{
+        int nr_vecs = bio_get_nr_vecs(bdev);
+        return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+}
+static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+                                         int rw, u64 file_offset, int skip_sum,
+                                         u32 *csums)
+{
+        int write = rw & REQ_WRITE;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        bio_get(bio);
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        if (ret)
+                goto err;
+        if (write && !skip_sum) {
+                ret = btrfs_wq_submit_bio(root->fs_info,
+                                   inode, rw, bio, 0, 0,
+                                   file_offset,
+                                   __btrfs_submit_bio_start_direct_io,
+                                   __btrfs_submit_bio_done);
+                goto err;
+        } else if (!skip_sum)
+                btrfs_lookup_bio_sums_dio(root, inode, bio,
+                                          file_offset, csums);
+        ret = btrfs_map_bio(root, rw, bio, 0, 1);
+err:
+        bio_put(bio);
+        return ret;
+}
+static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
+                                    int skip_sum)
+{
+        struct inode *inode = dip->inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+        struct bio *bio;
+        struct bio *orig_bio = dip->orig_bio;
+        struct bio_vec *bvec = orig_bio->bi_io_vec;
+        u64 start_sector = orig_bio->bi_sector;
+        u64 file_offset = dip->logical_offset;
+        u64 submit_len = 0;
+        u64 map_length;
+        int nr_pages = 0;
+        u32 *csums = dip->csums;
+        int ret = 0;
+        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_private = dip;
+        bio->bi_end_io = btrfs_end_dio_bio;
+        atomic_inc(&dip->pending_bios);
+        map_length = orig_bio->bi_size;
+        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+                              &map_length, NULL, 0);
+        if (ret) {
+                bio_put(bio);
+                return -EIO;
+        }
+        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
+                if (unlikely(map_length < submit_len + bvec->bv_len ||
+                    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+                                 bvec->bv_offset) < bvec->bv_len)) {
+                        /*
+                         * inc the count before we submit the bio so
+                         * we know the end IO handler won't happen before
+                         * we inc the count. Otherwise, the dip might get freed
+                         * before we're done setting it up
+                         */
+                        atomic_inc(&dip->pending_bios);
+                        ret = __btrfs_submit_dio_bio(bio, inode, rw,
+                                                     file_offset, skip_sum,
+                                                     csums);
+                        if (ret) {
+                                bio_put(bio);
+                                atomic_dec(&dip->pending_bios);
+                                goto out_err;
+                        }
+                        if (!skip_sum)
+                                csums = csums + nr_pages;
+                        start_sector += submit_len >> 9;
+                        file_offset += submit_len;
+                        submit_len = 0;
+                        nr_pages = 0;
+                        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
+                                                  start_sector, GFP_NOFS);
+                        if (!bio)
+                                goto out_err;
+                        bio->bi_private = dip;
+                        bio->bi_end_io = btrfs_end_dio_bio;
+                        map_length = orig_bio->bi_size;
+                        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+                                              &map_length, NULL, 0);
+                        if (ret) {
+                                bio_put(bio);
+                                goto out_err;
+                        }
+                } else {
+                        submit_len += bvec->bv_len;
+                        nr_pages ++;
+                        bvec++;
+                }
+        }
+        ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
+                                     csums);
+        if (!ret)
+                return 0;
+        bio_put(bio);
+out_err:
+        dip->errors = 1;
+        /*
+         * before atomic variable goto zero, we must
+         * make sure dip->errors is perceived to be set.
+         */
+        smp_mb__before_atomic_dec();
+        if (atomic_dec_and_test(&dip->pending_bios))
+                bio_io_error(dip->orig_bio);
+        /* bio_end_io() will handle error, so we needn't return it */
+        return 0;
+}
 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
                                loff_t file_offset)
 {
@@ -5723,36 +5915,18 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        dip->disk_bytenr = (u64)bio->bi_sector << 9;
        bio->bi_private = dip;
+        dip->errors = 0;
+        dip->orig_bio = bio;
+        atomic_set(&dip->pending_bios, 0);
        if (write)
                bio->bi_end_io = btrfs_endio_direct_write;
        else
                bio->bi_end_io = btrfs_endio_direct_read;
-        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
-        if (ret)
+        if (!ret)
-                goto out_err;
-        if (write && !skip_sum) {
-                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-                                   inode, rw, bio, 0, 0,
-                                   dip->logical_offset,
-                                   __btrfs_submit_bio_start_direct_io,
-                                   __btrfs_submit_bio_done);
-                if (ret)
-                        goto out_err;
                return;
-        } else if (!skip_sum)
-                btrfs_lookup_bio_sums_dio(root, inode, bio,
-                                          dip->logical_offset, dip->csums);
-        ret = btrfs_map_bio(root, rw, bio, 0, 1);
-        if (ret)
-                goto out_err;
-        return;
-out_err:
-        kfree(dip->csums);
-        kfree(dip);
 free_ordered:
        /*
         * If this is a write, we need to clean up the reserved space and kill
@@ -5760,8 +5934,7 @@ free_ordered:
         */
        if (write) {
                struct btrfs_ordered_extent *ordered;
-                ordered = btrfs_lookup_ordered_extent(inode,
+                ordered = btrfs_lookup_ordered_extent(inode, file_offset);
-                                                      dip->logical_offset);
                if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
                        btrfs_free_reserved_extent(root, ordered->start,
@@ -6607,8 +6780,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        BUG_ON(ret);
        if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
-                btrfs_log_new_name(trans, old_inode, old_dir,
+                struct dentry *parent = dget_parent(new_dentry);
-                                   new_dentry->d_parent);
+                btrfs_log_new_name(trans, old_inode, old_dir, parent);
+                dput(parent);
                btrfs_end_log_trans(root);
        }
 out_fail:
@@ -6758,8 +6932,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        btrfs_set_trans_block_group(trans, dir);
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, dir->i_ino, objectid,
-                                dentry->d_parent->d_inode->i_ino, objectid,
                                BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
                                &index);
        err = PTR_ERR(inode);
@@ -6773,7 +6946,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        }
        btrfs_set_trans_block_group(trans, inode);
-        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
@@ -6844,6 +7017,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
+        u64 i_size;
        int ret = 0;
        bool own_trans = true;
@@ -6885,11 +7059,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                    (actual_len > inode->i_size) &&
                    (cur_offset > inode->i_size)) {
                        if (cur_offset > actual_len)
-                                i_size_write(inode, actual_len);
+                                i_size = actual_len;
                        else
-                                i_size_write(inode, cur_offset);
+                                i_size = cur_offset;
-                        i_size_write(inode, cur_offset);
+                        i_size_write(inode, i_size);
-                        btrfs_ordered_update_i_size(inode, cur_offset, NULL);
+                        btrfs_ordered_update_i_size(inode, i_size, NULL);
                }
                ret = btrfs_update_inode(trans, root, inode);
@@ -6943,6 +7117,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
        mutex_lock(&inode->i_mutex);
+        ret = inode_newsize_ok(inode, alloc_end);
+        if (ret)
+                goto out;
        if (alloc_start > inode->i_size) {
                ret = btrfs_cont_expand(inode, alloc_start);
                if (ret)
@@ -7139,6 +7317,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .getattr        = btrfs_getattr,
        .permission     = btrfs_permission,
        .setxattr       = btrfs_setxattr,
        .getxattr       = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 463d91b4dd3..f87552a1d7e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -233,7 +233,8 @@ static noinline int create_subvol(struct btrfs_root *root,
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
        struct btrfs_root *new_root;
-        struct inode *dir = dentry->d_parent->d_inode;
+        struct dentry *parent = dget_parent(dentry);
+        struct inode *dir;
        int ret;
        int err;
        u64 objectid;
@@ -242,8 +243,13 @@ static noinline int create_subvol(struct btrfs_root *root,
        ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
                                       0, &objectid);
-        if (ret)
+        if (ret) {
+                dput(parent);
                return ret;
+        }
+        dir = parent->d_inode;
        /*
         * 1 - inode item
         * 2 - refs
@@ -251,8 +257,10 @@ static noinline int create_subvol(struct btrfs_root *root,
         * 2 - dir items
         */
        trans = btrfs_start_transaction(root, 6);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
+                dput(parent);
                return PTR_ERR(trans);
+        }
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
                                      0, objectid, NULL, 0, 0, 0);
@@ -339,6 +347,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
+        dput(parent);
        if (async_transid) {
                *async_transid = trans->transid;
                err = btrfs_commit_transaction_async(trans, root, 1);
@@ -354,6 +363,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
                           char *name, int namelen, u64 *async_transid)
 {
        struct inode *inode;
+        struct dentry *parent;
        struct btrfs_pending_snapshot *pending_snapshot;
        struct btrfs_trans_handle *trans;
        int ret;
@@ -396,7 +406,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        btrfs_orphan_cleanup(pending_snapshot->snap);
-        inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
+        parent = dget_parent(dentry);
+        inode = btrfs_lookup_dentry(parent->d_inode, dentry);
+        dput(parent);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
                goto fail;
@@ -935,23 +947,42 @@ out:
 static noinline int btrfs_ioctl_snap_create(struct file *file,
                                            void __user *arg, int subvol,
-                                            int async)
+                                            int v2)
 {
        struct btrfs_ioctl_vol_args *vol_args = NULL;
-        struct btrfs_ioctl_async_vol_args *async_vol_args = NULL;
+        struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
        char *name;
        u64 fd;
-        u64 transid = 0;
        int ret;
-        if (async) {
+        if (v2) {
-                async_vol_args = memdup_user(arg, sizeof(*async_vol_args));
+                u64 transid = 0;
-                if (IS_ERR(async_vol_args))
+                u64 *ptr = NULL;
-                        return PTR_ERR(async_vol_args);
+                vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2));
+                if (IS_ERR(vol_args_v2))
+                        return PTR_ERR(vol_args_v2);
+                if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                name = vol_args_v2->name;
+                fd = vol_args_v2->fd;
+                vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+                if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+                        ptr = &transid;
+                ret = btrfs_ioctl_snap_create_transid(file, name, fd,
+                                                      subvol, ptr);
-                name = async_vol_args->name;
+                if (ret == 0 && ptr &&
-                fd = async_vol_args->fd;
+                    copy_to_user(arg +
-                async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0';
+                                 offsetof(struct btrfs_ioctl_vol_args_v2,
+                                          transid), ptr, sizeof(*ptr)))
+                        ret = -EFAULT;
        } else {
                vol_args = memdup_user(arg, sizeof(*vol_args));
                if (IS_ERR(vol_args))
@@ -959,20 +990,13 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
                name = vol_args->name;
                fd = vol_args->fd;
                vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-        }
-        ret = btrfs_ioctl_snap_create_transid(file, name, fd,
-                                              subvol, &transid);
-        if (!ret && async) {
+                ret = btrfs_ioctl_snap_create_transid(file, name, fd,
-                if (copy_to_user(arg +
+                                                      subvol, NULL);
-                                offsetof(struct btrfs_ioctl_async_vol_args,
-                                transid), &transid, sizeof(transid)))
-                        return -EFAULT;
        }
+out:
        kfree(vol_args);
-        kfree(async_vol_args);
+        kfree(vol_args_v2);
        return ret;
 }
@@ -1669,12 +1693,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                olen = len = src->i_size - off;
        /* if we extend to eof, continue to block boundary */
        if (off + len == src->i_size)
-                len = ((src->i_size + bs-1) & ~(bs-1))
+                len = ALIGN(src->i_size, bs) - off;
-                        - off;
        /* verify the end result is block aligned */
-        if ((off & (bs-1)) ||
+        if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
-            ((off + len) & (bs-1)))
+            !IS_ALIGNED(destoff, bs))
                goto out_unlock;
        /* do any pending delalloc/csum calc on src, one way or
@@ -1874,8 +1897,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                         * but shouldn't round up the file size
                         */
                        endoff = new_key.offset + datal;
-                        if (endoff > off+olen)
+                        if (endoff > destoff+olen)
-                                endoff = off+olen;
+                                endoff = destoff+olen;
                        if (endoff > inode->i_size)
                                btrfs_i_size_write(inode, endoff);
@@ -2235,7 +2258,7 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_getversion(file, argp);
        case BTRFS_IOC_SNAP_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 0, 0);
-        case BTRFS_IOC_SNAP_CREATE_ASYNC:
+        case BTRFS_IOC_SNAP_CREATE_V2:
                return btrfs_ioctl_snap_create(file, argp, 0, 1);
        case BTRFS_IOC_SUBVOL_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 1, 0);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 17c99ebdf96..c344d12c646 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,11 +30,15 @@ struct btrfs_ioctl_vol_args {
        char name[BTRFS_PATH_NAME_MAX + 1];
 };
-#define BTRFS_SNAPSHOT_NAME_MAX 4079
+#define BTRFS_SUBVOL_CREATE_ASYNC       (1ULL << 0)
-struct btrfs_ioctl_async_vol_args {
+#define BTRFS_SUBVOL_NAME_MAX 4039
+struct btrfs_ioctl_vol_args_v2 {
        __s64 fd;
        __u64 transid;
-        char name[BTRFS_SNAPSHOT_NAME_MAX + 1];
+        __u64 flags;
+        __u64 unused[4];
+        char name[BTRFS_SUBVOL_NAME_MAX + 1];
 };
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
@@ -187,6 +191,6 @@ struct btrfs_ioctl_space_args {
                                    struct btrfs_ioctl_space_args)
 #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
 #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
-#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \
+#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
-                                   struct btrfs_ioctl_async_vol_args)
+                                   struct btrfs_ioctl_vol_args_v2)
 #endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index f4621f6deca..ae7737e352c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -250,6 +250,73 @@ int btrfs_add_ordered_sum(struct inode *inode,
 /*
 * this is used to account for finished IO across a given range
+ * of the file.  The IO may span ordered extents.  If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ *
+ * file_offset is updated to one byte past the range that is recorded as
+ * complete.  This allows you to walk forward in the file.
+ */
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+                                   struct btrfs_ordered_extent **cached,
+                                   u64 *file_offset, u64 io_size)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry = NULL;
+        int ret;
+        u64 dec_end;
+        u64 dec_start;
+        u64 to_dec;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        spin_lock(&tree->lock);
+        node = tree_search(tree, *file_offset);
+        if (!node) {
+                ret = 1;
+                goto out;
+        }
+        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+        if (!offset_in_entry(entry, *file_offset)) {
+                ret = 1;
+                goto out;
+        }
+        dec_start = max(*file_offset, entry->file_offset);
+        dec_end = min(*file_offset + io_size, entry->file_offset +
+                      entry->len);
+        *file_offset = dec_end;
+        if (dec_start > dec_end) {
+                printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
+                       (unsigned long long)dec_start,
+                       (unsigned long long)dec_end);
+        }
+        to_dec = dec_end - dec_start;
+        if (to_dec > entry->bytes_left) {
+                printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+                       (unsigned long long)entry->bytes_left,
+                       (unsigned long long)to_dec);
+        }
+        entry->bytes_left -= to_dec;
+        if (entry->bytes_left == 0)
+                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+        else
+                ret = 1;
+out:
+        if (!ret && cached && entry) {
+                *cached = entry;
+                atomic_inc(&entry->refs);
+        }
+        spin_unlock(&tree->lock);
+        return ret == 0;
+}
+/*
+ * this is used to account for finished IO across a given range
 * of the file.  The IO should not span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 8ac365492a3..61dca83119d 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -141,6 +141,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
                                   struct btrfs_ordered_extent **cached,
                                   u64 file_offset, u64 io_size);
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+                                   struct btrfs_ordered_extent **cached,
+                                   u64 *file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                             u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 79cba5fbc28..f8be250963a 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-        if (ret)
+        if (ret < 0)
                goto out;
+        if (ret) {
+                ret = -ENOENT;
+                goto out;
+        }
        ret = btrfs_del_item(trans, root, path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8299a25ffc8..883c6fa1367 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -244,6 +244,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_space_cache:
                        printk(KERN_INFO "btrfs: enabling disk space caching\n");
                        btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+                        break;
                case Opt_clear_cache:
                        printk(KERN_INFO "btrfs: force clearing of disk cache\n");
                        btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
@@ -562,12 +563,26 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 static int btrfs_test_super(struct super_block *s, void *data)
 {
-        struct btrfs_fs_devices *test_fs_devices = data;
+        struct btrfs_root *test_root = data;
        struct btrfs_root *root = btrfs_sb(s);
-        return root->fs_info->fs_devices == test_fs_devices;
+        /*
+         * If this super block is going away, return false as it
+         * can't match as an existing super block.
+         */
+        if (!atomic_read(&s->s_active))
+                return 0;
+        return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+}
+static int btrfs_set_super(struct super_block *s, void *data)
+{
+        s->s_fs_info = data;
+        return set_anon_super(s, data);
 }
 /*
 * Find a superblock for the given device / mount point.
 *
@@ -581,6 +596,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        struct super_block *s;
        struct dentry *root;
        struct btrfs_fs_devices *fs_devices = NULL;
+        struct btrfs_root *tree_root = NULL;
+        struct btrfs_fs_info *fs_info = NULL;
        fmode_t mode = FMODE_READ;
        char *subvol_name = NULL;
        u64 subvol_objectid = 0;
@@ -608,8 +625,24 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                goto error_close_devices;
        }
+        /*
+         * Setup a dummy root and fs_info for test/set super.  This is because
+         * we don't actually fill this stuff out until open_ctree, but we need
+         * it for searching for existing supers, so this lets us do that and
+         * then open_ctree will properly initialize everything later.
+         */
+        fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
+        tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        if (!fs_info || !tree_root) {
+                error = -ENOMEM;
+                goto error_close_devices;
+        }
+        fs_info->tree_root = tree_root;
+        fs_info->fs_devices = fs_devices;
+        tree_root->fs_info = fs_info;
        bdev = fs_devices->latest_bdev;
-        s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
+        s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
        if (IS_ERR(s))
                goto error_s;
@@ -652,9 +685,9 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                mutex_unlock(&root->d_inode->i_mutex);
                if (IS_ERR(new_root)) {
+                        dput(root);
                        deactivate_locked_super(s);
                        error = PTR_ERR(new_root);
-                        dput(root);
                        goto error_free_subvol_name;
                }
                if (!new_root->d_inode) {
@@ -675,6 +708,8 @@ error_s:
        error = PTR_ERR(s);
 error_close_devices:
        btrfs_close_devices(fs_devices);
+        kfree(fs_info);
+        kfree(tree_root);
 error_free_subvol_name:
        kfree(subvol_name);
        return ERR_PTR(error);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1fffbc017bd..f50e931fc21 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -902,6 +902,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = pending->root;
        struct btrfs_root *parent_root;
        struct inode *parent_inode;
+        struct dentry *parent;
        struct dentry *dentry;
        struct extent_buffer *tmp;
        struct extent_buffer *old;
@@ -941,7 +942,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        trans->block_rsv = &pending->block_rsv;
        dentry = pending->dentry;
-        parent_inode = dentry->d_parent->d_inode;
+        parent = dget_parent(dentry);
+        parent_inode = parent->d_inode;
        parent_root = BTRFS_I(parent_inode)->root;
        record_root_in_trans(trans, parent_root);
@@ -989,6 +991,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                 parent_inode->i_ino, index,
                                 dentry->d_name.name, dentry->d_name.len);
        BUG_ON(ret);
+        dput(parent);
        key.offset = (u64)-1;
        pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a29f19384a2..054744ac571 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2869,6 +2869,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 {
        int ret = 0;
        struct btrfs_root *root;
+        struct dentry *old_parent = NULL;
        /*
         * for regular files, if its inode is already on disk, we don't
@@ -2910,10 +2911,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
                if (IS_ROOT(parent))
                        break;
-                parent = parent->d_parent;
+                parent = dget_parent(parent);
+                dput(old_parent);
+                old_parent = parent;
                inode = parent->d_inode;
        }
+        dput(old_parent);
 out:
        return ret;
 }
@@ -2945,6 +2949,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 {
        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
        struct super_block *sb;
+        struct dentry *old_parent = NULL;
        int ret = 0;
        u64 last_committed = root->fs_info->last_trans_committed;
@@ -3016,10 +3021,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (IS_ROOT(parent))
                        break;
-                parent = parent->d_parent;
+                parent = dget_parent(parent);
+                dput(old_parent);
+                old_parent = parent;
        }
        ret = 0;
 end_trans:
+        dput(old_parent);
        if (ret < 0) {
                BUG_ON(ret != -ENOSPC);
                root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -3039,8 +3047,13 @@ end_no_trans:
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry)
 {
-        return btrfs_log_inode_parent(trans, root, dentry->d_inode,
+        struct dentry *parent = dget_parent(dentry);
-                                      dentry->d_parent, 0);
+        int ret;
+        ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+        dput(parent);
+        return ret;
 }
 /*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cc04dc1445d..6b988450783 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -412,12 +412,16 @@ static noinline int device_list_add(const char *path,
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
-        } else if (strcmp(device->name, path)) {
+        } else if (!device->name || strcmp(device->name, path)) {
                name = kstrdup(path, GFP_NOFS);
                if (!name)
                        return -ENOMEM;
                kfree(device->name);
                device->name = name;
+                if (device->missing) {
+                        fs_devices->missing_devices--;
+                        device->missing = 0;
+                }
        }
        if (found_transid > fs_devices->latest_trans) {
@@ -1236,6 +1240,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        device->fs_devices->num_devices--;
+        if (device->missing)
+                root->fs_info->fs_devices->missing_devices--;
        next_device = list_entry(root->fs_info->fs_devices->devices.next,
                                 struct btrfs_device, dev_list);
        if (device->bdev == root->fs_info->sb->s_bdev)
@@ -3080,7 +3087,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
        device->devid = devid;
        device->work.func = pending_bios_fn;
        device->fs_devices = fs_devices;
+        device->missing = 1;
        fs_devices->num_devices++;
+        fs_devices->missing_devices++;
        spin_lock_init(&device->io_lock);
        INIT_LIST_HEAD(&device->dev_alloc_list);
        memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3278,6 +3287,15 @@ static int read_one_dev(struct btrfs_root *root,
                        device = add_missing_dev(root, devid, dev_uuid);
                        if (!device)
                                return -ENOMEM;
+                } else if (!device->missing) {
+                        /*
+                         * this happens when a device that was properly setup
+                         * in the device info lists suddenly goes bad.
+                         * device->bdev is NULL, and so we have to set
+                         * device->missing to one here
+                         */
+                        root->fs_info->fs_devices->missing_devices++;
+                        device->missing = 1;
                }
        }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2b638b6e4ee..2740db49eb0 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -44,6 +44,7 @@ struct btrfs_device {
        int writeable;
        int in_fs_metadata;
+        int missing;
        spinlock_t io_lock;
@@ -93,6 +94,7 @@ struct btrfs_fs_devices {
        u64 num_devices;
        u64 open_devices;
        u64 rw_devices;
+        u64 missing_devices;
        u64 total_rw_bytes;
        struct block_device *latest_bdev;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e9c874abc9e..561438b6a50 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -204,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
        err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
                                  page->index << PAGE_CACHE_SHIFT, &len,
                                  ci->i_truncate_seq, ci->i_truncate_size,
-                                  &page, 1);
+                                  &page, 1, 0);
        if (err == -ENOENT)
                err = 0;
        if (err < 0) {
@@ -287,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
                                 offset, &len,
                                 ci->i_truncate_seq, ci->i_truncate_size,
-                                 pages, nr_pages);
+                                 pages, nr_pages, 0);
        if (rc == -ENOENT)
                rc = 0;
        if (rc < 0)
@@ -774,7 +774,7 @@ get_more_pages:
                                            snapc, do_sync,
                                            ci->i_truncate_seq,
                                            ci->i_truncate_size,
-                                            &inode->i_mtime, true, 1);
+                                            &inode->i_mtime, true, 1, 0);
                                max_pages = req->r_num_pages;
                                alloc_page_vec(fsc, req);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 98ab13e2b71..60d27bc9eb8 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1430,8 +1430,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
            invalidating_gen == ci->i_rdcache_gen) {
                /* success. */
                dout("try_nonblocking_invalidate %p success\n", inode);
-                ci->i_rdcache_gen = 0;
+                /* save any racing async invalidate some trouble */
-                ci->i_rdcache_revoking = 0;
+                ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
                return 0;
        }
        dout("try_nonblocking_invalidate %p failed\n", inode);
@@ -2273,8 +2273,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
-        unsigned seq = le32_to_cpu(grant->seq);
+        int seq = le32_to_cpu(grant->seq);
-        unsigned issue_seq = le32_to_cpu(grant->issue_seq);
        int newcaps = le32_to_cpu(grant->caps);
        int issued, implemented, used, wanted, dirty;
        u64 size = le64_to_cpu(grant->size);
@@ -2286,8 +2285,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        int revoked_rdcache = 0;
        int queue_invalidate = 0;
-        dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
+        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
-             inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
+             inode, cap, mds, seq, ceph_cap_string(newcaps));
        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
                inode->i_size);
@@ -2383,7 +2382,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        }
        cap->seq = seq;
-        cap->issue_seq = issue_seq;
        /* file layout may have changed */
        ci->i_layout = grant->layout;
@@ -2691,6 +2689,11 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
                     NULL /* no caps context */);
        try_flush_caps(inode, session, NULL);
        up_read(&mdsc->snap_rwsem);
+        /* make sure we re-request max_size, if necessary */
+        spin_lock(&inode->i_lock);
+        ci->i_requested_max_size = 0;
+        spin_unlock(&inode->i_lock);
 }
 /*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e0a2dc6fcaf..d902948a90d 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -40,7 +40,8 @@ int ceph_init_dentry(struct dentry *dentry)
        if (dentry->d_fsdata)
                return 0;
-        if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+        if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
+            ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
                dentry->d_op = &ceph_dentry_ops;
        else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
                dentry->d_op = &ceph_snapdir_dentry_ops;
@@ -114,8 +115,8 @@ static int __dcache_readdir(struct file *filp,
        spin_lock(&dcache_lock);
        /* start at beginning? */
-        if (filp->f_pos == 2 || (last &&
+        if (filp->f_pos == 2 || last == NULL ||
-                                 filp->f_pos < ceph_dentry(last)->offset)) {
+            filp->f_pos < ceph_dentry(last)->offset) {
                if (list_empty(&parent->d_subdirs))
                        goto out_unlock;
                p = parent->d_subdirs.prev;
@@ -336,7 +337,10 @@ more:
                if (req->r_reply_info.dir_end) {
                        kfree(fi->last_name);
                        fi->last_name = NULL;
-                        fi->next_offset = 2;
+                        if (ceph_frag_is_rightmost(frag))
+                                fi->next_offset = 2;
+                        else
+                                fi->next_offset = 0;
                } else {
                        rinfo = &req->r_reply_info;
                        err = note_last_dentry(fi,
@@ -355,18 +359,22 @@ more:
                u64 pos = ceph_make_fpos(frag, off);
                struct ceph_mds_reply_inode *in =
                        rinfo->dir_in[off - fi->offset].in;
+                struct ceph_vino vino;
+                ino_t ino;
                dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
                     off, off - fi->offset, rinfo->dir_nr, pos,
                     rinfo->dir_dname_len[off - fi->offset],
                     rinfo->dir_dname[off - fi->offset], in);
                BUG_ON(!in);
                ftype = le32_to_cpu(in->mode) >> 12;
+                vino.ino = le64_to_cpu(in->ino);
+                vino.snap = le64_to_cpu(in->snapid);
+                ino = ceph_vino_to_ino(vino);
                if (filldir(dirent,
                            rinfo->dir_dname[off - fi->offset],
                            rinfo->dir_dname_len[off - fi->offset],
-                            pos,
+                            pos, ino, ftype) < 0) {
-                            le64_to_cpu(in->ino),
-                            ftype) < 0) {
                        dout("filldir stopping us...\n");
                        return 0;
                }
@@ -414,6 +422,7 @@ static void reset_readdir(struct ceph_file_info *fi)
                fi->last_readdir = NULL;
        }
        kfree(fi->last_name);
+        fi->last_name = NULL;
        fi->next_offset = 2;  /* compensate for . and .. */
        if (fi->dentry) {
                dput(fi->dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e77c28cf369..7d0e4a82d89 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -154,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file)
        }
        /*
-         * No need to block if we have any caps.  Update wanted set
+         * No need to block if we have caps on the auth MDS (for
+         * write) or any MDS (for read).  Update wanted set
         * asynchronously.
         */
        spin_lock(&inode->i_lock);
-        if (__ceph_is_any_real_caps(ci)) {
+        if (__ceph_is_any_real_caps(ci) &&
+            (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
                int mds_wanted = __ceph_caps_mds_wanted(ci);
                int issued = __ceph_caps_issued(ci, NULL);
@@ -280,11 +282,13 @@ int ceph_release(struct inode *inode, struct file *file)
 static int striped_read(struct inode *inode,
                        u64 off, u64 len,
                        struct page **pages, int num_pages,
-                        int *checkeof)
+                        int *checkeof, bool align_to_pages,
+                        unsigned long buf_align)
 {
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_inode_info *ci = ceph_inode(inode);
        u64 pos, this_len;
+        int io_align, page_align;
        int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
        int left, pages_left;
        int read;
@@ -300,14 +304,19 @@ static int striped_read(struct inode *inode,
        page_pos = pages;
        pages_left = num_pages;
        read = 0;
+        io_align = off & ~PAGE_MASK;
 more:
+        if (align_to_pages)
+                page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+        else
+                page_align = pos & ~PAGE_MASK;
        this_len = left;
        ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
                                  &ci->i_layout, pos, &this_len,
                                  ci->i_truncate_seq,
                                  ci->i_truncate_size,
-                                  page_pos, pages_left);
+                                  page_pos, pages_left, page_align);
        hit_stripe = this_len < left;
        was_short = ret >= 0 && ret < this_len;
        if (ret == -ENOENT)
@@ -368,32 +377,34 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
        struct inode *inode = file->f_dentry->d_inode;
        struct page **pages;
        u64 off = *poff;
-        int num_pages = calc_pages_for(off, len);
+        int num_pages, ret;
-        int ret;
        dout("sync_read on file %p %llu~%u %s\n", file, off, len,
             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
        if (file->f_flags & O_DIRECT) {
-                pages = ceph_get_direct_page_vector(data, num_pages, off, len);
+                num_pages = calc_pages_for((unsigned long)data, len);
+                pages = ceph_get_direct_page_vector(data, num_pages, true);
-                /*
-                 * flush any page cache pages in this range.  this
-                 * will make concurrent normal and O_DIRECT io slow,
-                 * but it will at least behave sensibly when they are
-                 * in sequence.
-                 */
        } else {
+                num_pages = calc_pages_for(off, len);
                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
        }
        if (IS_ERR(pages))
                return PTR_ERR(pages);
+        /*
+         * flush any page cache pages in this range.  this
+         * will make concurrent normal and sync io slow,
+         * but it will at least behave sensibly when they are
+         * in sequence.
+         */
        ret = filemap_write_and_wait(inode->i_mapping);
        if (ret < 0)
                goto done;
-        ret = striped_read(inode, off, len, pages, num_pages, checkeof);
+        ret = striped_read(inode, off, len, pages, num_pages, checkeof,
+                           file->f_flags & O_DIRECT,
+                           (unsigned long)data & ~PAGE_MASK);
        if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
                ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
@@ -402,7 +413,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 done:
        if (file->f_flags & O_DIRECT)
-                ceph_put_page_vector(pages, num_pages);
+                ceph_put_page_vector(pages, num_pages, true);
        else
                ceph_release_page_vector(pages, num_pages);
        dout("sync_read result %d\n", ret);
@@ -448,6 +459,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
        int flags;
        int do_sync = 0;
        int check_caps = 0;
+        int page_align, io_align;
+        unsigned long buf_align;
        int ret;
        struct timespec mtime = CURRENT_TIME;
@@ -462,6 +475,9 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
        else
                pos = *offset;
+        io_align = pos & ~PAGE_MASK;
+        buf_align = (unsigned long)data & ~PAGE_MASK;
        ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
        if (ret < 0)
                return ret;
@@ -486,20 +502,27 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
         */
 more:
        len = left;
+        if (file->f_flags & O_DIRECT) {
+                /* write from beginning of first page, regardless of
+                   io alignment */
+                page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+                num_pages = calc_pages_for((unsigned long)data, len);
+        } else {
+                page_align = pos & ~PAGE_MASK;
+                num_pages = calc_pages_for(pos, len);
+        }
        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                    ceph_vino(inode), pos, &len,
                                    CEPH_OSD_OP_WRITE, flags,
                                    ci->i_snap_realm->cached_context,
                                    do_sync,
                                    ci->i_truncate_seq, ci->i_truncate_size,
-                                    &mtime, false, 2);
+                                    &mtime, false, 2, page_align);
        if (!req)
                return -ENOMEM;
-        num_pages = calc_pages_for(pos, len);
        if (file->f_flags & O_DIRECT) {
-                pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
+                pages = ceph_get_direct_page_vector(data, num_pages, false);
                if (IS_ERR(pages)) {
                        ret = PTR_ERR(pages);
                        goto out;
@@ -549,7 +572,7 @@ more:
        }
        if (file->f_flags & O_DIRECT)
-                ceph_put_page_vector(pages, num_pages);
+                ceph_put_page_vector(pages, num_pages, false);
        else if (file->f_flags & O_SYNC)
                ceph_release_page_vector(pages, num_pages);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1d6a45b5a04..bf1286588f2 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2,7 +2,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
@@ -471,7 +470,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
        if (issued & (CEPH_CAP_FILE_EXCL|
                      CEPH_CAP_FILE_WR|
-                      CEPH_CAP_FILE_BUFFER)) {
+                      CEPH_CAP_FILE_BUFFER|
+                      CEPH_CAP_AUTH_EXCL|
+                      CEPH_CAP_XATTR_EXCL)) {
                if (timespec_compare(ctime, &inode->i_ctime) > 0) {
                        dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
                             inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
@@ -511,7 +512,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
                        warn = 1;
                }
        } else {
-                /* we have no write caps; whatever the MDS says is true */
+                /* we have no write|excl caps; whatever the MDS says is true */
                if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
                        inode->i_ctime = *ctime;
                        inode->i_mtime = *mtime;
@@ -567,12 +568,17 @@ static int fill_inode(struct inode *inode,
        /*
         * provided version will be odd if inode value is projected,
-         * even if stable.  skip the update if we have a newer info
+         * even if stable.  skip the update if we have newer stable
-         * (e.g., due to inode info racing form multiple MDSs), or if
+         * info (ours>=theirs, e.g. due to racing mds replies), unless
-         * we are getting projected (unstable) inode info.
+         * we are getting projected (unstable) info (in which case the
+         * version is odd, and we want ours>theirs).
+         *   us   them
+         *   2    2     skip
+         *   3    2     skip
+         *   3    3     update
         */
        if (le64_to_cpu(info->version) > 0 &&
-            (ci->i_version & ~1) > le64_to_cpu(info->version))
+            (ci->i_version & ~1) >= le64_to_cpu(info->version))
                goto no_change;
        issued = __ceph_caps_issued(ci, &implemented);
@@ -606,7 +612,14 @@ static int fill_inode(struct inode *inode,
                            le32_to_cpu(info->time_warp_seq),
                            &ctime, &mtime, &atime);
-        ci->i_max_size = le64_to_cpu(info->max_size);
+        /* only update max_size on auth cap */
+        if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+            ci->i_max_size != le64_to_cpu(info->max_size)) {
+                dout("max_size %lld -> %llu\n", ci->i_max_size,
+                     le64_to_cpu(info->max_size));
+                ci->i_max_size = le64_to_cpu(info->max_size);
+        }
        ci->i_layout = info->layout;
        inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
@@ -1055,7 +1068,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                ininfo = rinfo->targeti.in;
                vino.ino = le64_to_cpu(ininfo->ino);
                vino.snap = le64_to_cpu(ininfo->snapid);
-                if (!dn->d_inode) {
+                in = dn->d_inode;
+                if (!in) {
                        in = ceph_get_inode(sb, vino);
                        if (IS_ERR(in)) {
                                pr_err("fill_trace bad get_inode "
@@ -1386,11 +1400,8 @@ static void ceph_invalidate_work(struct work_struct *work)
        spin_lock(&inode->i_lock);
        dout("invalidate_pages %p gen %d revoking %d\n", inode,
             ci->i_rdcache_gen, ci->i_rdcache_revoking);
-        if (ci->i_rdcache_gen == 0 ||
+        if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-            ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-                BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
                /* nevermind! */
-                ci->i_rdcache_revoking = 0;
                spin_unlock(&inode->i_lock);
                goto out;
        }
@@ -1400,15 +1411,16 @@ static void ceph_invalidate_work(struct work_struct *work)
        ceph_invalidate_nondirty_pages(inode->i_mapping);
        spin_lock(&inode->i_lock);
-        if (orig_gen == ci->i_rdcache_gen) {
+        if (orig_gen == ci->i_rdcache_gen &&
+            orig_gen == ci->i_rdcache_revoking) {
                dout("invalidate_pages %p gen %d successful\n", inode,
                     ci->i_rdcache_gen);
-                ci->i_rdcache_gen = 0;
+                ci->i_rdcache_revoking--;
-                ci->i_rdcache_revoking = 0;
                check = 1;
        } else {
-                dout("invalidate_pages %p gen %d raced, gen now %d\n",
+                dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
-                     inode, orig_gen, ci->i_rdcache_gen);
+                     inode, orig_gen, ci->i_rdcache_gen,
+                     ci->i_rdcache_revoking);
        }
        spin_unlock(&inode->i_lock);
@@ -1739,7 +1751,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
                return 0;
        }
-        dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+        dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
        if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
                return 0;
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index a6ce54e94eb..52e8fd74d45 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
 #include <linux/ioctl.h>
 #include <linux/types.h>
-#define CEPH_IOCTL_MAGIC 0x98
+#define CEPH_IOCTL_MAGIC 0x97
 /* just use u64 to align sanely on all archs */
 struct ceph_ioctl_layout {
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 40abde93c34..476b329867d 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -11,40 +11,68 @@
 * Implement fcntl and flock locking functions.
 */
 static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
-                             u64 pid, u64 pid_ns,
+                             int cmd, u8 wait, struct file_lock *fl)
-                             int cmd, u64 start, u64 length, u8 wait)
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_mds_client *mdsc =
                ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_mds_request *req;
        int err;
+        u64 length = 0;
        req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
        if (IS_ERR(req))
                return PTR_ERR(req);
        req->r_inode = igrab(inode);
+        /* mds requires start and length rather than start and end */
+        if (LLONG_MAX == fl->fl_end)
+                length = 0;
+        else
+                length = fl->fl_end - fl->fl_start + 1;
        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
             "length: %llu, wait: %d, type`: %d", (int)lock_type,
-             (int)operation, pid, start, length, wait, cmd);
+             (int)operation, (u64)fl->fl_pid, fl->fl_start,
+             length, wait, fl->fl_type);
        req->r_args.filelock_change.rule = lock_type;
        req->r_args.filelock_change.type = cmd;
-        req->r_args.filelock_change.pid = cpu_to_le64(pid);
+        req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
        /* This should be adjusted, but I'm not sure if
           namespaces actually get id numbers*/
        req->r_args.filelock_change.pid_namespace =
-                cpu_to_le64((u64)pid_ns);
+                cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
-        req->r_args.filelock_change.start = cpu_to_le64(start);
+        req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
        req->r_args.filelock_change.length = cpu_to_le64(length);
        req->r_args.filelock_change.wait = wait;
        err = ceph_mdsc_do_request(mdsc, inode, req);
+        if ( operation == CEPH_MDS_OP_GETFILELOCK){
+                fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+                if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
+                        fl->fl_type = F_RDLCK;
+                else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
+                        fl->fl_type = F_WRLCK;
+                else
+                        fl->fl_type = F_UNLCK;
+                fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
+                length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
+                                                 le64_to_cpu(req->r_reply_info.filelock_reply->length);
+                if (length >= 1)
+                        fl->fl_end = length -1;
+                else
+                        fl->fl_end = 0;
+        }
        ceph_mdsc_put_request(req);
        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
-             "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
+             "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
-             (int)operation, pid, start, length, wait, cmd, err);
+             (int)operation, (u64)fl->fl_pid, fl->fl_start,
+             length, wait, fl->fl_type, err);
        return err;
 }
@@ -54,7 +82,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 */
 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 {
-        u64 length;
        u8 lock_cmd;
        int err;
        u8 wait = 0;
@@ -76,29 +103,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
        else
                lock_cmd = CEPH_LOCK_UNLOCK;
-        if (LLONG_MAX == fl->fl_end)
+        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
-                length = 0;
-        else
-                length = fl->fl_end - fl->fl_start + 1;
-        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                (u64)fl->fl_pid,
-                                (u64)(unsigned long)fl->fl_nspid,
-                                lock_cmd, fl->fl_start,
-                                length, wait);
        if (!err) {
-                dout("mds locked, locking locally");
+                if ( op != CEPH_MDS_OP_GETFILELOCK ){
-                err = posix_lock_file(file, fl, NULL);
+                        dout("mds locked, locking locally");
-                if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+                        err = posix_lock_file(file, fl, NULL);
-                        /* undo! This should only happen if the kernel detects
+                        if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
-                         * local deadlock. */
+                                /* undo! This should only happen if the kernel detects
-                        ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+                                 * local deadlock. */
-                                          (u64)fl->fl_pid,
+                                ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                          (u64)(unsigned long)fl->fl_nspid,
+                                                  CEPH_LOCK_UNLOCK, 0, fl);
-                                          CEPH_LOCK_UNLOCK, fl->fl_start,
+                                dout("got %d on posix_lock_file, undid lock", err);
-                                          length, 0);
+                        }
-                        dout("got %d on posix_lock_file, undid lock", err);
                }
        } else {
                dout("mds returned error code %d", err);
        }
@@ -107,7 +125,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-        u64 length;
        u8 lock_cmd;
        int err;
        u8 wait = 1;
@@ -127,26 +144,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
                lock_cmd = CEPH_LOCK_EXCL;
        else
                lock_cmd = CEPH_LOCK_UNLOCK;
-        /* mds requires start and length rather than start and end */
-        if (LLONG_MAX == fl->fl_end)
-                length = 0;
-        else
-                length = fl->fl_end - fl->fl_start + 1;
        err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
-                                file, (u64)fl->fl_pid,
+                                file, lock_cmd, wait, fl);
-                                (u64)(unsigned long)fl->fl_nspid,
-                                lock_cmd, fl->fl_start,
-                                length, wait);
        if (!err) {
                err = flock_lock_file_wait(file, fl);
                if (err) {
                        ceph_lock_message(CEPH_LOCK_FLOCK,
                                          CEPH_MDS_OP_SETFILELOCK,
-                                          file, (u64)fl->fl_pid,
+                                          file, CEPH_LOCK_UNLOCK, 0, fl);
-                                          (u64)(unsigned long)fl->fl_nspid,
-                                          CEPH_LOCK_UNLOCK, fl->fl_start,
-                                          length, 0);
                        dout("got %d on flock_lock_file_wait, undid lock", err);
                }
        } else {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3142b15940c..38800eaa81d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -6,7 +6,6 @@
 #include <linux/sched.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include "super.h"
 #include "mds_client.h"
@@ -203,6 +202,38 @@ out_bad:
 }
 /*
+ * parse fcntl F_GETLK results
+ */
+static int parse_reply_info_filelock(void **p, void *end,
+                struct ceph_mds_reply_info_parsed *info)
+{
+        if (*p + sizeof(*info->filelock_reply) > end)
+                goto bad;
+        info->filelock_reply = *p;
+        *p += sizeof(*info->filelock_reply);
+        if (unlikely(*p != end))
+                goto bad;
+        return 0;
+bad:
+        return -EIO;
+}
+/*
+ * parse extra results
+ */
+static int parse_reply_info_extra(void **p, void *end,
+                struct ceph_mds_reply_info_parsed *info)
+{
+        if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+                return parse_reply_info_filelock(p, end, info);
+        else
+                return parse_reply_info_dir(p, end, info);
+}
+/*
 * parse entire mds reply
 */
 static int parse_reply_info(struct ceph_msg *msg,
@@ -224,10 +255,10 @@ static int parse_reply_info(struct ceph_msg *msg,
                        goto out_bad;
        }
-        /* dir content */
+        /* extra */
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
-                err = parse_reply_info_dir(&p, p+len, info);
+                err = parse_reply_info_extra(&p, p+len, info);
                if (err < 0)
                        goto out_bad;
        }
@@ -529,6 +560,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
        ceph_mdsc_get_request(req);
        __insert_request(mdsc, req);
+        req->r_uid = current_fsuid();
+        req->r_gid = current_fsgid();
        if (dir) {
                struct ceph_inode_info *ci = ceph_inode(dir);
@@ -1588,8 +1622,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
        head->op = cpu_to_le32(req->r_op);
-        head->caller_uid = cpu_to_le32(current_fsuid());
+        head->caller_uid = cpu_to_le32(req->r_uid);
-        head->caller_gid = cpu_to_le32(current_fsgid());
+        head->caller_gid = cpu_to_le32(req->r_gid);
        head->args = req->r_args;
        ceph_encode_filepath(&p, end, ino1, path1);
@@ -2072,7 +2106,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        mutex_lock(&session->s_mutex);
        if (err < 0) {
-                pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+                pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
                ceph_msg_dump(msg);
                goto out_err;
        }
@@ -2092,7 +2126,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        mutex_lock(&req->r_fill_mutex);
        err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
        if (err == 0) {
-                if (result == 0 && rinfo->dir_nr)
+                if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+                    rinfo->dir_nr)
                        ceph_readdir_prepopulate(req, req->r_session);
                ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
        }
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d66d63c7235..aabe563b54d 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -42,26 +42,37 @@ struct ceph_mds_reply_info_in {
 };
 /*
- * parsed info about an mds reply, including information about the
+ * parsed info about an mds reply, including information about
- * target inode and/or its parent directory and dentry, and directory
+ * either: 1) the target inode and/or its parent directory and dentry,
- * contents (for readdir results).
+ * and directory contents (for readdir results), or
+ * 2) the file range lock info (for fcntl F_GETLK results).
 */
 struct ceph_mds_reply_info_parsed {
        struct ceph_mds_reply_head    *head;
+        /* trace */
        struct ceph_mds_reply_info_in diri, targeti;
        struct ceph_mds_reply_dirfrag *dirfrag;
        char                          *dname;
        u32                           dname_len;
        struct ceph_mds_reply_lease   *dlease;
-        struct ceph_mds_reply_dirfrag *dir_dir;
+        /* extra */
-        int                           dir_nr;
+        union {
-        char                          **dir_dname;
+                /* for fcntl F_GETLK results */
-        u32                           *dir_dname_len;
+                struct ceph_filelock *filelock_reply;
-        struct ceph_mds_reply_lease   **dir_dlease;
-        struct ceph_mds_reply_info_in *dir_in;
+                /* for readdir results */
-        u8                            dir_complete, dir_end;
+                struct {
+                        struct ceph_mds_reply_dirfrag *dir_dir;
+                        int                           dir_nr;
+                        char                          **dir_dname;
+                        u32                           *dir_dname_len;
+                        struct ceph_mds_reply_lease   **dir_dlease;
+                        struct ceph_mds_reply_info_in *dir_in;
+                        u8                            dir_complete, dir_end;
+                };
+        };
        /* encoded blob describing snapshot contexts for certain
           operations (e.g., open) */
@@ -170,6 +181,8 @@ struct ceph_mds_request {
        union ceph_mds_request_args r_args;
        int r_fmode;        /* file mode, if expecting cap */
+        uid_t r_uid;
+        gid_t r_gid;
        /* for choosing which mds to send this request to */
        int r_direct_mode;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1886294e12f..7f01728a465 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -293,9 +293,7 @@ struct ceph_inode_info {
        int i_rd_ref, i_rdcache_ref, i_wr_ref;
        int i_wrbuffer_ref, i_wrbuffer_ref_head;
        u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
-        u32 i_rdcache_gen;      /* we increment this each time we get
+        u32 i_rdcache_gen;      /* incremented each time we get FILE_CACHE. */
-                                   FILE_CACHE.  If it's non-zero, we
-                                   _may_ have cached pages. */
        u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
        struct list_head i_unsafe_writes; /* uncommitted sync writes */
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 0ed213970ce..ee45648b0d1 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -4,6 +4,7 @@ config CIFS
        select NLS
        select CRYPTO
        select CRYPTO_MD5
+        select CRYPTO_HMAC
        select CRYPTO_ARC4
        help
          This is the client VFS module for the Common Internet File System
@@ -143,6 +144,13 @@ config CIFS_FSCACHE
            to be cached locally on disk through the general filesystem cache
            manager. If unsure, say N.
+config CIFS_ACL
+          bool "Provide CIFS ACL support (EXPERIMENTAL)"
+          depends on EXPERIMENTAL && CIFS_XATTR
+          help
+            Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
+            is handed over to the application/caller.
 config CIFS_EXPERIMENTAL
          bool "CIFS Experimental Features (EXPERIMENTAL)"
          depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index adefa60a9bd..43b19dd3919 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -6,7 +6,9 @@ obj-$(CONFIG_CIFS) += cifs.o
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
          link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
          md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
-          readdir.o ioctl.o sess.o export.o cifsacl.o
+          readdir.o ioctl.o sess.o export.o
+cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/README b/fs/cifs/README
index ee68d103654..46af99ab361 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -337,6 +337,15 @@ A partial list of the supported mount options follows:
  wsize         default write size (default 57344)
                maximum wsize currently allowed by CIFS is 57344 (fourteen
                4096 byte pages)
+  actimeo=n     attribute cache timeout in seconds (default 1 second).
+                After this timeout, the cifs client requests fresh attribute
+                information from the server. This option allows to tune the
+                attribute cache timeout to suit the workload needs. Shorter
+                timeouts mean better the cache coherency, but increased number
+                of calls to the server. Longer timeouts mean reduced number
+                of calls to the server at the expense of less stricter cache
+                coherency checks (i.e. incorrect attribute cache for a short
+                period of time).
  rw            mount the network share read-write (note that the
                server may still consider the share read-only)
  ro            mount network share read-only
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index e9a393c9c2c..7852cd67705 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -48,6 +48,7 @@ struct cifs_sb_info {
        struct nls_table *local_nls;
        unsigned int rsize;
        unsigned int wsize;
+        unsigned long actimeo; /* attribute cache timeout (jiffies) */
        atomic_t active;
        uid_t   mnt_uid;
        gid_t   mnt_gid;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index c9b4792ae82..a437ec391a0 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -30,8 +30,6 @@
 #include "cifs_debug.h"
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
        {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
        {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
@@ -560,7 +558,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
-                return NULL;
+                return ERR_CAST(tlink);
        xid = GetXid();
        rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
@@ -568,7 +566,9 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
        cifs_put_tlink(tlink);
-        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
+        cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+        if (rc)
+                return ERR_PTR(rc);
        return pntsd;
 }
@@ -583,7 +583,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
-                return NULL;
+                return ERR_CAST(tlink);
        tcon = tlink_tcon(tlink);
        xid = GetXid();
@@ -591,23 +591,22 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
                         &fid, &oplock, NULL, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc) {
+        if (!rc) {
-                cERROR(1, "Unable to open file to get ACL");
+                rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
-                goto out;
+                CIFSSMBClose(xid, tcon, fid);
        }
-        rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
-        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
-        CIFSSMBClose(xid, tcon, fid);
- out:
        cifs_put_tlink(tlink);
        FreeXid(xid);
+        cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+        if (rc)
+                return ERR_PTR(rc);
        return pntsd;
 }
 /* Retrieve an ACL from the server */
-static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
+struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
                                      struct inode *inode, const char *path,
                                      u32 *pacllen)
 {
@@ -695,7 +694,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 }
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void
+int
 cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
                  struct inode *inode, const char *path, const __u16 *pfid)
 {
@@ -711,17 +710,21 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
                pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
        /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
-        if (pntsd)
+        if (IS_ERR(pntsd)) {
+                rc = PTR_ERR(pntsd);
+                cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+        } else {
                rc = parse_sec_desc(pntsd, acllen, fattr);
-        if (rc)
+                kfree(pntsd);
-                cFYI(1, "parse sec desc failed rc = %d", rc);
+                if (rc)
+                        cERROR(1, "parse sec desc failed rc = %d", rc);
+        }
-        kfree(pntsd);
+        return rc;
-        return;
 }
 /* Convert mode bits to an ACL so we can update the ACL on the server */
-int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
+int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
 {
        int rc = 0;
        __u32 secdesclen = 0;
@@ -736,7 +739,10 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        /* Add three ACEs for owner, group, everyone getting rid of
           other ACEs as chmod disables ACEs and set the security descriptor */
-        if (pntsd) {
+        if (IS_ERR(pntsd)) {
+                rc = PTR_ERR(pntsd);
+                cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+        } else {
                /* allocate memory for the smb header,
                   set security descriptor request security descriptor
                   parameters, and secuirty descriptor itself */
@@ -766,4 +772,3 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        return rc;
 }
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 6c8096cf515..c4ae7d03656 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -74,11 +74,7 @@ struct cifs_wksid {
        char sidname[SIDNAMELENGTH];
 } __attribute__((packed));
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 extern int match_sid(struct cifs_sid *);
 extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
-#endif /*  CONFIG_CIFS_EXPERIMENTAL */
 #endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 9c3789762ab..3936aa7f2c2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -458,9 +458,13 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
                seq_printf(s, ",acl");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
                seq_printf(s, ",mfsymlinks");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
+                seq_printf(s, ",fsc");
        seq_printf(s, ",rsize=%d", cifs_sb->rsize);
        seq_printf(s, ",wsize=%d", cifs_sb->wsize);
+        /* convert actimeo and display it in seconds */
+                seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
        return 0;
 }
@@ -933,7 +937,6 @@ init_cifs(void)
        GlobalCurrentXid = 0;
        GlobalTotalActiveXid = 0;
        GlobalMaxActiveXid = 0;
-        memset(Local_System_Name, 0, 15);
        spin_lock_init(&cifs_tcp_ses_lock);
        spin_lock_init(&cifs_file_list_lock);
        spin_lock_init(&GlobalMid_Lock);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b577bf0a1bb..7136c0c3e2f 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -45,6 +45,16 @@
 #define CIFS_MIN_RCV_POOL 4
 /*
+ * default attribute cache timeout (jiffies)
+ */
+#define CIFS_DEF_ACTIMEO (1 * HZ)
+/*
+ * max attribute cache timeout (jiffies) - 2^30
+ */
+#define CIFS_MAX_ACTIMEO (1 << 30)
+/*
 * MAX_REQ is the maximum number of requests that WE will send
 * on one socket concurrently. It also matches the most common
 * value of max multiplex returned by servers.  We may
@@ -746,8 +756,6 @@ GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
 GLOBAL_EXTERN unsigned int GlobalMaxActiveXid;  /* prot by GlobalMid_Sem */
 GLOBAL_EXTERN spinlock_t GlobalMid_Lock;  /* protects above & list operations */
                                          /* on midQ entries */
-GLOBAL_EXTERN char Local_System_Name[15];
 /*
 *  Global counters, updated atomically
 */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 7ed69b6b5fe..e6d1481b16c 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -54,7 +54,8 @@ do {								\
             __func__, curr_xid, (int)rc);                      \
 } while (0)
 extern char *build_path_from_dentry(struct dentry *);
-extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
+extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+                                        struct cifsTconInfo *tcon);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 extern char *cifs_compose_mount_options(const char *sb_mountdata,
                const char *fullpath, const struct dfs_info3_param *ref,
@@ -79,9 +80,7 @@ extern bool is_valid_oplock_break(struct smb_hdr *smb,
                                  struct TCP_Server_Info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
-#endif
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
@@ -130,10 +129,12 @@ extern int cifs_get_file_info_unix(struct file *filp);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, int xid);
-extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
+extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
                              struct cifs_fattr *fattr, struct inode *inode,
                              const char *path, const __u16 *pfid);
-extern int mode_to_acl(struct inode *inode, const char *path, __u64);
+extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
+extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
+                                        const char *, u32 *);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
                        const char *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 2f2632b6df5..67acfb3acad 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2478,95 +2478,6 @@ querySymLinkRetry:
 }
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-/* Initialize NT TRANSACT SMB into small smb request buffer.
-   This assumes that all NT TRANSACTS that we init here have
-   total parm and data under about 400 bytes (to fit in small cifs
-   buffer size), which is the case so far, it easily fits. NB:
-        Setup words themselves and ByteCount
-        MaxSetupCount (size of returned setup area) and
-        MaxParameterCount (returned parms size) must be set by caller */
-static int
-smb_init_nttransact(const __u16 sub_command, const int setup_count,
-                   const int parm_len, struct cifsTconInfo *tcon,
-                   void **ret_buf)
-{
-        int rc;
-        __u32 temp_offset;
-        struct smb_com_ntransact_req *pSMB;
-        rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
-                                (void **)&pSMB);
-        if (rc)
-                return rc;
-        *ret_buf = (void *)pSMB;
-        pSMB->Reserved = 0;
-        pSMB->TotalParameterCount = cpu_to_le32(parm_len);
-        pSMB->TotalDataCount  = 0;
-        pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-                                          MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
-        pSMB->ParameterCount = pSMB->TotalParameterCount;
-        pSMB->DataCount  = pSMB->TotalDataCount;
-        temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
-                        (setup_count * 2) - 4 /* for rfc1001 length itself */;
-        pSMB->ParameterOffset = cpu_to_le32(temp_offset);
-        pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
-        pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
-        pSMB->SubCommand = cpu_to_le16(sub_command);
-        return 0;
-}
-static int
-validate_ntransact(char *buf, char **ppparm, char **ppdata,
-                   __u32 *pparmlen, __u32 *pdatalen)
-{
-        char *end_of_smb;
-        __u32 data_count, data_offset, parm_count, parm_offset;
-        struct smb_com_ntransact_rsp *pSMBr;
-        *pdatalen = 0;
-        *pparmlen = 0;
-        if (buf == NULL)
-                return -EINVAL;
-        pSMBr = (struct smb_com_ntransact_rsp *)buf;
-        /* ByteCount was converted from little endian in SendReceive */
-        end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
-                        (char *)&pSMBr->ByteCount;
-        data_offset = le32_to_cpu(pSMBr->DataOffset);
-        data_count = le32_to_cpu(pSMBr->DataCount);
-        parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
-        parm_count = le32_to_cpu(pSMBr->ParameterCount);
-        *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
-        *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
-        /* should we also check that parm and data areas do not overlap? */
-        if (*ppparm > end_of_smb) {
-                cFYI(1, "parms start after end of smb");
-                return -EINVAL;
-        } else if (parm_count + *ppparm > end_of_smb) {
-                cFYI(1, "parm end after end of smb");
-                return -EINVAL;
-        } else if (*ppdata > end_of_smb) {
-                cFYI(1, "data starts after end of smb");
-                return -EINVAL;
-        } else if (data_count + *ppdata > end_of_smb) {
-                cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
-                        *ppdata, data_count, (data_count + *ppdata),
-                        end_of_smb, pSMBr);
-                return -EINVAL;
-        } else if (parm_count + data_count > pSMBr->ByteCount) {
-                cFYI(1, "parm count and data count larger than SMB");
-                return -EINVAL;
-        }
-        *pdatalen = data_count;
-        *pparmlen = parm_count;
-        return 0;
-}
 int
 CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
                        const unsigned char *searchName,
@@ -3056,7 +2967,97 @@ GetExtAttrOut:
 #endif /* CONFIG_POSIX */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
+/*
+ * Initialize NT TRANSACT SMB into small smb request buffer.  This assumes that
+ * all NT TRANSACTS that we init here have total parm and data under about 400
+ * bytes (to fit in small cifs buffer size), which is the case so far, it
+ * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of
+ * returned setup area) and MaxParameterCount (returned parms size) must be set
+ * by caller
+ */
+static int
+smb_init_nttransact(const __u16 sub_command, const int setup_count,
+                   const int parm_len, struct cifsTconInfo *tcon,
+                   void **ret_buf)
+{
+        int rc;
+        __u32 temp_offset;
+        struct smb_com_ntransact_req *pSMB;
+        rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
+                                (void **)&pSMB);
+        if (rc)
+                return rc;
+        *ret_buf = (void *)pSMB;
+        pSMB->Reserved = 0;
+        pSMB->TotalParameterCount = cpu_to_le32(parm_len);
+        pSMB->TotalDataCount  = 0;
+        pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+                                          MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+        pSMB->ParameterCount = pSMB->TotalParameterCount;
+        pSMB->DataCount  = pSMB->TotalDataCount;
+        temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
+                        (setup_count * 2) - 4 /* for rfc1001 length itself */;
+        pSMB->ParameterOffset = cpu_to_le32(temp_offset);
+        pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
+        pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
+        pSMB->SubCommand = cpu_to_le16(sub_command);
+        return 0;
+}
+static int
+validate_ntransact(char *buf, char **ppparm, char **ppdata,
+                   __u32 *pparmlen, __u32 *pdatalen)
+{
+        char *end_of_smb;
+        __u32 data_count, data_offset, parm_count, parm_offset;
+        struct smb_com_ntransact_rsp *pSMBr;
+        *pdatalen = 0;
+        *pparmlen = 0;
+        if (buf == NULL)
+                return -EINVAL;
+        pSMBr = (struct smb_com_ntransact_rsp *)buf;
+        /* ByteCount was converted from little endian in SendReceive */
+        end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
+                        (char *)&pSMBr->ByteCount;
+        data_offset = le32_to_cpu(pSMBr->DataOffset);
+        data_count = le32_to_cpu(pSMBr->DataCount);
+        parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
+        parm_count = le32_to_cpu(pSMBr->ParameterCount);
+        *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
+        *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
+        /* should we also check that parm and data areas do not overlap? */
+        if (*ppparm > end_of_smb) {
+                cFYI(1, "parms start after end of smb");
+                return -EINVAL;
+        } else if (parm_count + *ppparm > end_of_smb) {
+                cFYI(1, "parm end after end of smb");
+                return -EINVAL;
+        } else if (*ppdata > end_of_smb) {
+                cFYI(1, "data starts after end of smb");
+                return -EINVAL;
+        } else if (data_count + *ppdata > end_of_smb) {
+                cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
+                        *ppdata, data_count, (data_count + *ppdata),
+                        end_of_smb, pSMBr);
+                return -EINVAL;
+        } else if (parm_count + data_count > pSMBr->ByteCount) {
+                cFYI(1, "parm count and data count larger than SMB");
+                return -EINVAL;
+        }
+        *pdatalen = data_count;
+        *pparmlen = parm_count;
+        return 0;
+}
 /* Get Security Descriptor (by handle) from remote server for a file or dir */
 int
 CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
@@ -3214,7 +3215,7 @@ setCifsAclRetry:
        return (rc);
 }
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
+#endif /* CONFIG_CIFS_ACL */
 /* Legacy Query Path Information call for lookup to old servers such
   as Win9x/WinME */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 251a17c0354..cc1a8604a79 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -105,6 +105,7 @@ struct smb_vol {
        unsigned int wsize;
        bool sockopt_tcp_nodelay:1;
        unsigned short int port;
+        unsigned long actimeo; /* attribute cache timeout (jiffies) */
        char *prepath;
        struct sockaddr_storage srcaddr; /* allow binding to a local IP */
        struct nls_table *local_nls;
@@ -806,23 +807,20 @@ cifs_parse_mount_options(char *options, const char *devname,
        short int override_gid = -1;
        bool uid_specified = false;
        bool gid_specified = false;
+        char *nodename = utsname()->nodename;
        separator[0] = ',';
        separator[1] = 0;
-        if (Local_System_Name[0] != 0)
+        /*
-                memcpy(vol->source_rfc1001_name, Local_System_Name, 15);
+         * does not have to be perfect mapping since field is
-        else {
+         * informational, only used for servers that do not support
-                char *nodename = utsname()->nodename;
+         * port 445 and it can be overridden at mount time
-                int n = strnlen(nodename, 15);
+         */
-                memset(vol->source_rfc1001_name, 0x20, 15);
+        memset(vol->source_rfc1001_name, 0x20, 15);
-                for (i = 0; i < n; i++) {
+        for (i = 0; i < strnlen(nodename, 15); i++)
-                        /* does not have to be perfect mapping since field is
+                vol->source_rfc1001_name[i] = toupper(nodename[i]);
-                        informational, only used for servers that do not support
-                        port 445 and it can be overridden at mount time */
-                        vol->source_rfc1001_name[i] = toupper(nodename[i]);
-                }
-        }
        vol->source_rfc1001_name[15] = 0;
        /* null target name indicates to use *SMBSERVR default called name
           if we end up sending RFC1001 session initialize */
@@ -840,6 +838,8 @@ cifs_parse_mount_options(char *options, const char *devname,
        /* default to using server inode numbers where available */
        vol->server_ino = 1;
+        vol->actimeo = CIFS_DEF_ACTIMEO;
        if (!options)
                return 1;
@@ -1214,6 +1214,16 @@ cifs_parse_mount_options(char *options, const char *devname,
                                        printk(KERN_WARNING "CIFS: server net"
                                        "biosname longer than 15 truncated.\n");
                        }
+                } else if (strnicmp(data, "actimeo", 7) == 0) {
+                        if (value && *value) {
+                                vol->actimeo = HZ * simple_strtoul(value,
+                                                                   &value, 0);
+                                if (vol->actimeo > CIFS_MAX_ACTIMEO) {
+                                        cERROR(1, "CIFS: attribute cache"
+                                                        "timeout too large");
+                                        return 1;
+                                }
+                        }
                } else if (strnicmp(data, "credentials", 4) == 0) {
                        /* ignore */
                } else if (strnicmp(data, "version", 3) == 0) {
@@ -1352,6 +1362,11 @@ cifs_parse_mount_options(char *options, const char *devname,
                                "supported. Instead set "
                                "/proc/fs/cifs/LookupCacheEnabled to 0\n");
                } else if (strnicmp(data, "fsc", 3) == 0) {
+#ifndef CONFIG_CIFS_FSCACHE
+                        cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE"
+                                  "kernel config option set");
+                        return 1;
+#endif
                        vol->fsc = true;
                } else if (strnicmp(data, "mfsymlinks", 10) == 0) {
                        vol->mfsymlinks = true;
@@ -2566,6 +2581,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
                cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
+        cifs_sb->actimeo = pvolume_info->actimeo;
        if (pvolume_info->noperm)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
        if (pvolume_info->setuids)
@@ -2816,13 +2833,13 @@ remote_path_check:
        /* check if a whole path (including prepath) is not remote */
        if (!rc && cifs_sb->prepathlen && tcon) {
                /* build_path_to_root works only when we have a valid tcon */
-                full_path = cifs_build_path_to_root(cifs_sb);
+                full_path = cifs_build_path_to_root(cifs_sb, tcon);
                if (full_path == NULL) {
                        rc = -ENOMEM;
                        goto mount_fail_check;
                }
                rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
-                if (rc != -EREMOTE) {
+                if (rc != 0 && rc != -EREMOTE) {
                        kfree(full_path);
                        goto mount_fail_check;
                }
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 0eb87026cad..548f06230a6 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -66,7 +66,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
        /* Search for server name delimiter */
        sep = memchr(hostname, '\\', len);
        if (sep)
-                len = sep - unc;
+                len = sep - hostname;
        else
                cFYI(1, "%s: probably server name is whole unc: %s",
                     __func__, unc);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 06c3e83fa38..5a28660ca2b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1108,7 +1108,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
        return total_written;
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
                                        bool fsuid_only)
 {
@@ -1142,7 +1141,6 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
        spin_unlock(&cifs_file_list_lock);
        return NULL;
 }
-#endif
 struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
                                        bool fsuid_only)
@@ -2271,8 +2269,10 @@ void cifs_oplock_break_get(struct cifsFileInfo *cfile)
 void cifs_oplock_break_put(struct cifsFileInfo *cfile)
 {
+        struct super_block *sb = cfile->dentry->d_sb;
        cifsFileInfo_put(cfile);
-        cifs_sb_deactive(cfile->dentry->d_sb);
+        cifs_sb_deactive(sb);
 }
 const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index a2ad94efcfe..297a43d0ff7 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -2,7 +2,7 @@
 *   fs/cifs/fscache.c - CIFS filesystem cache interface
 *
 *   Copyright (c) 2010 Novell, Inc.
- *   Author(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *   Author(s): Suresh Jayaraman <sjayaraman@suse.de>
 *
 *   This library is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU Lesser General Public License as published
@@ -67,10 +67,12 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
        if (cifsi->fscache)
                return;
-        cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
+                cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
                                &cifs_fscache_inode_object_def, cifsi);
-        cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
+                cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
                                cifsi->fscache);
+        }
 }
 void cifs_fscache_release_inode_cookie(struct inode *inode)
@@ -101,10 +103,8 @@ void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
 {
        if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
                cifs_fscache_disable_inode_cookie(inode);
-        else {
+        else
                cifs_fscache_enable_inode_cookie(inode);
-                cFYI(1, "CIFS: fscache inode cookie set");
-        }
 }
 void cifs_fscache_reset_inode_cookie(struct inode *inode)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index ef3a55bf86b..589f3e3f6e0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -686,13 +686,18 @@ int cifs_get_inode_info(struct inode **pinode,
                        cFYI(1, "cifs_sfu_type failed: %d", tmprc);
        }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
        /* fill in 0777 bits from ACL */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                cFYI(1, "Getting mode bits from ACL");
+                rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path,
-                cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
+                                                pfid);
+                if (rc) {
+                        cFYI(1, "%s: Getting ACL failed with error: %d",
+                                __func__, rc);
+                        goto cgii_exit;
+                }
        }
-#endif
+#endif /* CONFIG_CIFS_ACL */
        /* fill in remaining high mode bits e.g. SUID, VTX */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
@@ -723,12 +728,12 @@ static const struct inode_operations cifs_ipc_inode_ops = {
        .lookup = cifs_lookup,
 };
-char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
+char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+                                struct cifsTconInfo *tcon)
 {
        int pplen = cifs_sb->prepathlen;
        int dfsplen;
        char *full_path = NULL;
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        /* if no prefix path, simply set path to the root of share to "" */
        if (pplen == 0) {
@@ -870,7 +875,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
        char *full_path;
        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
-        full_path = cifs_build_path_to_root(cifs_sb);
+        full_path = cifs_build_path_to_root(cifs_sb, tcon);
        if (full_path == NULL)
                return ERR_PTR(-ENOMEM);
@@ -881,8 +886,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
                                                xid, NULL);
-        if (!inode)
+        if (!inode) {
-                return ERR_PTR(rc);
+                inode = ERR_PTR(rc);
+                goto out;
+        }
 #ifdef CONFIG_CIFS_FSCACHE
        /* populate tcon->resource_id */
@@ -898,13 +905,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
                inode->i_uid = cifs_sb->mnt_uid;
                inode->i_gid = cifs_sb->mnt_gid;
        } else if (rc) {
-                kfree(full_path);
-                _FreeXid(xid);
                iget_failed(inode);
-                return ERR_PTR(rc);
+                inode = ERR_PTR(rc);
        }
+out:
        kfree(full_path);
        /* can not call macro FreeXid here since in a void func
         * TODO: This is no longer true
@@ -1648,6 +1653,7 @@ static bool
 cifs_inode_needs_reval(struct inode *inode)
 {
        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        if (cifs_i->clientCanCacheRead)
                return false;
@@ -1658,19 +1664,21 @@ cifs_inode_needs_reval(struct inode *inode)
        if (cifs_i->time == 0)
                return true;
-        /* FIXME: the actimeo should be tunable */
+        if (!time_in_range(jiffies, cifs_i->time,
-        if (time_after_eq(jiffies, cifs_i->time + HZ))
+                                cifs_i->time + cifs_sb->actimeo))
                return true;
        /* hardlinked files w/ noserverino get "special" treatment */
-        if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
            S_ISREG(inode->i_mode) && inode->i_nlink != 1)
                return true;
        return false;
 }
-/* check invalid_mapping flag and zap the cache if it's set */
+/*
+ * Zap the cache. Called when invalid_mapping flag is set.
+ */
 static void
 cifs_invalidate_mapping(struct inode *inode)
 {
@@ -2114,11 +2122,16 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        if (attrs->ia_valid & ATTR_MODE) {
                rc = 0;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                        rc = mode_to_acl(inode, full_path, mode);
+                        rc = mode_to_cifs_acl(inode, full_path, mode);
-                else
+                        if (rc) {
-#endif
+                                cFYI(1, "%s: Setting ACL failed with error: %d",
+                                        __func__, rc);
+                                goto cifs_setattr_exit;
+                        }
+                } else
+#endif /* CONFIG_CIFS_ACL */
                if (((mode & S_IWUGO) == 0) &&
                    (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ef7bb7b50f5..a73eb9f4bda 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -226,26 +226,29 @@ static int initiate_cifs_search(const int xid, struct file *file)
        char *full_path = NULL;
        struct cifsFileInfo *cifsFile;
        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        struct tcon_link *tlink;
+        struct tcon_link *tlink = NULL;
        struct cifsTconInfo *pTcon;
-        tlink = cifs_sb_tlink(cifs_sb);
-        if (IS_ERR(tlink))
-                return PTR_ERR(tlink);
-        pTcon = tlink_tcon(tlink);
-        if (file->private_data == NULL)
-                file->private_data =
-                        kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
        if (file->private_data == NULL) {
-                rc = -ENOMEM;
+                tlink = cifs_sb_tlink(cifs_sb);
-                goto error_exit;
+                if (IS_ERR(tlink))
+                        return PTR_ERR(tlink);
+                cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+                if (cifsFile == NULL) {
+                        rc = -ENOMEM;
+                        goto error_exit;
+                }
+                file->private_data = cifsFile;
+                cifsFile->tlink = cifs_get_tlink(tlink);
+                pTcon = tlink_tcon(tlink);
+        } else {
+                cifsFile = file->private_data;
+                pTcon = tlink_tcon(cifsFile->tlink);
        }
-        cifsFile = file->private_data;
        cifsFile->invalidHandle = true;
        cifsFile->srch_inf.endOfSearch = false;
-        cifsFile->tlink = cifs_get_tlink(tlink);
        full_path = build_path_from_dentry(file->f_path.dentry);
        if (full_path == NULL) {
@@ -756,18 +759,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
        rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
                     ino, fattr.cf_dtype);
-        /*
-         * we can not return filldir errors to the caller since they are
-         * "normal" when the stat blocksize is too small - we return remapped
-         * error instead
-         *
-         * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above
-         * case already. Why should we be clobbering other errors from it?
-         */
-        if (rc) {
-                cFYI(1, "filldir rc = %d", rc);
-                rc = -EOVERFLOW;
-        }
        dput(tmp_dentry);
        return rc;
 }
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a264b744bb4..eae2a149160 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -30,10 +30,11 @@
 #define MAX_EA_VALUE_SIZE 65535
 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
+#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
 #define CIFS_XATTR_USER_PREFIX "user."
 #define CIFS_XATTR_SYSTEM_PREFIX "system."
 #define CIFS_XATTR_OS2_PREFIX "os2."
-#define CIFS_XATTR_SECURITY_PREFIX ".security"
+#define CIFS_XATTR_SECURITY_PREFIX "security."
 #define CIFS_XATTR_TRUSTED_PREFIX "trusted."
 #define XATTR_TRUSTED_PREFIX_LEN  8
 #define XATTR_SECURITY_PREFIX_LEN 9
@@ -277,29 +278,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                                cifs_sb->local_nls,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-                else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                        __u16 fid;
-                        int oplock = 0;
-                        struct cifs_ntsd *pacl = NULL;
-                        __u32 buflen = 0;
-                        if (experimEnabled)
-                                rc = CIFSSMBOpen(xid, pTcon, full_path,
-                                        FILE_OPEN, GENERIC_READ, 0, &fid,
-                                        &oplock, NULL, cifs_sb->local_nls,
-                                        cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        /* else rc is EOPNOTSUPP from above */
-                        if (rc == 0) {
-                                rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl,
-                                                      &buflen);
-                                CIFSSMBClose(xid, pTcon, fid);
-                        }
-                }
-#endif /* EXPERIMENTAL */
 #else
-                cFYI(1, "query POSIX ACL not supported yet");
+                cFYI(1, "Query POSIX ACL not supported yet");
 #endif /* CONFIG_CIFS_POSIX */
        } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
                          strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -311,8 +291,33 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
 #else
-                cFYI(1, "query POSIX default ACL not supported yet");
+                cFYI(1, "Query POSIX default ACL not supported yet");
-#endif
+#endif /* CONFIG_CIFS_POSIX */
+        } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
+                                strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+#ifdef CONFIG_CIFS_ACL
+                        u32 acllen;
+                        struct cifs_ntsd *pacl;
+                        pacl = get_cifs_acl(cifs_sb, direntry->d_inode,
+                                                full_path, &acllen);
+                        if (IS_ERR(pacl)) {
+                                rc = PTR_ERR(pacl);
+                                cERROR(1, "%s: error %zd getting sec desc",
+                                                __func__, rc);
+                        } else {
+                                if (ea_value) {
+                                        if (acllen > buf_size)
+                                                acllen = -ERANGE;
+                                        else
+                                                memcpy(ea_value, pacl, acllen);
+                                }
+                                rc = acllen;
+                                kfree(pacl);
+                        }
+#else
+                cFYI(1, "Query CIFS ACL not supported yet");
+#endif /* CONFIG_CIFS_ACL */
        } else if (strncmp(ea_name,
                  CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
                cFYI(1, "Trusted xattr namespace not supported yet");
diff --git a/fs/compat.c b/fs/compat.c
index c580c322fa6..eb1740ac8c0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1350,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max)
                        argv++;
                        if (i++ >= max)
                                return -E2BIG;
+                        if (fatal_signal_pending(current))
+                                return -ERESTARTNOHAND;
+                        cond_resched();
                }
        }
        return i;
@@ -1391,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
                while (len > 0) {
                        int offset, bytes_to_copy;
+                        if (fatal_signal_pending(current)) {
+                                ret = -ERESTARTNOHAND;
+                                goto out;
+                        }
+                        cond_resched();
                        offset = pos % PAGE_SIZE;
                        if (offset == 0)
                                offset = PAGE_SIZE;
@@ -1407,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
                                struct page *page;
-#ifdef CONFIG_STACK_GROWSUP
+                                page = get_arg_page(bprm, pos, 1);
-                                ret = expand_stack_downwards(bprm->vma, pos);
+                                if (!page) {
-                                if (ret < 0) {
-                                        /* We've exceed the stack rlimit. */
-                                        ret = -E2BIG;
-                                        goto out;
-                                }
-#endif
-                                ret = get_user_pages(current, bprm->mm, pos,
-                                                     1, 1, 1, &page, NULL);
-                                if (ret <= 0) {
-                                        /* We've exceed the stack rlimit. */
                                        ret = -E2BIG;
                                        goto out;
                                }
@@ -1539,8 +1539,10 @@ int compat_do_execve(char * filename,
        return retval;
 out:
-        if (bprm->mm)
+        if (bprm->mm) {
+                acct_arg_size(bprm, 0);
                mmput(bprm->mm);
+        }
 out_file:
        if (bprm->file) {
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 410ed188faa..a60579b007b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -19,7 +19,6 @@
 #include <linux/compiler.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/ioctl.h>
 #include <linux/if.h>
 #include <linux/if_bridge.h>
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 253732382d3..2720178b771 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -28,7 +28,6 @@
 #include <linux/key.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/exec.c b/fs/exec.c
index 99d33a1371e..c62efcb959c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -164,7 +164,26 @@ out:
 #ifdef CONFIG_MMU
-static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+        struct mm_struct *mm = current->mm;
+        long diff = (long)(pages - bprm->vma_pages);
+        if (!mm || !diff)
+                return;
+        bprm->vma_pages = pages;
+#ifdef SPLIT_RSS_COUNTING
+        add_mm_counter(mm, MM_ANONPAGES, diff);
+#else
+        spin_lock(&mm->page_table_lock);
+        add_mm_counter(mm, MM_ANONPAGES, diff);
+        spin_unlock(&mm->page_table_lock);
+#endif
+}
+struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
 {
        struct page *page;
@@ -186,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
                struct rlimit *rlim;
+                acct_arg_size(bprm, size / PAGE_SIZE);
                /*
                 * We've historically supported up to 32 pages (ARG_MAX)
                 * of argument strings even with small stacks
@@ -254,6 +275,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
        vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        INIT_LIST_HEAD(&vma->anon_vma_chain);
+        err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
+        if (err)
+                goto err;
        err = insert_vm_struct(mm, vma);
        if (err)
                goto err;
@@ -276,7 +302,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
 #else
-static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+}
+struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
 {
        struct page *page;
@@ -1003,6 +1033,7 @@ int flush_old_exec(struct linux_binprm * bprm)
        /*
         * Release all of the old mmap stuff
         */
+        acct_arg_size(bprm, 0);
        retval = exec_mmap(bprm->mm);
        if (retval)
                goto out;
@@ -1426,8 +1457,10 @@ int do_execve(const char * filename,
        return retval;
 out:
-        if (bprm->mm)
+        if (bprm->mm) {
-                mmput (bprm->mm);
+                acct_arg_size(bprm, 0);
+                mmput(bprm->mm);
+        }
 out_file:
        if (bprm->file) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2fedaf8b501..acf8695fa8f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -27,7 +27,6 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/vfs.h>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6a5edea2d70..94ce3d7a1c4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -910,6 +910,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_JOURNAL_CHECKSUM     0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
+#define EXT4_MOUNT_MBLK_IO_SUBMIT       0x4000000 /* multi-block io submits */
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bdbe6990220..e659597b690 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2125,9 +2125,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                         */
                        if (unlikely(journal_data && PageChecked(page)))
                                err = __ext4_journalled_writepage(page, len);
-                        else
+                        else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
                                err = ext4_bio_write_page(&io_submit, page,
                                                          len, mpd->wbc);
+                        else
+                                err = block_write_full_page(page,
+                                        noalloc_get_block_write, mpd->wbc);
                        if (!err)
                                mpd->pages_written++;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae883b1b..eb3bc2fe647 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -331,6 +331,30 @@ mext_out:
                return err;
        }
+        case FITRIM:
+        {
+                struct super_block *sb = inode->i_sb;
+                struct fstrim_range range;
+                int ret = 0;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                    sizeof(range)))
+                        return -EFAULT;
+                ret = ext4_trim_fs(sb, &range);
+                if (ret < 0)
+                        return ret;
+                if (copy_to_user((struct fstrim_range *)arg, &range,
+                    sizeof(range)))
+                        return -EFAULT;
+                return 0;
+        }
        default:
                return -ENOTTY;
        }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 92203b8a099..dc40e75cba8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -872,7 +872,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        if (namelen > EXT4_NAME_LEN)
                return NULL;
        if ((namelen <= 2) && (name[0] == '.') &&
-            (name[1] == '.' || name[1] == '0')) {
+            (name[1] == '.' || name[1] == '\0')) {
                /*
                 * "." or ".." will only be in the first block
                 * NFS may look up ".."; "." should be handled by the VFS
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7f5451cd1d3..beacce11ac5 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -237,8 +237,6 @@ static void ext4_end_bio(struct bio *bio, int error)
                        } while (bh != head);
                }
-                put_io_page(io_end->pages[i]);
                /*
                 * If this is a partial write which happened to make
                 * all buffers uptodate then we can optimize away a
@@ -248,6 +246,8 @@ static void ext4_end_bio(struct bio *bio, int error)
                 */
                if (!partial_write)
                        SetPageUptodate(page);
+                put_io_page(io_end->pages[i]);
        }
        io_end->num_io_pages = 0;
        inode = io_end->inode;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index dc963929de6..981c8477ada 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -232,6 +232,8 @@ static int setup_new_group_blocks(struct super_block *sb,
                               GFP_NOFS);
        if (err)
                goto exit_bh;
+        for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++)
+                ext4_set_bit(bit, bh->b_data);
        ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
                   input->block_bitmap - start);
@@ -247,6 +249,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
        if (err)
                goto exit_bh;
+        for (i = 0, bit = input->inode_table - start;
+             i < sbi->s_itb_per_group; i++, bit++)
+                ext4_set_bit(bit, bh->b_data);
        if ((err = extend_or_restart_transaction(handle, 2, bh)))
                goto exit_bh;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 61182fe6254..fb15c9c0be7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1026,6 +1026,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            !(def_mount_opts & EXT4_DEFM_NODELALLOC))
                seq_puts(seq, ",nodelalloc");
+        if (test_opt(sb, MBLK_IO_SUBMIT))
+                seq_puts(seq, ",mblk_io_submit");
        if (sbi->s_stripe)
                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
        /*
@@ -1197,7 +1199,6 @@ static const struct super_operations ext4_sops = {
        .quota_write    = ext4_quota_write,
 #endif
        .bdev_try_to_free_page = bdev_try_to_free_page,
-        .trim_fs        = ext4_trim_fs
 };
 static const struct super_operations ext4_nojournal_sops = {
@@ -1240,8 +1241,8 @@ enum {
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
        Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
        Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
-        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
-        Opt_block_validity, Opt_noblock_validity,
+        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard,
@@ -1305,6 +1306,8 @@ static const match_table_t tokens = {
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
+        {Opt_mblk_io_submit, "mblk_io_submit"},
+        {Opt_nomblk_io_submit, "nomblk_io_submit"},
        {Opt_block_validity, "block_validity"},
        {Opt_noblock_validity, "noblock_validity"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1726,6 +1729,12 @@ set_qf_format:
                case Opt_nodelalloc:
                        clear_opt(sbi->s_mount_opt, DELALLOC);
                        break;
+                case Opt_mblk_io_submit:
+                        set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+                        break;
+                case Opt_nomblk_io_submit:
+                        clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+                        break;
                case Opt_stripe:
                        if (match_int(&args[0], &option))
                                return 0;
@@ -2799,9 +2808,6 @@ static void ext4_clear_request_list(void)
        struct ext4_li_request *elr;
        mutex_lock(&ext4_li_info->li_list_mtx);
-        if (list_empty(&ext4_li_info->li_request_list))
-                return;
        list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
                elr = list_entry(pos, struct ext4_li_request,
                                 lr_request);
@@ -3268,13 +3274,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * Test whether we have more sectors than will fit in sector_t,
         * and whether the max offset is addressable by the page cache.
         */
-        ret = generic_check_addressable(sb->s_blocksize_bits,
+        err = generic_check_addressable(sb->s_blocksize_bits,
                                        ext4_blocks_count(es));
-        if (ret) {
+        if (err) {
                ext4_msg(sb, KERN_ERR, "filesystem"
                         " too large to mount safely on this system");
                if (sizeof(sector_t) < 8)
                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+                ret = err;
                goto failed_mount;
        }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c8224587123..8b984a2cebb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/compat.h>
 static const struct file_operations fuse_direct_io_file_operations;
@@ -134,6 +135,7 @@ EXPORT_SYMBOL_GPL(fuse_do_open);
 void fuse_finish_open(struct inode *inode, struct file *file)
 {
        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = get_fuse_conn(inode);
        if (ff->open_flags & FOPEN_DIRECT_IO)
                file->f_op = &fuse_direct_io_file_operations;
@@ -141,6 +143,15 @@ void fuse_finish_open(struct inode *inode, struct file *file)
                invalidate_inode_pages2(inode->i_mapping);
        if (ff->open_flags & FOPEN_NONSEEKABLE)
                nonseekable_open(inode, file);
+        if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
+                struct fuse_inode *fi = get_fuse_inode(inode);
+                spin_lock(&fc->lock);
+                fi->attr_version = ++fc->attr_version;
+                i_size_write(inode, 0);
+                spin_unlock(&fc->lock);
+                fuse_invalidate_attr(inode);
+        }
 }
 int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -1618,6 +1629,58 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 }
 /*
+ * CUSE servers compiled on 32bit broke on 64bit kernels because the
+ * ABI was defined to be 'struct iovec' which is different on 32bit
+ * and 64bit.  Fortunately we can determine which structure the server
+ * used from the size of the reply.
+ */
+static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src,
+                                 size_t transferred, unsigned count,
+                                 bool is_compat)
+{
+#ifdef CONFIG_COMPAT
+        if (count * sizeof(struct compat_iovec) == transferred) {
+                struct compat_iovec *ciov = src;
+                unsigned i;
+                /*
+                 * With this interface a 32bit server cannot support
+                 * non-compat (i.e. ones coming from 64bit apps) ioctl
+                 * requests
+                 */
+                if (!is_compat)
+                        return -EINVAL;
+                for (i = 0; i < count; i++) {
+                        dst[i].iov_base = compat_ptr(ciov[i].iov_base);
+                        dst[i].iov_len = ciov[i].iov_len;
+                }
+                return 0;
+        }
+#endif
+        if (count * sizeof(struct iovec) != transferred)
+                return -EIO;
+        memcpy(dst, src, transferred);
+        return 0;
+}
+/* Make sure iov_length() won't overflow */
+static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
+{
+        size_t n;
+        u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+        for (n = 0; n < count; n++) {
+                if (iov->iov_len > (size_t) max)
+                        return -ENOMEM;
+                max -= iov->iov_len;
+        }
+        return 0;
+}
+/*
 * For ioctls, there is no generic way to determine how much memory
 * needs to be read and/or written.  Furthermore, ioctls are allowed
 * to dereference the passed pointer, so the parameter requires deep
@@ -1798,18 +1861,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
                        goto out;
-                err = -EIO;
-                if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
-                        goto out;
-                /* okay, copy in iovs and retry */
                vaddr = kmap_atomic(pages[0], KM_USER0);
-                memcpy(page_address(iov_page), vaddr, transferred);
+                err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr,
+                                            transferred, in_iovs + out_iovs,
+                                            (flags & FUSE_IOCTL_COMPAT) != 0);
                kunmap_atomic(vaddr, KM_USER0);
+                if (err)
+                        goto out;
                in_iov = page_address(iov_page);
                out_iov = in_iov + in_iovs;
+                err = fuse_verify_ioctl_iov(in_iov, in_iovs);
+                if (err)
+                        goto out;
+                err = fuse_verify_ioctl_iov(out_iov, out_iovs);
+                if (err)
+                        goto out;
                goto retry;
        }
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 06d582732d3..5ab3839dfcb 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -138,10 +138,8 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
                                      struct gfs2_inum_host *inum)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
-        struct gfs2_holder i_gh;
        struct inode *inode;
        struct dentry *dentry;
-        int error;
        inode = gfs2_ilookup(sb, inum->no_addr);
        if (inode) {
@@ -152,52 +150,16 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
                goto out_inode;
        }
-        error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
+        inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
-                                  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+                                    GFS2_BLKST_DINODE);
-        if (error)
+        if (IS_ERR(inode))
-                return ERR_PTR(error);
+                return ERR_CAST(inode);
-        error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
-        if (error)
-                goto fail;
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
-        if (IS_ERR(inode)) {
-                error = PTR_ERR(inode);
-                goto fail;
-        }
-        error = gfs2_inode_refresh(GFS2_I(inode));
-        if (error) {
-                iput(inode);
-                goto fail;
-        }
-        /* Pick up the works we bypass in gfs2_inode_lookup */
-        if (inode->i_state & I_NEW) 
-                gfs2_set_iop(inode);
-        if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
-                iput(inode);
-                goto fail;
-        }
-        error = -EIO;
-        if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
-                iput(inode);
-                goto fail;
-        }
-        gfs2_glock_dq_uninit(&i_gh);
 out_inode:
        dentry = d_obtain_alias(inode);
        if (!IS_ERR(dentry))
                dentry->d_op = &gfs2_dops;
        return dentry;
-fail:
-        gfs2_glock_dq_uninit(&i_gh);
-        return ERR_PTR(error);
 }
 static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 87778857f09..f92c1770416 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -686,21 +686,20 @@ static void delete_work_func(struct work_struct *work)
 {
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct gfs2_inode *ip = NULL;
+        struct gfs2_inode *ip;
        struct inode *inode;
-        u64 no_addr = 0;
+        u64 no_addr = gl->gl_name.ln_number;
+        ip = gl->gl_object;
+        /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
-        spin_lock(&gl->gl_spin);
-        ip = (struct gfs2_inode *)gl->gl_object;
        if (ip)
-                no_addr = ip->i_no_addr;
-        spin_unlock(&gl->gl_spin);
-        if (ip) {
                inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
-                if (inode) {
+        else
-                        d_prune_aliases(inode);
+                inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
-                        iput(inode);
+        if (inode && !IS_ERR(inode)) {
-                }
+                d_prune_aliases(inode);
+                iput(inode);
        }
        gfs2_glock_put(gl);
 }
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 06370f8bd8c..e1213f7f921 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -73,49 +73,6 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
        return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
 }
-struct gfs2_skip_data {
-        u64     no_addr;
-        int     skipped;
-};
-static int iget_skip_test(struct inode *inode, void *opaque)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_skip_data *data = opaque;
-        if (ip->i_no_addr == data->no_addr) {
-                if (inode->i_state & (I_FREEING|I_WILL_FREE)){
-                        data->skipped = 1;
-                        return 0;
-                }
-                return 1;
-        }
-        return 0;
-}
-static int iget_skip_set(struct inode *inode, void *opaque)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_skip_data *data = opaque;
-        if (data->skipped)
-                return 1;
-        inode->i_ino = (unsigned long)(data->no_addr);
-        ip->i_no_addr = data->no_addr;
-        return 0;
-}
-static struct inode *gfs2_iget_skip(struct super_block *sb,
-                                    u64 no_addr)
-{
-        struct gfs2_skip_data data;
-        unsigned long hash = (unsigned long)no_addr;
-        data.no_addr = no_addr;
-        data.skipped = 0;
-        return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
-}
 /**
 * GFS2 lookup code fills in vfs inode contents based on info obtained
 * from directory entry inside gfs2_inode_lookup(). This has caused issues
@@ -243,93 +200,54 @@ fail:
        return ERR_PTR(error);
 }
-/**
+struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
- * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
+                                  u64 *no_formal_ino, unsigned int blktype)
- *                               and try to reclaim it by doing iput.
- *
- * This function assumes no rgrp locks are currently held.
- *
- * @sb: The super block
- * no_addr: The inode number
- *
- */
-void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
 {
-        struct gfs2_sbd *sdp;
+        struct super_block *sb = sdp->sd_vfs;
-        struct gfs2_inode *ip;
+        struct gfs2_holder i_gh;
-        struct gfs2_glock *io_gl = NULL;
-        int error;
-        struct gfs2_holder gh;
        struct inode *inode;
+        int error;
-        inode = gfs2_iget_skip(sb, no_addr);
+        error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
+                                  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-        if (!inode)
+        if (error)
-                return;
+                return ERR_PTR(error);
-        /* If it's not a new inode, someone's using it, so leave it alone. */
-        if (!(inode->i_state & I_NEW)) {
-                iput(inode);
-                return;
-        }
-        ip = GFS2_I(inode);
-        sdp = GFS2_SB(inode);
-        ip->i_no_formal_ino = -1;
-        error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+        error = gfs2_check_blk_type(sdp, no_addr, blktype);
-        if (unlikely(error))
+        if (error)
                goto fail;
-        ip->i_gl->gl_object = ip;
-        error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
-        if (unlikely(error))
+        if (IS_ERR(inode))
-                goto fail_put;
+                goto fail;
-        set_bit(GIF_INVALID, &ip->i_flags);
-        error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
-                                   &ip->i_iopen_gh);
-        if (unlikely(error))
-                goto fail_iopen;
-        ip->i_iopen_gh.gh_gl->gl_object = ip;
+        error = gfs2_inode_refresh(GFS2_I(inode));
-        gfs2_glock_put(io_gl);
+        if (error)
-        io_gl = NULL;
+                goto fail_iput;
-        inode->i_mode = DT2IF(DT_UNKNOWN);
+        /* Pick up the works we bypass in gfs2_inode_lookup */
+        if (inode->i_state & I_NEW) 
+                gfs2_set_iop(inode);
-        /*
+        /* Two extra checks for NFS only */
-         * We must read the inode in order to work out its type in
+        if (no_formal_ino) {
-         * this case. Note that this doesn't happen often as we normally
+                error = -ESTALE;
-         * know the type beforehand. This code path only occurs during
+                if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
-         * unlinked inode recovery (where it is safe to do this glock,
+                        goto fail_iput;
-         * which is not true in the general case).
-         */
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
-                                   &gh);
-        if (unlikely(error))
-                goto fail_glock;
-        /* Inode is now uptodate */
+                error = -EIO;
-        gfs2_glock_dq_uninit(&gh);
+                if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
-        gfs2_set_iop(inode);
+                        goto fail_iput;
-        /* The iput will cause it to be deleted. */
+                error = 0;
-        iput(inode);
+        }
-        return;
-fail_glock:
-        gfs2_glock_dq(&ip->i_iopen_gh);
-fail_iopen:
-        if (io_gl)
-                gfs2_glock_put(io_gl);
-fail_put:
-        ip->i_gl->gl_object = NULL;
-        gfs2_glock_put(ip->i_gl);
 fail:
-        iget_failed(inode);
+        gfs2_glock_dq_uninit(&i_gh);
-        return;
+        return error ? ERR_PTR(error) : inode;
+fail_iput:
+        iput(inode);
+        goto fail;
 }
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6720d7d5fbc..d8499fadcc5 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -99,7 +99,9 @@ err:
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
                                       u64 no_addr, u64 no_formal_ino);
-extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
+extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+                                         u64 *no_formal_ino,
+                                         unsigned int blktype);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 58a9b9998b4..f606baf9ba7 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -631,6 +631,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                             struct fs_disk_quota *fdq)
 {
        struct inode *inode = &ip->i_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct address_space *mapping = inode->i_mapping;
        unsigned long index = loc >> PAGE_CACHE_SHIFT;
        unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
@@ -658,11 +659,11 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        qd->qd_qb.qb_value = qp->qu_value;
        if (fdq) {
                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
-                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
                        qd->qd_qb.qb_warn = qp->qu_warn;
                }
                if (fdq->d_fieldmask & FS_DQ_BHARD) {
-                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
                        qd->qd_qb.qb_limit = qp->qu_limit;
                }
        }
@@ -1497,9 +1498,9 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
        fdq->d_version = FS_DQUOT_VERSION;
        fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
        fdq->d_id = id;
-        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
+        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
-        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
+        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
-        fdq->d_bcount = be64_to_cpu(qlvb->qb_value);
+        fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
        gfs2_glock_dq_uninit(&q_gh);
 out:
@@ -1566,10 +1567,10 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
        /* If nothing has changed, this is a no-op */
        if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
-            (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn)))
+            ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
                fdq->d_fieldmask ^= FS_DQ_BSOFT;
        if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
-            (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit)))
+            ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
                fdq->d_fieldmask ^= FS_DQ_BHARD;
        if (fdq->d_fieldmask == 0)
                goto out_i;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index bef3ab6cf5c..33c8407b876 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -963,17 +963,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 *          The inode, if one has been found, in inode.
 */
-static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
-                           u64 skip)
 {
        u32 goal = 0, block;
        u64 no_addr;
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        unsigned int n;
+        struct gfs2_glock *gl;
+        struct gfs2_inode *ip;
+        int error;
+        int found = 0;
-        for(;;) {
+        while (goal < rgd->rd_data) {
-                if (goal >= rgd->rd_data)
-                        break;
                down_write(&sdp->sd_log_flush_lock);
                n = 1;
                block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
@@ -990,11 +991,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
                if (no_addr == skip)
                        continue;
                *last_unlinked = no_addr;
-                return no_addr;
+                error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
+                if (error)
+                        continue;
+                /* If the inode is already in cache, we can ignore it here
+                 * because the existing inode disposal code will deal with
+                 * it when all refs have gone away. Accessing gl_object like
+                 * this is not safe in general. Here it is ok because we do
+                 * not dereference the pointer, and we only need an approx
+                 * answer to whether it is NULL or not.
+                 */
+                ip = gl->gl_object;
+                if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+                        gfs2_glock_put(gl);
+                else
+                        found++;
+                /* Limit reclaim to sensible number of tasks */
+                if (found > 2*NR_CPUS)
+                        return;
        }
        rgd->rd_flags &= ~GFS2_RDF_CHECK;
-        return 0;
+        return;
 }
 /**
@@ -1075,11 +1097,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
 * Try to acquire rgrp in way which avoids contending with others.
 *
 * Returns: errno
- *          unlinked: the block address of an unlinked block to be reclaimed
 */
-static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
-                          u64 *last_unlinked)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd, *begin = NULL;
@@ -1089,7 +1109,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
        int loops = 0;
        int error, rg_locked;
-        *unlinked = 0;
        rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
        while (rgd) {
@@ -1106,17 +1125,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        /* If the rg came in already locked, there's no
+                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                           way we can recover from a failed try_rgrp_unlink
+                                try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
-                           because that would require an iput which can only
-                           happen after the rgrp is unlocked. */
-                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
-                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
-                                                           ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (*unlinked)
-                                return -EAGAIN;
                        /* fall through */
                case GLR_TRYFAILED:
                        rgd = recent_rgrp_next(rgd);
@@ -1145,13 +1157,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+                                try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
-                                                            ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (*unlinked)
-                                return -EAGAIN;
                        break;
                case GLR_TRYFAILED:
@@ -1204,12 +1213,12 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
        int error = 0;
-        u64 last_unlinked = NO_BLOCK, unlinked;
+        u64 last_unlinked = NO_BLOCK;
+        int tries = 0;
        if (gfs2_assert_warn(sdp, al->al_requested))
                return -EINVAL;
-try_again:
        if (hold_rindex) {
                /* We need to hold the rindex unless the inode we're using is
                   the rindex itself, in which case it's already held. */
@@ -1218,31 +1227,23 @@ try_again:
                else if (!sdp->sd_rgrps) /* We may not have the rindex read
                                            in, so: */
                        error = gfs2_ri_update_special(ip);
+                if (error)
+                        return error;
        }
-        if (error)
+        do {
-                return error;
+                error = get_local_rgrp(ip, &last_unlinked);
+                /* If there is no space, flushing the log may release some */
+                if (error)
+                        gfs2_log_flush(sdp, NULL);
+        } while (error && tries++ < 3);
-        /* Find an rgrp suitable for allocation.  If it encounters any unlinked
-           dinodes along the way, error will equal -EAGAIN and unlinked will
-           contains it block address. We then need to look up that inode and
-           try to free it, and try the allocation again. */
-        error = get_local_rgrp(ip, &unlinked, &last_unlinked);
        if (error) {
                if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
                        gfs2_glock_dq_uninit(&al->al_ri_gh);
-                if (error != -EAGAIN)
+                return error;
-                        return error;
-                gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
-                /* regardless of whether or not gfs2_process_unlinked_inode
-                   was successful, we don't want to repeat it again. */
-                last_unlinked = unlinked;
-                gfs2_log_flush(sdp, NULL);
-                error = 0;
-                goto try_again;
        }
        /* no error, so we have the rgrp set in the inode's allocation. */
        al->al_file = file;
        al->al_line = line;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index e92fdbb3bc3..d6cc1647662 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -6,7 +6,6 @@
 #include <linux/syscalls.h>
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/capability.h>
 #include <linux/file.h>
 #include <linux/fs.h>
@@ -530,41 +529,6 @@ static int ioctl_fsthaw(struct file *filp)
        return thaw_super(sb);
 }
-static int ioctl_fstrim(struct file *filp, void __user *argp)
-{
-        struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
-        struct fstrim_range range;
-        int ret = 0;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
-        /* If filesystem doesn't support trim feature, return. */
-        if (sb->s_op->trim_fs == NULL)
-                return -EOPNOTSUPP;
-        /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
-        if (sb->s_bdev == NULL)
-                return -EINVAL;
-        if (argp == NULL) {
-                range.start = 0;
-                range.len = ULLONG_MAX;
-                range.minlen = 0;
-        } else if (copy_from_user(&range, argp, sizeof(range)))
-                return -EFAULT;
-        ret = sb->s_op->trim_fs(sb, &range);
-        if (ret < 0)
-                return ret;
-        if ((argp != NULL) &&
-            (copy_to_user(argp, &range, sizeof(range))))
-                return -EFAULT;
-        return 0;
-}
 /*
 * When you add any new common ioctls to the switches above and below
 * please update compat_sys_ioctl() too.
@@ -615,10 +579,6 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                error = ioctl_fsthaw(filp);
                break;
-        case FITRIM:
-                error = ioctl_fstrim(filp, argp);
-                break;
        case FS_IOC_FIEMAP:
                return ioctl_fiemap(filp, arg);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 2f7d05c8992..7da2a06508e 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -103,22 +103,15 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
        }
        ret = -ESRCH;
-        /*
+        rcu_read_lock();
-         * We want IOPRIO_WHO_PGRP/IOPRIO_WHO_USER to be "atomic",
-         * so we can't use rcu_read_lock(). See re-copy of ->ioprio
-         * in copy_process().
-         */
-        read_lock(&tasklist_lock);
        switch (which) {
                case IOPRIO_WHO_PROCESS:
-                        rcu_read_lock();
                        if (!who)
                                p = current;
                        else
                                p = find_task_by_vpid(who);
                        if (p)
                                ret = set_task_ioprio(p, ioprio);
-                        rcu_read_unlock();
                        break;
                case IOPRIO_WHO_PGRP:
                        if (!who)
@@ -141,12 +134,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
                                break;
                        do_each_thread(g, p) {
-                                int match;
+                                if (__task_cred(p)->uid != who)
-                                rcu_read_lock();
-                                match = __task_cred(p)->uid == who;
-                                rcu_read_unlock();
-                                if (!match)
                                        continue;
                                ret = set_task_ioprio(p, ioprio);
                                if (ret)
@@ -160,7 +148,7 @@ free_uid:
                        ret = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
@@ -204,17 +192,15 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
        int ret = -ESRCH;
        int tmpio;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        switch (which) {
                case IOPRIO_WHO_PROCESS:
-                        rcu_read_lock();
                        if (!who)
                                p = current;
                        else
                                p = find_task_by_vpid(who);
                        if (p)
                                ret = get_task_ioprio(p);
-                        rcu_read_unlock();
                        break;
                case IOPRIO_WHO_PGRP:
                        if (!who)
@@ -241,12 +227,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
                                break;
                        do_each_thread(g, p) {
-                                int match;
+                                if (__task_cred(p)->uid != user->uid)
-                                rcu_read_lock();
-                                match = __task_cred(p)->uid == user->uid;
-                                rcu_read_unlock();
-                                if (!match)
                                        continue;
                                tmpio = get_task_ioprio(p);
                                if (tmpio < 0)
@@ -264,6 +245,6 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
                        ret = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c590d155c09..f837ba95352 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -899,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
        /* journal descriptor can store up to n blocks -bzzz */
        journal->j_blocksize = blocksize;
+        journal->j_dev = bdev;
+        journal->j_fs_dev = fs_dev;
+        journal->j_blk_offset = start;
+        journal->j_maxlen = len;
+        bdevname(journal->j_dev, journal->j_devname);
+        p = journal->j_devname;
+        while ((p = strchr(p, '/')))
+                *p = '!';
        jbd2_stats_proc_init(journal);
        n = journal->j_blocksize / sizeof(journal_block_tag_t);
        journal->j_wbufsize = n;
@@ -908,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
                        __func__);
                goto out_err;
        }
-        journal->j_dev = bdev;
-        journal->j_fs_dev = fs_dev;
-        journal->j_blk_offset = start;
-        journal->j_maxlen = len;
-        bdevname(journal->j_dev, journal->j_devname);
-        p = journal->j_devname;
-        while ((p = strchr(p, '/')))
-                *p = '!';
        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
        if (!bh) {
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index d5bb86866e6..25509eb28fd 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,7 +14,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/smp_lock.h>
 #include <linux/kthread.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 47ea1e1925b..332c54cf75e 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -7,7 +7,6 @@
 */
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/errno.h>
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 25e21e4023b..ed0c59fe23c 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -124,7 +124,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
                        continue;
                if (host->h_server != ni->server)
                        continue;
-                if (ni->server &&
+                if (ni->server && ni->src_len != 0 &&
                    !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
                        continue;
@@ -167,6 +167,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
        host->h_addrlen = ni->salen;
        rpc_set_port(nlm_addr(host), 0);
        memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
+        host->h_srcaddrlen = ni->src_len;
        host->h_version    = ni->version;
        host->h_proto      = ni->protocol;
        host->h_rpcclnt    = NULL;
@@ -238,9 +239,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                                     const char *hostname,
                                     int noresvport)
 {
-        const struct sockaddr source = {
-                .sa_family      = AF_UNSPEC,
-        };
        struct nlm_lookup_host_info ni = {
                .server         = 0,
                .sap            = sap,
@@ -249,8 +247,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                .version        = version,
                .hostname       = hostname,
                .hostname_len   = strlen(hostname),
-                .src_sap        = &source,
-                .src_len        = sizeof(source),
                .noresvport     = noresvport,
        };
@@ -357,7 +353,6 @@ nlm_bind_host(struct nlm_host *host)
                        .protocol       = host->h_proto,
                        .address        = nlm_addr(host),
                        .addrsize       = host->h_addrlen,
-                        .saddress       = nlm_srcaddr(host),
                        .timeout        = &timeparms,
                        .servername     = host->h_name,
                        .program        = &nlm_program,
@@ -376,6 +371,8 @@ nlm_bind_host(struct nlm_host *host)
                        args.flags |= RPC_CLNT_CREATE_HARDRTRY;
                if (host->h_noresvport)
                        args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+                if (host->h_srcaddrlen)
+                        args.saddress = nlm_srcaddr(host);
                clnt = rpc_create(&args);
                if (!IS_ERR(clnt))
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a336e832475..38d26119245 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index c462d346acb..ef5659b211e 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -25,7 +25,6 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/nlm.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index c3069f38d60..0caea5310ac 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/locks.c b/fs/locks.c
index 0e62dd35d08..8729347bcd1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -122,7 +122,6 @@
 #include <linux/module.h>
 #include <linux/security.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/syscalls.h>
 #include <linux/time.h>
 #include <linux/rcupdate.h>
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index f46ee8b0e13..9da29706f91 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -828,7 +828,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
                super->s_journal_seg[i] = segno;
                super->s_journal_ec[i] = ec;
                logfs_set_segment_reserved(sb, segno);
-                err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
+                err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
                BUG_ON(err); /* mempool should prevent this */
                err = logfs_erase_segment(sb, segno, 1);
                BUG_ON(err); /* FIXME: remount-ro would be nicer */
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 6127baf0e18..ee99a9f5dfd 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1994,6 +1994,9 @@ static int do_write_inode(struct inode *inode)
        /* FIXME: transaction is part of logfs_block now.  Is that enough? */
        err = logfs_write_buf(master_inode, page, 0);
+        if (err)
+                move_page_to_inode(inode, page);
        logfs_put_write_page(page);
        return err;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 5362af9b737..4ff7ca53053 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1748,6 +1748,9 @@ struct file *do_filp_open(int dfd, const char *pathname,
        if (!(open_flag & O_CREAT))
                mode = 0;
+        /* Must never be set by userspace */
+        open_flag &= ~FMODE_NONOTIFY;
        /*
         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
         * check for O_DSYNC if the need any syncing at all we enforce it's
diff --git a/fs/namespace.c b/fs/namespace.c
index 8a415c9c5e5..3dbfc072ec7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -13,7 +13,6 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/percpu.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/acct.h>
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index aac8832e919..f22b12e7d33 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -19,7 +19,6 @@
 #include <linux/mm.h>
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
-#include <linux/smp_lock.h>
 #include <linux/ncp_fs.h>
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6c754f70c52..cb50aaf981d 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -17,7 +17,6 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d290545aa0c..8fb93b604e7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -26,7 +26,6 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index c2a1f9a155c..d40a547e337 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -17,7 +17,6 @@
 #include <linux/mount.h>
 #include <linux/slab.h>
 #include <linux/highuid.h>
-#include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index aeec017fe81..93a8b3bd69e 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -9,7 +9,6 @@
 #include <linux/completion.h>
 #include <linux/ip.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfs_fs.h>
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 232a7eead33..1fd62fc49be 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -11,7 +11,6 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/nfs4.h>
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 07ac3847e56..996dd8989a9 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -34,6 +34,7 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
+#include <linux/kmemleak.h>
 #include "delegation.h"
 #include "iostat.h"
@@ -56,7 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *,
                      struct inode *, struct dentry *);
 static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
-static int nfs_readdir_clear_array(struct page*, gfp_t);
+static void nfs_readdir_clear_array(struct page*);
 const struct file_operations nfs_dir_operations = {
        .llseek         = nfs_llseek_dir,
@@ -82,8 +83,8 @@ const struct inode_operations nfs_dir_inode_operations = {
        .setattr        = nfs_setattr,
 };
-const struct address_space_operations nfs_dir_addr_space_ops = {
+const struct address_space_operations nfs_dir_aops = {
-        .releasepage = nfs_readdir_clear_array,
+        .freepage = nfs_readdir_clear_array,
 };
 #ifdef CONFIG_NFS_V3
@@ -161,6 +162,7 @@ struct nfs_cache_array_entry {
        u64 cookie;
        u64 ino;
        struct qstr string;
+        unsigned char d_type;
 };
 struct nfs_cache_array {
@@ -170,14 +172,13 @@ struct nfs_cache_array {
        struct nfs_cache_array_entry array[0];
 };
-#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
 typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 typedef struct {
        struct file     *file;
        struct page     *page;
        unsigned long   page_index;
        u64             *dir_cookie;
+        u64             last_cookie;
        loff_t          current_index;
        decode_dirent_t decode;
@@ -194,9 +195,13 @@ typedef struct {
 static
 struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
 {
+        void *ptr;
        if (page == NULL)
                return ERR_PTR(-EIO);
-        return (struct nfs_cache_array *)kmap(page);
+        ptr = kmap(page);
+        if (ptr == NULL)
+                return ERR_PTR(-ENOMEM);
+        return ptr;
 }
 static
@@ -209,14 +214,15 @@ void nfs_readdir_release_array(struct page *page)
 * we are freeing strings created by nfs_add_to_readdir_array()
 */
 static
-int nfs_readdir_clear_array(struct page *page, gfp_t mask)
+void nfs_readdir_clear_array(struct page *page)
 {
-        struct nfs_cache_array *array = nfs_readdir_get_array(page);
+        struct nfs_cache_array *array;
        int i;
+        array = kmap_atomic(page, KM_USER0);
        for (i = 0; i < array->size; i++)
                kfree(array->array[i].string.name);
-        nfs_readdir_release_array(page);
+        kunmap_atomic(array, KM_USER0);
-        return 0;
 }
 /*
@@ -231,6 +237,11 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le
        string->name = kmemdup(name, len, GFP_KERNEL);
        if (string->name == NULL)
                return -ENOMEM;
+        /*
+         * Avoid a kmemleak false positive. The pointer to the name is stored
+         * in a page cache page which kmemleak does not scan.
+         */
+        kmemleak_not_leak(string->name);
        string->hash = full_name_hash(name, len);
        return 0;
 }
@@ -244,20 +255,24 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
        if (IS_ERR(array))
                return PTR_ERR(array);
-        ret = -EIO;
-        if (array->size >= MAX_READDIR_ARRAY)
-                goto out;
        cache_entry = &array->array[array->size];
+        /* Check that this entry lies within the page bounds */
+        ret = -ENOSPC;
+        if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
+                goto out;
        cache_entry->cookie = entry->prev_cookie;
        cache_entry->ino = entry->ino;
+        cache_entry->d_type = entry->d_type;
        ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
        if (ret)
                goto out;
        array->last_cookie = entry->cookie;
-        if (entry->eof == 1)
-                array->eof_index = array->size;
        array->size++;
+        if (entry->eof != 0)
+                array->eof_index = array->size;
 out:
        nfs_readdir_release_array(page);
        return ret;
@@ -272,7 +287,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
        if (diff < 0)
                goto out_eof;
        if (diff >= array->size) {
-                if (array->eof_index > 0)
+                if (array->eof_index >= 0)
                        goto out_eof;
                desc->current_index += array->size;
                return -EAGAIN;
@@ -281,8 +296,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
        index = (unsigned int)diff;
        *desc->dir_cookie = array->array[index].cookie;
        desc->cache_entry_index = index;
-        if (index == array->eof_index)
-                desc->eof = 1;
        return 0;
 out_eof:
        desc->eof = 1;
@@ -296,17 +309,16 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
        int status = -EAGAIN;
        for (i = 0; i < array->size; i++) {
-                if (i == array->eof_index) {
-                        desc->eof = 1;
-                        status = -EBADCOOKIE;
-                }
                if (array->array[i].cookie == *desc->dir_cookie) {
                        desc->cache_entry_index = i;
-                        status = 0;
+                        return 0;
-                        break;
                }
        }
+        if (array->eof_index >= 0) {
+                status = -EBADCOOKIE;
+                if (*desc->dir_cookie == array->last_cookie)
+                        desc->eof = 1;
+        }
        return status;
 }
@@ -314,10 +326,7 @@ static
 int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 {
        struct nfs_cache_array *array;
-        int status = -EBADCOOKIE;
+        int status;
-        if (desc->dir_cookie == NULL)
-                goto out;
        array = nfs_readdir_get_array(desc->page);
        if (IS_ERR(array)) {
@@ -330,6 +339,10 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
        else
                status = nfs_readdir_search_for_cookie(array, desc);
+        if (status == -EAGAIN) {
+                desc->last_cookie = array->last_cookie;
+                desc->page_index++;
+        }
        nfs_readdir_release_array(desc->page);
 out:
        return status;
@@ -381,13 +394,9 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct x
 static
 int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
 {
-        struct nfs_inode *node;
        if (dentry->d_inode == NULL)
                goto different;
-        node = NFS_I(dentry->d_inode);
+        if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
-        if (node->fh.size != entry->fh->size)
-                goto different;
-        if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
                goto different;
        return 1;
 different:
@@ -449,14 +458,15 @@ out:
 /* Perform conversion from xdr to cache array */
 static
-void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
                                void *xdr_page, struct page *page, unsigned int buflen)
 {
        struct xdr_stream stream;
        struct xdr_buf buf;
        __be32 *ptr = xdr_page;
-        int status;
        struct nfs_cache_array *array;
+        unsigned int count = 0;
+        int status;
        buf.head->iov_base = xdr_page;
        buf.head->iov_len = buflen;
@@ -471,21 +481,32 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e
        do {
                status = xdr_decode(desc, entry, &stream);
-                if (status != 0)
+                if (status != 0) {
+                        if (status == -EAGAIN)
+                                status = 0;
                        break;
+                }
-                if (nfs_readdir_add_to_array(entry, page) == -1)
+                count++;
-                        break;
-                if (desc->plus == 1)
+                if (desc->plus != 0)
                        nfs_prime_dcache(desc->file->f_path.dentry, entry);
+                status = nfs_readdir_add_to_array(entry, page);
+                if (status != 0)
+                        break;
        } while (!entry->eof);
-        if (status == -EBADCOOKIE && entry->eof) {
+        if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
                array = nfs_readdir_get_array(page);
-                array->eof_index = array->size - 1;
+                if (!IS_ERR(array)) {
-                status = 0;
+                        array->eof_index = array->size;
-                nfs_readdir_release_array(page);
+                        status = 0;
+                        nfs_readdir_release_array(page);
+                } else
+                        status = PTR_ERR(array);
        }
+        return status;
 }
 static
@@ -537,11 +558,11 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        struct nfs_entry entry;
        struct file     *file = desc->file;
        struct nfs_cache_array *array;
-        int status = 0;
+        int status = -ENOMEM;
        unsigned int array_size = ARRAY_SIZE(pages);
        entry.prev_cookie = 0;
-        entry.cookie = *desc->dir_cookie;
+        entry.cookie = desc->last_cookie;
        entry.eof = 0;
        entry.fh = nfs_alloc_fhandle();
        entry.fattr = nfs_alloc_fattr();
@@ -549,6 +570,10 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
                goto out;
        array = nfs_readdir_get_array(page);
+        if (IS_ERR(array)) {
+                status = PTR_ERR(array);
+                goto out;
+        }
        memset(array, 0, sizeof(struct nfs_cache_array));
        array->eof_index = -1;
@@ -556,12 +581,19 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        if (!pages_ptr)
                goto out_release_array;
        do {
+                unsigned int pglen;
                status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
                if (status < 0)
                        break;
-                nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE);
+                pglen = status;
-        } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
+                status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen);
+                if (status < 0) {
+                        if (status == -ENOSPC)
+                                status = 0;
+                        break;
+                }
+        } while (array->eof_index < 0);
        nfs_readdir_free_large_page(pages_ptr, pages, array_size);
 out_release_array:
@@ -582,8 +614,10 @@ static
 int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
 {
        struct inode    *inode = desc->file->f_path.dentry->d_inode;
+        int ret;
-        if (nfs_readdir_xdr_to_array(desc, page, inode) < 0)
+        ret = nfs_readdir_xdr_to_array(desc, page, inode);
+        if (ret < 0)
                goto error;
        SetPageUptodate(page);
@@ -595,12 +629,14 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
        return 0;
 error:
        unlock_page(page);
-        return -EIO;
+        return ret;
 }
 static
 void cache_page_release(nfs_readdir_descriptor_t *desc)
 {
+        if (!desc->page->mapping)
+                nfs_readdir_clear_array(desc->page);
        page_cache_release(desc->page);
        desc->page = NULL;
 }
@@ -608,12 +644,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
 static
 struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
 {
-        struct page *page;
+        return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
-        page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
                        desc->page_index, (filler_t *)nfs_readdir_filler, desc);
-        if (IS_ERR(page))
-                desc->eof = 1;
-        return page;
 }
 /*
@@ -629,9 +661,8 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)
                return PTR_ERR(desc->page);
        res = nfs_readdir_search_array(desc);
-        if (res == 0)
+        if (res != 0)
-                return 0;
+                cache_page_release(desc);
-        cache_page_release(desc);
        return res;
 }
@@ -639,22 +670,18 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)
 static inline
 int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 {
-        int res = -EAGAIN;
+        int res;
-        while (1) {
+        if (desc->page_index == 0) {
-                res = find_cache_page(desc);
+                desc->current_index = 0;
-                if (res != -EAGAIN)
+                desc->last_cookie = 0;
-                        break;
-                desc->page_index++;
        }
+        do {
+                res = find_cache_page(desc);
+        } while (res == -EAGAIN);
        return res;
 }
-static inline unsigned int dt_type(struct inode *inode)
-{
-        return (inode->i_mode >> 12) & 15;
-}
 /*
 * Once we've found the start of the dirent within a page: fill 'er up...
 */
@@ -666,35 +693,35 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
        int i = 0;
        int res = 0;
        struct nfs_cache_array *array = NULL;
-        unsigned int d_type = DT_UNKNOWN;
-        struct dentry *dentry = NULL;
        array = nfs_readdir_get_array(desc->page);
+        if (IS_ERR(array)) {
+                res = PTR_ERR(array);
+                goto out;
+        }
        for (i = desc->cache_entry_index; i < array->size; i++) {
-                d_type = DT_UNKNOWN;
+                struct nfs_cache_array_entry *ent;
-                res = filldir(dirent, array->array[i].string.name,
+                ent = &array->array[i];
-                        array->array[i].string.len, file->f_pos,
+                if (filldir(dirent, ent->string.name, ent->string.len,
-                        nfs_compat_user_ino64(array->array[i].ino), d_type);
+                    file->f_pos, nfs_compat_user_ino64(ent->ino),
-                if (res < 0)
+                    ent->d_type) < 0) {
+                        desc->eof = 1;
                        break;
+                }
                file->f_pos++;
-                desc->cache_entry_index = i;
                if (i < (array->size-1))
                        *desc->dir_cookie = array->array[i+1].cookie;
                else
                        *desc->dir_cookie = array->last_cookie;
-                if (i == array->eof_index) {
-                        desc->eof = 1;
-                        break;
-                }
        }
+        if (array->eof_index >= 0)
+                desc->eof = 1;
        nfs_readdir_release_array(desc->page);
+out:
        cache_page_release(desc);
-        if (dentry != NULL)
-                dput(dentry);
        dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
                        (unsigned long long)*desc->dir_cookie, res);
        return res;
@@ -729,13 +756,14 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
                goto out;
        }
-        if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
-                status = -EIO;
-                goto out_release;
-        }
        desc->page_index = 0;
+        desc->last_cookie = *desc->dir_cookie;
        desc->page = page;
+        status = nfs_readdir_xdr_to_array(desc, page, inode);
+        if (status < 0)
+                goto out_release;
        status = nfs_do_filldir(desc, dirent, filldir);
 out:
@@ -757,7 +785,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct inode    *inode = dentry->d_inode;
        nfs_readdir_descriptor_t my_desc,
                        *desc = &my_desc;
-        int res = -ENOMEM;
+        int res;
        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -782,18 +810,18 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (res < 0)
                goto out;
-        while (desc->eof != 1) {
+        do {
                res = readdir_search_pagecache(desc);
                if (res == -EBADCOOKIE) {
+                        res = 0;
                        /* This means either end of directory */
                        if (*desc->dir_cookie && desc->eof == 0) {
                                /* Or that the server has 'lost' a cookie */
                                res = uncached_readdir(desc, dirent, filldir);
-                                if (res >= 0)
+                                if (res == 0)
                                        continue;
                        }
-                        res = 0;
                        break;
                }
                if (res == -ETOOSMALL && desc->plus) {
@@ -808,11 +836,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        break;
                res = nfs_do_filldir(desc, dirent, filldir);
-                if (res < 0) {
+                if (res < 0)
-                        res = 0;
                        break;
-                }
+        } while (!desc->eof);
-        }
 out:
        nfs_unblock_sillyrename(dentry);
        if (res > 0)
@@ -1345,12 +1371,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
-                        case -EISDIR:
                        case -ENOTDIR:
                                goto no_open;
                        case -ELOOP:
                                if (!(nd->intent.open.flags & O_NOFOLLOW))
                                        goto no_open;
+                        /* case -EISDIR: */
                        /* case -EINVAL: */
                        default:
                                res = ERR_CAST(inode);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 84d3c8b9020..e6ace0d93c7 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -867,7 +867,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
                goto out;
        nfs_alloc_commit_data(dreq);
-        if (dreq->commit_data == NULL || count < wsize)
+        if (dreq->commit_data == NULL || count <= wsize)
                sync = NFS_FILE_SYNC;
        dreq->inode = inode;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 60677f9f131..7bf029ef408 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -693,6 +693,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
        struct inode *inode = filp->f_mapping->host;
        int status = 0;
+        unsigned int saved_type = fl->fl_type;
        /* Try local locking first */
        posix_test_lock(filp, fl);
@@ -700,6 +701,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
                /* found a conflict */
                goto out;
        }
+        fl->fl_type = saved_type;
        if (nfs_have_delegation(inode, FMODE_READ))
                goto out_noconflict;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 314f5716460..e67e31c7341 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -289,6 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                } else if (S_ISDIR(inode->i_mode)) {
                        inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
                        inode->i_fop = &nfs_dir_operations;
+                        inode->i_data.a_ops = &nfs_dir_aops;
                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
                                set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        /* Deal with crossing mountpoints */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index db08ff3ff45..e6356b750b7 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -362,6 +362,15 @@ unsigned int nfs_page_length(struct page *page)
 }
 /*
+ * Convert a umode to a dirent->d_type
+ */
+static inline
+unsigned char nfs_umode_to_dtype(umode_t mode)
+{
+        return (mode >> 12) & 15;
+}
+/*
 * Determine the number of pages in an array of length 'len' and
 * with a base offset of 'base'
 */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index eceafe74f47..4f981f1f668 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -505,13 +505,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
 static struct rpc_version mnt_version1 = {
        .number         = 1,
-        .nrprocs        = 2,
+        .nrprocs        = ARRAY_SIZE(mnt_procedures),
        .procs          = mnt_procedures,
 };
 static struct rpc_version mnt_version3 = {
        .number         = 3,
-        .nrprocs        = 2,
+        .nrprocs        = ARRAY_SIZE(mnt3_procedures),
        .procs          = mnt3_procedures,
 };
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index e6bf45710cc..5914a1911c9 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -423,7 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
        struct page **page;
        size_t hdrlen;
        unsigned int pglen, recvd;
-        int status, nr = 0;
+        int status;
        if ((status = ntohl(*p++)))
                return nfs_stat_to_errno(status);
@@ -443,7 +443,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
        if (pglen > recvd)
                pglen = recvd;
        page = rcvbuf->pages;
-        return nr;
+        return pglen;
 }
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -485,6 +485,8 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se
        entry->prev_cookie        = entry->cookie;
        entry->cookie     = ntohl(*p++);
+        entry->d_type = DT_UNKNOWN;
        p = xdr_inline_peek(xdr, 8);
        if (p != NULL)
                entry->eof = !p[0] && p[1];
@@ -495,7 +497,7 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return ERR_PTR(-EAGAIN);
 }
 /*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index d9a5e832c25..f6cc60f06da 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -555,7 +555,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
        struct page **page;
        size_t hdrlen;
        u32 recvd, pglen;
-        int status, nr = 0;
+        int status;
        status = ntohl(*p++);
        /* Decode post_op_attrs */
@@ -586,7 +586,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
                pglen = recvd;
        page = rcvbuf->pages;
-        return nr;
+        return pglen;
 }
 __be32 *
@@ -622,11 +622,13 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s
        entry->prev_cookie = entry->cookie;
        p = xdr_decode_hyper(p, &entry->cookie);
+        entry->d_type = DT_UNKNOWN;
        if (plus) {
                entry->fattr->valid = 0;
                p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
                if (IS_ERR(p))
                        goto out_overflow_exit;
+                entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
                /* In fact, a post_op_fh3: */
                p = xdr_inline_decode(xdr, 4);
                if (unlikely(!p))
@@ -656,7 +658,7 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s
 out_overflow:
        print_overflow_msg(__func__, xdr);
 out_overflow_exit:
-        return ERR_PTR(-EIO);
+        return ERR_PTR(-EAGAIN);
 }
 /*
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0f24cdf2cb1..4435e5e1f90 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2852,8 +2852,10 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
        nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
        res.pgbase = args.pgbase;
        status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
-        if (status == 0)
+        if (status >= 0) {
                memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
+                status += args.pgbase;
+        }
        nfs_invalidate_atime(dir);
@@ -3359,6 +3361,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
        ret = nfs_revalidate_inode(server, inode);
        if (ret < 0)
                return ret;
+        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+                nfs_zap_acl_cache(inode);
        ret = nfs4_read_cached_acl(inode, buf, buflen);
        if (ret != -ENOENT)
                return ret;
@@ -3387,6 +3391,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
        nfs_inode_return_delegation(inode);
        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
        ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        /*
+         * Acl update can result in inode attribute update.
+         * so mark the attribute cache invalid.
+         */
+        spin_lock(&inode->i_lock);
+        NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
+        spin_unlock(&inode->i_lock);
        nfs_access_zap_cache(inode);
        nfs_zap_acl_cache(inode);
        return ret;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f313c4cce7e..9f1826b012e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4518,7 +4518,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
        xdr_read_pages(xdr, pglen);
-        return 0;
+        return pglen;
 }
 static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -6208,6 +6208,10 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
        if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
                entry->ino = entry->fattr->fileid;
+        entry->d_type = DT_UNKNOWN;
+        if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
+                entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
        if (verify_attr_len(xdr, p, len) < 0)
                goto out_overflow;
@@ -6221,7 +6225,7 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return ERR_PTR(-EAGAIN);
 }
 /*
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 137b549e63d..b68536cc904 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -115,7 +115,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
 {
        if (!nfs_lock_request_dontget(req))
                return 0;
-        if (req->wb_page != NULL)
+        if (test_bit(PG_MAPPED, &req->wb_flags))
                radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
        return 1;
 }
@@ -125,7 +125,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
 */
 void nfs_clear_page_tag_locked(struct nfs_page *req)
 {
-        if (req->wb_page != NULL) {
+        if (test_bit(PG_MAPPED, &req->wb_flags)) {
                struct inode *inode = req->wb_context->path.dentry->d_inode;
                struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e4b62c6f5a6..aedcaa7f291 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -152,7 +152,6 @@ static void nfs_readpage_release(struct nfs_page *req)
                        (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
                        req->wb_bytes,
                        (long long)req_offset(req));
-        nfs_clear_request(req);
        nfs_release_request(req);
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0a42e8f4adc..4100630c9a5 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -39,7 +39,6 @@
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/lockd/bind.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/mnt_namespace.h>
@@ -67,6 +66,12 @@
 #define NFSDBG_FACILITY         NFSDBG_VFS
+#ifdef CONFIG_NFS_V3
+#define NFS_DEFAULT_VERSION 3
+#else
+#define NFS_DEFAULT_VERSION 2
+#endif
 enum {
        /* Mount options that take no arguments */
        Opt_soft, Opt_hard,
@@ -1064,12 +1069,10 @@ static int nfs_parse_mount_options(char *raw,
                        mnt->flags |= NFS_MOUNT_VER3;
                        mnt->version = 3;
                        break;
-#ifdef CONFIG_NFS_V4
                case Opt_v4:
                        mnt->flags &= ~NFS_MOUNT_VER3;
                        mnt->version = 4;
                        break;
-#endif
                case Opt_udp:
                        mnt->flags &= ~NFS_MOUNT_TCP;
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1281,12 +1284,10 @@ static int nfs_parse_mount_options(char *raw,
                                mnt->flags |= NFS_MOUNT_VER3;
                                mnt->version = 3;
                                break;
-#ifdef CONFIG_NFS_V4
                        case NFS4_VERSION:
                                mnt->flags &= ~NFS_MOUNT_VER3;
                                mnt->version = 4;
                                break;
-#endif
                        default:
                                goto out_invalid_value;
                        }
@@ -2277,7 +2278,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        };
        int error = -ENOMEM;
-        data = nfs_alloc_parsed_mount_data(3);
+        data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
        mntfh = nfs_alloc_fhandle();
        if (data == NULL || mntfh == NULL)
                goto out_free_fh;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4c14c17a527..10d648ea128 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -390,6 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
                if (nfs_have_delegation(inode, FMODE_WRITE))
                        nfsi->change_attr++;
        }
+        set_bit(PG_MAPPED, &req->wb_flags);
        SetPagePrivate(req->wb_page);
        set_page_private(req->wb_page, (unsigned long)req);
        nfsi->npages++;
@@ -415,6 +416,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        spin_lock(&inode->i_lock);
        set_page_private(req->wb_page, 0);
        ClearPagePrivate(req->wb_page);
+        clear_bit(PG_MAPPED, &req->wb_flags);
        radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
        nfsi->npages--;
        if (!nfsi->npages) {
@@ -422,7 +424,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
                iput(inode);
        } else
                spin_unlock(&inode->i_lock);
-        nfs_clear_request(req);
        nfs_release_request(req);
 }
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2a533a0af2a..7e84a852cda 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -260,9 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp)
        err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
                        &fhp->fh_post_attr);
        fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
-        if (err)
+        if (err) {
                fhp->fh_post_saved = 0;
-        else
+                /* Grab the ctime anyway - set_change_info might use it */
+                fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime;
+        } else
                fhp->fh_post_saved = 1;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ad2bfa68d53..116cab970e0 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2262,7 +2262,7 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
 * Spawn a thread to perform a recall on the delegation represented
 * by the lease (file_lock)
 *
- * Called from break_lease() with lock_kernel() held.
+ * Called from break_lease() with lock_flocks() held.
 * Note: we assume break_lease will only call this *once* for any given
 * lease.
 */
@@ -2286,7 +2286,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
        list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
        spin_unlock(&recall_lock);
-        /* only place dl_time is set. protected by lock_kernel*/
+        /* only place dl_time is set. protected by lock_flocks*/
        dp->dl_time = get_seconds();
        /*
@@ -2303,7 +2303,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
 /*
 * The file_lock is being reapd.
 *
- * Called by locks_free_lock() with lock_kernel() held.
+ * Called by locks_free_lock() with lock_flocks() held.
 */
 static
 void nfsd_release_deleg_cb(struct file_lock *fl)
@@ -2318,7 +2318,7 @@ void nfsd_release_deleg_cb(struct file_lock *fl)
 }
 /*
- * Called from setlease() with lock_kernel() held
+ * Called from setlease() with lock_flocks() held
 */
 static
 int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 4d476ff08ae..60fce3dc5cb 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -484,18 +484,17 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
 static inline void
 set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
 {
-        BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved);
+        BUG_ON(!fhp->fh_pre_saved);
-        cinfo->atomic = 1;
+        cinfo->atomic = fhp->fh_post_saved;
        cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
-        if (cinfo->change_supported) {
-                cinfo->before_change = fhp->fh_pre_change;
+        cinfo->before_change = fhp->fh_pre_change;
-                cinfo->after_change = fhp->fh_post_change;
+        cinfo->after_change = fhp->fh_post_change;
-        } else {
+        cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
-                cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+        cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
-                cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+        cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
-                cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+        cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
-                cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
-        }
 }
 int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 49c844dab33..59e5fe742f7 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -335,7 +335,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
         * the device at this point.
         *
         * To prevent nilfs_dat_translate() from returning the
-         * uncommited block number, this makes a copy of the entry
+         * uncommitted block number, this makes a copy of the entry
         * buffer and redirects nilfs_dat_translate() to the copy.
         */
        if (!buffer_nilfs_redirected(entry_bh)) {
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 33ad25ddd5c..caf9a6a3fb5 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -176,7 +176,6 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
 int nilfs_init_gcinode(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
-        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
        inode->i_mode = S_IFREG;
        mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -186,14 +185,6 @@ int nilfs_init_gcinode(struct inode *inode)
        ii->i_flags = 0;
        nilfs_bmap_init_gc(ii->i_bmap);
-        /*
-         * Add the inode to GC inode list. Garbage Collection
-         * is serialized and no two processes manipulate the
-         * list simultaneously.
-         */
-        igrab(inode);
-        list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
        return 0;
 }
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 3e90f86d5bf..b185e937a33 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -337,6 +337,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
                                   struct nilfs_argv *argv, void *buf)
 {
        size_t nmembs = argv->v_nmembs;
+        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
        struct inode *inode;
        struct nilfs_vdesc *vdesc;
        struct buffer_head *bh, *n;
@@ -349,10 +350,21 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
                ino = vdesc->vd_ino;
                cno = vdesc->vd_cno;
                inode = nilfs_iget_for_gc(sb, ino, cno);
-                if (unlikely(inode == NULL)) {
+                if (IS_ERR(inode)) {
-                        ret = -ENOMEM;
+                        ret = PTR_ERR(inode);
                        goto failed;
                }
+                if (list_empty(&NILFS_I(inode)->i_dirty)) {
+                        /*
+                         * Add the inode to GC inode list. Garbage Collection
+                         * is serialized and no two processes manipulate the
+                         * list simultaneously.
+                         */
+                        igrab(inode);
+                        list_add(&NILFS_I(inode)->i_dirty,
+                                 &nilfs->ns_gc_inodes);
+                }
                do {
                        ret = nilfs_ioctl_move_inode_block(inode, vdesc,
                                                           &buffers);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index b04f88eed09..f35794b97e8 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -92,7 +92,11 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        wait_event(group->fanotify_data.access_waitq, event->response);
+        wait_event(group->fanotify_data.access_waitq, event->response ||
+                                atomic_read(&group->fanotify_data.bypass_perm));
+        if (!event->response) /* bypass_perm set */
+                return 0;
        /* userspace responded, convert to something usable */
        spin_lock(&event->lock);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 063224812b7..8b61220cffc 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -106,20 +106,29 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
        return client_fd;
 }
-static ssize_t fill_event_metadata(struct fsnotify_group *group,
+static int fill_event_metadata(struct fsnotify_group *group,
                                   struct fanotify_event_metadata *metadata,
                                   struct fsnotify_event *event)
 {
+        int ret = 0;
        pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
                 group, metadata, event);
        metadata->event_len = FAN_EVENT_METADATA_LEN;
+        metadata->metadata_len = FAN_EVENT_METADATA_LEN;
        metadata->vers = FANOTIFY_METADATA_VERSION;
        metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
        metadata->pid = pid_vnr(event->tgid);
-        metadata->fd = create_fd(group, event);
+        if (unlikely(event->mask & FAN_Q_OVERFLOW))
+                metadata->fd = FAN_NOFD;
+        else {
+                metadata->fd = create_fd(group, event);
+                if (metadata->fd < 0)
+                        ret = metadata->fd;
+        }
-        return metadata->fd;
+        return ret;
 }
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
@@ -200,7 +209,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
        mutex_lock(&group->fanotify_data.access_mutex);
-        if (group->fanotify_data.bypass_perm) {
+        if (atomic_read(&group->fanotify_data.bypass_perm)) {
                mutex_unlock(&group->fanotify_data.access_mutex);
                kmem_cache_free(fanotify_response_event_cache, re);
                event->response = FAN_ALLOW;
@@ -257,24 +266,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        fd = fill_event_metadata(group, &fanotify_event_metadata, event);
+        ret = fill_event_metadata(group, &fanotify_event_metadata, event);
-        if (fd < 0)
+        if (ret < 0)
-                return fd;
+                goto out;
+        fd = fanotify_event_metadata.fd;
        ret = prepare_for_access_response(group, event, fd);
        if (ret)
                goto out_close_fd;
        ret = -EFAULT;
-        if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
+        if (copy_to_user(buf, &fanotify_event_metadata,
+                         fanotify_event_metadata.event_len))
                goto out_kill_access_response;
-        return FAN_EVENT_METADATA_LEN;
+        return fanotify_event_metadata.event_len;
 out_kill_access_response:
        remove_access_response(group, event, fd);
 out_close_fd:
-        sys_close(fd);
+        if (fd != FAN_NOFD)
+                sys_close(fd);
+out:
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        if (event->mask & FAN_ALL_PERM_EVENTS) {
+                event->response = FAN_DENY;
+                wake_up(&group->fanotify_data.access_waitq);
+        }
+#endif
        return ret;
 }
@@ -382,7 +401,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
        mutex_lock(&group->fanotify_data.access_mutex);
-        group->fanotify_data.bypass_perm = true;
+        atomic_inc(&group->fanotify_data.bypass_perm);
        list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
                pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
@@ -586,11 +605,10 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 {
        struct fsnotify_mark *fsn_mark;
        __u32 added;
+        int ret = 0;
        fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
        if (!fsn_mark) {
-                int ret;
                if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
                        return -ENOSPC;
@@ -600,17 +618,16 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
                fsnotify_init_mark(fsn_mark, fanotify_free_mark);
                ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
-                if (ret) {
+                if (ret)
-                        fanotify_free_mark(fsn_mark);
+                        goto err;
-                        return ret;
-                }
        }
        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
-        fsnotify_put_mark(fsn_mark);
        if (added & ~mnt->mnt_fsnotify_mask)
                fsnotify_recalc_vfsmount_mask(mnt);
+err:
-        return 0;
+        fsnotify_put_mark(fsn_mark);
+        return ret;
 }
 static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -619,6 +636,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 {
        struct fsnotify_mark *fsn_mark;
        __u32 added;
+        int ret = 0;
        pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -634,8 +652,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
        fsn_mark = fsnotify_find_inode_mark(group, inode);
        if (!fsn_mark) {
-                int ret;
                if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
                        return -ENOSPC;
@@ -645,16 +661,16 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
                fsnotify_init_mark(fsn_mark, fanotify_free_mark);
                ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
-                if (ret) {
+                if (ret)
-                        fanotify_free_mark(fsn_mark);
+                        goto err;
-                        return ret;
-                }
        }
        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
-        fsnotify_put_mark(fsn_mark);
        if (added & ~inode->i_fsnotify_mask)
                fsnotify_recalc_inode_mask(inode);
-        return 0;
+err:
+        fsnotify_put_mark(fsn_mark);
+        return ret;
 }
 /* fanotify syscalls */
@@ -687,8 +703,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
        group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
-        if (IS_ERR(group))
+        if (IS_ERR(group)) {
+                free_uid(user);
                return PTR_ERR(group);
+        }
        group->fanotify_data.user = user;
        atomic_inc(&user->fanotify_listeners);
@@ -698,6 +716,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        mutex_init(&group->fanotify_data.access_mutex);
        init_waitqueue_head(&group->fanotify_data.access_waitq);
        INIT_LIST_HEAD(&group->fanotify_data.access_list);
+        atomic_set(&group->fanotify_data.bypass_perm, 0);
 #endif
        switch (flags & FAN_ALL_CLASS_BITS) {
        case FAN_CLASS_NOTIF:
@@ -764,8 +783,10 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
        if (flags & ~FAN_ALL_MARK_FLAGS)
                return -EINVAL;
        switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
-        case FAN_MARK_ADD:
+        case FAN_MARK_ADD:              /* fallthrough */
        case FAN_MARK_REMOVE:
+                if (!mask)
+                        return -EINVAL;
        case FAN_MARK_FLUSH:
                break;
        default:
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 444c305a468..4cd5d5d78f9 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -752,6 +752,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
        if (ret >= 0)
                return ret;
+        fsnotify_put_group(group);
        atomic_dec(&user->inotify_devs);
 out_free_uid:
        free_uid(user);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f1e962cb3b7..0d7c5540ad6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -573,11 +573,14 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+        if (ocfs2_iocb_is_sem_locked(iocb)) {
+                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        ocfs2_iocb_clear_rw_locked(iocb);
        level = ocfs2_iocb_rw_locked_level(iocb);
-        if (!level)
-                up_read(&inode->i_alloc_sem);
        ocfs2_rw_unlock(inode, level);
        if (is_async)
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 76bfdfda691..eceb456037c 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -68,8 +68,27 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
        else
                clear_bit(1, (unsigned long *)&iocb->private);
 }
+/*
+ * Using a named enum representing lock types in terms of #N bit stored in
+ * iocb->private, which is going to be used for communication bewteen
+ * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ */
+enum ocfs2_iocb_lock_bits {
+        OCFS2_IOCB_RW_LOCK = 0,
+        OCFS2_IOCB_RW_LOCK_LEVEL,
+        OCFS2_IOCB_SEM,
+        OCFS2_IOCB_NUM_LOCKS
+};
 #define ocfs2_iocb_clear_rw_locked(iocb) \
-        clear_bit(0, (unsigned long *)&iocb->private)
+        clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
 #define ocfs2_iocb_rw_locked_level(iocb) \
-        test_bit(1, (unsigned long *)&iocb->private)
+        test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_sem_locked(iocb) \
+        set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_sem_locked(iocb) \
+        clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_sem_locked(iocb) \
+        test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 52c7557f3e2..9f26ac9be2a 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1964,8 +1964,10 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
        if (reg == NULL)
                return ERR_PTR(-ENOMEM);
-        if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
+        if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) {
-                return ERR_PTR(-ENAMETOOLONG);
+                ret = -ENAMETOOLONG;
+                goto free;
+        }
        spin_lock(&o2hb_live_lock);
        reg->hr_region_num = 0;
@@ -1974,7 +1976,8 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
                                                         O2NM_MAX_REGIONS);
                if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
                        spin_unlock(&o2hb_live_lock);
-                        return ERR_PTR(-EFBIG);
+                        ret = -EFBIG;
+                        goto free;
                }
                set_bit(reg->hr_region_num, o2hb_region_bitmap);
        }
@@ -1986,10 +1989,13 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
        ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
        if (ret) {
                config_item_put(&reg->hr_item);
-                return ERR_PTR(ret);
+                goto free;
        }
        return &reg->hr_item;
+free:
+        kfree(reg);
+        return ERR_PTR(ret);
 }
 static void o2hb_heartbeat_group_drop_item(struct config_group *group,
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index c7fba396392..6c61771469a 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -113,10 +113,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(QUOTA),
        define_mask(REFCOUNT),
        define_mask(BASTS),
+        define_mask(RESERVATIONS),
+        define_mask(CLUSTER),
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
-        define_mask(RESERVATIONS),
 };
 static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index ea2ed9f56c9..34d6544357d 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -81,7 +81,7 @@
 #include <linux/sched.h>
 /* bits that are frequently given and infrequently matched in the low word */
-/* NOTE: If you add a flag, you need to also update mlog.c! */
+/* NOTE: If you add a flag, you need to also update masklog.c! */
 #define ML_ENTRY        0x0000000000000001ULL /* func call entry */
 #define ML_EXIT         0x0000000000000002ULL /* func call exit */
 #define ML_TCP          0x0000000000000004ULL /* net cluster/tcp.c */
@@ -114,13 +114,14 @@
 #define ML_XATTR        0x0000000020000000ULL /* ocfs2 extended attributes */
 #define ML_QUOTA        0x0000000040000000ULL /* ocfs2 quota operations */
 #define ML_REFCOUNT     0x0000000080000000ULL /* refcount tree operations */
-#define ML_BASTS        0x0000001000000000ULL /* dlmglue asts and basts */
+#define ML_BASTS        0x0000000100000000ULL /* dlmglue asts and basts */
+#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */
+#define ML_CLUSTER      0x0000000400000000ULL /* cluster stack */
 /* bits that are infrequently given and frequently matched in the high word */
-#define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
+#define ML_ERROR        0x1000000000000000ULL /* sent to KERN_ERR */
-#define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
+#define ML_NOTICE       0x2000000000000000ULL /* setn to KERN_NOTICE */
-#define ML_KTHREAD      0x0000000400000000ULL /* kernel thread activity */
+#define ML_KTHREAD      0x4000000000000000ULL /* kernel thread activity */
-#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
-#define ML_CLUSTER      0x0000001000000000ULL /* cluster stack */
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index edaded48e7e..895532ac4d9 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -476,7 +476,6 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
 out:
        iput(inode);
-        ocfs2_dentry_attach_gen(dentry);
 }
 /*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c49f6de0e7a..d417b3f9b0c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2461,8 +2461,10 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        di->i_dx_root = cpu_to_le64(dr_blkno);
+        spin_lock(&OCFS2_I(dir)->ip_lock);
        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        spin_unlock(&OCFS2_I(dir)->ip_lock);
        ocfs2_journal_dirty(handle, di_bh);
@@ -4466,8 +4468,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
                goto out_commit;
        }
+        spin_lock(&OCFS2_I(dir)->ip_lock);
        OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        spin_unlock(&OCFS2_I(dir)->ip_lock);
        di->i_dx_root = cpu_to_le64(0ULL);
        ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 58a93b95373..cc2aaa96cfe 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -959,7 +959,7 @@ static int dlm_match_regions(struct dlm_ctxt *dlm,
                r += O2HB_MAX_REGION_NAME_LEN;
        }
-        local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+        local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC);
        if (!local) {
                status = -ENOMEM;
                goto bail;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f564b0e5f80..59f0f6bdfc6 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2346,7 +2346,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
 */
 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
                                      struct dlm_lock_resource *res,
-                                      int *numlocks)
+                                      int *numlocks,
+                                      int *hasrefs)
 {
        int ret;
        int i;
@@ -2356,6 +2357,9 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        assert_spin_locked(&res->spinlock);
+        *numlocks = 0;
+        *hasrefs = 0;
        ret = -EINVAL;
        if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
                mlog(0, "cannot migrate lockres with unknown owner!\n");
@@ -2386,7 +2390,13 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        }
        *numlocks = count;
-        mlog(0, "migrateable lockres having %d locks\n", *numlocks);
+        count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+        if (count < O2NM_MAX_NODES)
+                *hasrefs = 1;
+        mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
+             res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
 leave:
        return ret;
@@ -2408,7 +2418,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        const char *name;
        unsigned int namelen;
        int mle_added = 0;
-        int numlocks;
+        int numlocks, hasrefs;
        int wake = 0;
        if (!dlm_grab(dlm))
@@ -2417,13 +2427,13 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        name = res->lockname.name;
        namelen = res->lockname.len;
-        mlog(0, "migrating %.*s to %u\n", namelen, name, target);
+        mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
        /*
         * ensure this lockres is a proper candidate for migration
         */
        spin_lock(&res->spinlock);
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
+        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
        if (ret < 0) {
                spin_unlock(&res->spinlock);
                goto leave;
@@ -2431,10 +2441,8 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        spin_unlock(&res->spinlock);
        /* no work to do */
-        if (numlocks == 0) {
+        if (numlocks == 0 && !hasrefs)
-                mlog(0, "no locks were found on this lockres! done!\n");
                goto leave;
-        }
        /*
         * preallocate up front
@@ -2459,14 +2467,14 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
         * find a node to migrate the lockres to
         */
-        mlog(0, "picking a migration node\n");
        spin_lock(&dlm->spinlock);
        /* pick a new node */
        if (!test_bit(target, dlm->domain_map) ||
            target >= O2NM_MAX_NODES) {
                target = dlm_pick_migration_target(dlm, res);
        }
-        mlog(0, "node %u chosen for migration\n", target);
+        mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
+             namelen, name, target);
        if (target >= O2NM_MAX_NODES ||
            !test_bit(target, dlm->domain_map)) {
@@ -2667,7 +2675,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
        int ret;
        int lock_dropped = 0;
-        int numlocks;
+        int numlocks, hasrefs;
        spin_lock(&res->spinlock);
        if (res->owner != dlm->node_num) {
@@ -2681,8 +2689,8 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        }
        /* No need to migrate a lockres having no locks */
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
+        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-        if (ret >= 0 && numlocks == 0) {
+        if (ret >= 0 && numlocks == 0 && !hasrefs) {
                spin_unlock(&res->spinlock);
                goto leave;
        }
@@ -2915,6 +2923,12 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
                }
                queue++;
        }
+        nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+        if (nodenum < O2NM_MAX_NODES) {
+                spin_unlock(&res->spinlock);
+                return nodenum;
+        }
        spin_unlock(&res->spinlock);
        mlog(0, "have not found a suitable target yet! checking domain map\n");
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 77b4c04a280..f6cba566429 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2241,11 +2241,15 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        mutex_lock(&inode->i_mutex);
+        ocfs2_iocb_clear_sem_locked(iocb);
 relock:
        /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
        if (direct_io) {
                down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
+                /* communicate with ocfs2_dio_end_io */
+                ocfs2_iocb_set_sem_locked(iocb);
        }
        /*
@@ -2382,8 +2386,10 @@ out:
                ocfs2_rw_unlock(inode, rw_level);
 out_sems:
-        if (have_alloc_sem)
+        if (have_alloc_sem) {
                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        mutex_unlock(&inode->i_mutex);
@@ -2527,6 +2533,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
                goto bail;
        }
+        ocfs2_iocb_clear_sem_locked(iocb);
        /*
         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
         * need locks to protect pending reads from racing with truncate.
@@ -2534,6 +2542,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        if (filp->f_flags & O_DIRECT) {
                down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
+                ocfs2_iocb_set_sem_locked(iocb);
                ret = ocfs2_rw_lock(inode, 0);
                if (ret < 0) {
@@ -2575,8 +2584,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        }
 bail:
-        if (have_alloc_sem)
+        if (have_alloc_sem) {
                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
        mlog_exit(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d8408217e3b..70dd3b1798f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -159,7 +159,9 @@ struct ocfs2_lock_res {
        char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
        unsigned int             l_ro_holders;
        unsigned int             l_ex_holders;
-        unsigned char            l_level;
+        signed char              l_level;
+        signed char              l_requested;
+        signed char              l_blocking;
        /* Data packed - type enum ocfs2_lock_type */
        unsigned char            l_type;
@@ -169,8 +171,6 @@ struct ocfs2_lock_res {
        unsigned char            l_action;
        /* Data packed - enum type ocfs2_unlock_action */
        unsigned char            l_unlock_action;
-        unsigned char            l_requested;
-        unsigned char            l_blocking;
        unsigned int             l_pending_gen;
        spinlock_t               l_lock;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c2e4f8222e2..bf2e7764920 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -350,7 +350,7 @@ enum {
 #define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
        NUM_SYSTEM_INODES
 };
-#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE
 #define NUM_LOCAL_SYSTEM_INODES \
                (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 252e7c82f92..a5ebe421195 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -190,7 +190,7 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
                        return c;
        }
-        return c;
+        return NULL;
 }
 /*
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f02c0ef3157..cfeab7ce369 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,7 +41,6 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
-#include <linux/smp_lock.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
diff --git a/fs/pipe.c b/fs/pipe.c
index a8012a95572..04629f36e39 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1199,12 +1199,24 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
        return ret;
 }
+/*
+ * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
+ * location, so checking ->i_pipe is not enough to verify that this is a
+ * pipe.
+ */
+struct pipe_inode_info *get_pipe_info(struct file *file)
+{
+        struct inode *i = file->f_path.dentry->d_inode;
+        return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
+}
 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        struct pipe_inode_info *pipe;
        long ret;
-        pipe = file->f_path.dentry->d_inode->i_pipe;
+        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f3d02ca461e..182845147fe 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1574,7 +1574,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
        if (!tmp)
                return -ENOMEM;
-        pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE);
+        pathname = d_path(path, tmp, PAGE_SIZE);
        len = PTR_ERR(pathname);
        if (IS_ERR(pathname))
                goto out;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 9c2b5f48487..3ddb6068177 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -16,7 +16,6 @@
 #include <linux/limits.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/sysctl.h>
 #include <linux/slab.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index da6b01d70f0..c126c83b9a4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -706,6 +706,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
 * skip over unmapped regions.
 */
 #define PAGEMAP_WALK_SIZE       (PMD_SIZE)
+#define PAGEMAP_WALK_MASK       (PMD_MASK)
 static ssize_t pagemap_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
 {
@@ -776,7 +777,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
                unsigned long end;
                pm.pos = 0;
-                end = start_vaddr + PAGEMAP_WALK_SIZE;
+                end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
                /* overflow ? */
                if (end < start_vaddr || end > end_vaddr)
                        end = end_vaddr;
diff --git a/fs/read_write.c b/fs/read_write.c
index 431a0ed610c..5d431bacbea 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,7 +9,6 @@
 #include <linux/fcntl.h>
 #include <linux/file.h>
 #include <linux/uio.h>
-#include <linux/smp_lock.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include <linux/module.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 41656d40dc5..0bae036831e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -8,7 +8,6 @@
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/exportfs.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index adf22b485ce..79265fdc317 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -9,7 +9,6 @@
 #include <linux/time.h>
 #include <asm/uaccess.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/compat.h>
 /*
@@ -184,12 +183,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
                return 0;
        }
-        /* we need to make sure nobody is changing the file size beneath
-         ** us
-         */
-        reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
        depth = reiserfs_write_lock_once(inode->i_sb);
+        /* we need to make sure nobody is changing the file size beneath us */
+        reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
        write_from = inode->i_size & (blocksize - 1);
        /* if we are on a block boundary, we are already unpacked.  */
        if (write_from == 0) {
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 076c8b19468..d31bce1a9f9 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -43,7 +43,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/writeback.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3bf7a6457f4..b243117b875 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,7 +28,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 struct file_system_type reiserfs_fs_type;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 536d697a8a2..90d2fcb67a3 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -472,7 +472,9 @@ int reiserfs_acl_chmod(struct inode *inode)
                struct reiserfs_transaction_handle th;
                size_t size = reiserfs_xattr_nblocks(inode,
                                             reiserfs_acl_size(clone->a_count));
-                reiserfs_write_lock(inode->i_sb);
+                int depth;
+                depth = reiserfs_write_lock_once(inode->i_sb);
                error = journal_begin(&th, inode->i_sb, size * 2);
                if (!error) {
                        int error2;
@@ -482,7 +484,7 @@ int reiserfs_acl_chmod(struct inode *inode)
                        if (error2)
                                error = error2;
                }
-                reiserfs_write_unlock(inode->i_sb);
+                reiserfs_write_unlock_once(inode->i_sb, depth);
        }
        posix_acl_release(clone);
        return error;
diff --git a/fs/splice.c b/fs/splice.c
index 8f1dfaecc8f..ce2f02579e3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1311,18 +1311,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
                               struct pipe_inode_info *opipe,
                               size_t len, unsigned int flags);
-/*
- * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
- * location, so checking ->i_pipe is not enough to verify that this is a
- * pipe.
- */
-static inline struct pipe_inode_info *pipe_info(struct inode *inode)
-{
-        if (S_ISFIFO(inode->i_mode))
-                return inode->i_pipe;
-        return NULL;
-}
 /*
 * Determine where to splice to/from.
@@ -1336,8 +1324,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
        loff_t offset, *off;
        long ret;
-        ipipe = pipe_info(in->f_path.dentry->d_inode);
+        ipipe = get_pipe_info(in);
-        opipe = pipe_info(out->f_path.dentry->d_inode);
+        opipe = get_pipe_info(out);
        if (ipipe && opipe) {
                if (off_in || off_out)
@@ -1555,7 +1543,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
        int error;
        long ret;
-        pipe = pipe_info(file->f_path.dentry->d_inode);
+        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
@@ -1642,7 +1630,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
        };
        long ret;
-        pipe = pipe_info(file->f_path.dentry->d_inode);
+        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
@@ -2022,8 +2010,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 static long do_tee(struct file *in, struct file *out, size_t len,
                   unsigned int flags)
 {
-        struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
+        struct pipe_inode_info *ipipe = get_pipe_info(in);
-        struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
+        struct pipe_inode_info *opipe = get_pipe_info(out);
        int ret = -EINVAL;
        /*
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 7d287afccde..691f61223ed 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -934,7 +934,6 @@ xfs_aops_discard_page(
        struct xfs_inode        *ip = XFS_I(inode);
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
-        ssize_t                 len = 1 << inode->i_blkbits;
        if (!xfs_is_delayed_page(page, IO_DELAY))
                goto out_invalidate;
@@ -949,58 +948,14 @@ xfs_aops_discard_page(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        bh = head = page_buffers(page);
        do {
-                int             done;
-                xfs_fileoff_t   offset_fsb;
-                xfs_bmbt_irec_t imap;
-                int             nimaps = 1;
                int             error;
-                xfs_fsblock_t   firstblock;
+                xfs_fileoff_t   start_fsb;
-                xfs_bmap_free_t flist;
                if (!buffer_delay(bh))
                        goto next_buffer;
-                offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
-                /*
-                 * Map the range first and check that it is a delalloc extent
-                 * before trying to unmap the range. Otherwise we will be
-                 * trying to remove a real extent (which requires a
-                 * transaction) or a hole, which is probably a bad idea...
-                 */
-                error = xfs_bmapi(NULL, ip, offset_fsb, 1,
-                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
-                                &nimaps, NULL);
-                if (error) {
-                        /* something screwed, just bail */
-                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
-                                "page discard failed delalloc mapping lookup.");
-                        }
-                        break;
-                }
-                if (!nimaps) {
-                        /* nothing there */
-                        goto next_buffer;
-                }
-                if (imap.br_startblock != DELAYSTARTBLOCK) {
-                        /* been converted, ignore */
-                        goto next_buffer;
-                }
-                WARN_ON(imap.br_blockcount == 0);
-                /*
-                 * Note: while we initialise the firstblock/flist pair, they
-                 * should never be used because blocks should never be
-                 * allocated or freed for a delalloc extent and hence we need
-                 * don't cancel or finish them after the xfs_bunmapi() call.
-                 */
-                xfs_bmap_init(&flist, &firstblock);
-                error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
-                                        &flist, &done);
-                ASSERT(!flist.xbf_count && !flist.xbf_first);
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -1010,7 +965,7 @@ xfs_aops_discard_page(
                        break;
                }
 next_buffer:
-                offset += len;
+                offset += 1 << inode->i_blkbits;
        } while ((bh = bh->b_this_page) != head);
@@ -1505,11 +1460,42 @@ xfs_vm_write_failed(
        struct inode            *inode = mapping->host;
        if (to > inode->i_size) {
-                struct iattr    ia = {
+                /*
-                        .ia_valid       = ATTR_SIZE | ATTR_FORCE,
+                 * punch out the delalloc blocks we have already allocated. We
-                        .ia_size        = inode->i_size,
+                 * don't call xfs_setattr() to do this as we may be in the
-                };
+                 * middle of a multi-iovec write and so the vfs inode->i_size
-                xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
+                 * will not match the xfs ip->i_size and so it will zero too
+                 * much. Hence we jus truncate the page cache to zero what is
+                 * necessary and punch the delalloc blocks directly.
+                 */
+                struct xfs_inode        *ip = XFS_I(inode);
+                xfs_fileoff_t           start_fsb;
+                xfs_fileoff_t           end_fsb;
+                int                     error;
+                truncate_pagecache(inode, to, inode->i_size);
+                /*
+                 * Check if there are any blocks that are outside of i_size
+                 * that need to be trimmed back.
+                 */
+                start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
+                end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
+                if (end_fsb <= start_fsb)
+                        return;
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+                                                        end_fsb - start_fsb);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "xfs_vm_write_failed: unable to clean up ino %lld",
+                                                ip->i_ino);
+                        }
+                }
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index aa1d353def2..4c5deb6e9e3 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -488,29 +488,16 @@ found:
        spin_unlock(&pag->pag_buf_lock);
        xfs_perag_put(pag);
-        /* Attempt to get the semaphore without sleeping,
+        if (xfs_buf_cond_lock(bp)) {
-         * if this does not work then we need to drop the
+                /* failed, so wait for the lock if requested. */
-         * spinlock and do a hard attempt on the semaphore.
-         */
-        if (down_trylock(&bp->b_sema)) {
                if (!(flags & XBF_TRYLOCK)) {
-                        /* wait for buffer ownership */
                        xfs_buf_lock(bp);
                        XFS_STATS_INC(xb_get_locked_waited);
                } else {
-                        /* We asked for a trylock and failed, no need
-                         * to look at file offset and length here, we
-                         * know that this buffer at least overlaps our
-                         * buffer and is locked, therefore our buffer
-                         * either does not exist, or is this buffer.
-                         */
                        xfs_buf_rele(bp);
                        XFS_STATS_INC(xb_busy_locked);
                        return NULL;
                }
-        } else {
-                /* trylock worked */
-                XB_SET_OWNER(bp);
        }
        if (bp->b_flags & XBF_STALE) {
@@ -876,10 +863,18 @@ xfs_buf_rele(
 */
 /*
- *      Locks a buffer object, if it is not already locked.
+ *      Locks a buffer object, if it is not already locked.  Note that this in
- *      Note that this in no way locks the underlying pages, so it is only
+ *      no way locks the underlying pages, so it is only useful for
- *      useful for synchronizing concurrent use of buffer objects, not for
+ *      synchronizing concurrent use of buffer objects, not for synchronizing
- *      synchronizing independent access to the underlying pages.
+ *      independent access to the underlying pages.
+ *
+ *      If we come across a stale, pinned, locked buffer, we know that we are
+ *      being asked to lock a buffer that has been reallocated. Because it is
+ *      pinned, we know that the log has not been pushed to disk and hence it
+ *      will still be locked.  Rather than continuing to have trylock attempts
+ *      fail until someone else pushes the log, push it ourselves before
+ *      returning.  This means that the xfsaild will not get stuck trying
+ *      to push on stale inode buffers.
 */
 int
 xfs_buf_cond_lock(
@@ -890,6 +885,8 @@ xfs_buf_cond_lock(
        locked = down_trylock(&bp->b_sema) == 0;
        if (locked)
                XB_SET_OWNER(bp);
+        else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+                xfs_log_force(bp->b_target->bt_mount, 0);
        trace_xfs_buf_cond_lock(bp, _RET_IP_);
        return locked ? 0 : -EBUSY;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8abd12e32e1..4111cd3966c 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5471,8 +5471,13 @@ xfs_getbmap(
                        if (error)
                                goto out_unlock_iolock;
                }
+                /*
-                ASSERT(ip->i_delayed_blks == 0);
+                 * even after flushing the inode, there can still be delalloc
+                 * blocks on the inode beyond EOF due to speculative
+                 * preallocation. These are not removed until the release
+                 * function is called or the inode is inactivated. Hence we
+                 * cannot assert here that ip->i_delayed_blks == 0.
+                 */
        }
        lock = xfs_ilock_map_shared(ip);
@@ -6070,3 +6075,79 @@ xfs_bmap_disk_count_leaves(
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
 }
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will alays punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length)
+{
+        xfs_fileoff_t           remaining = length;
+        int                     error = 0;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        do {
+                int             done;
+                xfs_bmbt_irec_t imap;
+                int             nimaps = 1;
+                xfs_fsblock_t   firstblock;
+                xfs_bmap_free_t flist;
+                /*
+                 * Map the range first and check that it is a delalloc extent
+                 * before trying to unmap the range. Otherwise we will be
+                 * trying to remove a real extent (which requires a
+                 * transaction) or a hole, which is probably a bad idea...
+                 */
+                error = xfs_bmapi(NULL, ip, start_fsb, 1,
+                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
+                                &nimaps, NULL);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "Failed delalloc mapping lookup ino %lld fsb %lld.",
+                                                ip->i_ino, start_fsb);
+                        }
+                        break;
+                }
+                if (!nimaps) {
+                        /* nothing there */
+                        goto next_block;
+                }
+                if (imap.br_startblock != DELAYSTARTBLOCK) {
+                        /* been converted, ignore */
+                        goto next_block;
+                }
+                WARN_ON(imap.br_blockcount == 0);
+                /*
+                 * Note: while we initialise the firstblock/flist pair, they
+                 * should never be used because blocks should never be
+                 * allocated or freed for a delalloc extent and hence we need
+                 * don't cancel or finish them after the xfs_bunmapi() call.
+                 */
+                xfs_bmap_init(&flist, &firstblock);
+                error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+                                        &flist, &done);
+                if (error)
+                        break;
+                ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+                start_fsb++;
+                remaining--;
+        } while(remaining > 0);
+        return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 71ec9b6ecdf..3651191daea 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -394,6 +394,11 @@ xfs_bmap_count_blocks(
        int                     whichfork,
        int                     *count);
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length);
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3b9582c60a2..e60490bc00a 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -377,6 +377,19 @@ xfs_swap_extents(
        ip->i_d.di_format = tip->i_d.di_format;
        tip->i_d.di_format = tmp;
+        /*
+         * The extents in the source inode could still contain speculative
+         * preallocation beyond EOF (e.g. the file is open but not modified
+         * while defrag is in progress). In that case, we need to copy over the
+         * number of delalloc blocks the data fork in the source inode is
+         * tracking beyond EOF so that when the fork is truncated away when the
+         * temporary inode is unlinked we don't underrun the i_delayed_blks
+         * counter on that inode.
+         */
+        ASSERT(tip->i_delayed_blks == 0);
+        tip->i_delayed_blks = ip->i_delayed_blks;
+        ip->i_delayed_blks = 0;
        ilf_fields = XFS_ILOG_CORE;
        switch(ip->i_d.di_format) {
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed999026766..c78cc6a3d87 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
 int     xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
 char *  xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+int     xfs_error_test_active;
 int
 xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
                        len = strlen(mp->m_fsname);
                        xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
                        strcpy(xfs_etest_fsname[i], mp->m_fsname);
+                        xfs_error_test_active++;
                        return 0;
                }
        }
@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
                        xfs_etest_fsid[i] = 0LL;
                        kmem_free(xfs_etest_fsname[i]);
                        xfs_etest_fsname[i] = NULL;
+                        xfs_error_test_active--;
                }
        }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c2c1a072bb8..f338847f80b 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level,
 #define XFS_RANDOM_BMAPIFORMAT                          XFS_RANDOM_DEFAULT
 #ifdef DEBUG
+extern int xfs_error_test_active;
 extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 #define XFS_NUM_INJECT_ERROR                            10
 #define XFS_TEST_ERROR(expr, mp, tag, rf)               \
-        ((expr) || \
+        ((expr) || (xfs_error_test_active && \
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
-                        (rf)))
+                        (rf))))
 extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
 extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c7ac020705d..7c8d30c453c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -657,18 +657,37 @@ xfs_inode_item_unlock(
 }
 /*
- * This is called to find out where the oldest active copy of the
+ * This is called to find out where the oldest active copy of the inode log
- * inode log item in the on disk log resides now that the last log
+ * item in the on disk log resides now that the last log write of it completed
- * write of it completed at the given lsn.  Since we always re-log
+ * at the given lsn.  Since we always re-log all dirty data in an inode, the
- * all dirty data in an inode, the latest copy in the on disk log
+ * latest copy in the on disk log is the only one that matters.  Therefore,
- * is the only one that matters.  Therefore, simply return the
+ * simply return the given lsn.
- * given lsn.
+ *
+ * If the inode has been marked stale because the cluster is being freed, we
+ * don't want to (re-)insert this inode into the AIL. There is a race condition
+ * where the cluster buffer may be unpinned before the inode is inserted into
+ * the AIL during transaction committed processing. If the buffer is unpinned
+ * before the inode item has been committed and inserted, then it is possible
+ * for the buffer to be written and IO completions before the inode is inserted
+ * into the AIL. In that case, we'd be inserting a clean, stale inode into the
+ * AIL which will never get removed. It will, however, get reclaimed which
+ * triggers an assert in xfs_inode_free() complaining about freein an inode
+ * still in the AIL.
+ *
+ * To avoid this, return a lower LSN than the one passed in so that the
+ * transaction committed code will not move the inode forward in the AIL but
+ * will still unpin it properly.
 */
 STATIC xfs_lsn_t
 xfs_inode_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
+        if (xfs_iflags_test(ip, XFS_ISTALE))
+                return lsn - 1;
        return lsn;
 }
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d2af0a8381a..77a59891734 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -297,6 +297,7 @@ xfs_rename(
         * it and some incremental backup programs won't work without it.
         */
        xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
        /*
         * Adjust the link count on src_dp.  This is necessary when
author	David S. Miller <davem@davemloft.net>	2010-12-27 01:37:05 -0500
committer	David S. Miller <davem@davemloft.net>	2010-12-27 01:37:05 -0500
commit	17f7f4d9fcce8f1b75b5f735569309dee7665968 (patch)
tree	14d7e49ca0053a0fcab3c33b5023bf3f90c5c08a /fs
parent	041110a439e21cd40709ead4ffbfa8034619ad77 (diff)
parent	d7c1255a3a21e98bdc64df8ccf005a174d7e6289 (diff)