33 files changed, 790 insertions, 302 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 4f078c054b41..955947ef3e02 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1021,6 +1021,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        /* everything turned out well, dispose of the aiocb. */
        kiocb_free(iocb);
+        put_reqs_available(ctx, 1);
        /*
         * We have to order our ring_info tail store above and test
@@ -1062,6 +1063,9 @@ static long aio_read_events_ring(struct kioctx *ctx,
        if (head == tail)
                goto out;
+        head %= ctx->nr_events;
+        tail %= ctx->nr_events;
        while (ret < nr) {
                long avail;
                struct io_event *ev;
@@ -1100,8 +1104,6 @@ static long aio_read_events_ring(struct kioctx *ctx,
        flush_dcache_page(ctx->ring_pages[0]);
        pr_debug("%li  h%u t%u\n", ret, head, tail);
-        put_reqs_available(ctx, ret);
 out:
        mutex_unlock(&ctx->ring_lock);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b7e2c1c1ef36..be91397f4e92 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1259,11 +1259,19 @@ struct btrfs_block_group_cache {
        spinlock_t lock;
        u64 pinned;
        u64 reserved;
+        u64 delalloc_bytes;
        u64 bytes_super;
        u64 flags;
        u64 sectorsize;
        u64 cache_generation;
+        /*
+         * It is just used for the delayed data space allocation because
+         * only the data space allocation and the relative metadata update
+         * can be done cross the transaction.
+         */
+        struct rw_semaphore data_rwsem;
        /* for raid56, this is a full stripe, without parity */
        unsigned long full_stripe_len;
@@ -3316,7 +3324,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                                   struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
                         u64 min_alloc_size, u64 empty_size, u64 hint_byte,
-                         struct btrfs_key *ins, int is_data);
+                         struct btrfs_key *ins, int is_data, int delalloc);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                  struct extent_buffer *buf, int full_backref, int no_quota);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -3330,7 +3338,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
                      u64 owner, u64 offset, int no_quota);
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
+                               int delalloc);
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
                                       u64 start, u64 len);
 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fafb3e53ecde..99c253918208 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -105,7 +105,8 @@ static int find_next_key(struct btrfs_path *path, int level,
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups);
 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                       u64 num_bytes, int reserve);
+                                       u64 num_bytes, int reserve,
+                                       int delalloc);
 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
                               u64 num_bytes);
 int btrfs_pin_extent(struct btrfs_root *root,
@@ -3260,7 +3261,8 @@ again:
        spin_lock(&block_group->lock);
        if (block_group->cached != BTRFS_CACHE_FINISHED ||
-            !btrfs_test_opt(root, SPACE_CACHE)) {
+            !btrfs_test_opt(root, SPACE_CACHE) ||
+            block_group->delalloc_bytes) {
                /*
                 * don't bother trying to write stuff out _if_
                 * a) we're not cached,
@@ -5613,6 +5615,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
 * @cache:      The cache we are manipulating
 * @num_bytes:  The number of bytes in question
 * @reserve:    One of the reservation enums
+ * @delalloc:   The blocks are allocated for the delalloc write
 *
 * This is called by the allocator when it reserves space, or by somebody who is
 * freeing space that was never actually used on disk.  For example if you
@@ -5631,7 +5634,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
 * succeeds.
 */
 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                       u64 num_bytes, int reserve)
+                                       u64 num_bytes, int reserve, int delalloc)
 {
        struct btrfs_space_info *space_info = cache->space_info;
        int ret = 0;
@@ -5650,12 +5653,18 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
                                                num_bytes, 0);
                                space_info->bytes_may_use -= num_bytes;
                        }
+                        if (delalloc)
+                                cache->delalloc_bytes += num_bytes;
                }
        } else {
                if (cache->ro)
                        space_info->bytes_readonly += num_bytes;
                cache->reserved -= num_bytes;
                space_info->bytes_reserved -= num_bytes;
+                if (delalloc)
+                        cache->delalloc_bytes -= num_bytes;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&space_info->lock);
@@ -6206,7 +6215,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
                btrfs_add_free_space(cache, buf->start, buf->len);
-                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
                trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
                pin = 0;
        }
@@ -6365,6 +6374,70 @@ enum btrfs_loop_type {
        LOOP_NO_EMPTY_SIZE = 3,
 };
+static inline void
+btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
+                       int delalloc)
+{
+        if (delalloc)
+                down_read(&cache->data_rwsem);
+}
+static inline void
+btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
+                       int delalloc)
+{
+        btrfs_get_block_group(cache);
+        if (delalloc)
+                down_read(&cache->data_rwsem);
+}
+static struct btrfs_block_group_cache *
+btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
+                   struct btrfs_free_cluster *cluster,
+                   int delalloc)
+{
+        struct btrfs_block_group_cache *used_bg;
+        bool locked = false;
+again:
+        spin_lock(&cluster->refill_lock);
+        if (locked) {
+                if (used_bg == cluster->block_group)
+                        return used_bg;
+                up_read(&used_bg->data_rwsem);
+                btrfs_put_block_group(used_bg);
+        }
+        used_bg = cluster->block_group;
+        if (!used_bg)
+                return NULL;
+        if (used_bg == block_group)
+                return used_bg;
+        btrfs_get_block_group(used_bg);
+        if (!delalloc)
+                return used_bg;
+        if (down_read_trylock(&used_bg->data_rwsem))
+                return used_bg;
+        spin_unlock(&cluster->refill_lock);
+        down_read(&used_bg->data_rwsem);
+        locked = true;
+        goto again;
+}
+static inline void
+btrfs_release_block_group(struct btrfs_block_group_cache *cache,
+                         int delalloc)
+{
+        if (delalloc)
+                up_read(&cache->data_rwsem);
+        btrfs_put_block_group(cache);
+}
 /*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
@@ -6379,7 +6452,7 @@ enum btrfs_loop_type {
 static noinline int find_free_extent(struct btrfs_root *orig_root,
                                     u64 num_bytes, u64 empty_size,
                                     u64 hint_byte, struct btrfs_key *ins,
-                                     u64 flags)
+                                     u64 flags, int delalloc)
 {
        int ret = 0;
        struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -6467,6 +6540,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
                                up_read(&space_info->groups_sem);
                        } else {
                                index = get_block_group_index(block_group);
+                                btrfs_lock_block_group(block_group, delalloc);
                                goto have_block_group;
                        }
                } else if (block_group) {
@@ -6481,7 +6555,7 @@ search:
                u64 offset;
                int cached;
-                btrfs_get_block_group(block_group);
+                btrfs_grab_block_group(block_group, delalloc);
                search_start = block_group->key.objectid;
                /*
@@ -6529,16 +6603,16 @@ have_block_group:
                         * the refill lock keeps out other
                         * people trying to start a new cluster
                         */
-                        spin_lock(&last_ptr->refill_lock);
+                        used_block_group = btrfs_lock_cluster(block_group,
-                        used_block_group = last_ptr->block_group;
+                                                              last_ptr,
-                        if (used_block_group != block_group &&
+                                                              delalloc);
-                            (!used_block_group ||
+                        if (!used_block_group)
-                             used_block_group->ro ||
-                             !block_group_bits(used_block_group, flags)))
                                goto refill_cluster;
-                        if (used_block_group != block_group)
+                        if (used_block_group != block_group &&
-                                btrfs_get_block_group(used_block_group);
+                            (used_block_group->ro ||
+                             !block_group_bits(used_block_group, flags)))
+                                goto release_cluster;
                        offset = btrfs_alloc_from_cluster(used_block_group,
                                                last_ptr,
@@ -6552,16 +6626,15 @@ have_block_group:
                                                used_block_group,
                                                search_start, num_bytes);
                                if (used_block_group != block_group) {
-                                        btrfs_put_block_group(block_group);
+                                        btrfs_release_block_group(block_group,
+                                                                  delalloc);
                                        block_group = used_block_group;
                                }
                                goto checks;
                        }
                        WARN_ON(last_ptr->block_group != used_block_group);
-                        if (used_block_group != block_group)
+release_cluster:
-                                btrfs_put_block_group(used_block_group);
-refill_cluster:
                        /* If we are on LOOP_NO_EMPTY_SIZE, we can't
                         * set up a new clusters, so lets just skip it
                         * and let the allocator find whatever block
@@ -6578,8 +6651,10 @@ refill_cluster:
                         * succeeding in the unclustered
                         * allocation.  */
                        if (loop >= LOOP_NO_EMPTY_SIZE &&
-                            last_ptr->block_group != block_group) {
+                            used_block_group != block_group) {
                                spin_unlock(&last_ptr->refill_lock);
+                                btrfs_release_block_group(used_block_group,
+                                                          delalloc);
                                goto unclustered_alloc;
                        }
@@ -6589,6 +6664,10 @@ refill_cluster:
                         */
                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
+                        if (used_block_group != block_group)
+                                btrfs_release_block_group(used_block_group,
+                                                          delalloc);
+refill_cluster:
                        if (loop >= LOOP_NO_EMPTY_SIZE) {
                                spin_unlock(&last_ptr->refill_lock);
                                goto unclustered_alloc;
@@ -6696,7 +6775,7 @@ checks:
                BUG_ON(offset > search_start);
                ret = btrfs_update_reserved_bytes(block_group, num_bytes,
-                                                  alloc_type);
+                                                  alloc_type, delalloc);
                if (ret == -EAGAIN) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
                        goto loop;
@@ -6708,13 +6787,13 @@ checks:
                trace_btrfs_reserve_extent(orig_root, block_group,
                                           search_start, num_bytes);
-                btrfs_put_block_group(block_group);
+                btrfs_release_block_group(block_group, delalloc);
                break;
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
                BUG_ON(index != get_block_group_index(block_group));
-                btrfs_put_block_group(block_group);
+                btrfs_release_block_group(block_group, delalloc);
        }
        up_read(&space_info->groups_sem);
@@ -6827,7 +6906,7 @@ again:
 int btrfs_reserve_extent(struct btrfs_root *root,
                         u64 num_bytes, u64 min_alloc_size,
                         u64 empty_size, u64 hint_byte,
-                         struct btrfs_key *ins, int is_data)
+                         struct btrfs_key *ins, int is_data, int delalloc)
 {
        bool final_tried = false;
        u64 flags;
@@ -6837,7 +6916,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
 again:
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
-                               flags);
+                               flags, delalloc);
        if (ret == -ENOSPC) {
                if (!final_tried && ins->offset) {
@@ -6862,7 +6941,8 @@ again:
 }
 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
-                                        u64 start, u64 len, int pin)
+                                        u64 start, u64 len,
+                                        int pin, int delalloc)
 {
        struct btrfs_block_group_cache *cache;
        int ret = 0;
@@ -6881,7 +6961,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
                pin_down_extent(root, cache, start, len, 1);
        else {
                btrfs_add_free_space(cache, start, len);
-                btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+                btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
        }
        btrfs_put_block_group(cache);
@@ -6891,15 +6971,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
 }
 int btrfs_free_reserved_extent(struct btrfs_root *root,
-                                        u64 start, u64 len)
+                               u64 start, u64 len, int delalloc)
 {
-        return __btrfs_free_reserved_extent(root, start, len, 0);
+        return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
 }
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
                                       u64 start, u64 len)
 {
-        return __btrfs_free_reserved_extent(root, start, len, 1);
+        return __btrfs_free_reserved_extent(root, start, len, 1, 0);
 }
 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -7114,7 +7194,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                return -EINVAL;
        ret = btrfs_update_reserved_bytes(block_group, ins->offset,
-                                          RESERVE_ALLOC_NO_ACCOUNT);
+                                          RESERVE_ALLOC_NO_ACCOUNT, 0);
        BUG_ON(ret); /* logic error */
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
                                         0, owner, offset, ins, 1);
@@ -7256,7 +7336,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                return ERR_CAST(block_rsv);
        ret = btrfs_reserve_extent(root, blocksize, blocksize,
-                                   empty_size, hint, &ins, 0);
+                                   empty_size, hint, &ins, 0, 0);
        if (ret) {
                unuse_block_rsv(root->fs_info, block_rsv, blocksize);
                return ERR_PTR(ret);
@@ -8659,6 +8739,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
                                               start);
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
+        init_rwsem(&cache->data_rwsem);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
        INIT_LIST_HEAD(&cache->new_bg_list);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 15ce5f2a2b62..ccc264e7bde1 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -158,7 +158,6 @@ struct extent_buffer {
         * to unlock
         */
        wait_queue_head_t read_lock_wq;
-        wait_queue_head_t lock_wq;
        struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
 #ifdef CONFIG_BTRFS_DEBUG
        struct list_head leak_list;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1874aee69c86..225302b39afb 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -75,6 +75,8 @@ void free_extent_map(struct extent_map *em)
        if (atomic_dec_and_test(&em->refs)) {
                WARN_ON(extent_map_in_tree(em));
                WARN_ON(!list_empty(&em->list));
+                if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+                        kfree(em->bdev);
                kmem_cache_free(extent_map_cache, em);
        }
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index e7fd8a56a140..b2991fd8583e 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -15,6 +15,7 @@
 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
 #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
 #define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
+#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
 struct extent_map {
        struct rb_node rb_node;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 372b05ff1943..2b0a627cb5f9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -274,18 +274,32 @@ struct io_ctl {
 };
 static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
-                       struct btrfs_root *root)
+                       struct btrfs_root *root, int write)
 {
+        int num_pages;
+        int check_crcs = 0;
+        num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                    PAGE_CACHE_SHIFT;
+        if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+                check_crcs = 1;
+        /* Make sure we can fit our crcs into the first page */
+        if (write && check_crcs &&
+            (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
+                return -ENOSPC;
        memset(io_ctl, 0, sizeof(struct io_ctl));
-        io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-                PAGE_CACHE_SHIFT;
+        io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
-        io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
-                                GFP_NOFS);
        if (!io_ctl->pages)
                return -ENOMEM;
+        io_ctl->num_pages = num_pages;
        io_ctl->root = root;
-        if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+        io_ctl->check_crcs = check_crcs;
-                io_ctl->check_crcs = 1;
        return 0;
 }
@@ -666,6 +680,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
        generation = btrfs_free_space_generation(leaf, header);
        btrfs_release_path(path);
+        if (!BTRFS_I(inode)->generation) {
+                btrfs_info(root->fs_info,
+                           "The free space cache file (%llu) is invalid. skip it\n",
+                           offset);
+                return 0;
+        }
        if (BTRFS_I(inode)->generation != generation) {
                btrfs_err(root->fs_info,
                        "free space inode generation (%llu) "
@@ -677,7 +698,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
        if (!num_entries)
                return 0;
-        ret = io_ctl_init(&io_ctl, inode, root);
+        ret = io_ctl_init(&io_ctl, inode, root, 0);
        if (ret)
                return ret;
@@ -957,19 +978,18 @@ fail:
 }
 static noinline_for_stack int
-add_ioctl_entries(struct btrfs_root *root,
+write_pinned_extent_entries(struct btrfs_root *root,
-                  struct inode *inode,
+                            struct btrfs_block_group_cache *block_group,
-                  struct btrfs_block_group_cache *block_group,
+                            struct io_ctl *io_ctl,
-                  struct io_ctl *io_ctl,
+                            int *entries)
-                  struct extent_state **cached_state,
-                  struct list_head *bitmap_list,
-                  int *entries)
 {
        u64 start, extent_start, extent_end, len;
-        struct list_head *pos, *n;
        struct extent_io_tree *unpin = NULL;
        int ret;
+        if (!block_group)
+                return 0;
        /*
         * We want to add any pinned extents to our free space cache
         * so we don't leak the space
@@ -979,23 +999,19 @@ add_ioctl_entries(struct btrfs_root *root,
         */
        unpin = root->fs_info->pinned_extents;
-        if (block_group)
+        start = block_group->key.objectid;
-                start = block_group->key.objectid;
-        while (block_group && (start < block_group->key.objectid +
+        while (start < block_group->key.objectid + block_group->key.offset) {
-                               block_group->key.offset)) {
                ret = find_first_extent_bit(unpin, start,
                                            &extent_start, &extent_end,
                                            EXTENT_DIRTY, NULL);
-                if (ret) {
+                if (ret)
-                        ret = 0;
+                        return 0;
-                        break;
-                }
                /* This pinned extent is out of our range */
                if (extent_start >= block_group->key.objectid +
                    block_group->key.offset)
-                        break;
+                        return 0;
                extent_start = max(extent_start, start);
                extent_end = min(block_group->key.objectid +
@@ -1005,11 +1021,20 @@ add_ioctl_entries(struct btrfs_root *root,
                *entries += 1;
                ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
                if (ret)
-                        goto out_nospc;
+                        return -ENOSPC;
                start = extent_end;
        }
+        return 0;
+}
+static noinline_for_stack int
+write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
+{
+        struct list_head *pos, *n;
+        int ret;
        /* Write out the bitmaps */
        list_for_each_safe(pos, n, bitmap_list) {
                struct btrfs_free_space *entry =
@@ -1017,36 +1042,24 @@ add_ioctl_entries(struct btrfs_root *root,
                ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
                if (ret)
-                        goto out_nospc;
+                        return -ENOSPC;
                list_del_init(&entry->list);
        }
-        /* Zero out the rest of the pages just to make sure */
+        return 0;
-        io_ctl_zero_remaining_pages(io_ctl);
+}
-        ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
-                                0, i_size_read(inode), cached_state);
-        io_ctl_drop_pages(io_ctl);
-        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
-                             i_size_read(inode) - 1, cached_state, GFP_NOFS);
-        if (ret)
+static int flush_dirty_cache(struct inode *inode)
-                goto fail;
+{
+        int ret;
        ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
-        if (ret) {
+        if (ret)
                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
                                 GFP_NOFS);
-                goto fail;
-        }
-        return 0;
-fail:
+        return ret;
-        return -1;
-out_nospc:
-        return -ENOSPC;
 }
 static void noinline_for_stack
@@ -1056,6 +1069,7 @@ cleanup_write_cache_enospc(struct inode *inode,
                           struct list_head *bitmap_list)
 {
        struct list_head *pos, *n;
        list_for_each_safe(pos, n, bitmap_list) {
                struct btrfs_free_space *entry =
                        list_entry(pos, struct btrfs_free_space, list);
@@ -1088,64 +1102,104 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 {
        struct extent_state *cached_state = NULL;
        struct io_ctl io_ctl;
-        struct list_head bitmap_list;
+        LIST_HEAD(bitmap_list);
        int entries = 0;
        int bitmaps = 0;
        int ret;
-        int err = -1;
-        INIT_LIST_HEAD(&bitmap_list);
        if (!i_size_read(inode))
                return -1;
-        ret = io_ctl_init(&io_ctl, inode, root);
+        ret = io_ctl_init(&io_ctl, inode, root, 1);
        if (ret)
                return -1;
+        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
+                down_write(&block_group->data_rwsem);
+                spin_lock(&block_group->lock);
+                if (block_group->delalloc_bytes) {
+                        block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+                        spin_unlock(&block_group->lock);
+                        up_write(&block_group->data_rwsem);
+                        BTRFS_I(inode)->generation = 0;
+                        ret = 0;
+                        goto out;
+                }
+                spin_unlock(&block_group->lock);
+        }
        /* Lock all pages first so we can lock the extent safely. */
        io_ctl_prepare_pages(&io_ctl, inode, 0);
        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
                         0, &cached_state);
-        /* Make sure we can fit our crcs into the first page */
-        if (io_ctl.check_crcs &&
-            (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
-                goto out_nospc;
        io_ctl_set_generation(&io_ctl, trans->transid);
+        /* Write out the extent entries in the free space cache */
        ret = write_cache_extent_entries(&io_ctl, ctl,
                                         block_group, &entries, &bitmaps,
                                         &bitmap_list);
        if (ret)
                goto out_nospc;
-        ret = add_ioctl_entries(root, inode, block_group, &io_ctl,
+        /*
-                                &cached_state, &bitmap_list, &entries);
+         * Some spaces that are freed in the current transaction are pinned,
+         * they will be added into free space cache after the transaction is
+         * committed, we shouldn't lose them.
+         */
+        ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
+        if (ret)
+                goto out_nospc;
-        if (ret == -ENOSPC)
+        /* At last, we write out all the bitmaps. */
+        ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+        if (ret)
                goto out_nospc;
-        else if (ret)
+        /* Zero out the rest of the pages just to make sure */
+        io_ctl_zero_remaining_pages(&io_ctl);
+        /* Everything is written out, now we dirty the pages in the file. */
+        ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+                                0, i_size_read(inode), &cached_state);
+        if (ret)
+                goto out_nospc;
+        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+                up_write(&block_group->data_rwsem);
+        /*
+         * Release the pages and unlock the extent, we will flush
+         * them out later
+         */
+        io_ctl_drop_pages(&io_ctl);
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+                             i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+        /* Flush the dirty pages in the cache file. */
+        ret = flush_dirty_cache(inode);
+        if (ret)
                goto out;
-        err = update_cache_item(trans, root, inode, path, offset,
+        /* Update the cache item to tell everyone this cache file is valid. */
+        ret = update_cache_item(trans, root, inode, path, offset,
                                entries, bitmaps);
 out:
        io_ctl_free(&io_ctl);
-        if (err) {
+        if (ret) {
                invalidate_inode_pages2(inode->i_mapping);
                BTRFS_I(inode)->generation = 0;
        }
        btrfs_update_inode(trans, root, inode);
-        return err;
+        return ret;
 out_nospc:
        cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
+        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+                up_write(&block_group->data_rwsem);
        goto out;
 }
@@ -1165,6 +1219,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                spin_unlock(&block_group->lock);
                return 0;
        }
+        if (block_group->delalloc_bytes) {
+                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+                spin_unlock(&block_group->lock);
+                return 0;
+        }
        spin_unlock(&block_group->lock);
        inode = lookup_free_space_inode(root, block_group, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8925f66a1411..3668048e16f8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -693,7 +693,7 @@ retry:
                ret = btrfs_reserve_extent(root,
                                           async_extent->compressed_size,
                                           async_extent->compressed_size,
-                                           0, alloc_hint, &ins, 1);
+                                           0, alloc_hint, &ins, 1, 1);
                if (ret) {
                        int i;
@@ -794,7 +794,7 @@ retry:
 out:
        return ret;
 out_free_reserve:
-        btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_free:
        extent_clear_unlock_delalloc(inode, async_extent->start,
                                     async_extent->start +
@@ -917,7 +917,7 @@ static noinline int cow_file_range(struct inode *inode,
                cur_alloc_size = disk_num_bytes;
                ret = btrfs_reserve_extent(root, cur_alloc_size,
                                           root->sectorsize, 0, alloc_hint,
-                                           &ins, 1);
+                                           &ins, 1, 1);
                if (ret < 0)
                        goto out_unlock;
@@ -995,7 +995,7 @@ out:
        return ret;
 out_reserve:
-        btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_unlock:
        extent_clear_unlock_delalloc(inode, start, end, locked_page,
                                     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
@@ -2599,6 +2599,21 @@ out_kfree:
        return NULL;
 }
+static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
+                                         u64 start, u64 len)
+{
+        struct btrfs_block_group_cache *cache;
+        cache = btrfs_lookup_block_group(root->fs_info, start);
+        ASSERT(cache);
+        spin_lock(&cache->lock);
+        cache->delalloc_bytes -= len;
+        spin_unlock(&cache->lock);
+        btrfs_put_block_group(cache);
+}
 /* as ordered data IO finishes, this gets called so we can finish
 * an ordered extent if the range of bytes in the file it covers are
 * fully written.
@@ -2698,6 +2713,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                                                logical_len, logical_len,
                                                compress_type, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
+                if (!ret)
+                        btrfs_release_delalloc_bytes(root,
+                                                     ordered_extent->start,
+                                                     ordered_extent->disk_len);
        }
        unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
                           ordered_extent->file_offset, ordered_extent->len,
@@ -2750,7 +2769,7 @@ out:
                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
                    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
                        btrfs_free_reserved_extent(root, ordered_extent->start,
-                                                   ordered_extent->disk_len);
+                                                   ordered_extent->disk_len, 1);
        }
@@ -6535,21 +6554,21 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        alloc_hint = get_extent_allocation_hint(inode, start, len);
        ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
-                                   alloc_hint, &ins, 1);
+                                   alloc_hint, &ins, 1, 1);
        if (ret)
                return ERR_PTR(ret);
        em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
                              ins.offset, ins.offset, ins.offset, 0);
        if (IS_ERR(em)) {
-                btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
                return em;
        }
        ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
                                           ins.offset, ins.offset, 0);
        if (ret) {
-                btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
                free_extent_map(em);
                return ERR_PTR(ret);
        }
@@ -7437,7 +7456,7 @@ free_ordered:
                if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
                        btrfs_free_reserved_extent(root, ordered->start,
-                                                   ordered->disk_len);
+                                                   ordered->disk_len, 1);
                btrfs_put_ordered_extent(ordered);
                btrfs_put_ordered_extent(ordered);
        }
@@ -8808,7 +8827,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
                cur_bytes = max(cur_bytes, min_size);
                ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
-                                           *alloc_hint, &ins, 1);
+                                           *alloc_hint, &ins, 1, 0);
                if (ret) {
                        if (own_trans)
                                btrfs_end_transaction(trans, root);
@@ -8822,7 +8841,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                                  BTRFS_FILE_EXTENT_PREALLOC);
                if (ret) {
                        btrfs_free_reserved_extent(root, ins.objectid,
-                                                   ins.offset);
+                                                   ins.offset, 0);
                        btrfs_abort_transaction(trans, root, ret);
                        if (own_trans)
                                btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 01277b8f2373..5665d2149249 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,14 +33,14 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
 */
 void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
-        if (eb->lock_nested) {
+        /*
-                read_lock(&eb->lock);
+         * no lock is required.  The lock owner may change if
-                if (eb->lock_nested && current->pid == eb->lock_owner) {
+         * we have a read lock, but it won't change to or away
-                        read_unlock(&eb->lock);
+         * from us.  If we have the write lock, we are the owner
-                        return;
+         * and it'll never change.
-                }
+         */
-                read_unlock(&eb->lock);
+        if (eb->lock_nested && current->pid == eb->lock_owner)
-        }
+                return;
        if (rw == BTRFS_WRITE_LOCK) {
                if (atomic_read(&eb->blocking_writers) == 0) {
                        WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -65,14 +65,15 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
 */
 void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
-        if (eb->lock_nested) {
+        /*
-                read_lock(&eb->lock);
+         * no lock is required.  The lock owner may change if
-                if (eb->lock_nested && current->pid == eb->lock_owner) {
+         * we have a read lock, but it won't change to or away
-                        read_unlock(&eb->lock);
+         * from us.  If we have the write lock, we are the owner
-                        return;
+         * and it'll never change.
-                }
+         */
-                read_unlock(&eb->lock);
+        if (eb->lock_nested && current->pid == eb->lock_owner)
-        }
+                return;
        if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
                BUG_ON(atomic_read(&eb->blocking_writers) != 1);
                write_lock(&eb->lock);
@@ -99,6 +100,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 void btrfs_tree_read_lock(struct extent_buffer *eb)
 {
 again:
+        BUG_ON(!atomic_read(&eb->blocking_writers) &&
+               current->pid == eb->lock_owner);
        read_lock(&eb->lock);
        if (atomic_read(&eb->blocking_writers) &&
            current->pid == eb->lock_owner) {
@@ -132,7 +136,9 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
        if (atomic_read(&eb->blocking_writers))
                return 0;
-        read_lock(&eb->lock);
+        if (!read_trylock(&eb->lock))
+                return 0;
        if (atomic_read(&eb->blocking_writers)) {
                read_unlock(&eb->lock);
                return 0;
@@ -151,7 +157,10 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
        if (atomic_read(&eb->blocking_writers) ||
            atomic_read(&eb->blocking_readers))
                return 0;
-        write_lock(&eb->lock);
+        if (!write_trylock(&eb->lock))
+                return 0;
        if (atomic_read(&eb->blocking_writers) ||
            atomic_read(&eb->blocking_readers)) {
                write_unlock(&eb->lock);
@@ -168,14 +177,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
 */
 void btrfs_tree_read_unlock(struct extent_buffer *eb)
 {
-        if (eb->lock_nested) {
+        /*
-                read_lock(&eb->lock);
+         * if we're nested, we have the write lock.  No new locking
-                if (eb->lock_nested && current->pid == eb->lock_owner) {
+         * is needed as long as we are the lock owner.
-                        eb->lock_nested = 0;
+         * The write unlock will do a barrier for us, and the lock_nested
-                        read_unlock(&eb->lock);
+         * field only matters to the lock owner.
-                        return;
+         */
-                }
+        if (eb->lock_nested && current->pid == eb->lock_owner) {
-                read_unlock(&eb->lock);
+                eb->lock_nested = 0;
+                return;
        }
        btrfs_assert_tree_read_locked(eb);
        WARN_ON(atomic_read(&eb->spinning_readers) == 0);
@@ -189,14 +199,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
 */
 void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
 {
-        if (eb->lock_nested) {
+        /*
-                read_lock(&eb->lock);
+         * if we're nested, we have the write lock.  No new locking
-                if (eb->lock_nested && current->pid == eb->lock_owner) {
+         * is needed as long as we are the lock owner.
-                        eb->lock_nested = 0;
+         * The write unlock will do a barrier for us, and the lock_nested
-                        read_unlock(&eb->lock);
+         * field only matters to the lock owner.
-                        return;
+         */
-                }
+        if (eb->lock_nested && current->pid == eb->lock_owner) {
-                read_unlock(&eb->lock);
+                eb->lock_nested = 0;
+                return;
        }
        btrfs_assert_tree_read_locked(eb);
        WARN_ON(atomic_read(&eb->blocking_readers) == 0);
@@ -244,6 +255,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
        BUG_ON(blockers > 1);
        btrfs_assert_tree_locked(eb);
+        eb->lock_owner = 0;
        atomic_dec(&eb->write_locks);
        if (blockers) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ac80188eec88..b6d198f5181e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2725,11 +2725,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
                length = btrfs_dev_extent_length(l, dev_extent);
-                if (found_key.offset + length <= start) {
+                if (found_key.offset + length <= start)
-                        key.offset = found_key.offset + length;
+                        goto skip;
-                        btrfs_release_path(path);
-                        continue;
-                }
                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2740,10 +2737,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                 * the chunk from going away while we scrub it
                 */
                cache = btrfs_lookup_block_group(fs_info, chunk_offset);
-                if (!cache) {
-                        ret = -ENOENT;
+                /* some chunks are removed but not committed to disk yet,
-                        break;
+                 * continue scrubbing */
-                }
+                if (!cache)
+                        goto skip;
                dev_replace->cursor_right = found_key.offset + length;
                dev_replace->cursor_left = found_key.offset;
                dev_replace->item_needs_writeback = 1;
@@ -2802,7 +2801,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                dev_replace->cursor_left = dev_replace->cursor_right;
                dev_replace->item_needs_writeback = 1;
+skip:
                key.offset = found_key.offset + length;
                btrfs_release_path(path);
        }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ffeed6d6326f..c83b24251e53 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2543,9 +2543,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
        remove_extent_mapping(em_tree, em);
        write_unlock(&em_tree->lock);
-        kfree(map);
-        em->bdev = NULL;
        /* once for the tree */
        free_extent_map(em);
        /* once for us */
@@ -4301,9 +4298,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        em = alloc_extent_map();
        if (!em) {
+                kfree(map);
                ret = -ENOMEM;
                goto error;
        }
+        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
        em->bdev = (struct block_device *)map;
        em->start = start;
        em->len = num_bytes;
@@ -4346,7 +4345,6 @@ error_del_extent:
        /* One for the tree reference */
        free_extent_map(em);
 error:
-        kfree(map);
        kfree(devices_info);
        return ret;
 }
@@ -4558,7 +4556,6 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
                write_unlock(&tree->map_tree.lock);
                if (!em)
                        break;
-                kfree(em->bdev);
                /* once for us */
                free_extent_map(em);
                /* once for the tree */
@@ -5362,6 +5359,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
        return 0;
 }
+static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
+{
+        if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
+                bio_endio_nodec(bio, err);
+        else
+                bio_endio(bio, err);
+        kfree(bbio);
+}
 static void btrfs_end_bio(struct bio *bio, int err)
 {
        struct btrfs_bio *bbio = bio->bi_private;
@@ -5402,12 +5408,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
                        bio = bbio->orig_bio;
                }
-                /*
-                 * We have original bio now. So increment bi_remaining to
-                 * account for it in endio
-                 */
-                atomic_inc(&bio->bi_remaining);
                bio->bi_private = bbio->private;
                bio->bi_end_io = bbio->end_io;
                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
@@ -5424,9 +5424,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
                        set_bit(BIO_UPTODATE, &bio->bi_flags);
                        err = 0;
                }
-                kfree(bbio);
-                bio_endio(bio, err);
+                btrfs_end_bbio(bbio, bio, err);
        } else if (!is_orig_bio) {
                bio_put(bio);
        }
@@ -5589,12 +5588,15 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 {
        atomic_inc(&bbio->error);
        if (atomic_dec_and_test(&bbio->stripes_pending)) {
+                /* Shoud be the original bio. */
+                WARN_ON(bio != bbio->orig_bio);
                bio->bi_private = bbio->private;
                bio->bi_end_io = bbio->end_io;
                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
                bio->bi_iter.bi_sector = logical >> 9;
-                kfree(bbio);
-                bio_endio(bio, -EIO);
+                btrfs_end_bbio(bbio, bio, -EIO);
        }
 }
@@ -5681,6 +5683,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                        BUG_ON(!bio); /* -ENOMEM */
                } else {
                        bio = first_bio;
+                        bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
                }
                submit_stripe_bio(root, bbio, bio,
@@ -5822,6 +5825,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                return -ENOMEM;
        }
+        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
        em->bdev = (struct block_device *)map;
        em->start = logical;
        em->len = length;
@@ -5846,7 +5850,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
                                                        uuid, NULL);
                if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
-                        kfree(map);
                        free_extent_map(em);
                        return -EIO;
                }
@@ -5854,7 +5857,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                        map->stripes[i].dev =
                                add_missing_dev(root, devid, uuid);
                        if (!map->stripes[i].dev) {
-                                kfree(map);
                                free_extent_map(em);
                                return -EIO;
                        }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1a15bbeb65e2..2aaa00c47816 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -190,11 +190,14 @@ struct btrfs_bio_stripe {
 struct btrfs_bio;
 typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+#define BTRFS_BIO_ORIG_BIO_SUBMITTED    0x1
 struct btrfs_bio {
        atomic_t stripes_pending;
        struct btrfs_fs_info *fs_info;
        bio_end_io_t *end_io;
        struct bio *orig_bio;
+        unsigned long flags;
        void *private;
        atomic_t error;
        int max_errors;
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 0227b45ef00a..15e9505aa35f 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -290,7 +290,8 @@ int
 cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
                 const struct nls_table *cp, int mapChars)
 {
-        int i, j, charlen;
+        int i, charlen;
+        int j = 0;
        char src_char;
        __le16 dst_char;
        wchar_t tmp;
@@ -298,12 +299,11 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
        if (!mapChars)
                return cifs_strtoUTF16(target, source, PATH_MAX, cp);
-        for (i = 0, j = 0; i < srclen; j++) {
+        for (i = 0; i < srclen; j++) {
                src_char = source[i];
                charlen = 1;
                switch (src_char) {
                case 0:
-                        put_unaligned(0, &target[j]);
                        goto ctoUTF16_out;
                case ':':
                        dst_char = cpu_to_le16(UNI_COLON);
@@ -350,6 +350,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
        }
 ctoUTF16_out:
+        put_unaligned(0, &target[j]); /* Null terminate target unicode string */
        return j;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2c90d07c0b3a..888398067420 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -725,6 +725,19 @@ out_nls:
        goto out;
 }
+static ssize_t
+cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+        ssize_t rc;
+        struct inode *inode = file_inode(iocb->ki_filp);
+        rc = cifs_revalidate_mapping(inode);
+        if (rc)
+                return rc;
+        return generic_file_read_iter(iocb, iter);
+}
 static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
        struct inode *inode = file_inode(iocb->ki_filp);
@@ -881,7 +894,7 @@ const struct inode_operations cifs_symlink_inode_ops = {
 const struct file_operations cifs_file_ops = {
        .read = new_sync_read,
        .write = new_sync_write,
-        .read_iter = generic_file_read_iter,
+        .read_iter = cifs_loose_read_iter,
        .write_iter = cifs_file_write_iter,
        .open = cifs_open,
        .release = cifs_close,
@@ -939,7 +952,7 @@ const struct file_operations cifs_file_direct_ops = {
 const struct file_operations cifs_file_nobrl_ops = {
        .read = new_sync_read,
        .write = new_sync_write,
-        .read_iter = generic_file_read_iter,
+        .read_iter = cifs_loose_read_iter,
        .write_iter = cifs_file_write_iter,
        .open = cifs_open,
        .release = cifs_close,
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 264ece71bdb2..68559fd557fb 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -374,7 +374,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
        oparms.cifs_sb = cifs_sb;
        oparms.desired_access = GENERIC_WRITE;
        oparms.create_options = create_options;
-        oparms.disposition = FILE_OPEN;
+        oparms.disposition = FILE_CREATE;
        oparms.path = path;
        oparms.fid = &fid;
        oparms.reconnect = false;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b73e0621ce9e..b10b48c2a7af 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -910,7 +910,7 @@ static const struct file_operations eventpoll_fops = {
 void eventpoll_release_file(struct file *file)
 {
        struct eventpoll *ep;
-        struct epitem *epi;
+        struct epitem *epi, *next;
        /*
         * We don't want to get "file->f_lock" because it is not
@@ -926,7 +926,7 @@ void eventpoll_release_file(struct file *file)
         * Besides, ep_remove() acquires the lock, so we can't hold it here.
         */
        mutex_lock(&epmutex);
-        list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
+        list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
                ep = epi->ep;
                mutex_lock_nested(&ep->mtx, 0);
                ep_remove(ep, epi);
diff --git a/fs/locks.c b/fs/locks.c
index da57c9b7e844..717fbc404e6b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -431,7 +431,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl)
        if (assign_type(fl, type) != 0)
                return -EINVAL;
-        fl->fl_owner = (fl_owner_t)filp;
+        fl->fl_owner = (fl_owner_t)current->files;
        fl->fl_pid = current->tgid;
        fl->fl_file = filp;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c496f8a74639..9927913c97c2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -147,6 +147,17 @@ int nfs_sync_mapping(struct address_space *mapping)
        return ret;
 }
+static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        if (inode->i_mapping->nrpages == 0)
+                flags &= ~NFS_INO_INVALID_DATA;
+        nfsi->cache_validity |= flags;
+        if (flags & NFS_INO_INVALID_DATA)
+                nfs_fscache_invalidate(inode);
+}
 /*
 * Invalidate the local caches
 */
@@ -162,17 +173,16 @@ static void nfs_zap_caches_locked(struct inode *inode)
        memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
        if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
-                nfs_fscache_invalidate(inode);
+                nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
-                nfsi->cache_validity |= NFS_INO_INVALID_ATTR
                                        | NFS_INO_INVALID_DATA
                                        | NFS_INO_INVALID_ACCESS
                                        | NFS_INO_INVALID_ACL
-                                        | NFS_INO_REVAL_PAGECACHE;
+                                        | NFS_INO_REVAL_PAGECACHE);
        } else
-                nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+                nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
                                        | NFS_INO_INVALID_ACCESS
                                        | NFS_INO_INVALID_ACL
-                                        | NFS_INO_REVAL_PAGECACHE;
+                                        | NFS_INO_REVAL_PAGECACHE);
        nfs_zap_label_cache_locked(nfsi);
 }
@@ -187,8 +197,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
 {
        if (mapping->nrpages != 0) {
                spin_lock(&inode->i_lock);
-                NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+                nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
-                nfs_fscache_invalidate(inode);
                spin_unlock(&inode->i_lock);
        }
 }
@@ -209,7 +218,7 @@ EXPORT_SYMBOL_GPL(nfs_zap_acl_cache);
 void nfs_invalidate_atime(struct inode *inode)
 {
        spin_lock(&inode->i_lock);
-        NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+        nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
        spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL_GPL(nfs_invalidate_atime);
@@ -369,7 +378,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                inode->i_mode = fattr->mode;
                if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
                                && nfs_server_capable(inode, NFS_CAP_MODE))
-                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
                /* Why so? Because we want revalidate for devices/FIFOs, and
                 * that's precisely what we have in nfs_file_inode_operations.
                 */
@@ -415,36 +424,36 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                if (fattr->valid & NFS_ATTR_FATTR_ATIME)
                        inode->i_atime = fattr->atime;
                else if (nfs_server_capable(inode, NFS_CAP_ATIME))
-                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
                if (fattr->valid & NFS_ATTR_FATTR_MTIME)
                        inode->i_mtime = fattr->mtime;
                else if (nfs_server_capable(inode, NFS_CAP_MTIME))
-                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
                if (fattr->valid & NFS_ATTR_FATTR_CTIME)
                        inode->i_ctime = fattr->ctime;
                else if (nfs_server_capable(inode, NFS_CAP_CTIME))
-                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
                if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
                        inode->i_version = fattr->change_attr;
                else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
-                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
                if (fattr->valid & NFS_ATTR_FATTR_SIZE)
                        inode->i_size = nfs_size_to_loff_t(fattr->size);
                else
-                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
-                                | NFS_INO_REVAL_PAGECACHE;
+                                | NFS_INO_REVAL_PAGECACHE);
                if (fattr->valid & NFS_ATTR_FATTR_NLINK)
                        set_nlink(inode, fattr->nlink);
                else if (nfs_server_capable(inode, NFS_CAP_NLINK))
-                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
                if (fattr->valid & NFS_ATTR_FATTR_OWNER)
                        inode->i_uid = fattr->uid;
                else if (nfs_server_capable(inode, NFS_CAP_OWNER))
-                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
                if (fattr->valid & NFS_ATTR_FATTR_GROUP)
                        inode->i_gid = fattr->gid;
                else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
-                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
                if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
                        inode->i_blocks = fattr->du.nfs2.blocks;
                if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -550,6 +559,9 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
        spin_lock(&inode->i_lock);
        i_size_write(inode, offset);
+        /* Optimisation */
+        if (offset == 0)
+                NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
        spin_unlock(&inode->i_lock);
        truncate_pagecache(inode, offset);
@@ -578,7 +590,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
                        inode->i_uid = attr->ia_uid;
                if ((attr->ia_valid & ATTR_GID) != 0)
                        inode->i_gid = attr->ia_gid;
-                NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS
+                                | NFS_INO_INVALID_ACL);
                spin_unlock(&inode->i_lock);
        }
        if ((attr->ia_valid & ATTR_SIZE) != 0) {
@@ -1101,7 +1114,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
                        && inode->i_version == fattr->pre_change_attr) {
                inode->i_version = fattr->change_attr;
                if (S_ISDIR(inode->i_mode))
-                        nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
                ret |= NFS_INO_INVALID_ATTR;
        }
        /* If we have atomic WCC data, we may update some attributes */
@@ -1117,7 +1130,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
                        && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
                memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
                if (S_ISDIR(inode->i_mode))
-                        nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
                ret |= NFS_INO_INVALID_ATTR;
        }
        if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
@@ -1128,9 +1141,6 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
                ret |= NFS_INO_INVALID_ATTR;
        }
-        if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
-                nfs_fscache_invalidate(inode);
        return ret;
 }
@@ -1189,7 +1199,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
                invalid |= NFS_INO_INVALID_ATIME;
        if (invalid != 0)
-                nfsi->cache_validity |= invalid;
+                nfs_set_cache_invalid(inode, invalid);
        nfsi->read_cache_jiffies = fattr->time_start;
        return 0;
@@ -1402,13 +1412,11 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode);
 static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
 {
-        struct nfs_inode *nfsi = NFS_I(inode);
+        unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-        nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+        if (S_ISDIR(inode->i_mode))
-        if (S_ISDIR(inode->i_mode)) {
+                invalid |= NFS_INO_INVALID_DATA;
-                nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+        nfs_set_cache_invalid(inode, invalid);
-                nfs_fscache_invalidate(inode);
-        }
        if ((fattr->valid & NFS_ATTR_FATTR) == 0)
                return 0;
        return nfs_refresh_inode_locked(inode, fattr);
@@ -1601,6 +1609,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        if ((nfsi->npages == 0) || new_isize > cur_isize) {
                                i_size_write(inode, new_isize);
                                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+                                invalid &= ~NFS_INO_REVAL_PAGECACHE;
                        }
                        dprintk("NFS: isize change on server for file %s/%ld "
                                        "(%Ld to %Ld)\n",
@@ -1702,10 +1711,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                invalid &= ~NFS_INO_INVALID_DATA;
        if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) ||
                        (save_cache_validity & NFS_INO_REVAL_FORCED))
-                nfsi->cache_validity |= invalid;
+                nfs_set_cache_invalid(inode, invalid);
-        if (invalid & NFS_INO_INVALID_DATA)
-                nfs_fscache_invalidate(inode);
        return 0;
 out_err:
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index f63cb87cd730..ba2affa51941 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -230,7 +230,7 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *,
 extern struct file_system_type nfs4_fs_type;
 /* nfs4namespace.c */
-struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *);
+struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *);
 struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
                               struct nfs_fh *, struct nfs_fattr *);
 int nfs4_replace_transport(struct nfs_server *server,
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 3d5dbf80d46a..3d83cb1fdc70 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -139,16 +139,22 @@ static size_t nfs_parse_server_name(char *string, size_t len,
 * @server: NFS server struct
 * @flavors: List of security tuples returned by SECINFO procedure
 *
- * Return the pseudoflavor of the first security mechanism in
+ * Return an rpc client that uses the first security mechanism in
- * "flavors" that is locally supported.  Return RPC_AUTH_UNIX if
+ * "flavors" that is locally supported.  The "flavors" array
- * no matching flavor is found in the array.  The "flavors" array
 * is searched in the order returned from the server, per RFC 3530
- * recommendation.
+ * recommendation and each flavor is checked for membership in the
+ * sec= mount option list if it exists.
+ *
+ * Return -EPERM if no matching flavor is found in the array.
+ *
+ * Please call rpc_shutdown_client() when you are done with this rpc client.
+ *
 */
-static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server,
+static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt,
+                                          struct nfs_server *server,
                                          struct nfs4_secinfo_flavors *flavors)
 {
-        rpc_authflavor_t pseudoflavor;
+        rpc_authflavor_t pflavor;
        struct nfs4_secinfo4 *secinfo;
        unsigned int i;
@@ -159,62 +165,73 @@ static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server,
                case RPC_AUTH_NULL:
                case RPC_AUTH_UNIX:
                case RPC_AUTH_GSS:
-                        pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
+                        pflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
                                                        &secinfo->flavor_info);
-                        /* make sure pseudoflavor matches sec= mount opt */
+                        /* does the pseudoflavor match a sec= mount opt? */
-                        if (pseudoflavor != RPC_AUTH_MAXFLAVOR &&
+                        if (pflavor != RPC_AUTH_MAXFLAVOR &&
-                            nfs_auth_info_match(&server->auth_info,
+                            nfs_auth_info_match(&server->auth_info, pflavor)) {
-                                                pseudoflavor))
+                                struct rpc_clnt *new;
-                                return pseudoflavor;
+                                struct rpc_cred *cred;
-                        break;
+                                /* Cloning creates an rpc_auth for the flavor */
+                                new = rpc_clone_client_set_auth(clnt, pflavor);
+                                if (IS_ERR(new))
+                                        continue;
+                                /**
+                                * Check that the user actually can use the
+                                * flavor. This is mostly for RPC_AUTH_GSS
+                                * where cr_init obtains a gss context
+                                */
+                                cred = rpcauth_lookupcred(new->cl_auth, 0);
+                                if (IS_ERR(cred)) {
+                                        rpc_shutdown_client(new);
+                                        continue;
+                                }
+                                put_rpccred(cred);
+                                return new;
+                        }
                }
        }
+        return ERR_PTR(-EPERM);
-        /* if there were any sec= options then nothing matched */
-        if (server->auth_info.flavor_len > 0)
-                return -EPERM;
-        return RPC_AUTH_UNIX;
 }
-static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name)
+/**
+ * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup,
+ * return an rpc_clnt that uses the best available security flavor with
+ * respect to the secinfo flavor list and the sec= mount options.
+ *
+ * @clnt: RPC client to clone
+ * @inode: directory inode
+ * @name: lookup name
+ *
+ * Please call rpc_shutdown_client() when you are done with this rpc client.
+ */
+struct rpc_clnt *
+nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode,
+                                        struct qstr *name)
 {
        struct page *page;
        struct nfs4_secinfo_flavors *flavors;
-        rpc_authflavor_t flavor;
+        struct rpc_clnt *new;
        int err;
        page = alloc_page(GFP_KERNEL);
        if (!page)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        flavors = page_address(page);
        err = nfs4_proc_secinfo(inode, name, flavors);
        if (err < 0) {
-                flavor = err;
+                new = ERR_PTR(err);
                goto out;
        }
-        flavor = nfs_find_best_sec(NFS_SERVER(inode), flavors);
+        new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors);
 out:
        put_page(page);
-        return flavor;
+        return new;
-}
-/*
- * Please call rpc_shutdown_client() when you are done with this client.
- */
-struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode,
-                                        struct qstr *name)
-{
-        rpc_authflavor_t flavor;
-        flavor = nfs4_negotiate_security(inode, name);
-        if ((int)flavor < 0)
-                return ERR_PTR((int)flavor);
-        return rpc_clone_client_set_auth(clnt, flavor);
 }
 static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
@@ -397,11 +414,6 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
        if (client->cl_auth->au_flavor != flavor)
                flavor = client->cl_auth->au_flavor;
-        else {
-                rpc_authflavor_t new = nfs4_negotiate_security(dir, name);
-                if ((int)new >= 0)
-                        flavor = new;
-        }
        mnt = nfs_do_submount(dentry, fh, fattr, flavor);
 out:
        rpc_shutdown_client(client);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 285ad5334018..4bf3d97cc5a0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3247,7 +3247,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
                        err = -EPERM;
                        if (client != *clnt)
                                goto out;
-                        client = nfs4_create_sec_client(client, dir, name);
+                        client = nfs4_negotiate_security(client, dir, name);
                        if (IS_ERR(client))
                                return PTR_ERR(client);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3ee5af4e738e..98ff061ccaf3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -934,12 +934,14 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
        if (nfs_have_delegated_attributes(inode))
                goto out;
-        if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
+        if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
                return false;
        smp_rmb();
        if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
                return false;
 out:
+        if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+                return false;
        return PageUptodate(page) != 0;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c0d45cec9958..2204e1fe5725 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -41,6 +41,7 @@
 #include <linux/ratelimit.h>
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/sunrpc/addr.h>
+#include <linux/hash.h>
 #include "xdr4.h"
 #include "xdr4cb.h"
 #include "vfs.h"
@@ -364,6 +365,79 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
        return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
 }
+/*
+ * When we recall a delegation, we should be careful not to hand it
+ * out again straight away.
+ * To ensure this we keep a pair of bloom filters ('new' and 'old')
+ * in which the filehandles of recalled delegations are "stored".
+ * If a filehandle appear in either filter, a delegation is blocked.
+ * When a delegation is recalled, the filehandle is stored in the "new"
+ * filter.
+ * Every 30 seconds we swap the filters and clear the "new" one,
+ * unless both are empty of course.
+ *
+ * Each filter is 256 bits.  We hash the filehandle to 32bit and use the
+ * low 3 bytes as hash-table indices.
+ *
+ * 'state_lock', which is always held when block_delegations() is called,
+ * is used to manage concurrent access.  Testing does not need the lock
+ * except when swapping the two filters.
+ */
+static struct bloom_pair {
+        int     entries, old_entries;
+        time_t  swap_time;
+        int     new; /* index into 'set' */
+        DECLARE_BITMAP(set[2], 256);
+} blocked_delegations;
+static int delegation_blocked(struct knfsd_fh *fh)
+{
+        u32 hash;
+        struct bloom_pair *bd = &blocked_delegations;
+        if (bd->entries == 0)
+                return 0;
+        if (seconds_since_boot() - bd->swap_time > 30) {
+                spin_lock(&state_lock);
+                if (seconds_since_boot() - bd->swap_time > 30) {
+                        bd->entries -= bd->old_entries;
+                        bd->old_entries = bd->entries;
+                        memset(bd->set[bd->new], 0,
+                               sizeof(bd->set[0]));
+                        bd->new = 1-bd->new;
+                        bd->swap_time = seconds_since_boot();
+                }
+                spin_unlock(&state_lock);
+        }
+        hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
+        if (test_bit(hash&255, bd->set[0]) &&
+            test_bit((hash>>8)&255, bd->set[0]) &&
+            test_bit((hash>>16)&255, bd->set[0]))
+                return 1;
+        if (test_bit(hash&255, bd->set[1]) &&
+            test_bit((hash>>8)&255, bd->set[1]) &&
+            test_bit((hash>>16)&255, bd->set[1]))
+                return 1;
+        return 0;
+}
+static void block_delegations(struct knfsd_fh *fh)
+{
+        u32 hash;
+        struct bloom_pair *bd = &blocked_delegations;
+        hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
+        __set_bit(hash&255, bd->set[bd->new]);
+        __set_bit((hash>>8)&255, bd->set[bd->new]);
+        __set_bit((hash>>16)&255, bd->set[bd->new]);
+        if (bd->entries == 0)
+                bd->swap_time = seconds_since_boot();
+        bd->entries += 1;
+}
 static struct nfs4_delegation *
 alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
 {
@@ -372,6 +446,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
        dprintk("NFSD alloc_init_deleg\n");
        if (num_delegations > max_delegations)
                return NULL;
+        if (delegation_blocked(&current_fh->fh_handle))
+                return NULL;
        dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
        if (dp == NULL)
                return dp;
@@ -2770,6 +2846,8 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
        /* Only place dl_time is set; protected by i_lock: */
        dp->dl_time = get_seconds();
+        block_delegations(&dp->dl_fh);
        nfsd4_cb_recall(dp);
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2d305a121f37..83baf2bfe9e9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2687,6 +2687,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
                nfserr = nfserr_toosmall;
                goto fail;
        case nfserr_noent:
+                xdr_truncate_encode(xdr, start_offset);
                goto skip_entry;
        default:
                /*
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index a106b3f2b22a..fae17c640df3 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -331,6 +331,7 @@ struct dlm_lock_resource
        u16 state;
        char lvb[DLM_LVB_LEN];
        unsigned int inflight_locks;
+        unsigned int inflight_assert_workers;
        unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 };
@@ -910,6 +911,9 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
 void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
                                   struct dlm_lock_resource *res);
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+                struct dlm_lock_resource *res);
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3087a21d32f9..82abf0cc9a12 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -581,6 +581,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        atomic_set(&res->asts_reserved, 0);
        res->migration_pending = 0;
        res->inflight_locks = 0;
+        res->inflight_assert_workers = 0;
        res->dlm = dlm;
@@ -683,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
        wake_up(&res->wq);
 }
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+                struct dlm_lock_resource *res)
+{
+        assert_spin_locked(&res->spinlock);
+        res->inflight_assert_workers++;
+        mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
+                        dlm->name, res->lockname.len, res->lockname.name,
+                        res->inflight_assert_workers);
+}
+static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+                struct dlm_lock_resource *res)
+{
+        spin_lock(&res->spinlock);
+        __dlm_lockres_grab_inflight_worker(dlm, res);
+        spin_unlock(&res->spinlock);
+}
+static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+                struct dlm_lock_resource *res)
+{
+        assert_spin_locked(&res->spinlock);
+        BUG_ON(res->inflight_assert_workers == 0);
+        res->inflight_assert_workers--;
+        mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
+                        dlm->name, res->lockname.len, res->lockname.name,
+                        res->inflight_assert_workers);
+}
+static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+                struct dlm_lock_resource *res)
+{
+        spin_lock(&res->spinlock);
+        __dlm_lockres_drop_inflight_worker(dlm, res);
+        spin_unlock(&res->spinlock);
+}
 /*
 * lookup a lock resource by name.
 * may already exist in the hashtable.
@@ -1603,7 +1641,8 @@ send_response:
                        mlog(ML_ERROR, "failed to dispatch assert master work\n");
                        response = DLM_MASTER_RESP_ERROR;
                        dlm_lockres_put(res);
-                }
+                } else
+                        dlm_lockres_grab_inflight_worker(dlm, res);
        } else {
                if (res)
                        dlm_lockres_put(res);
@@ -2118,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
        dlm_lockres_release_ast(dlm, res);
 put:
+        dlm_lockres_drop_inflight_worker(dlm, res);
        dlm_lockres_put(res);
        mlog(0, "finished with dlm_assert_master_worker\n");
@@ -3088,11 +3129,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
                        /* remove it so that only one mle will be found */
                        __dlm_unlink_mle(dlm, tmp);
                        __dlm_mle_detach_hb_events(dlm, tmp);
-                        ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
+                        if (tmp->type == DLM_MLE_MASTER) {
-                        mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
+                                ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
-                            "telling master to get ref for cleared out mle "
+                                mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
-                            "during migration\n", dlm->name, namelen, name,
+                                                "telling master to get ref "
-                            master, new_master);
+                                                "for cleared out mle during "
+                                                "migration\n", dlm->name,
+                                                namelen, name, master,
+                                                new_master);
+                        }
                }
                spin_unlock(&tmp->spinlock);
        }
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 5de019437ea5..45067faf5695 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1708,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
                                mlog_errno(-ENOMEM);
                                /* retry!? */
                                BUG();
-                        }
+                        } else
+                                __dlm_lockres_grab_inflight_worker(dlm, res);
                } else /* put.. incase we are not the master */
                        dlm_lockres_put(res);
                spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 9db869de829d..69aac6f088ad 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -259,12 +259,15 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                 * refs on it. */
                unused = __dlm_lockres_unused(lockres);
                if (!unused ||
-                    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
+                    (lockres->state & DLM_LOCK_RES_MIGRATING) ||
+                    (lockres->inflight_assert_workers != 0)) {
                        mlog(0, "%s: res %.*s is in use or being remastered, "
-                             "used %d, state %d\n", dlm->name,
+                             "used %d, state %d, assert master workers %u\n",
-                             lockres->lockname.len, lockres->lockname.name,
+                             dlm->name, lockres->lockname.len,
-                             !unused, lockres->state);
+                             lockres->lockname.name,
-                        list_move_tail(&dlm->purge_list, &lockres->purge);
+                             !unused, lockres->state,
+                             lockres->inflight_assert_workers);
+                        list_move_tail(&lockres->purge, &dlm->purge_list);
                        spin_unlock(&lockres->spinlock);
                        continue;
                }
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 5698b52cf5c9..2e3c9dbab68c 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -191,7 +191,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
                                     DLM_UNLOCK_CLEAR_CONVERT_TYPE);
                } else if (status == DLM_RECOVERING ||
                           status == DLM_MIGRATING ||
-                           status == DLM_FORWARD) {
+                           status == DLM_FORWARD ||
+                           status == DLM_NOLOCKMGR
+                           ) {
                        /* must clear the actions because this unlock
                         * is about to be retried.  cannot free or do
                         * any list manipulation. */
@@ -200,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
                             res->lockname.name,
                             status==DLM_RECOVERING?"recovering":
                             (status==DLM_MIGRATING?"migrating":
-                              "forward"));
+                                (status == DLM_FORWARD ? "forward" :
+                                                "nolockmanager")));
                        actions = 0;
                }
                if (flags & LKM_CANCEL)
@@ -364,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
                         * updated state to the recovery master.  this thread
                         * just needs to finish out the operation and call
                         * the unlockast. */
-                        ret = DLM_NORMAL;
+                        if (dlm_is_node_dead(dlm, owner))
+                                ret = DLM_NORMAL;
+                        else
+                                ret = DLM_NOLOCKMGR;
                } else {
                        /* something bad.  this will BUG in ocfs2 */
                        ret = dlm_err_to_dlm_status(tmpret);
@@ -638,7 +644,9 @@ retry:
        if (status == DLM_RECOVERING ||
            status == DLM_MIGRATING ||
-            status == DLM_FORWARD) {
+            status == DLM_FORWARD ||
+            status == DLM_NOLOCKMGR) {
                /* We want to go away for a tiny bit to allow recovery
                 * / migration to complete on this resource. I don't
                 * know of any wait queue we could sleep on as this
@@ -650,7 +658,7 @@ retry:
                msleep(50);
                mlog(0, "retrying unlock due to pending recovery/"
-                     "migration/in-progress\n");
+                     "migration/in-progress/reconnect\n");
                goto retry;
        }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2060fc398445..8add6f1030d7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -205,6 +205,21 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
        return inode;
 }
+static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb,
+                struct dentry *dentry, struct inode *inode)
+{
+        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+        ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+        ocfs2_lock_res_free(&dl->dl_lockres);
+        BUG_ON(dl->dl_count != 1);
+        spin_lock(&dentry_attach_lock);
+        dentry->d_fsdata = NULL;
+        spin_unlock(&dentry_attach_lock);
+        kfree(dl);
+        iput(inode);
+}
 static int ocfs2_mknod(struct inode *dir,
                       struct dentry *dentry,
                       umode_t mode,
@@ -231,6 +246,7 @@ static int ocfs2_mknod(struct inode *dir,
        sigset_t oldset;
        int did_block_signals = 0;
        struct posix_acl *default_acl = NULL, *acl = NULL;
+        struct ocfs2_dentry_lock *dl = NULL;
        trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
                          (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -423,6 +439,8 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
+        dl = dentry->d_fsdata;
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
                                 &lookup);
@@ -469,6 +487,9 @@ leave:
         * ocfs2_delete_inode will mutex_lock again.
         */
        if ((status < 0) && inode) {
+                if (dl)
+                        ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
                clear_nlink(inode);
                iput(inode);
@@ -991,6 +1012,65 @@ leave:
        return status;
 }
+static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
+                u64 src_inode_no, u64 dest_inode_no)
+{
+        int ret = 0, i = 0;
+        u64 parent_inode_no = 0;
+        u64 child_inode_no = src_inode_no;
+        struct inode *child_inode;
+#define MAX_LOOKUP_TIMES 32
+        while (1) {
+                child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
+                if (IS_ERR(child_inode)) {
+                        ret = PTR_ERR(child_inode);
+                        break;
+                }
+                ret = ocfs2_inode_lock(child_inode, NULL, 0);
+                if (ret < 0) {
+                        iput(child_inode);
+                        if (ret != -ENOENT)
+                                mlog_errno(ret);
+                        break;
+                }
+                ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
+                                &parent_inode_no);
+                ocfs2_inode_unlock(child_inode, 0);
+                iput(child_inode);
+                if (ret < 0) {
+                        ret = -ENOENT;
+                        break;
+                }
+                if (parent_inode_no == dest_inode_no) {
+                        ret = 1;
+                        break;
+                }
+                if (parent_inode_no == osb->root_inode->i_ino) {
+                        ret = 0;
+                        break;
+                }
+                child_inode_no = parent_inode_no;
+                if (++i >= MAX_LOOKUP_TIMES) {
+                        mlog(ML_NOTICE, "max lookup times reached, filesystem "
+                                        "may have nested directories, "
+                                        "src inode: %llu, dest inode: %llu.\n",
+                                        (unsigned long long)src_inode_no,
+                                        (unsigned long long)dest_inode_no);
+                        ret = 0;
+                        break;
+                }
+        }
+        return ret;
+}
 /*
 * The only place this should be used is rename!
 * if they have the same id, then the 1st one is the only one locked.
@@ -1002,6 +1082,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                             struct inode *inode2)
 {
        int status;
+        int inode1_is_ancestor, inode2_is_ancestor;
        struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
        struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
        struct buffer_head **tmpbh;
@@ -1015,9 +1096,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
        if (*bh2)
                *bh2 = NULL;
-        /* we always want to lock the one with the lower lockid first. */
+        /* we always want to lock the one with the lower lockid first.
+         * and if they are nested, we lock ancestor first */
        if (oi1->ip_blkno != oi2->ip_blkno) {
-                if (oi1->ip_blkno < oi2->ip_blkno) {
+                inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
+                                oi1->ip_blkno);
+                if (inode1_is_ancestor < 0) {
+                        status = inode1_is_ancestor;
+                        goto bail;
+                }
+                inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
+                                oi2->ip_blkno);
+                if (inode2_is_ancestor < 0) {
+                        status = inode2_is_ancestor;
+                        goto bail;
+                }
+                if ((inode1_is_ancestor == 1) ||
+                                (oi1->ip_blkno < oi2->ip_blkno &&
+                                inode2_is_ancestor == 0)) {
                        /* switch id1 and id2 around */
                        tmpbh = bh2;
                        bh2 = bh1;
@@ -1098,6 +1196,7 @@ static int ocfs2_rename(struct inode *old_dir,
        struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
        struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
        struct ocfs2_dir_lookup_result target_insert = { NULL, };
+        bool should_add_orphan = false;
        /* At some point it might be nice to break this function up a
         * bit. */
@@ -1134,6 +1233,21 @@ static int ocfs2_rename(struct inode *old_dir,
                        goto bail;
                }
                rename_lock = 1;
+                /* here we cannot guarantee the inodes haven't just been
+                 * changed, so check if they are nested again */
+                status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
+                                old_inode->i_ino);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                } else if (status == 1) {
+                        status = -EPERM;
+                        trace_ocfs2_rename_not_permitted(
+                                        (unsigned long long)old_inode->i_ino,
+                                        (unsigned long long)new_dir->i_ino);
+                        goto bail;
+                }
        }
        /* if old and new are the same, this'll just do one lock. */
@@ -1304,6 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir,
                                mlog_errno(status);
                                goto bail;
                        }
+                        should_add_orphan = true;
                }
        } else {
                BUG_ON(new_dentry->d_parent->d_inode != new_dir);
@@ -1348,17 +1463,6 @@ static int ocfs2_rename(struct inode *old_dir,
                        goto bail;
                }
-                if (S_ISDIR(new_inode->i_mode) ||
-                    (ocfs2_read_links_count(newfe) == 1)) {
-                        status = ocfs2_orphan_add(osb, handle, new_inode,
-                                                  newfe_bh, orphan_name,
-                                                  &orphan_insert, orphan_dir);
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
-                }
                /* change the dirent to point to the correct inode */
                status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
                                            old_inode);
@@ -1373,6 +1477,15 @@ static int ocfs2_rename(struct inode *old_dir,
                else
                        ocfs2_add_links_count(newfe, -1);
                ocfs2_journal_dirty(handle, newfe_bh);
+                if (should_add_orphan) {
+                        status = ocfs2_orphan_add(osb, handle, new_inode,
+                                        newfe_bh, orphan_name,
+                                        &orphan_insert, orphan_dir);
+                        if (status < 0) {
+                                mlog_errno(status);
+                                goto bail;
+                        }
+                }
        } else {
                /* if the name was not found in new_dir, add it now */
                status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1642,6 +1755,7 @@ static int ocfs2_symlink(struct inode *dir,
        struct ocfs2_dir_lookup_result lookup = { NULL, };
        sigset_t oldset;
        int did_block_signals = 0;
+        struct ocfs2_dentry_lock *dl = NULL;
        trace_ocfs2_symlink_begin(dir, dentry, symname,
                                  dentry->d_name.len, dentry->d_name.name);
@@ -1830,6 +1944,8 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
+        dl = dentry->d_fsdata;
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
                                 &lookup);
@@ -1864,6 +1980,9 @@ bail:
        if (xattr_ac)
                ocfs2_free_alloc_context(xattr_ac);
        if ((status < 0) && inode) {
+                if (dl)
+                        ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
                clear_nlink(inode);
                iput(inode);
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 1b60c62aa9d6..6cb019b7c6a8 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename,
                  __entry->new_len, __get_str(new_name))
 );
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted);
 TRACE_EVENT(ocfs2_rename_target_exists,
        TP_PROTO(int new_len, const char *new_name),
        TP_ARGS(new_len, new_name),
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 714e53b9cc66..636aab69ead5 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4288,9 +4288,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
                goto out;
        }
+        error = ocfs2_rw_lock(inode, 1);
+        if (error) {
+                mlog_errno(error);
+                goto out;
+        }
        error = ocfs2_inode_lock(inode, &old_bh, 1);
        if (error) {
                mlog_errno(error);
+                ocfs2_rw_unlock(inode, 1);
                goto out;
        }
@@ -4302,6 +4309,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
        up_write(&OCFS2_I(inode)->ip_xattr_sem);
        ocfs2_inode_unlock(inode, 1);
+        ocfs2_rw_unlock(inode, 1);
        brelse(old_bh);
        if (error) {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c7a89cea5c5d..ddb662b32447 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1925,15 +1925,11 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        ocfs2_shutdown_local_alloc(osb);
+        ocfs2_truncate_log_shutdown(osb);
        /* This will disable recovery and flush any recovery work. */
        ocfs2_recovery_exit(osb);
-        /*
-         * During dismount, when it recovers another node it will call
-         * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq.
-         */
-        ocfs2_truncate_log_shutdown(osb);
        ocfs2_journal_shutdown(osb);
        ocfs2_sync_blockdev(sb);