102 files changed, 1579 insertions, 884 deletions
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 15690bb1d3b5..789b3afb3423 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -140,6 +140,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
        candidate->first = candidate->last = index;
        candidate->offset_first = from;
        candidate->to_last = to;
+        INIT_LIST_HEAD(&candidate->link);
        candidate->usage = 1;
        candidate->state = AFS_WBACK_PENDING;
        init_waitqueue_head(&candidate->waitq);
diff --git a/fs/aio.c b/fs/aio.c
index fc557a3be0a9..26869cde3953 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -239,15 +239,23 @@ static void __put_ioctx(struct kioctx *ctx)
        call_rcu(&ctx->rcu_head, ctx_rcu_free);
 }
-#define get_ioctx(kioctx) do {                                          \
+static inline void get_ioctx(struct kioctx *kioctx)
-        BUG_ON(atomic_read(&(kioctx)->users) <= 0);                     \
+{
-        atomic_inc(&(kioctx)->users);                                   \
+        BUG_ON(atomic_read(&kioctx->users) <= 0);
-} while (0)
+        atomic_inc(&kioctx->users);
-#define put_ioctx(kioctx) do {                                          \
+}
-        BUG_ON(atomic_read(&(kioctx)->users) <= 0);                     \
-        if (unlikely(atomic_dec_and_test(&(kioctx)->users)))            \
+static inline int try_get_ioctx(struct kioctx *kioctx)
-                __put_ioctx(kioctx);                                    \
+{
-} while (0)
+        return atomic_inc_not_zero(&kioctx->users);
+}
+static inline void put_ioctx(struct kioctx *kioctx)
+{
+        BUG_ON(atomic_read(&kioctx->users) <= 0);
+        if (unlikely(atomic_dec_and_test(&kioctx->users)))
+                __put_ioctx(kioctx);
+}
 /* ioctx_alloc
 *      Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
@@ -601,8 +609,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
        rcu_read_lock();
        hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
-                if (ctx->user_id == ctx_id && !ctx->dead) {
+                /*
-                        get_ioctx(ctx);
+                 * RCU protects us against accessing freed memory but
+                 * we have to be careful not to get a reference when the
+                 * reference count already dropped to 0 (ctx->dead test
+                 * is unreliable because of races).
+                 */
+                if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
                        ret = ctx;
                        break;
                }
@@ -1629,6 +1642,23 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                goto out_put_req;
        spin_lock_irq(&ctx->ctx_lock);
+        /*
+         * We could have raced with io_destroy() and are currently holding a
+         * reference to ctx which should be destroyed. We cannot submit IO
+         * since ctx gets freed as soon as io_submit() puts its reference.  The
+         * check here is reliable: io_destroy() sets ctx->dead before waiting
+         * for outstanding IO and the barrier between these two is realized by
+         * unlock of mm->ioctx_lock and lock of ctx->ctx_lock.  Analogously we
+         * increment ctx->reqs_active before checking for ctx->dead and the
+         * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
+         * don't see ctx->dead set here, io_destroy() waits for our IO to
+         * finish.
+         */
+        if (ctx->dead) {
+                spin_unlock_irq(&ctx->ctx_lock);
+                ret = -EINVAL;
+                goto out_put_req;
+        }
        aio_run_iocb(req);
        if (!list_empty(&ctx->run_list)) {
                /* drain the run list */
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 333a7bb4cb9c..889287019599 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -873,6 +873,11 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
        ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
        if (ret)
                goto out_del;
+        /*
+         * bdev could be deleted beneath us which would implicitly destroy
+         * the holder directory.  Hold on to it.
+         */
+        kobject_get(bdev->bd_part->holder_dir);
        list_add(&holder->list, &bdev->bd_holder_disks);
        goto out_unlock;
@@ -909,6 +914,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
                del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
                del_symlink(bdev->bd_part->holder_dir,
                            &disk_to_dev(disk)->kobj);
+                kobject_put(bdev->bd_part->holder_dir);
                list_del_init(&holder->list);
                kfree(holder);
        }
@@ -922,14 +928,15 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
 * flush_disk - invalidates all buffer-cache entries on a disk
 *
 * @bdev:      struct block device to be flushed
+ * @kill_dirty: flag to guide handling of dirty inodes
 *
 * Invalidates all buffer-cache entries on a disk. It should be called
 * when a disk has been changed -- either by a media change or online
 * resize.
 */
-static void flush_disk(struct block_device *bdev)
+static void flush_disk(struct block_device *bdev, bool kill_dirty)
 {
-        if (__invalidate_device(bdev)) {
+        if (__invalidate_device(bdev, kill_dirty)) {
                char name[BDEVNAME_SIZE] = "";
                if (bdev->bd_disk)
@@ -966,7 +973,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
                       "%s: detected capacity change from %lld to %lld\n",
                       name, bdev_size, disk_size);
                i_size_write(bdev->bd_inode, disk_size);
-                flush_disk(bdev);
+                flush_disk(bdev, false);
        }
 }
 EXPORT_SYMBOL(check_disk_size_change);
@@ -1019,7 +1026,7 @@ int check_disk_change(struct block_device *bdev)
        if (!(events & DISK_EVENT_MEDIA_CHANGE))
                return 0;
-        flush_disk(bdev);
+        flush_disk(bdev, true);
        if (bdops->revalidate_disk)
                bdops->revalidate_disk(bdev->bd_disk);
        return 1;
@@ -1215,12 +1222,6 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
        res = __blkdev_get(bdev, mode, 0);
-        /* __blkdev_get() may alter read only status, check it afterwards */
-        if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
-                __blkdev_put(bdev, mode, 0);
-                res = -EACCES;
-        }
        if (whole) {
                /* finish claiming */
                mutex_lock(&bdev->bd_mutex);
@@ -1298,6 +1299,11 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
        if (err)
                return ERR_PTR(err);
+        if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+                blkdev_put(bdev, mode);
+                return ERR_PTR(-EACCES);
+        }
        return bdev;
 }
 EXPORT_SYMBOL(blkdev_get_by_path);
@@ -1601,7 +1607,7 @@ fail:
 }
 EXPORT_SYMBOL(lookup_bdev);
-int __invalidate_device(struct block_device *bdev)
+int __invalidate_device(struct block_device *bdev, bool kill_dirty)
 {
        struct super_block *sb = get_super(bdev);
        int res = 0;
@@ -1614,7 +1620,7 @@ int __invalidate_device(struct block_device *bdev)
                 * hold).
                 */
                shrink_dcache_sb(sb);
-                res = invalidate_inodes(sb);
+                res = invalidate_inodes(sb, kill_dirty);
                drop_super(sb);
        }
        invalidate_bdev(bdev);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c98b3af6052..7f78cc78fdd0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -729,6 +729,15 @@ struct btrfs_space_info {
        u64 disk_total;         /* total bytes on disk, takes mirrors into
                                   account */
+        /*
+         * we bump reservation progress every time we decrement
+         * bytes_reserved.  This way people waiting for reservations
+         * know something good has happened and they can check
+         * for progress.  The number here isn't to be trusted, it
+         * just shows reclaim activity
+         */
+        unsigned long reservation_progress;
        int full;               /* indicates that we cannot allocate any more
                                   chunks for this space */
        int force_alloc;        /* set if we need to force a chunk alloc for
@@ -1254,6 +1263,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_SPACE_CACHE         (1 << 12)
 #define BTRFS_MOUNT_CLEAR_CACHE         (1 << 13)
 #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
+#define BTRFS_MOUNT_ENOSPC_DEBUG         (1 << 15)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -2218,6 +2228,8 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root,
                                   u64 start, u64 end);
 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
                               u64 num_bytes);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 type);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fdce8799b98d..e1aa8d607bc7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -359,10 +359,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        tree = &BTRFS_I(page->mapping->host)->io_tree;
-        if (page->private == EXTENT_PAGE_PRIVATE)
+        if (page->private == EXTENT_PAGE_PRIVATE) {
+                WARN_ON(1);
                goto out;
-        if (!page->private)
+        }
+        if (!page->private) {
+                WARN_ON(1);
                goto out;
+        }
        len = page->private >> 2;
        WARN_ON(len == 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4e7e012ad667..7b3089b5c2df 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3342,15 +3342,16 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
        u64 max_reclaim;
        u64 reclaimed = 0;
        long time_left;
-        int pause = 1;
        int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
        int loops = 0;
+        unsigned long progress;
        block_rsv = &root->fs_info->delalloc_block_rsv;
        space_info = block_rsv->space_info;
        smp_mb();
        reserved = space_info->bytes_reserved;
+        progress = space_info->reservation_progress;
        if (reserved == 0)
                return 0;
@@ -3365,31 +3366,36 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
                spin_lock(&space_info->lock);
-                if (reserved > space_info->bytes_reserved) {
+                if (reserved > space_info->bytes_reserved)
-                        loops = 0;
                        reclaimed += reserved - space_info->bytes_reserved;
-                } else {
-                        loops++;
-                }
                reserved = space_info->bytes_reserved;
                spin_unlock(&space_info->lock);
+                loops++;
                if (reserved == 0 || reclaimed >= max_reclaim)
                        break;
                if (trans && trans->transaction->blocked)
                        return -EAGAIN;
-                __set_current_state(TASK_INTERRUPTIBLE);
+                time_left = schedule_timeout_interruptible(1);
-                time_left = schedule_timeout(pause);
                /* We were interrupted, exit */
                if (time_left)
                        break;
-                pause <<= 1;
+                /* we've kicked the IO a few times, if anything has been freed,
-                if (pause > HZ / 10)
+                 * exit.  There is no sense in looping here for a long time
-                        pause = HZ / 10;
+                 * when we really need to commit the transaction, or there are
+                 * just too many writers without enough free space
+                 */
+                if (loops > 3) {
+                        smp_mb();
+                        if (progress != space_info->reservation_progress)
+                                break;
+                }
        }
        return reclaimed >= to_reclaim;
@@ -3612,6 +3618,7 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
                if (num_bytes) {
                        spin_lock(&space_info->lock);
                        space_info->bytes_reserved -= num_bytes;
+                        space_info->reservation_progress++;
                        spin_unlock(&space_info->lock);
                }
        }
@@ -3844,6 +3851,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
                sinfo->bytes_reserved -= num_bytes;
+                sinfo->reservation_progress++;
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = 1;
        }
@@ -4005,7 +4013,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                to_reserve = 0;
        }
        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        to_reserve += calc_csum_metadata_size(inode, num_bytes);
        ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
        if (ret)
@@ -4133,6 +4140,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
+                        cache->space_info->reservation_progress++;
                        cache->space_info->bytes_used += num_bytes;
                        cache->space_info->disk_used += num_bytes * factor;
                        spin_unlock(&cache->lock);
@@ -4184,6 +4192,7 @@ static int pin_down_extent(struct btrfs_root *root,
        if (reserved) {
                cache->reserved -= num_bytes;
                cache->space_info->bytes_reserved -= num_bytes;
+                cache->space_info->reservation_progress++;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);
@@ -4234,6 +4243,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
                                space_info->bytes_readonly += num_bytes;
                        cache->reserved -= num_bytes;
                        space_info->bytes_reserved -= num_bytes;
+                        space_info->reservation_progress++;
                }
                spin_unlock(&cache->lock);
                spin_unlock(&space_info->lock);
@@ -4712,6 +4722,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                if (ret) {
                        spin_lock(&cache->space_info->lock);
                        cache->space_info->bytes_reserved -= buf->len;
+                        cache->space_info->reservation_progress++;
                        spin_unlock(&cache->space_info->lock);
                }
                goto out;
@@ -5376,7 +5387,7 @@ again:
                               num_bytes, data, 1);
                goto again;
        }
-        if (ret == -ENOSPC) {
+        if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
                struct btrfs_space_info *sinfo;
                sinfo = __find_space_info(root->fs_info, data);
@@ -6583,7 +6594,7 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
        u64 end = start + extent_key->offset - 1;
        em = alloc_extent_map(GFP_NOFS);
-        BUG_ON(!em || IS_ERR(em));
+        BUG_ON(!em);
        em->start = start;
        em->len = extent_key->offset;
@@ -8065,6 +8076,13 @@ out:
        return ret;
 }
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 type)
+{
+        u64 alloc_flags = get_alloc_profile(root, type);
+        return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+}
 /*
 * helper to account the unused space of all the readonly block group in the
 * list. takes mirrors into account.
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5e76a474cb7e..714adc4ac4c2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1433,12 +1433,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 */
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end, u64 max_bytes,
-                     unsigned long bits)
+                     unsigned long bits, int contig)
 {
        struct rb_node *node;
        struct extent_state *state;
        u64 cur_start = *start;
        u64 total_bytes = 0;
+        u64 last = 0;
        int found = 0;
        if (search_end <= cur_start) {
@@ -1463,7 +1464,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
                state = rb_entry(node, struct extent_state, rb_node);
                if (state->start > search_end)
                        break;
-                if (state->end >= cur_start && (state->state & bits)) {
+                if (contig && found && state->start > last + 1)
+                        break;
+                if (state->end >= cur_start && (state->state & bits) == bits) {
                        total_bytes += min(search_end, state->end) + 1 -
                                       max(cur_start, state->start);
                        if (total_bytes >= max_bytes)
@@ -1472,6 +1475,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
                                *start = state->start;
                                found = 1;
                        }
+                        last = state->end;
+                } else if (contig && found) {
+                        break;
                }
                node = rb_next(node);
                if (!node)
@@ -1946,6 +1952,7 @@ void set_page_extent_mapped(struct page *page)
 static void set_page_extent_head(struct page *page, unsigned long len)
 {
+        WARN_ON(!PagePrivate(page));
        set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
 }
@@ -2821,9 +2828,17 @@ int try_release_extent_state(struct extent_map_tree *map,
                 * at this point we can safely clear everything except the
                 * locked bit and the nodatasum bit
                 */
-                clear_extent_bit(tree, start, end,
+                ret = clear_extent_bit(tree, start, end,
                                 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
                                 0, 0, NULL, mask);
+                /* if clear_extent_bit failed for enomem reasons,
+                 * we can't allow the release to continue.
+                 */
+                if (ret < 0)
+                        ret = 0;
+                else
+                        ret = 1;
        }
        return ret;
 }
@@ -2903,6 +2918,46 @@ out:
        return sector;
 }
+/*
+ * helper function for fiemap, which doesn't want to see any holes.
+ * This maps until we find something past 'last'
+ */
+static struct extent_map *get_extent_skip_holes(struct inode *inode,
+                                                u64 offset,
+                                                u64 last,
+                                                get_extent_t *get_extent)
+{
+        u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
+        struct extent_map *em;
+        u64 len;
+        if (offset >= last)
+                return NULL;
+        while(1) {
+                len = last - offset;
+                if (len == 0)
+                        break;
+                len = (len + sectorsize - 1) & ~(sectorsize - 1);
+                em = get_extent(inode, NULL, 0, offset, len, 0);
+                if (!em || IS_ERR(em))
+                        return em;
+                /* if this isn't a hole return it */
+                if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
+                    em->block_start != EXTENT_MAP_HOLE) {
+                        return em;
+                }
+                /* this is a hole, advance to the next extent */
+                offset = extent_map_end(em);
+                free_extent_map(em);
+                if (offset >= last)
+                        break;
+        }
+        return NULL;
+}
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -2912,16 +2967,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        u32 flags = 0;
        u32 found_type;
        u64 last;
+        u64 last_for_get_extent = 0;
        u64 disko = 0;
+        u64 isize = i_size_read(inode);
        struct btrfs_key found_key;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        struct btrfs_path *path;
        struct btrfs_file_extent_item *item;
        int end = 0;
-        u64 em_start = 0, em_len = 0;
+        u64 em_start = 0;
+        u64 em_len = 0;
+        u64 em_end = 0;
        unsigned long emflags;
-        int hole = 0;
        if (len == 0)
                return -EINVAL;
@@ -2931,6 +2989,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                return -ENOMEM;
        path->leave_spinning = 1;
+        /*
+         * lookup the last file extent.  We're not using i_size here
+         * because there might be preallocation past i_size
+         */
        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
                                       path, inode->i_ino, -1, 0);
        if (ret < 0) {
@@ -2944,18 +3006,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
        found_type = btrfs_key_type(&found_key);
-        /* No extents, just return */
+        /* No extents, but there might be delalloc bits */
        if (found_key.objectid != inode->i_ino ||
            found_type != BTRFS_EXTENT_DATA_KEY) {
-                btrfs_free_path(path);
+                /* have to trust i_size as the end */
-                return 0;
+                last = (u64)-1;
+                last_for_get_extent = isize;
+        } else {
+                /*
+                 * remember the start of the last extent.  There are a
+                 * bunch of different factors that go into the length of the
+                 * extent, so its much less complex to remember where it started
+                 */
+                last = found_key.offset;
+                last_for_get_extent = last + 1;
        }
-        last = found_key.offset;
        btrfs_free_path(path);
+        /*
+         * we might have some extents allocated but more delalloc past those
+         * extents.  so, we trust isize unless the start of the last extent is
+         * beyond isize
+         */
+        if (last < isize) {
+                last = (u64)-1;
+                last_for_get_extent = isize;
+        }
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
                         &cached_state, GFP_NOFS);
-        em = get_extent(inode, NULL, 0, off, max - off, 0);
+        em = get_extent_skip_holes(inode, off, last_for_get_extent,
+                                   get_extent);
        if (!em)
                goto out;
        if (IS_ERR(em)) {
@@ -2964,22 +3046,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        }
        while (!end) {
-                hole = 0;
+                u64 offset_in_extent;
-                off = em->start + em->len;
-                if (off >= max)
-                        end = 1;
-                if (em->block_start == EXTENT_MAP_HOLE) {
+                /* break if the extent we found is outside the range */
-                        hole = 1;
+                if (em->start >= max || extent_map_end(em) < off)
-                        goto next;
+                        break;
-                }
-                em_start = em->start;
+                /*
-                em_len = em->len;
+                 * get_extent may return an extent that starts before our
+                 * requested range.  We have to make sure the ranges
+                 * we return to fiemap always move forward and don't
+                 * overlap, so adjust the offsets here
+                 */
+                em_start = max(em->start, off);
+                /*
+                 * record the offset from the start of the extent
+                 * for adjusting the disk offset below
+                 */
+                offset_in_extent = em_start - em->start;
+                em_end = extent_map_end(em);
+                em_len = em_end - em_start;
+                emflags = em->flags;
                disko = 0;
                flags = 0;
+                /*
+                 * bump off for our next call to get_extent
+                 */
+                off = extent_map_end(em);
+                if (off >= max)
+                        end = 1;
                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
                        end = 1;
                        flags |= FIEMAP_EXTENT_LAST;
@@ -2990,42 +3088,34 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
                } else {
-                        disko = em->block_start;
+                        disko = em->block_start + offset_in_extent;
                }
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
-next:
-                emflags = em->flags;
                free_extent_map(em);
                em = NULL;
-                if (!end) {
+                if ((em_start >= last) || em_len == (u64)-1 ||
-                        em = get_extent(inode, NULL, 0, off, max - off, 0);
+                   (last == (u64)-1 && isize <= em_end)) {
-                        if (!em)
-                                goto out;
-                        if (IS_ERR(em)) {
-                                ret = PTR_ERR(em);
-                                goto out;
-                        }
-                        emflags = em->flags;
-                }
-                if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
-                if (em_start == last) {
+                /* now scan forward to see if this is really the last extent. */
+                em = get_extent_skip_holes(inode, off, last_for_get_extent,
+                                           get_extent);
+                if (IS_ERR(em)) {
+                        ret = PTR_ERR(em);
+                        goto out;
+                }
+                if (!em) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
+                ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
-                if (!hole) {
+                                              em_len, flags);
-                        ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+                if (ret)
-                                                em_len, flags);
+                        goto out_free;
-                        if (ret)
-                                goto out_free;
-                }
        }
 out_free:
        free_extent_map(em);
@@ -3194,7 +3284,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                }
                if (!PageUptodate(p))
                        uptodate = 0;
-                unlock_page(p);
+                /*
+                 * see below about how we avoid a nasty race with release page
+                 * and why we unlock later
+                 */
+                if (i != 0)
+                        unlock_page(p);
        }
        if (uptodate)
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -3218,9 +3314,26 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        atomic_inc(&eb->refs);
        spin_unlock(&tree->buffer_lock);
        radix_tree_preload_end();
+        /*
+         * there is a race where release page may have
+         * tried to find this extent buffer in the radix
+         * but failed.  It will tell the VM it is safe to
+         * reclaim the, and it will clear the page private bit.
+         * We must make sure to set the page private bit properly
+         * after the extent buffer is in the radix tree so
+         * it doesn't get lost
+         */
+        set_page_extent_mapped(eb->first_page);
+        set_page_extent_head(eb->first_page, eb->len);
+        if (!page0)
+                unlock_page(eb->first_page);
        return eb;
 free_eb:
+        if (eb->first_page && !page0)
+                unlock_page(eb->first_page);
        if (!atomic_dec_and_test(&eb->refs))
                return exists;
        btrfs_release_extent_buffer(eb);
@@ -3271,10 +3384,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                        continue;
                lock_page(page);
+                WARN_ON(!PagePrivate(page));
+                set_page_extent_mapped(page);
                if (i == 0)
                        set_page_extent_head(page, eb->len);
-                else
-                        set_page_private(page, EXTENT_PAGE_PRIVATE);
                clear_page_dirty_for_io(page);
                spin_lock_irq(&page->mapping->tree_lock);
@@ -3464,6 +3578,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
+                WARN_ON(!PagePrivate(page));
+                set_page_extent_mapped(page);
+                if (i == 0)
+                        set_page_extent_head(page, eb->len);
                if (inc_all_pages)
                        page_cache_get(page);
                if (!PageUptodate(page)) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7083cfafd061..9318dfefd59c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -191,7 +191,7 @@ void extent_io_exit(void);
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end,
-                     u64 max_bytes, unsigned long bits);
+                     u64 max_bytes, unsigned long bits, int contig);
 void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b0e1fce12530..2b6c12e983b3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,8 +51,8 @@ struct extent_map *alloc_extent_map(gfp_t mask)
 {
        struct extent_map *em;
        em = kmem_cache_alloc(extent_map_cache, mask);
-        if (!em || IS_ERR(em))
+        if (!em)
-                return em;
+                return NULL;
        em->in_tree = 0;
        em->flags = 0;
        em->compress_type = BTRFS_COMPRESS_NONE;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c1d3a818731a..f447b783bb84 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -70,6 +70,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
+                /*
+                 * if we get a partial write, we can end up with
+                 * partially up to date pages.  These add
+                 * a lot of complexity, so make sure they don't
+                 * happen by forcing this copy to be retried.
+                 *
+                 * The rest of the btrfs_file_write code will fall
+                 * back to page at a time copies after we return 0.
+                 */
+                if (!PageUptodate(page) && copied < count)
+                        copied = 0;
                iov_iter_advance(i, copied);
                write_bytes -= copied;
                total_copied += copied;
@@ -186,6 +199,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split = alloc_extent_map(GFP_NOFS);
                if (!split2)
                        split2 = alloc_extent_map(GFP_NOFS);
+                BUG_ON(!split || !split2);
                write_lock(&em_tree->lock);
                em = lookup_extent_mapping(em_tree, start, len);
@@ -762,6 +776,27 @@ out:
 }
 /*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos)
+{
+        int ret = 0;
+        if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+                ret = btrfs_readpage(NULL, page);
+                if (ret)
+                        return ret;
+                lock_page(page);
+                if (!PageUptodate(page)) {
+                        unlock_page(page);
+                        return -EIO;
+                }
+        }
+        return 0;
+}
+/*
 * this gets pages into the page cache and locks them down, it also properly
 * waits for data=ordered extents to finish before allowing the pages to be
 * modified.
@@ -776,6 +811,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
        unsigned long index = pos >> PAGE_CACHE_SHIFT;
        struct inode *inode = fdentry(file)->d_inode;
        int err = 0;
+        int faili = 0;
        u64 start_pos;
        u64 last_pos;
@@ -793,15 +829,24 @@ again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = grab_cache_page(inode->i_mapping, index + i);
                if (!pages[i]) {
-                        int c;
+                        faili = i - 1;
-                        for (c = i - 1; c >= 0; c--) {
+                        err = -ENOMEM;
-                                unlock_page(pages[c]);
+                        goto fail;
-                                page_cache_release(pages[c]);
+                }
-                        }
-                        return -ENOMEM;
+                if (i == 0)
+                        err = prepare_uptodate_page(pages[i], pos);
+                if (i == num_pages - 1)
+                        err = prepare_uptodate_page(pages[i],
+                                                    pos + write_bytes);
+                if (err) {
+                        page_cache_release(pages[i]);
+                        faili = i - 1;
+                        goto fail;
                }
                wait_on_page_writeback(pages[i]);
        }
+        err = 0;
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
                lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -841,6 +886,14 @@ again:
                WARN_ON(!PageLocked(pages[i]));
        }
        return 0;
+fail:
+        while (faili >= 0) {
+                unlock_page(pages[faili]);
+                page_cache_release(pages[faili]);
+                faili--;
+        }
+        return err;
 }
 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
@@ -850,7 +903,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = fdentry(file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct page *pinned[2];
        struct page **pages = NULL;
        struct iov_iter i;
        loff_t *ppos = &iocb->ki_pos;
@@ -871,9 +923,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
                      (file->f_flags & O_DIRECT));
-        pinned[0] = NULL;
-        pinned[1] = NULL;
        start_pos = pos;
        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
@@ -961,32 +1010,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        first_index = pos >> PAGE_CACHE_SHIFT;
        last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
-        /*
-         * there are lots of better ways to do this, but this code
-         * makes sure the first and last page in the file range are
-         * up to date and ready for cow
-         */
-        if ((pos & (PAGE_CACHE_SIZE - 1))) {
-                pinned[0] = grab_cache_page(inode->i_mapping, first_index);
-                if (!PageUptodate(pinned[0])) {
-                        ret = btrfs_readpage(NULL, pinned[0]);
-                        BUG_ON(ret);
-                        wait_on_page_locked(pinned[0]);
-                } else {
-                        unlock_page(pinned[0]);
-                }
-        }
-        if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
-                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
-                if (!PageUptodate(pinned[1])) {
-                        ret = btrfs_readpage(NULL, pinned[1]);
-                        BUG_ON(ret);
-                        wait_on_page_locked(pinned[1]);
-                } else {
-                        unlock_page(pinned[1]);
-                }
-        }
        while (iov_iter_count(&i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
                size_t write_bytes = min(iov_iter_count(&i),
@@ -1023,8 +1046,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                copied = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, &i);
-                dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >>
-                                PAGE_CACHE_SHIFT;
+                /*
+                 * if we have trouble faulting in the pages, fall
+                 * back to one page at a time
+                 */
+                if (copied < write_bytes)
+                        nrptrs = 1;
+                if (copied == 0)
+                        dirty_pages = 0;
+                else
+                        dirty_pages = (copied + offset +
+                                       PAGE_CACHE_SIZE - 1) >>
+                                       PAGE_CACHE_SHIFT;
                if (num_pages > dirty_pages) {
                        if (copied > 0)
@@ -1068,10 +1103,6 @@ out:
                err = ret;
        kfree(pages);
-        if (pinned[0])
-                page_cache_release(pinned[0]);
-        if (pinned[1])
-                page_cache_release(pinned[1]);
        *ppos = pos;
        /*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bcc461a9695f..9007bbd01dbf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -644,6 +644,7 @@ retry:
                                        async_extent->ram_size - 1, 0);
                em = alloc_extent_map(GFP_NOFS);
+                BUG_ON(!em);
                em->start = async_extent->start;
                em->len = async_extent->ram_size;
                em->orig_start = em->start;
@@ -820,6 +821,7 @@ static noinline int cow_file_range(struct inode *inode,
                BUG_ON(ret);
                em = alloc_extent_map(GFP_NOFS);
+                BUG_ON(!em);
                em->start = start;
                em->orig_start = em->start;
                ram_size = ins.offset;
@@ -1169,6 +1171,7 @@ out_check:
                        struct extent_map_tree *em_tree;
                        em_tree = &BTRFS_I(inode)->extent_tree;
                        em = alloc_extent_map(GFP_NOFS);
+                        BUG_ON(!em);
                        em->start = cur_offset;
                        em->orig_start = em->start;
                        em->len = num_bytes;
@@ -1910,7 +1913,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
        private = 0;
        if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-                             (u64)-1, 1, EXTENT_DIRTY)) {
+                             (u64)-1, 1, EXTENT_DIRTY, 0)) {
                ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
                                        start, &private_failure);
                if (ret == 0) {
@@ -4818,10 +4821,11 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                goto fail;
        /*
-         * 1 item for inode ref
+         * 2 items for inode and inode ref
         * 2 items for dir items
+         * 1 item for parent inode
         */
-        trans = btrfs_start_transaction(root, 3);
+        trans = btrfs_start_transaction(root, 5);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto fail;
@@ -5277,6 +5281,128 @@ out:
        return em;
 }
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+                                           size_t pg_offset, u64 start, u64 len,
+                                           int create)
+{
+        struct extent_map *em;
+        struct extent_map *hole_em = NULL;
+        u64 range_start = start;
+        u64 end;
+        u64 found;
+        u64 found_end;
+        int err = 0;
+        em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+        if (IS_ERR(em))
+                return em;
+        if (em) {
+                /*
+                 * if our em maps to a hole, there might
+                 * actually be delalloc bytes behind it
+                 */
+                if (em->block_start != EXTENT_MAP_HOLE)
+                        return em;
+                else
+                        hole_em = em;
+        }
+        /* check to see if we've wrapped (len == -1 or similar) */
+        end = start + len;
+        if (end < start)
+                end = (u64)-1;
+        else
+                end -= 1;
+        em = NULL;
+        /* ok, we didn't find anything, lets look for delalloc */
+        found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
+                                 end, len, EXTENT_DELALLOC, 1);
+        found_end = range_start + found;
+        if (found_end < range_start)
+                found_end = (u64)-1;
+        /*
+         * we didn't find anything useful, return
+         * the original results from get_extent()
+         */
+        if (range_start > end || found_end <= start) {
+                em = hole_em;
+                hole_em = NULL;
+                goto out;
+        }
+        /* adjust the range_start to make sure it doesn't
+         * go backwards from the start they passed in
+         */
+        range_start = max(start,range_start);
+        found = found_end - range_start;
+        if (found > 0) {
+                u64 hole_start = start;
+                u64 hole_len = len;
+                em = alloc_extent_map(GFP_NOFS);
+                if (!em) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                /*
+                 * when btrfs_get_extent can't find anything it
+                 * returns one huge hole
+                 *
+                 * make sure what it found really fits our range, and
+                 * adjust to make sure it is based on the start from
+                 * the caller
+                 */
+                if (hole_em) {
+                        u64 calc_end = extent_map_end(hole_em);
+                        if (calc_end <= start || (hole_em->start > end)) {
+                                free_extent_map(hole_em);
+                                hole_em = NULL;
+                        } else {
+                                hole_start = max(hole_em->start, start);
+                                hole_len = calc_end - hole_start;
+                        }
+                }
+                em->bdev = NULL;
+                if (hole_em && range_start > hole_start) {
+                        /* our hole starts before our delalloc, so we
+                         * have to return just the parts of the hole
+                         * that go until  the delalloc starts
+                         */
+                        em->len = min(hole_len,
+                                      range_start - hole_start);
+                        em->start = hole_start;
+                        em->orig_start = hole_start;
+                        /*
+                         * don't adjust block start at all,
+                         * it is fixed at EXTENT_MAP_HOLE
+                         */
+                        em->block_start = hole_em->block_start;
+                        em->block_len = hole_len;
+                } else {
+                        em->start = range_start;
+                        em->len = found;
+                        em->orig_start = range_start;
+                        em->block_start = EXTENT_MAP_DELALLOC;
+                        em->block_len = found;
+                }
+        } else if (hole_em) {
+                return hole_em;
+        }
+out:
+        free_extent_map(hole_em);
+        if (err) {
+                free_extent_map(em);
+                return ERR_PTR(err);
+        }
+        return em;
+}
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                                                  u64 start, u64 len)
 {
@@ -5931,6 +6057,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        if (!skip_sum) {
                dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
                if (!dip->csums) {
+                        kfree(dip);
                        ret = -ENOMEM;
                        goto free_ordered;
                }
@@ -6099,7 +6226,7 @@ out:
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
-        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
+        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
 }
 int btrfs_readpage(struct file *file, struct page *page)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 02d224e8c83f..5fdb2abc4fa7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1071,12 +1071,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
-        if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
+        if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
                return -EINVAL;
        if (flags & ~BTRFS_SUBVOL_RDONLY)
                return -EOPNOTSUPP;
+        if (!is_owner_or_cap(inode))
+                return -EACCES;
        down_write(&root->fs_info->subvol_sem);
        /* nothing to do */
@@ -1097,7 +1100,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
                goto out_reset;
        }
-        ret = btrfs_update_root(trans, root,
+        ret = btrfs_update_root(trans, root->fs_info->tree_root,
                                &root->root_key, &root->root_item);
        btrfs_commit_transaction(trans, root);
@@ -2208,7 +2211,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
        int num_types = 4;
        int alloc_size;
        int ret = 0;
-        int slot_count = 0;
+        u64 slot_count = 0;
        int i, c;
        if (copy_from_user(&space_args,
@@ -2247,7 +2250,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
                goto out;
        }
-        slot_count = min_t(int, space_args.space_slots, slot_count);
+        slot_count = min_t(u64, space_args.space_slots, slot_count);
        alloc_size = sizeof(*dest) * slot_count;
@@ -2267,6 +2270,9 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
        for (i = 0; i < num_types; i++) {
                struct btrfs_space_info *tmp;
+                if (!slot_count)
+                        break;
                info = NULL;
                rcu_read_lock();
                list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
@@ -2288,7 +2294,10 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
                                memcpy(dest, &space, sizeof(space));
                                dest++;
                                space_args.total_spaces++;
+                                slot_count--;
                        }
+                        if (!slot_count)
+                                break;
                }
                up_read(&info->groups_sem);
        }
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index cc9b450399df..a178f5ebea78 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -280,6 +280,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
        unsigned long tot_out;
        unsigned long tot_len;
        char *buf;
+        bool may_late_unmap, need_unmap;
        data_in = kmap(pages_in[0]);
        tot_len = read_compress_length(data_in);
@@ -300,11 +301,13 @@ static int lzo_decompress_biovec(struct list_head *ws,
                tot_in += in_len;
                working_bytes = in_len;
+                may_late_unmap = need_unmap = false;
                /* fast path: avoid using the working buffer */
                if (in_page_bytes_left >= in_len) {
                        buf = data_in + in_offset;
                        bytes = in_len;
+                        may_late_unmap = true;
                        goto cont;
                }
@@ -329,14 +332,17 @@ cont:
                                if (working_bytes == 0 && tot_in >= tot_len)
                                        break;
-                                kunmap(pages_in[page_in_index]);
+                                if (page_in_index + 1 >= total_pages_in) {
-                                page_in_index++;
-                                if (page_in_index >= total_pages_in) {
                                        ret = -1;
-                                        data_in = NULL;
                                        goto done;
                                }
-                                data_in = kmap(pages_in[page_in_index]);
+                                if (may_late_unmap)
+                                        need_unmap = true;
+                                else
+                                        kunmap(pages_in[page_in_index]);
+                                data_in = kmap(pages_in[++page_in_index]);
                                in_page_bytes_left = PAGE_CACHE_SIZE;
                                in_offset = 0;
@@ -346,6 +352,8 @@ cont:
                out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
                ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
                                            &out_len);
+                if (need_unmap)
+                        kunmap(pages_in[page_in_index - 1]);
                if (ret != LZO_E_OK) {
                        printk(KERN_WARNING "btrfs decompress failed\n");
                        ret = -1;
@@ -363,8 +371,7 @@ cont:
                        break;
        }
 done:
-        if (data_in)
+        kunmap(pages_in[page_in_index]);
-                kunmap(pages_in[page_in_index]);
        return ret;
 }
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 1f5556acb530..31ade5802ae8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1157,6 +1157,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
        new_node->bytenr = dest->node->start;
        new_node->level = node->level;
        new_node->lowest = node->lowest;
+        new_node->checked = 1;
        new_node->root = dest;
        if (!node->lowest) {
@@ -3653,6 +3654,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        u32 item_size;
        int ret;
        int err = 0;
+        int progress = 0;
        path = btrfs_alloc_path();
        if (!path)
@@ -3665,9 +3667,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        }
        while (1) {
+                progress++;
                trans = btrfs_start_transaction(rc->extent_root, 0);
                BUG_ON(IS_ERR(trans));
+restart:
                if (update_backref_cache(trans, &rc->backref_cache)) {
                        btrfs_end_transaction(trans, rc->extent_root);
                        continue;
@@ -3780,6 +3783,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                        }
                }
        }
+        if (trans && progress && err == -ENOSPC) {
+                ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
+                                              rc->block_group->flags);
+                if (ret == 0) {
+                        err = 0;
+                        progress = 0;
+                        goto restart;
+                }
+        }
        btrfs_release_path(rc->extent_root, path);
        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a004008f7d28..d39a9895d932 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -155,7 +155,8 @@ enum {
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
-        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
+        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
+        Opt_enospc_debug, Opt_err,
 };
 static match_table_t tokens = {
@@ -184,6 +185,7 @@ static match_table_t tokens = {
        {Opt_space_cache, "space_cache"},
        {Opt_clear_cache, "clear_cache"},
        {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+        {Opt_enospc_debug, "enospc_debug"},
        {Opt_err, NULL},
 };
@@ -358,6 +360,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_user_subvol_rm_allowed:
                        btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
                        break;
+                case Opt_enospc_debug:
+                        btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+                        break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2636a051e4b2..dd13eb81ee40 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1338,11 +1338,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        ret = btrfs_shrink_device(device, 0);
        if (ret)
-                goto error_brelse;
+                goto error_undo;
        ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
        if (ret)
-                goto error_brelse;
+                goto error_undo;
        device->in_fs_metadata = 0;
@@ -1416,6 +1416,13 @@ out:
        mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
        return ret;
+error_undo:
+        if (device->writeable) {
+                list_add(&device->dev_alloc_list,
+                         &root->fs_info->fs_devices->alloc_list);
+                root->fs_info->fs_devices->rw_devices++;
+        }
+        goto error_brelse;
 }
 /*
@@ -1605,12 +1612,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        ret = find_next_devid(root, &device->devid);
        if (ret) {
+                kfree(device->name);
                kfree(device);
                goto error;
        }
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
+                kfree(device->name);
                kfree(device);
                ret = PTR_ERR(trans);
                goto error;
@@ -1631,7 +1640,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        device->dev_root = root->fs_info->dev_root;
        device->bdev = bdev;
        device->in_fs_metadata = 1;
-        device->mode = 0;
+        device->mode = FMODE_EXCL;
        set_blocksize(device->bdev, 4096);
        if (seeding_dev) {
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0bc68de8edd7..ebafa65a29b6 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -409,7 +409,7 @@ more:
        spin_lock(&inode->i_lock);
        if (ci->i_release_count == fi->dir_release_count) {
                dout(" marking %p complete\n", inode);
-                ci->i_ceph_flags |= CEPH_I_COMPLETE;
+                /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
                ci->i_max_offset = filp->f_pos;
        }
        spin_unlock(&inode->i_lock);
@@ -496,6 +496,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
        /* .snap dir? */
        if (err == -ENOENT &&
+            ceph_snap(parent) == CEPH_NOSNAP &&
            strcmp(dentry->d_name.name,
                   fsc->mount_options->snapdir_name) == 0) {
                struct inode *inode = ceph_get_snapdir(parent);
@@ -992,7 +993,7 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *dir;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        dir = dentry->d_parent->d_inode;
@@ -1029,28 +1030,8 @@ out_touch:
 static void ceph_dentry_release(struct dentry *dentry)
 {
        struct ceph_dentry_info *di = ceph_dentry(dentry);
-        struct inode *parent_inode = NULL;
-        u64 snapid = CEPH_NOSNAP;
-        if (!IS_ROOT(dentry)) {
+        dout("dentry_release %p\n", dentry);
-                parent_inode = dentry->d_parent->d_inode;
-                if (parent_inode)
-                        snapid = ceph_snap(parent_inode);
-        }
-        dout("dentry_release %p parent %p\n", dentry, parent_inode);
-        if (parent_inode && snapid != CEPH_SNAPDIR) {
-                struct ceph_inode_info *ci = ceph_inode(parent_inode);
-                spin_lock(&parent_inode->i_lock);
-                if (ci->i_shared_gen == di->lease_shared_gen ||
-                    snapid <= CEPH_MAXSNAP) {
-                        dout(" clearing %p complete (d_release)\n",
-                             parent_inode);
-                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                        ci->i_release_count++;
-                }
-                spin_unlock(&parent_inode->i_lock);
-        }
        if (di) {
                ceph_dentry_lru_del(dentry);
                if (di->lease_session)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5625463aa479..193bfa5e9cbd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -707,7 +707,7 @@ static int fill_inode(struct inode *inode,
                    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
                    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
                        dout(" marking %p complete (empty)\n", inode);
-                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
+                        /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
                        ci->i_max_offset = 2;
                }
                break;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 39c243acd062..f40b9139e437 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -584,10 +584,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
        if (lastinode)
                iput(lastinode);
-        dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
+        list_for_each_entry(child, &realm->children, child_item) {
-        list_for_each_entry(child, &realm->children, child_item)
+                dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
-                queue_realm_cap_snaps(child);
+                     realm, realm->ino, child, child->ino);
+                list_del_init(&child->dirty_item);
+                list_add(&child->dirty_item, &realm->dirty_item);
+        }
+        list_del_init(&realm->dirty_item);
        dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
 }
@@ -683,7 +687,9 @@ more:
         * queue cap snaps _after_ we've built the new snap contexts,
         * so that i_head_snapc can be set appropriately.
         */
-        list_for_each_entry(realm, &dirty_realms, dirty_item) {
+        while (!list_empty(&dirty_realms)) {
+                realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
+                                         dirty_item);
                queue_realm_cap_snaps(realm);
        }
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 4a3330235d55..a9371b6578c0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -127,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.70"
+#define CIFS_VERSION   "1.71"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index edd5b29b53c9..17afb0fbcaed 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -188,6 +188,8 @@ struct TCP_Server_Info {
        /* multiplexed reads or writes */
        unsigned int maxBuf;    /* maxBuf specifies the maximum */
        /* message size the server can send or receive for non-raw SMBs */
+        /* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */
+        /* when socket is setup (and during reconnect) before NegProt sent */
        unsigned int max_rw;    /* maxRw specifies the maximum */
        /* message size the server can send or receive for */
        /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
@@ -652,7 +654,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   MID_REQUEST_SUBMITTED 2
 #define   MID_RESPONSE_RECEIVED 4
 #define   MID_RETRY_NEEDED      8 /* session closed while this request out */
-#define   MID_NO_RESP_NEEDED 0x10
+#define   MID_RESPONSE_MALFORMED 0x10
 /* Types of response buffer returned from SendReceive2 */
 #define   CIFS_NO_BUFFER        0    /* Response buffer not returned */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 257b6d895e20..8d6c17ab593d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -338,10 +338,11 @@ cifs_echo_request(struct work_struct *work)
                                        struct TCP_Server_Info, echo.work);
        /*
-         * We cannot send an echo until the NEGOTIATE_PROTOCOL request is done.
+         * We cannot send an echo until the NEGOTIATE_PROTOCOL request is
-         * Also, no need to ping if we got a response recently
+         * done, which is indicated by maxBuf != 0. Also, no need to ping if
+         * we got a response recently
         */
-        if (server->tcpStatus != CifsGood ||
+        if (server->maxBuf == 0 ||
            time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
                goto requeue_echo;
@@ -585,11 +586,20 @@ incomplete_rcv:
                total_read += 4; /* account for rfc1002 hdr */
                dump_smb(smb_buffer, total_read);
-                if (checkSMB(smb_buffer, smb_buffer->Mid, total_read)) {
+                /*
+                 * We know that we received enough to get to the MID as we
+                 * checked the pdu_length earlier. Now check to see
+                 * if the rest of the header is OK. We borrow the length
+                 * var for the rest of the loop to avoid a new stack var.
+                 *
+                 * 48 bytes is enough to display the header and a little bit
+                 * into the payload for debugging purposes.
+                 */
+                length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
+                if (length != 0)
                        cifs_dump_mem("Bad SMB: ", smb_buffer,
-                                        total_read < 48 ? total_read : 48);
+                                        min_t(unsigned int, total_read, 48));
-                        continue;
-                }
                mid_entry = NULL;
                server->lstrp = jiffies;
@@ -601,7 +611,8 @@ incomplete_rcv:
                        if ((mid_entry->mid == smb_buffer->Mid) &&
                            (mid_entry->midState == MID_REQUEST_SUBMITTED) &&
                            (mid_entry->command == smb_buffer->Command)) {
-                                if (check2ndT2(smb_buffer,server->maxBuf) > 0) {
+                                if (length == 0 &&
+                                   check2ndT2(smb_buffer, server->maxBuf) > 0) {
                                        /* We have a multipart transact2 resp */
                                        isMultiRsp = true;
                                        if (mid_entry->resp_buf) {
@@ -636,7 +647,12 @@ incomplete_rcv:
                                mid_entry->resp_buf = smb_buffer;
                                mid_entry->largeBuf = isLargeBuf;
 multi_t2_fnd:
-                                mid_entry->midState = MID_RESPONSE_RECEIVED;
+                                if (length == 0)
+                                        mid_entry->midState =
+                                                        MID_RESPONSE_RECEIVED;
+                                else
+                                        mid_entry->midState =
+                                                        MID_RESPONSE_MALFORMED;
 #ifdef CONFIG_CIFS_STATS2
                                mid_entry->when_received = jiffies;
 #endif
@@ -657,6 +673,9 @@ multi_t2_fnd:
                                else
                                        smallbuf = NULL;
                        }
+                } else if (length != 0) {
+                        /* response sanity checks failed */
+                        continue;
                } else if (!is_valid_oplock_break(smb_buffer, server) &&
                           !isMultiRsp) {
                        cERROR(1, "No task to wake, unknown frame received! "
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 8d9189f64477..79f641eeda30 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -170,7 +170,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
 {
        int rc, alen, slen;
        const char *pct;
-        char *endp, scope_id[13];
+        char scope_id[13];
        struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
        struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
@@ -197,9 +197,9 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
                memcpy(scope_id, pct + 1, slen);
                scope_id[slen] = '\0';
-                s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
+                rc = strict_strtoul(scope_id, 0,
-                if (endp != scope_id + slen)
+                                        (unsigned long *)&s6->sin6_scope_id);
-                        return 0;
+                rc = (rc == 0) ? 1 : 0;
        }
        return rc;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 1adc9625a344..16765703131b 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -656,13 +656,13 @@ ssetup_ntlmssp_authenticate:
        if (type == LANMAN) {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-                char lnm_session_key[CIFS_SESS_KEY_SIZE];
+                char lnm_session_key[CIFS_AUTH_RESP_SIZE];
                pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
                /* no capabilities flags in old lanman negotiation */
-                pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
                /* Calculate hash with password and copy into bcc_ptr.
                 * Encryption Key (stored as in cryptkey) gets used if the
@@ -675,8 +675,8 @@ ssetup_ntlmssp_authenticate:
                                        true : false, lnm_session_key);
                ses->flags |= CIFS_SES_LANMAN;
-                memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE);
+                memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
-                bcc_ptr += CIFS_SESS_KEY_SIZE;
+                bcc_ptr += CIFS_AUTH_RESP_SIZE;
                /* can not sign if LANMAN negotiated so no need
                to calculate signing key? but what if server
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index fbc5aace54b1..46d8756f2b24 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -457,6 +457,9 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
        case MID_RETRY_NEEDED:
                rc = -EAGAIN;
                break;
+        case MID_RESPONSE_MALFORMED:
+                rc = -EIO;
+                break;
        default:
                cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
                        mid->mid, mid->midState);
diff --git a/fs/compat.c b/fs/compat.c
index f6fd0a00e6cc..691c3fd8ce1d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1228,7 +1228,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
-        ret = compat_readv(file, vec, vlen, &pos);
+        ret = -ESPIPE;
+        if (file->f_mode & FMODE_PREAD)
+                ret = compat_readv(file, vec, vlen, &pos);
        fput_light(file, fput_needed);
        return ret;
 }
@@ -1285,7 +1287,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
-        ret = compat_writev(file, vec, vlen, &pos);
+        ret = -ESPIPE;
+        if (file->f_mode & FMODE_PWRITE)
+                ret = compat_writev(file, vec, vlen, &pos);
        fput_light(file, fput_needed);
        return ret;
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index 2a6bd9a4ae97..611ffe928c03 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1523,6 +1523,28 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 }
 EXPORT_SYMBOL(d_alloc_root);
+static struct dentry * __d_find_any_alias(struct inode *inode)
+{
+        struct dentry *alias;
+        if (list_empty(&inode->i_dentry))
+                return NULL;
+        alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
+        __dget(alias);
+        return alias;
+}
+static struct dentry * d_find_any_alias(struct inode *inode)
+{
+        struct dentry *de;
+        spin_lock(&inode->i_lock);
+        de = __d_find_any_alias(inode);
+        spin_unlock(&inode->i_lock);
+        return de;
+}
 /**
 * d_obtain_alias - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
@@ -1552,7 +1574,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
        if (IS_ERR(inode))
                return ERR_CAST(inode);
-        res = d_find_alias(inode);
+        res = d_find_any_alias(inode);
        if (res)
                goto out_iput;
@@ -1565,7 +1587,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
        spin_lock(&inode->i_lock);
-        res = __d_find_alias(inode, 0);
+        res = __d_find_any_alias(inode);
        if (res) {
                spin_unlock(&inode->i_lock);
                dput(tmp);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9c64ae9e4c1a..2d8c87b951c2 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1468,15 +1468,13 @@ static void work_stop(void)
 static int work_start(void)
 {
-        recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
+        recv_workqueue = create_singlethread_workqueue("dlm_recv");
-                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
        if (!recv_workqueue) {
                log_print("can't start dlm_recv");
                return -ENOMEM;
        }
-        send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
+        send_workqueue = create_singlethread_workqueue("dlm_send");
-                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
        if (!send_workqueue) {
                log_print("can't start dlm_send");
                destroy_workqueue(recv_workqueue);
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 6fc4f319b550..534c1d46e69e 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -46,24 +46,28 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct dentry *lower_dentry;
        struct vfsmount *lower_mnt;
-        struct dentry *dentry_save;
+        struct dentry *dentry_save = NULL;
-        struct vfsmount *vfsmount_save;
+        struct vfsmount *vfsmount_save = NULL;
        int rc = 1;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
        if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
                goto out;
-        dentry_save = nd->path.dentry;
+        if (nd) {
-        vfsmount_save = nd->path.mnt;
+                dentry_save = nd->path.dentry;
-        nd->path.dentry = lower_dentry;
+                vfsmount_save = nd->path.mnt;
-        nd->path.mnt = lower_mnt;
+                nd->path.dentry = lower_dentry;
+                nd->path.mnt = lower_mnt;
+        }
        rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
-        nd->path.dentry = dentry_save;
+        if (nd) {
-        nd->path.mnt = vfsmount_save;
+                nd->path.dentry = dentry_save;
+                nd->path.mnt = vfsmount_save;
+        }
        if (dentry->d_inode) {
                struct inode *lower_inode =
                        ecryptfs_inode_to_lower(dentry->d_inode);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index dbc84ed96336..e00753496e3e 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -632,8 +632,7 @@ int ecryptfs_interpose(struct dentry *hidden_dentry,
                       u32 flags);
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                                        struct dentry *lower_dentry,
-                                        struct inode *ecryptfs_dir_inode,
+                                        struct inode *ecryptfs_dir_inode);
-                                        struct nameidata *ecryptfs_nd);
 int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
                                         size_t *decrypted_name_size,
                                         struct dentry *ecryptfs_dentry,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 81e10e6a9443..7d1050e254f9 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -317,6 +317,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 const struct file_operations ecryptfs_dir_fops = {
        .readdir = ecryptfs_readdir,
+        .read = generic_read_dir,
        .unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index bd33f87a1907..b592938a84bc 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -74,16 +74,20 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
        unsigned int flags_save;
        int rc;
-        dentry_save = nd->path.dentry;
+        if (nd) {
-        vfsmount_save = nd->path.mnt;
+                dentry_save = nd->path.dentry;
-        flags_save = nd->flags;
+                vfsmount_save = nd->path.mnt;
-        nd->path.dentry = lower_dentry;
+                flags_save = nd->flags;
-        nd->path.mnt = lower_mnt;
+                nd->path.dentry = lower_dentry;
-        nd->flags &= ~LOOKUP_OPEN;
+                nd->path.mnt = lower_mnt;
+                nd->flags &= ~LOOKUP_OPEN;
+        }
        rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
-        nd->path.dentry = dentry_save;
+        if (nd) {
-        nd->path.mnt = vfsmount_save;
+                nd->path.dentry = dentry_save;
-        nd->flags = flags_save;
+                nd->path.mnt = vfsmount_save;
+                nd->flags = flags_save;
+        }
        return rc;
 }
@@ -241,8 +245,7 @@ out:
 */
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                                        struct dentry *lower_dentry,
-                                        struct inode *ecryptfs_dir_inode,
+                                        struct inode *ecryptfs_dir_inode)
-                                        struct nameidata *ecryptfs_nd)
 {
        struct dentry *lower_dir_dentry;
        struct vfsmount *lower_mnt;
@@ -290,8 +293,6 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                goto out;
        if (special_file(lower_inode->i_mode))
                goto out;
-        if (!ecryptfs_nd)
-                goto out;
        /* Released in this function */
        page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
        if (!page_virt) {
@@ -349,75 +350,6 @@ out:
 }
 /**
- * ecryptfs_new_lower_dentry
- * @name: The name of the new dentry.
- * @lower_dir_dentry: Parent directory of the new dentry.
- * @nd: nameidata from last lookup.
- *
- * Create a new dentry or get it from lower parent dir.
- */
-static struct dentry *
-ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
-                          struct nameidata *nd)
-{
-        struct dentry *new_dentry;
-        struct dentry *tmp;
-        struct inode *lower_dir_inode;
-        lower_dir_inode = lower_dir_dentry->d_inode;
-        tmp = d_alloc(lower_dir_dentry, name);
-        if (!tmp)
-                return ERR_PTR(-ENOMEM);
-        mutex_lock(&lower_dir_inode->i_mutex);
-        new_dentry = lower_dir_inode->i_op->lookup(lower_dir_inode, tmp, nd);
-        mutex_unlock(&lower_dir_inode->i_mutex);
-        if (!new_dentry)
-                new_dentry = tmp;
-        else
-                dput(tmp);
-        return new_dentry;
-}
-/**
- * ecryptfs_lookup_one_lower
- * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
- * @lower_dir_dentry: lower parent directory
- * @name: lower file name
- *
- * Get the lower dentry from vfs. If lower dentry does not exist yet,
- * create it.
- */
-static struct dentry *
-ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
-                          struct dentry *lower_dir_dentry, struct qstr *name)
-{
-        struct nameidata nd;
-        struct vfsmount *lower_mnt;
-        int err;
-        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
-                                    ecryptfs_dentry->d_parent));
-        err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
-        mntput(lower_mnt);
-        if (!err) {
-                /* we dont need the mount */
-                mntput(nd.path.mnt);
-                return nd.path.dentry;
-        }
-        if (err != -ENOENT)
-                return ERR_PTR(err);
-        /* create a new lower dentry */
-        return ecryptfs_new_lower_dentry(name, lower_dir_dentry, &nd);
-}
-/**
 * ecryptfs_lookup
 * @ecryptfs_dir_inode: The eCryptfs directory inode
 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
@@ -434,7 +366,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        size_t encrypted_and_encoded_name_size;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
        struct dentry *lower_dir_dentry, *lower_dentry;
-        struct qstr lower_name;
        int rc = 0;
        if ((ecryptfs_dentry->d_name.len == 1
@@ -444,20 +375,14 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                goto out_d_drop;
        }
        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-        lower_name.name = ecryptfs_dentry->d_name.name;
+        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
-        lower_name.len = ecryptfs_dentry->d_name.len;
+        lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
-        lower_name.hash = ecryptfs_dentry->d_name.hash;
+                                      lower_dir_dentry,
-        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+                                      ecryptfs_dentry->d_name.len);
-                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
-                                lower_dir_dentry->d_inode, &lower_name);
-                if (rc < 0)
-                        goto out_d_drop;
-        }
-        lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-                                                 lower_dir_dentry, &lower_name);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
                                encrypted_and_encoded_name);
                goto out_d_drop;
@@ -479,28 +404,21 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                       "filename; rc = [%d]\n", __func__, rc);
                goto out_d_drop;
        }
-        lower_name.name = encrypted_and_encoded_name;
+        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
-        lower_name.len = encrypted_and_encoded_name_size;
+        lower_dentry = lookup_one_len(encrypted_and_encoded_name,
-        lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
+                                      lower_dir_dentry,
-        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+                                      encrypted_and_encoded_name_size);
-                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
-                                lower_dir_dentry->d_inode, &lower_name);
-                if (rc < 0)
-                        goto out_d_drop;
-        }
-        lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-                                                 lower_dir_dentry, &lower_name);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
                                encrypted_and_encoded_name);
                goto out_d_drop;
        }
 lookup_and_interpose:
        rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
-                                                 ecryptfs_dir_inode,
+                                                 ecryptfs_dir_inode);
-                                                 ecryptfs_nd);
        goto out;
 out_d_drop:
        d_drop(ecryptfs_dentry);
@@ -1092,6 +1010,8 @@ int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
                         ecryptfs_dentry_to_lower(dentry), &lower_stat);
        if (!rc) {
+                fsstack_copy_attr_all(dentry->d_inode,
+                                      ecryptfs_inode_to_lower(dentry->d_inode));
                generic_fillattr(dentry->d_inode, stat);
                stat->blocks = lower_stat.blocks;
        }
diff --git a/fs/eventfd.c b/fs/eventfd.c
index e0194b3e14d6..d9a591773919 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -99,7 +99,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_get);
 * @ctx: [in] Pointer to eventfd context.
 *
 * The eventfd context reference must have been previously acquired either
- * with eventfd_ctx_get() or eventfd_ctx_fdget()).
+ * with eventfd_ctx_get() or eventfd_ctx_fdget().
 */
 void eventfd_ctx_put(struct eventfd_ctx *ctx)
 {
@@ -146,9 +146,9 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
 * @ctx: [in] Pointer to eventfd context.
 * @wait: [in] Wait queue to be removed.
- * @cnt: [out] Pointer to the 64bit conter value.
+ * @cnt: [out] Pointer to the 64-bit counter value.
 *
- * Returns zero if successful, or the following error codes:
+ * Returns %0 if successful, or the following error codes:
 *
 * -EAGAIN      : The operation would have blocked.
 *
@@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
 * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
 * @ctx: [in] Pointer to eventfd context.
 * @no_wait: [in] Different from zero if the operation should not block.
- * @cnt: [out] Pointer to the 64bit conter value.
+ * @cnt: [out] Pointer to the 64-bit counter value.
 *
- * Returns zero if successful, or the following error codes:
+ * Returns %0 if successful, or the following error codes:
 *
- * -EAGAIN      : The operation would have blocked but @no_wait was nonzero.
+ * -EAGAIN      : The operation would have blocked but @no_wait was non-zero.
 * -ERESTARTSYS : A signal interrupted the wait operation.
 *
 * If @no_wait is zero, the function might sleep until the eventfd internal
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 267d0ada4541..4a09af9e9a63 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -63,6 +63,13 @@
 * cleanup path and it is also acquired by eventpoll_release_file()
 * if a file has been pushed inside an epoll set and it is then
 * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
+ * It is also acquired when inserting an epoll fd onto another epoll
+ * fd. We do this so that we walk the epoll tree and ensure that this
+ * insertion does not create a cycle of epoll file descriptors, which
+ * could lead to deadlock. We need a global mutex to prevent two
+ * simultaneous inserts (A into B and B into A) from racing and
+ * constructing a cycle without either insert observing that it is
+ * going to.
 * It is possible to drop the "ep->mtx" and to use the global
 * mutex "epmutex" (together with "ep->lock") to have it working,
 * but having "ep->mtx" will make the interface more scalable.
@@ -224,6 +231,9 @@ static long max_user_watches __read_mostly;
 */
 static DEFINE_MUTEX(epmutex);
+/* Used to check for epoll file descriptor inclusion loops */
+static struct nested_calls poll_loop_ncalls;
 /* Used for safe wake up implementation */
 static struct nested_calls poll_safewake_ncalls;
@@ -1198,6 +1208,62 @@ retry:
        return res;
 }
+/**
+ * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
+ *                      API, to verify that adding an epoll file inside another
+ *                      epoll structure, does not violate the constraints, in
+ *                      terms of closed loops, or too deep chains (which can
+ *                      result in excessive stack usage).
+ *
+ * @priv: Pointer to the epoll file to be currently checked.
+ * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
+ *          data structure pointer.
+ * @call_nests: Current dept of the @ep_call_nested() call stack.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+{
+        int error = 0;
+        struct file *file = priv;
+        struct eventpoll *ep = file->private_data;
+        struct rb_node *rbp;
+        struct epitem *epi;
+        mutex_lock(&ep->mtx);
+        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+                epi = rb_entry(rbp, struct epitem, rbn);
+                if (unlikely(is_file_epoll(epi->ffd.file))) {
+                        error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+                                               ep_loop_check_proc, epi->ffd.file,
+                                               epi->ffd.file->private_data, current);
+                        if (error != 0)
+                                break;
+                }
+        }
+        mutex_unlock(&ep->mtx);
+        return error;
+}
+/**
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
+ *                 another epoll file (represented by @ep) does not create
+ *                 closed loops or too deep chains.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the epoll file to be checked.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check(struct eventpoll *ep, struct file *file)
+{
+        return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+                              ep_loop_check_proc, file, ep, current);
+}
 /*
 * Open an eventpoll file descriptor.
 */
@@ -1246,6 +1312,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                struct epoll_event __user *, event)
 {
        int error;
+        int did_lock_epmutex = 0;
        struct file *file, *tfile;
        struct eventpoll *ep;
        struct epitem *epi;
@@ -1287,6 +1354,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
         */
        ep = file->private_data;
+        /*
+         * When we insert an epoll file descriptor, inside another epoll file
+         * descriptor, there is the change of creating closed loops, which are
+         * better be handled here, than in more critical paths.
+         *
+         * We hold epmutex across the loop check and the insert in this case, in
+         * order to prevent two separate inserts from racing and each doing the
+         * insert "at the same time" such that ep_loop_check passes on both
+         * before either one does the insert, thereby creating a cycle.
+         */
+        if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+                mutex_lock(&epmutex);
+                did_lock_epmutex = 1;
+                error = -ELOOP;
+                if (ep_loop_check(ep, tfile) != 0)
+                        goto error_tgt_fput;
+        }
        mutex_lock(&ep->mtx);
        /*
@@ -1322,6 +1408,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        mutex_unlock(&ep->mtx);
 error_tgt_fput:
+        if (unlikely(did_lock_epmutex))
+                mutex_unlock(&epmutex);
        fput(tfile);
 error_fput:
        fput(file);
@@ -1441,6 +1530,12 @@ static int __init eventpoll_init(void)
                EP_ITEM_COST;
        BUG_ON(max_user_watches < 0);
+        /*
+         * Initialize the structure used to perform epoll file descriptor
+         * inclusion loops checks.
+         */
+        ep_nested_calls_init(&poll_loop_ncalls);
        /* Initialize the structure used to perform safe poll wait head wake ups */
        ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 264e95d02830..4d70db110cfc 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -272,7 +272,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                err = exofs_set_link(new_dir, new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME;
                if (dir_de)
@@ -286,12 +285,9 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= EXOFS_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = exofs_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
@@ -299,7 +295,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
        old_inode->i_ctime = CURRENT_TIME;
        exofs_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2e1d8341d827..adb91855ccd0 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -344,7 +344,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
                new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -356,12 +355,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
                        if (new_dir->i_nlink >= EXT2_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = ext2_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
@@ -369,12 +365,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
-         * inode_dec_link_count() will mark the inode dirty.
         */
        old_inode->i_ctime = CURRENT_TIME_SEC;
+        mark_inode_dirty(old_inode);
        ext2_delete_entry (old_de, old_page);
-        inode_dec_link_count(old_inode);
        if (dir_de) {
                if (old_dir != new_dir)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0c8d97b56f34..3aa0b72b3b94 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -848,6 +848,7 @@ struct ext4_inode_info {
        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
+        atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
        spinlock_t i_block_reservation_lock;
@@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ         37
+#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
+                                            EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
+                                             EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 #endif  /* __KERNEL__ */
 #endif  /* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 63a75810b7c3..ccce8a7e94ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3174,9 +3174,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * that this IO needs to convertion to written when IO is
                 * completed
                 */
-                if (io)
+                if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
                        io->flag = EXT4_IO_END_UNWRITTEN;
-                else
+                        atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+                } else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
                        map->m_flags |= EXT4_MAP_UNINIT;
@@ -3463,9 +3464,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                 * that we need to perform convertion when IO is done.
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                        if (io)
+                        if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
                                io->flag = EXT4_IO_END_UNWRITTEN;
-                        else
+                                atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+                        } else
                                ext4_set_inode_state(inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
                }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2e8322c8aa88..7b80d543b89e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
        return 0;
 }
+static void ext4_aiodio_wait(struct inode *inode)
+{
+        wait_queue_head_t *wq = ext4_ioend_wq(inode);
+        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+}
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete.  Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block.  If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
+                   unsigned long nr_segs, loff_t pos)
+{
+        struct super_block *sb = inode->i_sb;
+        int blockmask = sb->s_blocksize - 1;
+        size_t count = iov_length(iov, nr_segs);
+        loff_t final_size = pos + count;
+        if (pos >= inode->i_size)
+                return 0;
+        if ((pos & blockmask) || (final_size & blockmask))
+                return 1;
+        return 0;
+}
 static ssize_t
 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                unsigned long nr_segs, loff_t pos)
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+        int unaligned_aio = 0;
+        int ret;
        /*
         * If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                        nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
                                              sbi->s_bitmap_maxbytes - pos);
                }
+        } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
+                   !is_sync_kiocb(iocb))) {
+                unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
        }
-        return generic_file_aio_write(iocb, iov, nr_segs, pos);
+        /* Unaligned direct AIO must be serialized; see comment above */
+        if (unaligned_aio) {
+                static unsigned long unaligned_warn_time;
+                /* Warn about this once per day */
+                if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
+                        ext4_msg(inode->i_sb, KERN_WARNING,
+                                 "Unaligned AIO/DIO on inode %ld by %s; "
+                                 "performance will be poor.",
+                                 inode->i_ino, current->comm);
+                mutex_lock(ext4_aio_mutex(inode));
+                ext4_aiodio_wait(inode);
+        }
+        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+        if (unaligned_aio)
+                mutex_unlock(ext4_aio_mutex(inode));
+        return ret;
 }
 static const struct vm_operations_struct ext4_file_vm_ops = {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 851f49b2f9d2..d1fe09aea73d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -342,10 +342,15 @@ static struct kmem_cache *ext4_free_ext_cachep;
 /* We create slab caches for groupinfo data structures based on the
 * superblock block size.  There will be one per mounted filesystem for
 * each unique s_blocksize_bits */
-#define NR_GRPINFO_CACHES       \
+#define NR_GRPINFO_CACHES 8
-        (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+        "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
+        "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
+        "ext4_groupinfo_64k", "ext4_groupinfo_128k"
+};
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -2414,6 +2419,55 @@ err_freesgi:
        return -ENOMEM;
 }
+static void ext4_groupinfo_destroy_slabs(void)
+{
+        int i;
+        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+                if (ext4_groupinfo_caches[i])
+                        kmem_cache_destroy(ext4_groupinfo_caches[i]);
+                ext4_groupinfo_caches[i] = NULL;
+        }
+}
+static int ext4_groupinfo_create_slab(size_t size)
+{
+        static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
+        int slab_size;
+        int blocksize_bits = order_base_2(size);
+        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        struct kmem_cache *cachep;
+        if (cache_index >= NR_GRPINFO_CACHES)
+                return -EINVAL;
+        if (unlikely(cache_index < 0))
+                cache_index = 0;
+        mutex_lock(&ext4_grpinfo_slab_create_mutex);
+        if (ext4_groupinfo_caches[cache_index]) {
+                mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+                return 0;       /* Already created */
+        }
+        slab_size = offsetof(struct ext4_group_info,
+                                bb_counters[blocksize_bits + 2]);
+        cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
+                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
+                                        NULL);
+        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+        if (!cachep) {
+                printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
+                return -ENOMEM;
+        }
+        ext4_groupinfo_caches[cache_index] = cachep;
+        return 0;
+}
 int ext4_mb_init(struct super_block *sb, int needs_recovery)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2421,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        unsigned offset;
        unsigned max;
        int ret;
-        int cache_index;
-        struct kmem_cache *cachep;
-        char *namep = NULL;
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
@@ -2440,30 +2491,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
                goto out;
        }
-        cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        ret = ext4_groupinfo_create_slab(sb->s_blocksize);
-        cachep = ext4_groupinfo_caches[cache_index];
+        if (ret < 0)
-        if (!cachep) {
+                goto out;
-                char name[32];
-                int len = offsetof(struct ext4_group_info,
-                                        bb_counters[sb->s_blocksize_bits + 2]);
-                sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
-                namep = kstrdup(name, GFP_KERNEL);
-                if (!namep) {
-                        ret = -ENOMEM;
-                        goto out;
-                }
-                /* Need to free the kmem_cache_name() when we
-                 * destroy the slab */
-                cachep = kmem_cache_create(namep, len, 0,
-                                             SLAB_RECLAIM_ACCOUNT, NULL);
-                if (!cachep) {
-                        ret = -ENOMEM;
-                        goto out;
-                }
-                ext4_groupinfo_caches[cache_index] = cachep;
-        }
        /* order 0 is regular bitmap */
        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
@@ -2520,7 +2550,6 @@ out:
        if (ret) {
                kfree(sbi->s_mb_offsets);
                kfree(sbi->s_mb_maxs);
-                kfree(namep);
        }
        return ret;
 }
@@ -2734,7 +2763,6 @@ int __init ext4_init_mballoc(void)
 void ext4_exit_mballoc(void)
 {
-        int i;
        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
@@ -2743,15 +2771,7 @@ void ext4_exit_mballoc(void)
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_ext_cachep);
+        ext4_groupinfo_destroy_slabs();
-        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
-                struct kmem_cache *cachep = ext4_groupinfo_caches[i];
-                if (cachep) {
-                        char *name = (char *)kmem_cache_name(cachep);
-                        kmem_cache_destroy(cachep);
-                        kfree(name);
-                }
-        }
        ext4_remove_debugfs_entry();
 }
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7270dcfca92a..955cc309142f 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,14 +32,8 @@
 static struct kmem_cache *io_page_cachep, *io_end_cachep;
-#define WQ_HASH_SZ              37
-#define to_ioend_wq(v)  (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
-static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
 int __init ext4_init_pageio(void)
 {
-        int i;
        io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
        if (io_page_cachep == NULL)
                return -ENOMEM;
@@ -48,9 +42,6 @@ int __init ext4_init_pageio(void)
                kmem_cache_destroy(io_page_cachep);
                return -ENOMEM;
        }
-        for (i = 0; i < WQ_HASH_SZ; i++)
-                init_waitqueue_head(&ioend_wq[i]);
        return 0;
 }
@@ -62,7 +53,7 @@ void ext4_exit_pageio(void)
 void ext4_ioend_wait(struct inode *inode)
 {
-        wait_queue_head_t *wq = to_ioend_wq(inode);
+        wait_queue_head_t *wq = ext4_ioend_wq(inode);
        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
 }
@@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
        for (i = 0; i < io->num_io_pages; i++)
                put_io_page(io->pages[i]);
        io->num_io_pages = 0;
-        wq = to_ioend_wq(io->inode);
+        wq = ext4_ioend_wq(io->inode);
        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
            waitqueue_active(wq))
                wake_up_all(wq);
@@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
        struct inode *inode = io->inode;
        loff_t offset = io->offset;
        ssize_t size = io->size;
+        wait_queue_head_t *wq;
        int ret = 0;
        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
        if (io->iocb)
                aio_complete(io->iocb, io->result, 0);
        /* clear the DIO AIO unwritten flag */
-        io->flag &= ~EXT4_IO_END_UNWRITTEN;
+        if (io->flag & EXT4_IO_END_UNWRITTEN) {
+                io->flag &= ~EXT4_IO_END_UNWRITTEN;
+                /* Wake up anyone waiting on unwritten extent conversion */
+                wq = ext4_ioend_wq(io->inode);
+                if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
+                    waitqueue_active(wq)) {
+                        wake_up_all(wq);
+                }
+        }
        return ret;
 }
@@ -190,6 +191,7 @@ static void ext4_end_bio(struct bio *bio, int error)
        struct inode *inode;
        unsigned long flags;
        int i;
+        sector_t bi_sector = bio->bi_sector;
        BUG_ON(!io_end);
        bio->bi_private = NULL;
@@ -207,9 +209,7 @@ static void ext4_end_bio(struct bio *bio, int error)
                if (error)
                        SetPageError(page);
                BUG_ON(!head);
-                if (head->b_size == PAGE_CACHE_SIZE)
+                if (head->b_size != PAGE_CACHE_SIZE) {
-                        clear_buffer_dirty(head);
-                else {
                        loff_t offset;
                        loff_t io_end_offset = io_end->offset + io_end->size;
@@ -221,7 +221,6 @@ static void ext4_end_bio(struct bio *bio, int error)
                                        if (error)
                                                buffer_io_error(bh);
-                                        clear_buffer_dirty(bh);
                                }
                                if (buffer_delay(bh))
                                        partial_write = 1;
@@ -257,7 +256,7 @@ static void ext4_end_bio(struct bio *bio, int error)
                             (unsigned long long) io_end->offset,
                             (long) io_end->size,
                             (unsigned long long)
-                             bio->bi_sector >> (inode->i_blkbits - 9));
+                             bi_sector >> (inode->i_blkbits - 9));
        }
        /* Add the io_end to per-inode completed io list*/
@@ -380,6 +379,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        blocksize = 1 << inode->i_blkbits;
+        BUG_ON(!PageLocked(page));
        BUG_ON(PageWriteback(page));
        set_page_writeback(page);
        ClearPageError(page);
@@ -397,12 +397,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        for (bh = head = page_buffers(page), block_start = 0;
             bh != head || !block_start;
             block_start = block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_start >= len) {
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
                        continue;
                }
+                clear_buffer_dirty(bh);
                ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
                if (ret) {
                        /*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 48ce561fafac..f6a318f836b2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -77,6 +77,7 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
                       const char *dev_name, void *data);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
+static void ext4_clear_request_list(void);
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
@@ -832,6 +833,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
        atomic_set(&ei->i_ioend_count, 0);
+        atomic_set(&ei->i_aiodio_unwritten, 0);
        return &ei->vfs_inode;
 }
@@ -2716,6 +2718,8 @@ static void ext4_unregister_li_request(struct super_block *sb)
        mutex_unlock(&ext4_li_info->li_list_mtx);
 }
+static struct task_struct *ext4_lazyinit_task;
 /*
 * This is the function where ext4lazyinit thread lives. It walks
 * through the request list searching for next scheduled filesystem.
@@ -2784,6 +2788,10 @@ cont_thread:
                if (time_before(jiffies, next_wakeup))
                        schedule();
                finish_wait(&eli->li_wait_daemon, &wait);
+                if (kthread_should_stop()) {
+                        ext4_clear_request_list();
+                        goto exit_thread;
+                }
        }
 exit_thread:
@@ -2808,6 +2816,7 @@ exit_thread:
        wake_up(&eli->li_wait_task);
        kfree(ext4_li_info);
+        ext4_lazyinit_task = NULL;
        ext4_li_info = NULL;
        mutex_unlock(&ext4_li_mtx);
@@ -2830,11 +2839,10 @@ static void ext4_clear_request_list(void)
 static int ext4_run_lazyinit_thread(void)
 {
-        struct task_struct *t;
+        ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
+                                         ext4_li_info, "ext4lazyinit");
-        t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+        if (IS_ERR(ext4_lazyinit_task)) {
-        if (IS_ERR(t)) {
+                int err = PTR_ERR(ext4_lazyinit_task);
-                int err = PTR_ERR(t);
                ext4_clear_request_list();
                del_timer_sync(&ext4_li_info->li_timer);
                kfree(ext4_li_info);
@@ -2985,16 +2993,10 @@ static void ext4_destroy_lazyinit_thread(void)
         * If thread exited earlier
         * there's nothing to be done.
         */
-        if (!ext4_li_info)
+        if (!ext4_li_info || !ext4_lazyinit_task)
                return;
-        ext4_clear_request_list();
+        kthread_stop(ext4_lazyinit_task);
-        while (ext4_li_info->li_task) {
-                wake_up(&ext4_li_info->li_wait_daemon);
-                wait_event(ext4_li_info->li_wait_task,
-                           ext4_li_info->li_task == NULL);
-        }
 }
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
@@ -4768,7 +4770,7 @@ static struct file_system_type ext4_fs_type = {
        .fs_flags       = FS_REQUIRES_DEV,
 };
-int __init ext4_init_feat_adverts(void)
+static int __init ext4_init_feat_adverts(void)
 {
        struct ext4_features *ef;
        int ret = -ENOMEM;
@@ -4792,23 +4794,44 @@ out:
        return ret;
 }
+static void ext4_exit_feat_adverts(void)
+{
+        kobject_put(&ext4_feat->f_kobj);
+        wait_for_completion(&ext4_feat->f_kobj_unregister);
+        kfree(ext4_feat);
+}
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 static int __init ext4_init_fs(void)
 {
-        int err;
+        int i, err;
        ext4_check_flag_values();
+        for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+                mutex_init(&ext4__aio_mutex[i]);
+                init_waitqueue_head(&ext4__ioend_wq[i]);
+        }
        err = ext4_init_pageio();
        if (err)
                return err;
        err = ext4_init_system_zone();
        if (err)
-                goto out5;
+                goto out7;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
-                goto out4;
+                goto out6;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+        if (!ext4_proc_root)
+                goto out5;
        err = ext4_init_feat_adverts();
+        if (err)
+                goto out4;
        err = ext4_init_mballoc();
        if (err)
@@ -4838,12 +4861,14 @@ out1:
 out2:
        ext4_exit_mballoc();
 out3:
-        kfree(ext4_feat);
+        ext4_exit_feat_adverts();
+out4:
        remove_proc_entry("fs/ext4", NULL);
+out5:
        kset_unregister(ext4_kset);
-out4:
+out6:
        ext4_exit_system_zone();
-out5:
+out7:
        ext4_exit_pageio();
        return err;
 }
@@ -4857,6 +4882,7 @@ static void __exit ext4_exit_fs(void)
        destroy_inodecache();
        ext4_exit_xattr();
        ext4_exit_mballoc();
+        ext4_exit_feat_adverts();
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
        ext4_exit_system_zone();
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f88f752babd9..adae3fb7451a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,7 +43,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        /* This is not negative dentry. Always valid. */
@@ -54,7 +54,7 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        /*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bfed8447ed80..8bd0ef9286c3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -158,7 +158,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
        struct inode *inode;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        inode = entry->d_inode;
@@ -1283,8 +1283,11 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
        if (err)
                return err;
-        if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc)
+        if (attr->ia_valid & ATTR_OPEN) {
-                return 0;
+                if (fc->atomic_o_trunc)
+                        return 0;
+                file = NULL;
+        }
        if (attr->ia_valid & ATTR_SIZE)
                is_truncate = true;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 95da1bc1c826..9e0832dbb1e3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -86,18 +86,52 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff)
        return ff;
 }
+static void fuse_release_async(struct work_struct *work)
+{
+        struct fuse_req *req;
+        struct fuse_conn *fc;
+        struct path path;
+        req = container_of(work, struct fuse_req, misc.release.work);
+        path = req->misc.release.path;
+        fc = get_fuse_conn(path.dentry->d_inode);
+        fuse_put_request(fc, req);
+        path_put(&path);
+}
 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-        path_put(&req->misc.release.path);
+        if (fc->destroy_req) {
+                /*
+                 * If this is a fuseblk mount, then it's possible that
+                 * releasing the path will result in releasing the
+                 * super block and sending the DESTROY request.  If
+                 * the server is single threaded, this would hang.
+                 * For this reason do the path_put() in a separate
+                 * thread.
+                 */
+                atomic_inc(&req->count);
+                INIT_WORK(&req->misc.release.work, fuse_release_async);
+                schedule_work(&req->misc.release.work);
+        } else {
+                path_put(&req->misc.release.path);
+        }
 }
-static void fuse_file_put(struct fuse_file *ff)
+static void fuse_file_put(struct fuse_file *ff, bool sync)
 {
        if (atomic_dec_and_test(&ff->count)) {
                struct fuse_req *req = ff->reserved_req;
-                req->end = fuse_release_end;
+                if (sync) {
-                fuse_request_send_background(ff->fc, req);
+                        fuse_request_send(ff->fc, req);
+                        path_put(&req->misc.release.path);
+                        fuse_put_request(ff->fc, req);
+                } else {
+                        req->end = fuse_release_end;
+                        fuse_request_send_background(ff->fc, req);
+                }
                kfree(ff);
        }
 }
@@ -219,8 +253,12 @@ void fuse_release_common(struct file *file, int opcode)
         * Normally this will send the RELEASE request, however if
         * some asynchronous READ or WRITE requests are outstanding,
         * the sending will be delayed.
+         *
+         * Make the release synchronous if this is a fuseblk mount,
+         * synchronous RELEASE is allowed (and desirable) in this case
+         * because the server can be trusted not to screw up.
         */
-        fuse_file_put(ff);
+        fuse_file_put(ff, ff->fc->destroy_req != NULL);
 }
 static int fuse_open(struct inode *inode, struct file *file)
@@ -558,7 +596,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
                page_cache_release(page);
        }
        if (req->ff)
-                fuse_file_put(req->ff);
+                fuse_file_put(req->ff, false);
 }
 static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -1137,7 +1175,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
        __free_page(req->pages[0]);
-        fuse_file_put(req->ff);
+        fuse_file_put(req->ff, false);
 }
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ae5744a2f9e9..d4286947bc2c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,7 @@
 #include <linux/rwsem.h>
 #include <linux/rbtree.h>
 #include <linux/poll.h>
+#include <linux/workqueue.h>
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -262,7 +263,10 @@ struct fuse_req {
        /** Data for asynchronous requests */
        union {
                struct {
-                        struct fuse_release_in in;
+                        union {
+                                struct fuse_release_in in;
+                                struct work_struct work;
+                        };
                        struct path path;
                } release;
                struct fuse_init_in init_in;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4a456338b873..0da8da2c991d 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -44,7 +44,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
        int error;
        int had_lock = 0;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        parent = dget_parent(dentry);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 08a8beb152e6..7cd9a5a68d59 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1779,11 +1779,11 @@ int __init gfs2_glock_init(void)
 #endif
        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
-                                          WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+                                          WQ_HIGHPRI | WQ_FREEZABLE, 0);
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
-                                                WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+                                                WQ_MEM_RECLAIM | WQ_FREEZABLE,
                                                0);
        if (IS_ERR(gfs2_delete_workqueue)) {
                destroy_workqueue(glock_workqueue);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index ebef7ab6e17e..72c31a315d96 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -59,14 +59,7 @@ static void gfs2_init_gl_aspace_once(void *foo)
        struct address_space *mapping = (struct address_space *)(gl + 1);
        gfs2_init_glock_once(gl);
-        memset(mapping, 0, sizeof(*mapping));
+        address_space_init_once(mapping);
-        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-        spin_lock_init(&mapping->tree_lock);
-        spin_lock_init(&mapping->i_mmap_lock);
-        INIT_LIST_HEAD(&mapping->private_list);
-        spin_lock_init(&mapping->private_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
 }
 /**
@@ -144,7 +137,7 @@ static int __init init_gfs2_fs(void)
        error = -ENOMEM;
        gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-                                          WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0);
+                                          WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
        if (!gfs_recovery_wq)
                goto fail_wq;
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index afa66aaa2237..b4d70b13be92 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -238,46 +238,22 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 /*
- * hfs_unlink()
+ * hfs_remove()
 *
- * This is the unlink() entry in the inode_operations structure for
+ * This serves as both unlink() and rmdir() in the inode_operations
- * regular HFS directories.  The purpose is to delete an existing
+ * structure for regular HFS directories.  The purpose is to delete
- * file, given the inode for the parent directory and the name
+ * an existing child, given the inode for the parent directory and
- * (and its length) of the existing file.
+ * the name (and its length) of the existing directory.
- */
-static int hfs_unlink(struct inode *dir, struct dentry *dentry)
-{
-        struct inode *inode;
-        int res;
-        inode = dentry->d_inode;
-        res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
-        if (res)
-                return res;
-        drop_nlink(inode);
-        hfs_delete_inode(inode);
-        inode->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(inode);
-        return res;
-}
-/*
- * hfs_rmdir()
 *
- * This is the rmdir() entry in the inode_operations structure for
+ * HFS does not have hardlinks, so both rmdir and unlink set the
- * regular HFS directories.  The purpose is to delete an existing
+ * link count to 0.  The only difference is the emptiness check.
- * directory, given the inode for the parent directory and the name
- * (and its length) of the existing directory.
 */
-static int hfs_rmdir(struct inode *dir, struct dentry *dentry)
+static int hfs_remove(struct inode *dir, struct dentry *dentry)
 {
-        struct inode *inode;
+        struct inode *inode = dentry->d_inode;
        int res;
-        inode = dentry->d_inode;
+        if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
-        if (inode->i_size != 2)
                return -ENOTEMPTY;
        res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
        if (res)
@@ -307,7 +283,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* Unlink destination if it already exists */
        if (new_dentry->d_inode) {
-                res = hfs_unlink(new_dir, new_dentry);
+                res = hfs_remove(new_dir, new_dentry);
                if (res)
                        return res;
        }
@@ -332,9 +308,9 @@ const struct file_operations hfs_dir_operations = {
 const struct inode_operations hfs_dir_inode_operations = {
        .create         = hfs_create,
        .lookup         = hfs_lookup,
-        .unlink         = hfs_unlink,
+        .unlink         = hfs_remove,
        .mkdir          = hfs_mkdir,
-        .rmdir          = hfs_rmdir,
+        .rmdir          = hfs_remove,
        .rename         = hfs_rename,
        .setattr        = hfs_inode_setattr,
 };
diff --git a/fs/inode.c b/fs/inode.c
index da85e56378f3..0647d80accf6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -295,6 +295,20 @@ static void destroy_inode(struct inode *inode)
                call_rcu(&inode->i_rcu, i_callback);
 }
+void address_space_init_once(struct address_space *mapping)
+{
+        memset(mapping, 0, sizeof(*mapping));
+        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+        spin_lock_init(&mapping->tree_lock);
+        spin_lock_init(&mapping->i_mmap_lock);
+        INIT_LIST_HEAD(&mapping->private_list);
+        spin_lock_init(&mapping->private_lock);
+        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+        mutex_init(&mapping->unmap_mutex);
+}
+EXPORT_SYMBOL(address_space_init_once);
 /*
 * These are initializations that only need to be done
 * once, because the fields are idempotent across use
@@ -308,13 +322,7 @@ void inode_init_once(struct inode *inode)
        INIT_LIST_HEAD(&inode->i_devices);
        INIT_LIST_HEAD(&inode->i_wb_list);
        INIT_LIST_HEAD(&inode->i_lru);
-        INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+        address_space_init_once(&inode->i_data);
-        spin_lock_init(&inode->i_data.tree_lock);
-        spin_lock_init(&inode->i_data.i_mmap_lock);
-        INIT_LIST_HEAD(&inode->i_data.private_list);
-        spin_lock_init(&inode->i_data.private_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
-        INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
        i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
        INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
@@ -540,11 +548,14 @@ void evict_inodes(struct super_block *sb)
 /**
 * invalidate_inodes    - attempt to free all inodes on a superblock
 * @sb:         superblock to operate on
+ * @kill_dirty: flag to guide handling of dirty inodes
 *
 * Attempts to free all inodes for a given superblock.  If there were any
 * busy inodes return a non-zero value, else zero.
+ * If @kill_dirty is set, discard dirty inodes too, otherwise treat
+ * them as busy.
 */
-int invalidate_inodes(struct super_block *sb)
+int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 {
        int busy = 0;
        struct inode *inode, *next;
@@ -556,6 +567,10 @@ int invalidate_inodes(struct super_block *sb)
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
                        continue;
+                if (inode->i_state & I_DIRTY && !kill_dirty) {
+                        busy = 1;
+                        continue;
+                }
                if (atomic_read(&inode->i_count)) {
                        busy = 1;
                        continue;
diff --git a/fs/internal.h b/fs/internal.h
index 0663568b1247..9b976b57d7fe 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -112,4 +112,4 @@ extern void release_open_intent(struct nameidata *);
 */
 extern int get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
-extern int invalidate_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *, bool);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9e4686900f18..97e73469b2c4 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -473,7 +473,8 @@ int __jbd2_log_space_left(journal_t *journal)
 }
 /*
- * Called under j_state_lock.  Returns true if a transaction commit was started.
+ * Called with j_state_lock locked for writing.
+ * Returns true if a transaction commit was started.
 */
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
@@ -520,11 +521,13 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 {
        transaction_t *transaction = NULL;
        tid_t tid;
+        int need_to_start = 0;
        read_lock(&journal->j_state_lock);
        if (journal->j_running_transaction && !current->journal_info) {
                transaction = journal->j_running_transaction;
-                __jbd2_log_start_commit(journal, transaction->t_tid);
+                if (!tid_geq(journal->j_commit_request, transaction->t_tid))
+                        need_to_start = 1;
        } else if (journal->j_committing_transaction)
                transaction = journal->j_committing_transaction;
@@ -535,6 +538,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
        tid = transaction->t_tid;
        read_unlock(&journal->j_state_lock);
+        if (need_to_start)
+                jbd2_log_start_commit(journal, tid);
        jbd2_log_wait_commit(journal, tid);
        return 1;
 }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index faad2bd787c7..1d1191050f99 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -117,10 +117,10 @@ static inline void update_t_max_wait(transaction_t *transaction)
 static int start_this_handle(journal_t *journal, handle_t *handle,
                             int gfp_mask)
 {
-        transaction_t *transaction;
+        transaction_t   *transaction, *new_transaction = NULL;
-        int needed;
+        tid_t           tid;
-        int nblocks = handle->h_buffer_credits;
+        int             needed, need_to_start;
-        transaction_t *new_transaction = NULL;
+        int             nblocks = handle->h_buffer_credits;
        if (nblocks > journal->j_max_transaction_buffers) {
                printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -222,8 +222,11 @@ repeat:
                atomic_sub(nblocks, &transaction->t_outstanding_credits);
                prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
                                TASK_UNINTERRUPTIBLE);
-                __jbd2_log_start_commit(journal, transaction->t_tid);
+                tid = transaction->t_tid;
+                need_to_start = !tid_geq(journal->j_commit_request, tid);
                read_unlock(&journal->j_state_lock);
+                if (need_to_start)
+                        jbd2_log_start_commit(journal, tid);
                schedule();
                finish_wait(&journal->j_wait_transaction_locked, &wait);
                goto repeat;
@@ -442,7 +445,8 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-        int ret;
+        tid_t           tid;
+        int             need_to_start, ret;
        /* If we've had an abort of any type, don't even think about
         * actually doing the restart! */
@@ -465,8 +469,11 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
        spin_unlock(&transaction->t_handle_lock);
        jbd_debug(2, "restarting handle %p\n", handle);
-        __jbd2_log_start_commit(journal, transaction->t_tid);
+        tid = transaction->t_tid;
+        need_to_start = !tid_geq(journal->j_commit_request, tid);
        read_unlock(&journal->j_state_lock);
+        if (need_to_start)
+                jbd2_log_start_commit(journal, tid);
        lock_map_release(&handle->h_lockdep_map);
        handle->h_buffer_credits = nblocks;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 81ead850ddb6..5a2b269428a6 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1600,7 +1600,7 @@ out:
 static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        /*
         * This is not negative dentry. Always valid.
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index ce7337ddfdbf..6e6777f1b4b2 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -213,7 +213,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
                new_de = minix_find_entry(new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                minix_set_link(new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -225,18 +224,15 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= info->s_link_max)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = minix_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
        minix_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                minix_set_link(dir_de, dir_page, new_dir);
diff --git a/fs/namei.c b/fs/namei.c
index 7d77f24d32a9..a4689eb2df28 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -455,14 +455,6 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
        struct fs_struct *fs = current->fs;
        struct dentry *parent = nd->path.dentry;
-        /*
-         * It can be possible to revalidate the dentry that we started
-         * the path walk with. force_reval_path may also revalidate the
-         * dentry already committed to the nameidata.
-         */
-        if (unlikely(parent == dentry))
-                return nameidata_drop_rcu(nd);
        BUG_ON(!(nd->flags & LOOKUP_RCU));
        if (nd->root.mnt) {
                spin_lock(&fs->lock);
@@ -561,39 +553,25 @@ static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
 */
 void release_open_intent(struct nameidata *nd)
 {
-        if (nd->intent.open.file->f_path.dentry == NULL)
+        struct file *file = nd->intent.open.file;
-                put_filp(nd->intent.open.file);
-        else
-                fput(nd->intent.open.file);
-}
-/*
- * Call d_revalidate and handle filesystems that request rcu-walk
- * to be dropped. This may be called and return in rcu-walk mode,
- * regardless of success or error. If -ECHILD is returned, the caller
- * must return -ECHILD back up the path walk stack so path walk may
- * be restarted in ref-walk mode.
- */
-static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-        int status;
-        status = dentry->d_op->d_revalidate(dentry, nd);
+        if (file && !IS_ERR(file)) {
-        if (status == -ECHILD) {
+                if (file->f_path.dentry == NULL)
-                if (nameidata_dentry_drop_rcu(nd, dentry))
+                        put_filp(file);
-                        return status;
+                else
-                status = dentry->d_op->d_revalidate(dentry, nd);
+                        fput(file);
        }
+}
-        return status;
+static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        return dentry->d_op->d_revalidate(dentry, nd);
 }
-static inline struct dentry *
+static struct dentry *
 do_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        int status;
+        int status = d_revalidate(dentry, nd);
-        status = d_revalidate(dentry, nd);
        if (unlikely(status <= 0)) {
                /*
                 * The dentry failed validation.
@@ -602,24 +580,39 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
                 * to return a fail status.
                 */
                if (status < 0) {
-                        /* If we're in rcu-walk, we don't have a ref */
+                        dput(dentry);
-                        if (!(nd->flags & LOOKUP_RCU))
-                                dput(dentry);
                        dentry = ERR_PTR(status);
+                } else if (!d_invalidate(dentry)) {
-                } else {
+                        dput(dentry);
-                        /* Don't d_invalidate in rcu-walk mode */
+                        dentry = NULL;
-                        if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
-                                return ERR_PTR(-ECHILD);
-                        if (!d_invalidate(dentry)) {
-                                dput(dentry);
-                                dentry = NULL;
-                        }
                }
        }
        return dentry;
 }
+static inline struct dentry *
+do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
+{
+        int status = d_revalidate(dentry, nd);
+        if (likely(status > 0))
+                return dentry;
+        if (status == -ECHILD) {
+                if (nameidata_dentry_drop_rcu(nd, dentry))
+                        return ERR_PTR(-ECHILD);
+                return do_revalidate(dentry, nd);
+        }
+        if (status < 0)
+                return ERR_PTR(status);
+        /* Don't d_invalidate in rcu-walk mode */
+        if (nameidata_dentry_drop_rcu(nd, dentry))
+                return ERR_PTR(-ECHILD);
+        if (!d_invalidate(dentry)) {
+                dput(dentry);
+                dentry = NULL;
+        }
+        return dentry;
+}
 static inline int need_reval_dot(struct dentry *dentry)
 {
        if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
@@ -664,9 +657,6 @@ force_reval_path(struct path *path, struct nameidata *nd)
                return 0;
        if (!status) {
-                /* Don't d_invalidate in rcu-walk mode */
-                if (nameidata_drop_rcu(nd))
-                        return -ECHILD;
                d_invalidate(dentry);
                status = -ESTALE;
        }
@@ -773,6 +763,8 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
        int error;
        struct dentry *dentry = link->dentry;
+        BUG_ON(nd->flags & LOOKUP_RCU);
        touch_atime(link->mnt, dentry);
        nd_set_link(nd, NULL);
@@ -803,10 +795,16 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 * Without that kind of total limit, nasty chains of consecutive
 * symlinks can cause almost arbitrarily long lookups. 
 */
-static inline int do_follow_link(struct path *path, struct nameidata *nd)
+static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd)
 {
        void *cookie;
        int err = -ELOOP;
+        /* We drop rcu-walk here */
+        if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+                return -ECHILD;
+        BUG_ON(inode != path->dentry->d_inode);
        if (current->link_count >= MAX_NESTED_LINKS)
                goto loop;
        if (current->total_link_count >= 40)
@@ -1251,9 +1249,15 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
                        return -ECHILD;
                nd->seq = seq;
-                if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-                        goto need_revalidate;
+                        dentry = do_revalidate_rcu(dentry, nd);
-done2:
+                        if (!dentry)
+                                goto need_lookup;
+                        if (IS_ERR(dentry))
+                                goto fail;
+                        if (!(nd->flags & LOOKUP_RCU))
+                                goto done;
+                }
                path->mnt = mnt;
                path->dentry = dentry;
                if (likely(__follow_mount_rcu(nd, path, inode, false)))
@@ -1266,8 +1270,13 @@ done2:
        if (!dentry)
                goto need_lookup;
 found:
-        if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-                goto need_revalidate;
+                dentry = do_revalidate(dentry, nd);
+                if (!dentry)
+                        goto need_lookup;
+                if (IS_ERR(dentry))
+                        goto fail;
+        }
 done:
        path->mnt = mnt;
        path->dentry = dentry;
@@ -1309,16 +1318,6 @@ need_lookup:
        mutex_unlock(&dir->i_mutex);
        goto found;
-need_revalidate:
-        dentry = do_revalidate(dentry, nd);
-        if (!dentry)
-                goto need_lookup;
-        if (IS_ERR(dentry))
-                goto fail;
-        if (nd->flags & LOOKUP_RCU)
-                goto done2;
-        goto done;
 fail:
        return PTR_ERR(dentry);
 }
@@ -1415,11 +1414,7 @@ exec_again:
                        goto out_dput;
                if (inode->i_op->follow_link) {
-                        /* We commonly drop rcu-walk here */
+                        err = do_follow_link(inode, &next, nd);
-                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
-                                return -ECHILD;
-                        BUG_ON(inode != next.dentry->d_inode);
-                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
                        nd->inode = nd->path.dentry->d_inode;
@@ -1463,10 +1458,7 @@ last_component:
                        break;
                if (inode && unlikely(inode->i_op->follow_link) &&
                    (lookup_flags & LOOKUP_FOLLOW)) {
-                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
+                        err = do_follow_link(inode, &next, nd);
-                                return -ECHILD;
-                        BUG_ON(inode != next.dentry->d_inode);
-                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
                        nd->inode = nd->path.dentry->d_inode;
@@ -1500,12 +1492,15 @@ return_reval:
                 * We may need to check the cached dentry for staleness.
                 */
                if (need_reval_dot(nd->path.dentry)) {
+                        if (nameidata_drop_rcu_last_maybe(nd))
+                                return -ECHILD;
                        /* Note: we do not d_invalidate() */
                        err = d_revalidate(nd->path.dentry, nd);
                        if (!err)
                                err = -ESTALE;
                        if (err < 0)
                                break;
+                        return 0;
                }
 return_base:
                if (nameidata_drop_rcu_last_maybe(nd))
@@ -1551,6 +1546,7 @@ static int path_walk(const char *name, struct nameidata *nd)
                /* nd->path had been dropped */
                current->total_link_count = 0;
                nd->path = save;
+                nd->inode = save.dentry->d_inode;
                path_get(&nd->path);
                nd->flags |= LOOKUP_REVAL;
                result = link_path_walk(name, nd);
@@ -2265,8 +2261,6 @@ static struct file *finish_open(struct nameidata *nd,
        return filp;
 exit:
-        if (!IS_ERR(nd->intent.open.file))
-                release_open_intent(nd);
        path_put(&nd->path);
        return ERR_PTR(error);
 }
@@ -2389,8 +2383,6 @@ exit_mutex_unlock:
 exit_dput:
        path_put_conditional(path, nd);
 exit:
-        if (!IS_ERR(nd->intent.open.file))
-                release_open_intent(nd);
        path_put(&nd->path);
        return ERR_PTR(error);
 }
@@ -2464,21 +2456,29 @@ struct file *do_filp_open(int dfd, const char *pathname,
        /* !O_CREAT, simple open */
        error = do_path_lookup(dfd, pathname, flags, &nd);
        if (unlikely(error))
-                goto out_filp;
+                goto out_filp2;
        error = -ELOOP;
        if (!(nd.flags & LOOKUP_FOLLOW)) {
                if (nd.inode->i_op->follow_link)
-                        goto out_path;
+                        goto out_path2;
        }
        error = -ENOTDIR;
        if (nd.flags & LOOKUP_DIRECTORY) {
                if (!nd.inode->i_op->lookup)
-                        goto out_path;
+                        goto out_path2;
        }
        audit_inode(pathname, nd.path.dentry);
        filp = finish_open(&nd, open_flag, acc_mode);
+out2:
+        release_open_intent(&nd);
        return filp;
+out_path2:
+        path_put(&nd.path);
+out_filp2:
+        filp = ERR_PTR(error);
+        goto out2;
 creat:
        /* OK, have to create the file. Find the parent. */
        error = path_init_rcu(dfd, pathname,
@@ -2553,6 +2553,7 @@ out:
                path_put(&nd.root);
        if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
                goto reval;
+        release_open_intent(&nd);
        return filp;
 exit_dput:
@@ -2560,8 +2561,6 @@ exit_dput:
 out_path:
        path_put(&nd.path);
 out_filp:
-        if (!IS_ERR(nd.intent.open.file))
-                release_open_intent(&nd);
        filp = ERR_PTR(error);
        goto out;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 7b0b95371696..d1edf26025dc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1244,7 +1244,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
                 */
                br_write_lock(vfsmount_lock);
                if (mnt_get_count(mnt) != 2) {
-                        br_write_lock(vfsmount_lock);
+                        br_write_unlock(vfsmount_lock);
                        return -EBUSY;
                }
                br_write_unlock(vfsmount_lock);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1cc600e77bb4..2f8e61816d75 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -37,6 +37,7 @@
 #include <linux/inet.h>
 #include <linux/nfs_xdr.h>
 #include <linux/slab.h>
+#include <linux/compat.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -89,7 +90,11 @@ int nfs_wait_bit_killable(void *word)
 */
 u64 nfs_compat_user_ino64(u64 fileid)
 {
-        int ino;
+#ifdef CONFIG_COMPAT
+        compat_ulong_t ino;
+#else   
+        unsigned long ino;
+#endif
        if (enable_ino64)
                return fileid;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7a7474073148..1be36cf65bfc 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -298,6 +298,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
 #if defined(CONFIG_NFS_V4_1)
 struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
+extern void nfs4_schedule_session_recovery(struct nfs4_session *);
+#else
+static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
@@ -307,10 +312,9 @@ extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
 extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
 extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
-extern void nfs4_schedule_state_recovery(struct nfs_client *);
+extern void nfs4_schedule_lease_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
-extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
+extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
-extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index f5c9b125e8cc..b73c34375f60 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -219,6 +219,10 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
                goto out_err;
        }
        buf = kmalloc(rlen + 1, GFP_KERNEL);
+        if (!buf) {
+                dprintk("%s: Not enough memory\n", __func__);
+                goto out_err;
+        }
        buf[rlen] = '\0';
        memcpy(buf, r_addr, rlen);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 78936a8f40ab..0a07e353a961 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -256,12 +256,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                case -NFS4ERR_OPENMODE:
                        if (state == NULL)
                                break;
-                        nfs4_state_mark_reclaim_nograce(clp, state);
+                        nfs4_schedule_stateid_recovery(server, state);
-                        goto do_state_recovery;
+                        goto wait_on_recovery;
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
-                        goto do_state_recovery;
+                        nfs4_schedule_lease_recovery(clp);
+                        goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
                case -NFS4ERR_BADSESSION:
                case -NFS4ERR_BADSLOT:
@@ -272,7 +273,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                case -NFS4ERR_SEQ_MISORDERED:
                        dprintk("%s ERROR: %d Reset session\n", __func__,
                                errorcode);
-                        nfs4_schedule_state_recovery(clp);
+                        nfs4_schedule_session_recovery(clp->cl_session);
                        exception->retry = 1;
                        break;
 #endif /* defined(CONFIG_NFS_V4_1) */
@@ -295,8 +296,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
        }
        /* We failed to handle the error */
        return nfs4_map_errors(ret);
-do_state_recovery:
+wait_on_recovery:
-        nfs4_schedule_state_recovery(clp);
        ret = nfs4_wait_clnt_recover(clp);
        if (ret == 0)
                exception->retry = 1;
@@ -435,8 +435,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
                clp = res->sr_session->clp;
                do_renew_lease(clp, timestamp);
                /* Check sequence flags */
-                if (atomic_read(&clp->cl_count) > 1)
+                if (res->sr_status_flags != 0)
-                        nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+                        nfs4_schedule_lease_recovery(clp);
                break;
        case -NFS4ERR_DELAY:
                /* The server detected a resend of the RPC call and
@@ -1255,14 +1255,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                        case -NFS4ERR_BAD_HIGH_SLOT:
                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                        case -NFS4ERR_DEADSESSION:
-                                nfs4_schedule_state_recovery(
+                                nfs4_schedule_session_recovery(server->nfs_client->cl_session);
-                                        server->nfs_client);
                                goto out;
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_EXPIRED:
                                /* Don't recall a delegation if it was lost */
-                                nfs4_schedule_state_recovery(server->nfs_client);
+                                nfs4_schedule_lease_recovery(server->nfs_client);
                                goto out;
                        case -ERESTARTSYS:
                                /*
@@ -1271,7 +1270,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                                 */
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
-                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                                nfs4_schedule_stateid_recovery(server, state);
                        case -EKEYEXPIRED:
                                /*
                                 * User RPCSEC_GSS context has expired.
@@ -1587,7 +1586,7 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
                if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
                    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
                        break;
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
                ret = -EIO;
        }
        return ret;
@@ -3178,7 +3177,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
        if (task->tk_status < 0) {
                /* Unless we're shutting down, schedule state recovery! */
                if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
-                        nfs4_schedule_state_recovery(clp);
+                        nfs4_schedule_lease_recovery(clp);
                return;
        }
        do_renew_lease(clp, timestamp);
@@ -3252,6 +3251,35 @@ static void buf_to_pages(const void *buf, size_t buflen,
        }
 }
+static int buf_to_pages_noslab(const void *buf, size_t buflen,
+                struct page **pages, unsigned int *pgbase)
+{
+        struct page *newpage, **spages;
+        int rc = 0;
+        size_t len;
+        spages = pages;
+        do {
+                len = min_t(size_t, PAGE_CACHE_SIZE, buflen);
+                newpage = alloc_page(GFP_KERNEL);
+                if (newpage == NULL)
+                        goto unwind;
+                memcpy(page_address(newpage), buf, len);
+                buf += len;
+                buflen -= len;
+                *pages++ = newpage;
+                rc++;
+        } while (buflen != 0);
+        return rc;
+unwind:
+        for(; rc > 0; rc--)
+                __free_page(spages[rc-1]);
+        return -ENOMEM;
+}
 struct nfs4_cached_acl {
        int cached;
        size_t len;
@@ -3420,13 +3448,23 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
                .rpc_argp       = &arg,
                .rpc_resp       = &res,
        };
-        int ret;
+        int ret, i;
        if (!nfs4_server_supports_acls(server))
                return -EOPNOTSUPP;
+        i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+        if (i < 0)
+                return i;
        nfs_inode_return_delegation(inode);
-        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
        ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        /*
+         * Free each page after tx, so the only ref left is
+         * held by the network stack
+         */
+        for (; i > 0; i--)
+                put_page(pages[i-1]);
        /*
         * Acl update can result in inode attribute update.
         * so mark the attribute cache invalid.
@@ -3464,12 +3502,13 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                case -NFS4ERR_OPENMODE:
                        if (state == NULL)
                                break;
-                        nfs4_state_mark_reclaim_nograce(clp, state);
+                        nfs4_schedule_stateid_recovery(server, state);
-                        goto do_state_recovery;
+                        goto wait_on_recovery;
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
-                        goto do_state_recovery;
+                        nfs4_schedule_lease_recovery(clp);
+                        goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
                case -NFS4ERR_BADSESSION:
                case -NFS4ERR_BADSLOT:
@@ -3480,7 +3519,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                case -NFS4ERR_SEQ_MISORDERED:
                        dprintk("%s ERROR %d, Reset session\n", __func__,
                                task->tk_status);
-                        nfs4_schedule_state_recovery(clp);
+                        nfs4_schedule_session_recovery(clp->cl_session);
                        task->tk_status = 0;
                        return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
@@ -3497,9 +3536,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
        }
        task->tk_status = nfs4_map_errors(task->tk_status);
        return 0;
-do_state_recovery:
+wait_on_recovery:
        rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
-        nfs4_schedule_state_recovery(clp);
        if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
                rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
        task->tk_status = 0;
@@ -4110,7 +4148,7 @@ static void nfs4_lock_release(void *calldata)
                task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
                                data->arg.lock_seqid);
                if (!IS_ERR(task))
-                        rpc_put_task(task);
+                        rpc_put_task_async(task);
                dprintk("%s: cancelling lock!\n", __func__);
        } else
                nfs_free_seqid(data->arg.lock_seqid);
@@ -4134,23 +4172,18 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = {
 static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
 {
-        struct nfs_client *clp = server->nfs_client;
-        struct nfs4_state *state = lsp->ls_state;
        switch (error) {
        case -NFS4ERR_ADMIN_REVOKED:
        case -NFS4ERR_BAD_STATEID:
-        case -NFS4ERR_EXPIRED:
+                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
                if (new_lock_owner != 0 ||
                   (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
-                        nfs4_state_mark_reclaim_nograce(clp, state);
+                        nfs4_schedule_stateid_recovery(server, lsp->ls_state);
-                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
                break;
        case -NFS4ERR_STALE_STATEID:
-                if (new_lock_owner != 0 ||
-                    (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
-                        nfs4_state_mark_reclaim_reboot(clp, state);
                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+        case -NFS4ERR_EXPIRED:
+                nfs4_schedule_lease_recovery(server->nfs_client);
        };
 }
@@ -4366,12 +4399,14 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                        case -NFS4ERR_EXPIRED:
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
+                                nfs4_schedule_lease_recovery(server->nfs_client);
+                                goto out;
                        case -NFS4ERR_BADSESSION:
                        case -NFS4ERR_BADSLOT:
                        case -NFS4ERR_BAD_HIGH_SLOT:
                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                        case -NFS4ERR_DEADSESSION:
-                                nfs4_schedule_state_recovery(server->nfs_client);
+                                nfs4_schedule_session_recovery(server->nfs_client->cl_session);
                                goto out;
                        case -ERESTARTSYS:
                                /*
@@ -4381,7 +4416,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
                        case -NFS4ERR_OPENMODE:
-                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                                nfs4_schedule_stateid_recovery(server, state);
                                err = 0;
                                goto out;
                        case -EKEYEXPIRED:
@@ -4988,10 +5023,20 @@ int nfs4_proc_create_session(struct nfs_client *clp)
        int status;
        unsigned *ptr;
        struct nfs4_session *session = clp->cl_session;
+        long timeout = 0;
+        int err;
        dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
-        status = _nfs4_proc_create_session(clp);
+        do {
+                status = _nfs4_proc_create_session(clp);
+                if (status == -NFS4ERR_DELAY) {
+                        err = nfs4_delay(clp->cl_rpcclient, &timeout);
+                        if (err)
+                                status = err;
+                }
+        } while (status == -NFS4ERR_DELAY);
        if (status)
                goto out;
@@ -5100,7 +5145,7 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_lease_recovery(clp);
        }
        return 0;
 }
@@ -5187,7 +5232,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
        if (IS_ERR(task))
                ret = PTR_ERR(task);
        else
-                rpc_put_task(task);
+                rpc_put_task_async(task);
        dprintk("<-- %s status=%d\n", __func__, ret);
        return ret;
 }
@@ -5203,8 +5248,13 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
                goto out;
        }
        ret = rpc_wait_for_completion_task(task);
-        if (!ret)
+        if (!ret) {
+                struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
+                if (task->tk_status == 0)
+                        nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
                ret = task->tk_status;
+        }
        rpc_put_task(task);
 out:
        dprintk("<-- %s status=%d\n", __func__, ret);
@@ -5241,7 +5291,7 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_lease_recovery(clp);
        }
        return 0;
 }
@@ -5309,6 +5359,9 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
                status = PTR_ERR(task);
                goto out;
        }
+        status = nfs4_wait_for_completion_rpc_task(task);
+        if (status == 0)
+                status = task->tk_status;
        rpc_put_task(task);
        return 0;
 out:
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e6742b57a04c..0592288f9f06 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1007,9 +1007,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 }
 /*
- * Schedule a state recovery attempt
+ * Schedule a lease recovery attempt
 */
-void nfs4_schedule_state_recovery(struct nfs_client *clp)
+void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 {
        if (!clp)
                return;
@@ -1018,7 +1018,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
        nfs4_schedule_state_manager(clp);
 }
-int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
 {
        set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1032,7 +1032,7 @@ int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *st
        return 1;
 }
-int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
 {
        set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
        clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1041,6 +1041,14 @@ int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *s
        return 1;
 }
+void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
+{
+        struct nfs_client *clp = server->nfs_client;
+        nfs4_state_mark_reclaim_nograce(clp, state);
+        nfs4_schedule_state_manager(clp);
+}
 static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
 {
        struct inode *inode = state->inode;
@@ -1436,10 +1444,15 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 }
 #ifdef CONFIG_NFS_V4_1
+void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+        nfs4_schedule_lease_recovery(session->clp);
+}
 void nfs41_handle_recall_slot(struct nfs_client *clp)
 {
        set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
-        nfs4_schedule_state_recovery(clp);
+        nfs4_schedule_state_manager(clp);
 }
 static void nfs4_reset_all_state(struct nfs_client *clp)
@@ -1447,7 +1460,7 @@ static void nfs4_reset_all_state(struct nfs_client *clp)
        if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
                clp->cl_boot_time = CURRENT_TIME;
                nfs4_state_start_reclaim_nograce(clp);
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
        }
 }
@@ -1455,7 +1468,7 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
 {
        if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
                nfs4_state_start_reclaim_reboot(clp);
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
        }
 }
@@ -1475,7 +1488,7 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp)
 {
        nfs_expire_all_delegations(clp);
        if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
 }
 void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4e2c168b6ee9..94d50e86a124 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1660,7 +1660,7 @@ static void encode_create_session(struct xdr_stream *xdr,
        p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
        *p++ = cpu_to_be32(OP_CREATE_SESSION);
-        p = xdr_encode_hyper(p, clp->cl_ex_clid);
+        p = xdr_encode_hyper(p, clp->cl_clientid);
        *p++ = cpu_to_be32(clp->cl_seqid);                      /*Sequence id */
        *p++ = cpu_to_be32(args->flags);                        /*flags */
@@ -4694,7 +4694,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
        p = xdr_inline_decode(xdr, 8);
        if (unlikely(!p))
                goto out_overflow;
-        xdr_decode_hyper(p, &clp->cl_ex_clid);
+        xdr_decode_hyper(p, &clp->cl_clientid);
        p = xdr_inline_decode(xdr, 12);
        if (unlikely(!p))
                goto out_overflow;
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 903908a20023..c541093a5bf2 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,11 +86,14 @@
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT                "/tftpboot/%s"
+/* Default NFSROOT mount options. */
+#define NFS_DEF_OPTIONS         "udp"
 /* Parameters passed from the kernel command line */
 static char nfs_root_parms[256] __initdata = "";
 /* Text-based mount options passed to super.c */
-static char nfs_root_options[256] __initdata = "";
+static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
 /* Address of NFS server */
 static __be32 servaddr __initdata = htonl(INADDR_NONE);
@@ -160,8 +163,14 @@ static int __init root_nfs_copy(char *dest, const char *src,
 }
 static int __init root_nfs_cat(char *dest, const char *src,
-                                  const size_t destlen)
+                               const size_t destlen)
 {
+        size_t len = strlen(dest);
+        if (len && dest[len - 1] != ',')
+                if (strlcat(dest, ",", destlen) > destlen)
+                        return -1;
        if (strlcat(dest, src, destlen) > destlen)
                return -1;
        return 0;
@@ -194,16 +203,6 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
                if (root_nfs_cat(nfs_root_options, incoming,
                                                sizeof(nfs_root_options)))
                        return -1;
-        /*
-         * Possibly prepare for more options to be appended
-         */
-        if (nfs_root_options[0] != '\0' &&
-            nfs_root_options[strlen(nfs_root_options)] != ',')
-                if (root_nfs_cat(nfs_root_options, ",",
-                                                sizeof(nfs_root_options)))
-                        return -1;
        return 0;
 }
@@ -217,7 +216,7 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
 */
 static int __init root_nfs_data(char *cmdline)
 {
-        char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
+        char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
        int len, retval = -1;
        char *tmp = NULL;
        const size_t tmplen = sizeof(nfs_export_path);
@@ -244,9 +243,9 @@ static int __init root_nfs_data(char *cmdline)
         * Append mandatory options for nfsroot so they override
         * what has come before
         */
-        snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
+        snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4",
                        &servaddr);
-        if (root_nfs_cat(nfs_root_options, addr_option,
+        if (root_nfs_cat(nfs_root_options, mand_options,
                                                sizeof(nfs_root_options)))
                goto out_optionstoolong;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index e313a51acdd1..6481d537d69d 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -180,7 +180,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
        task_setup_data.rpc_client = NFS_CLIENT(dir);
        task = rpc_run_task(&task_setup_data);
        if (!IS_ERR(task))
-                rpc_put_task(task);
+                rpc_put_task_async(task);
        return 1;
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c8278f4046cb..42b92d7a9cc4 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1292,6 +1292,8 @@ static int nfs_commit_rpcsetup(struct list_head *head,
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
+        if (how & FLUSH_SYNC)
+                rpc_wait_for_completion_task(task);
        rpc_put_task(task);
        return 0;
 }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3be975e18919..02eb4edf0ece 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -432,7 +432,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
         * If the server returns different values for sessionID, slotID or
         * sequence number, the server is looney tunes.
         */
-        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
+        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
        if (unlikely(p == NULL))
                goto out_overflow;
        memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
@@ -484,7 +484,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr,
 out:
        return status;
 out_default:
-        return nfs_cb_stat_to_errno(status);
+        return nfs_cb_stat_to_errno(nfserr);
 }
 /*
@@ -564,11 +564,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
        if (unlikely(status))
                goto out;
        if (unlikely(nfserr != NFS4_OK))
-                goto out_default;
+                status = nfs_cb_stat_to_errno(nfserr);
 out:
        return status;
-out_default:
-        return nfs_cb_stat_to_errno(status);
 }
 /*
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d98d0213285d..7b566ec14e18 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -230,9 +230,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        dp->dl_client = clp;
        get_nfs4_file(fp);
        dp->dl_file = fp;
-        dp->dl_vfs_file = find_readable_file(fp);
-        get_file(dp->dl_vfs_file);
-        dp->dl_flock = NULL;
        dp->dl_type = type;
        dp->dl_stateid.si_boot = boot_time;
        dp->dl_stateid.si_stateownerid = current_delegid++;
@@ -241,8 +238,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
        dp->dl_time = 0;
        atomic_set(&dp->dl_count, 1);
-        list_add(&dp->dl_perfile, &fp->fi_delegations);
-        list_add(&dp->dl_perclnt, &clp->cl_delegations);
        INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
        return dp;
 }
@@ -253,36 +248,30 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
        if (atomic_dec_and_test(&dp->dl_count)) {
                dprintk("NFSD: freeing dp %p\n",dp);
                put_nfs4_file(dp->dl_file);
-                fput(dp->dl_vfs_file);
                kmem_cache_free(deleg_slab, dp);
                num_delegations--;
        }
 }
-/* Remove the associated file_lock first, then remove the delegation.
+static void nfs4_put_deleg_lease(struct nfs4_file *fp)
- * lease_modify() is called to remove the FS_LEASE file_lock from
- * the i_flock list, eventually calling nfsd's lock_manager
- * fl_release_callback.
- */
-static void
-nfs4_close_delegation(struct nfs4_delegation *dp)
 {
-        dprintk("NFSD: close_delegation dp %p\n",dp);
+        if (atomic_dec_and_test(&fp->fi_delegees)) {
-        /* XXX: do we even need this check?: */
+                vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
-        if (dp->dl_flock)
+                fp->fi_lease = NULL;
-                vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
+                fp->fi_deleg_file = NULL;
+        }
 }
 /* Called under the state lock. */
 static void
 unhash_delegation(struct nfs4_delegation *dp)
 {
-        list_del_init(&dp->dl_perfile);
        list_del_init(&dp->dl_perclnt);
        spin_lock(&recall_lock);
+        list_del_init(&dp->dl_perfile);
        list_del_init(&dp->dl_recall_lru);
        spin_unlock(&recall_lock);
-        nfs4_close_delegation(dp);
+        nfs4_put_deleg_lease(dp->dl_file);
        nfs4_put_delegation(dp);
 }
@@ -958,8 +947,6 @@ expire_client(struct nfs4_client *clp)
        spin_lock(&recall_lock);
        while (!list_empty(&clp->cl_delegations)) {
                dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
-                dprintk("NFSD: expire client. dp %p, fp %p\n", dp,
-                                dp->dl_flock);
                list_del_init(&dp->dl_perclnt);
                list_move(&dp->dl_recall_lru, &reaplist);
        }
@@ -2078,6 +2065,7 @@ alloc_init_file(struct inode *ino)
                fp->fi_inode = igrab(ino);
                fp->fi_id = current_fileid++;
                fp->fi_had_conflict = false;
+                fp->fi_lease = NULL;
                memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
                memset(fp->fi_access, 0, sizeof(fp->fi_access));
                spin_lock(&recall_lock);
@@ -2329,23 +2317,8 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
                nfs4_file_put_access(fp, O_RDONLY);
 }
-/*
+static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
- * Spawn a thread to perform a recall on the delegation represented
- * by the lease (file_lock)
- *
- * Called from break_lease() with lock_flocks() held.
- * Note: we assume break_lease will only call this *once* for any given
- * lease.
- */
-static
-void nfsd_break_deleg_cb(struct file_lock *fl)
 {
-        struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
-        dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl);
-        if (!dp)
-                return;
        /* We're assuming the state code never drops its reference
         * without first removing the lease.  Since we're in this lease
         * callback (and since the lease code is serialized by the kernel
@@ -2353,22 +2326,35 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
         * it's safe to take a reference: */
        atomic_inc(&dp->dl_count);
-        spin_lock(&recall_lock);
        list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
-        spin_unlock(&recall_lock);
        /* only place dl_time is set. protected by lock_flocks*/
        dp->dl_time = get_seconds();
+        nfsd4_cb_recall(dp);
+}
+/* Called from break_lease() with lock_flocks() held. */
+static void nfsd_break_deleg_cb(struct file_lock *fl)
+{
+        struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
+        struct nfs4_delegation *dp;
+        BUG_ON(!fp);
+        /* We assume break_lease is only called once per lease: */
+        BUG_ON(fp->fi_had_conflict);
        /*
         * We don't want the locks code to timeout the lease for us;
-         * we'll remove it ourself if the delegation isn't returned
+         * we'll remove it ourself if a delegation isn't returned
-         * in time.
+         * in time:
         */
        fl->fl_break_time = 0;
-        dp->dl_file->fi_had_conflict = true;
+        spin_lock(&recall_lock);
-        nfsd4_cb_recall(dp);
+        fp->fi_had_conflict = true;
+        list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+                nfsd_break_one_deleg(dp);
+        spin_unlock(&recall_lock);
 }
 static
@@ -2461,10 +2447,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 {
        struct nfs4_delegation *dp;
-        list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) {
+        spin_lock(&recall_lock);
-                if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid)
+        list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+                if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
+                        spin_unlock(&recall_lock);
                        return dp;
-        }
+                }
+        spin_unlock(&recall_lock);
        return NULL;
 }
@@ -2641,6 +2630,66 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
        return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
 }
+static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag)
+{
+        struct file_lock *fl;
+        fl = locks_alloc_lock();
+        if (!fl)
+                return NULL;
+        locks_init_lock(fl);
+        fl->fl_lmops = &nfsd_lease_mng_ops;
+        fl->fl_flags = FL_LEASE;
+        fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+        fl->fl_end = OFFSET_MAX;
+        fl->fl_owner = (fl_owner_t)(dp->dl_file);
+        fl->fl_pid = current->tgid;
+        return fl;
+}
+static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
+{
+        struct nfs4_file *fp = dp->dl_file;
+        struct file_lock *fl;
+        int status;
+        fl = nfs4_alloc_init_lease(dp, flag);
+        if (!fl)
+                return -ENOMEM;
+        fl->fl_file = find_readable_file(fp);
+        list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+        status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
+        if (status) {
+                list_del_init(&dp->dl_perclnt);
+                locks_free_lock(fl);
+                return -ENOMEM;
+        }
+        fp->fi_lease = fl;
+        fp->fi_deleg_file = fl->fl_file;
+        get_file(fp->fi_deleg_file);
+        atomic_set(&fp->fi_delegees, 1);
+        list_add(&dp->dl_perfile, &fp->fi_delegations);
+        return 0;
+}
+static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
+{
+        struct nfs4_file *fp = dp->dl_file;
+        if (!fp->fi_lease)
+                return nfs4_setlease(dp, flag);
+        spin_lock(&recall_lock);
+        if (fp->fi_had_conflict) {
+                spin_unlock(&recall_lock);
+                return -EAGAIN;
+        }
+        atomic_inc(&fp->fi_delegees);
+        list_add(&dp->dl_perfile, &fp->fi_delegations);
+        spin_unlock(&recall_lock);
+        list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+        return 0;
+}
 /*
 * Attempt to hand out a delegation.
 */
@@ -2650,7 +2699,6 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        struct nfs4_delegation *dp;
        struct nfs4_stateowner *sop = stp->st_stateowner;
        int cb_up;
-        struct file_lock *fl;
        int status, flag = 0;
        cb_up = nfsd4_cb_channel_good(sop->so_client);
@@ -2681,36 +2729,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        }
        dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
-        if (dp == NULL) {
+        if (dp == NULL)
-                flag = NFS4_OPEN_DELEGATE_NONE;
+                goto out_no_deleg;
-                goto out;
+        status = nfs4_set_delegation(dp, flag);
-        }
+        if (status)
-        status = -ENOMEM;
+                goto out_free;
-        fl = locks_alloc_lock();
-        if (!fl)
-                goto out;
-        locks_init_lock(fl);
-        fl->fl_lmops = &nfsd_lease_mng_ops;
-        fl->fl_flags = FL_LEASE;
-        fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
-        fl->fl_end = OFFSET_MAX;
-        fl->fl_owner =  (fl_owner_t)dp;
-        fl->fl_file = find_readable_file(stp->st_file);
-        BUG_ON(!fl->fl_file);
-        fl->fl_pid = current->tgid;
-        dp->dl_flock = fl;
-        /* vfs_setlease checks to see if delegation should be handed out.
-         * the lock_manager callback fl_change is used
-         */
-        if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
-                dprintk("NFSD: setlease failed [%d], no delegation\n", status);
-                dp->dl_flock = NULL;
-                locks_free_lock(fl);
-                unhash_delegation(dp);
-                flag = NFS4_OPEN_DELEGATE_NONE;
-                goto out;
-        }
        memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
@@ -2722,6 +2745,12 @@ out:
                        && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
                dprintk("NFSD: WARNING: refusing delegation reclaim\n");
        open->op_delegate_type = flag;
+        return;
+out_free:
+        nfs4_put_delegation(dp);
+out_no_deleg:
+        flag = NFS4_OPEN_DELEGATE_NONE;
+        goto out;
 }
 /*
@@ -2916,8 +2945,6 @@ nfs4_laundromat(void)
                                test_val = u;
                        break;
                }
-                dprintk("NFSD: purging unused delegation dp %p, fp %p\n",
-                                    dp, dp->dl_flock);
                list_move(&dp->dl_recall_lru, &reaplist);
        }
        spin_unlock(&recall_lock);
@@ -3128,7 +3155,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                        goto out;
                renew_client(dp->dl_client);
                if (filpp) {
-                        *filpp = find_readable_file(dp->dl_file);
+                        *filpp = dp->dl_file->fi_deleg_file;
                        BUG_ON(!*filpp);
                }
        } else { /* open or lock stateid */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 956629b9cdc9..615f0a9f0600 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -317,8 +317,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                READ_BUF(dummy32);
                len += (XDR_QUADLEN(dummy32) << 2);
                READMEM(buf, dummy32);
-                if ((host_err = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
+                if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
-                        goto out_nfserr;
+                        return status;
                iattr->ia_valid |= ATTR_UID;
        }
        if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) {
@@ -328,8 +328,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                READ_BUF(dummy32);
                len += (XDR_QUADLEN(dummy32) << 2);
                READMEM(buf, dummy32);
-                if ((host_err = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
+                if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
-                        goto out_nfserr;
+                        return status;
                iattr->ia_valid |= ATTR_GID;
        }
        if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
@@ -1142,7 +1142,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
        u32 dummy;
        char *machine_name;
-        int i;
+        int i, j;
        int nr_secflavs;
        READ_BUF(16);
@@ -1215,7 +1215,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
                        READ_BUF(4);
                        READ32(dummy);
                        READ_BUF(dummy * 4);
-                        for (i = 0; i < dummy; ++i)
+                        for (j = 0; j < dummy; ++j)
                                READ32(dummy);
                        break;
                case RPC_AUTH_GSS:
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 3074656ba7bf..2d31224b07bf 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -83,8 +83,6 @@ struct nfs4_delegation {
        atomic_t                dl_count;       /* ref count */
        struct nfs4_client      *dl_client;
        struct nfs4_file        *dl_file;
-        struct file             *dl_vfs_file;
-        struct file_lock        *dl_flock;
        u32                     dl_type;
        time_t                  dl_time;
 /* For recall: */
@@ -379,6 +377,9 @@ struct nfs4_file {
         */
        atomic_t                fi_readers;
        atomic_t                fi_writers;
+        struct file             *fi_deleg_file;
+        struct file_lock        *fi_lease;
+        atomic_t                fi_delegees;
        struct inode            *fi_inode;
        u32                     fi_id;      /* used with stateowner->so_id 
                                             * for stateid_hashtbl hash */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 641117f2188d..da1d9701f8e4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -808,7 +808,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
                if (ra->p_count == 0)
                        frap = rap;
        }
-        depth = nfsdstats.ra_size*11/10;
+        depth = nfsdstats.ra_size;
        if (!frap) {    
                spin_unlock(&rab->pb_lock);
                return NULL;
@@ -1744,6 +1744,13 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        host_err = nfsd_break_lease(odentry->d_inode);
        if (host_err)
                goto out_drop_write;
+        if (ndentry->d_inode) {
+                host_err = nfsd_break_lease(ndentry->d_inode);
+                if (host_err)
+                        goto out_drop_write;
+        }
+        if (host_err)
+                goto out_drop_write;
        host_err = vfs_rename(fdir, odentry, tdir, ndentry);
        if (!host_err) {
                host_err = commit_metadata(tfhp);
@@ -1812,22 +1819,22 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
        if (host_err)
-                goto out_nfserr;
+                goto out_put;
        host_err = nfsd_break_lease(rdentry->d_inode);
        if (host_err)
-                goto out_put;
+                goto out_drop_write;
        if (type != S_IFDIR)
                host_err = vfs_unlink(dirp, rdentry);
        else
                host_err = vfs_rmdir(dirp, rdentry);
-out_put:
-        dput(rdentry);
        if (!host_err)
                host_err = commit_metadata(fhp);
+out_drop_write:
        mnt_drop_write(fhp->fh_export->ex_path.mnt);
+out_put:
+        dput(rdentry);
 out_nfserr:
        err = nfserrno(host_err);
 out:
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 388e9e8f5286..85f7baa15f5d 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -35,11 +35,6 @@
 #include "btnode.h"
-void nilfs_btnode_cache_init_once(struct address_space *btnc)
-{
-        nilfs_mapping_init_once(btnc);
-}
 static const struct address_space_operations def_btnode_aops = {
        .sync_page              = block_sync_page,
 };
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 79037494f1e0..1b8ebd888c28 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
        struct buffer_head *newbh;
 };
-void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 6a0e2a189f60..a0babd2bff6a 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
        struct backing_dev_info *bdi = inode->i_sb->s_bdi;
        INIT_LIST_HEAD(&shadow->frozen_buffers);
-        nilfs_mapping_init_once(&shadow->frozen_data);
+        address_space_init_once(&shadow->frozen_data);
        nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
-        nilfs_mapping_init_once(&shadow->frozen_btnodes);
+        address_space_init_once(&shadow->frozen_btnodes);
        nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
        mi->mi_shadow = shadow;
        return 0;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 98034271cd02..161791d26458 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -397,7 +397,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
-                inc_nlink(old_inode);
                nilfs_set_link(new_dir, new_de, new_page, old_inode);
                nilfs_mark_inode_dirty(new_dir);
                new_inode->i_ctime = CURRENT_TIME;
@@ -411,13 +410,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= NILFS_LINK_MAX)
                                goto out_dir;
                }
-                inc_nlink(old_inode);
                err = nilfs_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        drop_nlink(old_inode);
-                        nilfs_mark_inode_dirty(old_inode);
                        goto out_dir;
-                }
                if (dir_de) {
                        inc_nlink(new_dir);
                        nilfs_mark_inode_dirty(new_dir);
@@ -431,7 +426,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        old_inode->i_ctime = CURRENT_TIME;
        nilfs_delete_entry(old_de, old_page);
-        drop_nlink(old_inode);
        if (dir_de) {
                nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 0c432416cfef..a585b35fd6bc 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        return nc;
 }
-void nilfs_mapping_init_once(struct address_space *mapping)
-{
-        memset(mapping, 0, sizeof(*mapping));
-        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-        spin_lock_init(&mapping->tree_lock);
-        INIT_LIST_HEAD(&mapping->private_list);
-        spin_lock_init(&mapping->private_lock);
-        spin_lock_init(&mapping->i_mmap_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-}
 void nilfs_mapping_init(struct address_space *mapping,
                        struct backing_dev_info *bdi,
                        const struct address_space_operations *aops)
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 622df27cd891..2a00953ebd5f 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page *);
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init_once(struct address_space *mapping);
 void nilfs_mapping_init(struct address_space *mapping,
                        struct backing_dev_info *bdi,
                        const struct address_space_operations *aops);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 55ebae5c7f39..2de9f636792a 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -430,7 +430,8 @@ static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
        nilfs_segctor_map_segsum_entry(
                sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
-        if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+        if (NILFS_I(inode)->i_root &&
+            !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
                set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
        /* skip finfo */
 }
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 58fd707174e1..1673b3d99842 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1279,7 +1279,7 @@ static void nilfs_inode_init_once(void *obj)
 #ifdef CONFIG_NILFS_XATTR
        init_rwsem(&ii->xattr_sem);
 #endif
-        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+        address_space_init_once(&ii->i_btnode_cache);
        ii->i_bmap = &ii->i_bmap_data;
        inode_init_once(&ii->vfs_inode);
 }
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 6d80ecc7834f..7eb90403fc8a 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -56,7 +56,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
        int ret = 0;    /* if all else fails, just return false */
        struct ocfs2_super *osb;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        inode = dentry->d_inode;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 43e56b97f9c0..6180da1e37e6 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -405,9 +405,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
               ocfs2_quota_trans_credits(sb);
 }
-/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
+/* data block for new dir/symlink, allocation of directory block, dx_root
- * bitmap block for the new bit) dx_root update for free list */
+ * update for free list */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
 static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
 {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b5f9160e93e9..19ebc5aad391 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3228,7 +3228,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
                                        u32 num_clusters, unsigned int e_flags)
 {
        int ret, delete, index, credits =  0;
-        u32 new_bit, new_len;
+        u32 new_bit, new_len, orig_num_clusters;
        unsigned int set_len;
        struct ocfs2_super *osb = OCFS2_SB(sb);
        handle_t *handle;
@@ -3261,6 +3261,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
                goto out;
        }
+        orig_num_clusters = num_clusters;
        while (num_clusters) {
                ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
                                             p_cluster, num_clusters,
@@ -3348,7 +3350,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
         * in write-back mode.
         */
        if (context->get_clusters == ocfs2_di_get_clusters) {
-                ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
+                ret = ocfs2_cow_sync_writeback(sb, context, cpos,
+                                               orig_num_clusters);
                if (ret)
                        mlog_errno(ret);
        }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 38f986d2447e..36c423fb0635 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1316,7 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb,
                               struct mount_options *mopt,
                               int is_remount)
 {
-        int status;
+        int status, user_stack = 0;
        char *p;
        u32 tmp;
@@ -1459,6 +1459,15 @@ static int ocfs2_parse_options(struct super_block *sb,
                        memcpy(mopt->cluster_stack, args[0].from,
                               OCFS2_STACK_LABEL_LEN);
                        mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+                        /*
+                         * Open code the memcmp here as we don't have
+                         * an osb to pass to
+                         * ocfs2_userspace_stack().
+                         */
+                        if (memcmp(mopt->cluster_stack,
+                                   OCFS2_CLASSIC_CLUSTER_STACK,
+                                   OCFS2_STACK_LABEL_LEN))
+                                user_stack = 1;
                        break;
                case Opt_inode64:
                        mopt->mount_opt |= OCFS2_MOUNT_INODE64;
@@ -1514,13 +1523,16 @@ static int ocfs2_parse_options(struct super_block *sb,
                }
        }
-        /* Ensure only one heartbeat mode */
+        if (user_stack == 0) {
-        tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+                /* Ensure only one heartbeat mode */
-                                 OCFS2_MOUNT_HB_NONE);
+                tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
-        if (hweight32(tmp) != 1) {
+                                         OCFS2_MOUNT_HB_GLOBAL |
-                mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+                                         OCFS2_MOUNT_HB_NONE);
-                status = 0;
+                if (hweight32(tmp) != 1) {
-                goto bail;
+                        mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+                        status = 0;
+                        goto bail;
+                }
        }
        status = 1;
diff --git a/fs/open.c b/fs/open.c
index e52389e1f05b..b47aab39c057 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -233,6 +233,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
+        /* It's not possible punch hole on append only file */
+        if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode))
+                return -EPERM;
+        if (IS_IMMUTABLE(inode))
+                return -EPERM;
        /*
         * Revalidate the write permissions, in case security policy has
         * changed since the files were opened.
@@ -790,6 +798,8 @@ struct file *nameidata_to_filp(struct nameidata *nd)
        /* Pick up the filp from the open intent */
        filp = nd->intent.open.file;
+        nd->intent.open.file = NULL;
        /* Has the filesystem initialised the file for us? */
        if (filp->f_path.dentry == NULL) {
                path_get(&nd->path);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 789c625c7aa5..b10e3540d5b7 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -251,6 +251,11 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
        }
        vm->vblk_size     = get_unaligned_be32(data + 0x08);
+        if (vm->vblk_size == 0) {
+                ldm_error ("Illegal VBLK size");
+                return false;
+        }
        vm->vblk_offset   = get_unaligned_be32(data + 0x0C);
        vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index 68d6a216ee79..11f688bd76c5 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -29,10 +29,9 @@ static inline void mac_fix_string(char *stg, int len)
 int mac_partition(struct parsed_partitions *state)
 {
-        int slot = 1;
        Sector sect;
        unsigned char *data;
-        int blk, blocks_in_map;
+        int slot, blocks_in_map;
        unsigned secsize;
 #ifdef CONFIG_PPC_PMAC
        int found_root = 0;
@@ -59,10 +58,14 @@ int mac_partition(struct parsed_partitions *state)
                put_dev_sector(sect);
                return 0;               /* not a MacOS disk */
        }
-        strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
        blocks_in_map = be32_to_cpu(part->map_count);
-        for (blk = 1; blk <= blocks_in_map; ++blk) {
+        if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) {
-                int pos = blk * secsize;
+                put_dev_sector(sect);
+                return 0;
+        }
+        strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
+        for (slot = 1; slot <= blocks_in_map; ++slot) {
+                int pos = slot * secsize;
                put_dev_sector(sect);
                data = read_part_sector(state, pos/512, &sect);
                if (!data)
@@ -113,13 +116,11 @@ int mac_partition(struct parsed_partitions *state)
                        }
                        if (goodness > found_root_goodness) {
-                                found_root = blk;
+                                found_root = slot;
                                found_root_goodness = goodness;
                        }
                }
 #endif /* CONFIG_PPC_PMAC */
-                ++slot;
        }
 #ifdef CONFIG_PPC_PMAC
        if (found_root_goodness)
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index 48cec7cbca17..be03a0b08b47 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,10 +10,13 @@
 #include "check.h"
 #include "osf.h"
+#define MAX_OSF_PARTITIONS 8
 int osf_partition(struct parsed_partitions *state)
 {
        int i;
        int slot = 1;
+        unsigned int npartitions;
        Sector sect;
        unsigned char *data;
        struct disklabel {
@@ -45,7 +48,7 @@ int osf_partition(struct parsed_partitions *state)
                        u8  p_fstype;
                        u8  p_frag;
                        __le16 p_cpg;
-                } d_partitions[8];
+                } d_partitions[MAX_OSF_PARTITIONS];
        } * label;
        struct d_partition * partition;
@@ -63,7 +66,12 @@ int osf_partition(struct parsed_partitions *state)
                put_dev_sector(sect);
                return 0;
        }
-        for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) {
+        npartitions = le16_to_cpu(label->d_npartitions);
+        if (npartitions > MAX_OSF_PARTITIONS) {
+                put_dev_sector(sect);
+                return 0;
+        }
+        for (i = 0 ; i < npartitions; i++, partition++) {
                if (slot == state->limit)
                        break;
                if (le32_to_cpu(partition->p_size))
diff --git a/fs/proc/array.c b/fs/proc/array.c
index df2b703b9d0f..7c99c1cf7e5c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -353,9 +353,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
        task_cap(m, task);
        task_cpus_allowed(m, task);
        cpuset_task_status_allowed(m, task);
-#if defined(CONFIG_S390)
-        task_show_regs(m, task);
-#endif
        task_context_switch_counts(m, task);
        return 0;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9d096e82b201..d49c4b5d2c3e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2620,35 +2620,6 @@ static const struct pid_entry proc_base_stuff[] = {
                &proc_self_inode_operations, NULL, {}),
 };
-/*
- *      Exceptional case: normally we are not allowed to unhash a busy
- * directory. In this case, however, we can do it - no aliasing problems
- * due to the way we treat inodes.
- */
-static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-        struct inode *inode;
-        struct task_struct *task;
-        if (nd->flags & LOOKUP_RCU)
-                return -ECHILD;
-        inode = dentry->d_inode;
-        task = get_proc_task(inode);
-        if (task) {
-                put_task_struct(task);
-                return 1;
-        }
-        d_drop(dentry);
-        return 0;
-}
-static const struct dentry_operations proc_base_dentry_operations =
-{
-        .d_revalidate   = proc_base_revalidate,
-        .d_delete       = pid_delete_dentry,
-};
 static struct dentry *proc_base_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -2685,7 +2656,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
-        d_set_d_op(dentry, &proc_base_dentry_operations);
        d_add(dentry, inode);
        error = NULL;
 out:
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 176ce4cda68a..d6a7ca1fdac5 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -27,6 +27,7 @@
 static void proc_evict_inode(struct inode *inode)
 {
        struct proc_dir_entry *de;
+        struct ctl_table_header *head;
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
@@ -38,8 +39,11 @@ static void proc_evict_inode(struct inode *inode)
        de = PROC_I(inode)->pde;
        if (de)
                pde_put(de);
-        if (PROC_I(inode)->sysctl)
+        head = PROC_I(inode)->sysctl;
-                sysctl_head_put(PROC_I(inode)->sysctl);
+        if (head) {
+                rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
+                sysctl_head_put(head);
+        }
 }
 struct vfsmount *proc_mnt;
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index d9396a4fc7ff..927cbd115e53 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -233,7 +233,7 @@ void __init proc_device_tree_init(void)
                return;
        root = of_find_node_by_path("/");
        if (root == NULL) {
-                printk(KERN_ERR "/proc/device-tree: can't find root\n");
+                pr_debug("/proc/device-tree: can't find root\n");
                return;
        }
        proc_device_tree_add_node(root, proc_device_tree);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 09a1f92a34ef..8eb2522111c5 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -408,15 +408,18 @@ static int proc_sys_compare(const struct dentry *parent,
                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
+        struct ctl_table_header *head;
        /* Although proc doesn't have negative dentries, rcu-walk means
         * that inode here can be NULL */
+        /* AV: can it, indeed? */
        if (!inode)
-                return 0;
+                return 1;
        if (name->len != len)
                return 1;
        if (memcmp(name->name, str, len))
                return 1;
-        return !sysctl_is_seen(PROC_I(inode)->sysctl);
+        head = rcu_dereference(PROC_I(inode)->sysctl);
+        return !head || !sysctl_is_seen(head);
 }
 static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ba5f51ec3458..68fdf45cc6c9 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -771,7 +771,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                                        EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
                                        dentry, inode, &security);
        if (retval) {
-                dir->i_nlink--;
+                DEC_DIR_INODE_NLINK(dir)
                goto out_failed;
        }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 3cfb2e933644..5c11ca82b782 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -978,8 +978,6 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
-                return -ECHILD;
        return -EPERM;
 }
diff --git a/fs/super.c b/fs/super.c
index 74e149efed81..7e9dd4cc2c01 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -177,6 +177,11 @@ void deactivate_locked_super(struct super_block *s)
        struct file_system_type *fs = s->s_type;
        if (atomic_dec_and_test(&s->s_active)) {
                fs->kill_sb(s);
+                /*
+                 * We need to call rcu_barrier so all the delayed rcu free
+                 * inodes are flushed before we release the fs module.
+                 */
+                rcu_barrier();
                put_filesystem(fs);
                put_super(s);
        } else {
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b427b1208c26..e474fbcf8bde 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -245,7 +245,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
                new_de = sysv_find_entry(new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                sysv_set_link(new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -257,18 +256,15 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
                        if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = sysv_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
        sysv_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                sysv_set_link(dir_de, dir_page, new_dir);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 2be0f9eb86d2..b7c338d5e9df 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -32,6 +32,8 @@
 #include <linux/crc-itu-t.h>
 #include <linux/exportfs.h>
+enum { UDF_MAX_LINKS = 0xffff };
 static inline int udf_match(int len1, const unsigned char *name1, int len2,
                            const unsigned char *name2)
 {
@@ -650,7 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *iinfo;
        err = -EMLINK;
-        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
+        if (dir->i_nlink >= UDF_MAX_LINKS)
                goto out;
        err = -EIO;
@@ -1034,9 +1036,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        struct fileIdentDesc cfi, *fi;
        int err;
-        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
+        if (inode->i_nlink >= UDF_MAX_LINKS)
                return -EMLINK;
-        }
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
@@ -1131,9 +1132,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
                        goto end_rename;
                retval = -EMLINK;
-                if (!new_inode &&
+                if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS)
-                        new_dir->i_nlink >=
-                                (256 << sizeof(new_dir->i_nlink)) - 1)
                        goto end_rename;
        }
        if (!nfi) {
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 12f39b9e4437..d6f681535eb8 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -306,7 +306,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                ufs_set_link(new_dir, new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -318,12 +317,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= UFS_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = ufs_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
@@ -331,12 +327,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
-         * inode_dec_link_count() will mark the inode dirty.
         */
        old_inode->i_ctime = CURRENT_TIME_SEC;
        ufs_delete_entry(old_dir, old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                ufs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index 05201ae719e5..d61611c88012 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -152,6 +152,8 @@ xfs_ioc_trim(
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
+        if (!blk_queue_discard(q))
+                return -XFS_ERROR(EOPNOTSUPP);
        if (copy_from_user(&range, urange, sizeof(range)))
                return -XFS_ERROR(EFAULT);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f5e2a19e0f8e..0ca0e3c024d7 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -695,14 +695,19 @@ xfs_ioc_fsgeometry_v1(
        xfs_mount_t             *mp,
        void                    __user *arg)
 {
-        xfs_fsop_geom_v1_t      fsgeo;
+        xfs_fsop_geom_t         fsgeo;
        int                     error;
-        error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
+        error = xfs_fs_geometry(mp, &fsgeo, 3);
        if (error)
                return -error;
-        if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+        /*
+         * Caller should have passed an argument of type
+         * xfs_fsop_geom_v1_t.  This is a proper subset of the
+         * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
+         */
+        if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
                return -XFS_ERROR(EFAULT);
        return 0;
 }
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cec89dd5d7d2..85668efb3e3e 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
        xfs_fsop_geom_t         *geo,
        int                     new_version)
 {
+        memset(geo, 0, sizeof(*geo));
        geo->blocksize = mp->m_sb.sb_blocksize;
        geo->rtextsize = mp->m_sb.sb_rextsize;
        geo->agblocks = mp->m_sb.sb_agblocks;