27 files changed, 933 insertions, 293 deletions
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 210acafe4a9b..3ff8bdd18fb3 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -432,7 +432,6 @@ vfs_rejected_lock:
        list_del_init(&fl->fl_u.afs.link);
        if (list_empty(&vnode->granted_locks))
                afs_defer_unlock(vnode, key);
-        spin_unlock(&vnode->lock);
        goto abort_attempt;
 }
diff --git a/fs/aio.c b/fs/aio.c
index 76da12537956..d065b2c3273e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -485,6 +485,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 {
        assert_spin_locked(&ctx->ctx_lock);
+        if (req->ki_eventfd != NULL)
+                eventfd_ctx_put(req->ki_eventfd);
        if (req->ki_dtor)
                req->ki_dtor(req);
        if (req->ki_iovec != &req->ki_inline_vec)
@@ -509,8 +511,6 @@ static void aio_fput_routine(struct work_struct *data)
                /* Complete the fput(s) */
                if (req->ki_filp != NULL)
                        __fput(req->ki_filp);
-                if (req->ki_eventfd != NULL)
-                        __fput(req->ki_eventfd);
                /* Link the iocb into the context's free list */
                spin_lock_irq(&ctx->ctx_lock);
@@ -528,8 +528,6 @@ static void aio_fput_routine(struct work_struct *data)
 */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
-        int schedule_putreq = 0;
        dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
                req, atomic_long_read(&req->ki_filp->f_count));
@@ -549,24 +547,16 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
         * we would not be holding the last reference to the file*, so
         * this function will be executed w/out any aio kthread wakeup.
         */
-        if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
+        if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
-                schedule_putreq++;
-        else
-                req->ki_filp = NULL;
-        if (req->ki_eventfd != NULL) {
-                if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
-                        schedule_putreq++;
-                else
-                        req->ki_eventfd = NULL;
-        }
-        if (unlikely(schedule_putreq)) {
                get_ioctx(ctx);
                spin_lock(&fput_lock);
                list_add(&req->ki_list, &fput_head);
                spin_unlock(&fput_lock);
                queue_work(aio_wq, &fput_work);
-        } else
+        } else {
+                req->ki_filp = NULL;
                really_put_req(ctx, req);
+        }
        return 1;
 }
@@ -1622,7 +1612,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                 * an eventfd() fd, and will be signaled for each completed
                 * event using the eventfd_signal() function.
                 */
-                req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
+                req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
                if (IS_ERR(req->ki_eventfd)) {
                        ret = PTR_ERR(req->ki_eventfd);
                        req->ki_eventfd = NULL;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 9fa212b014a5..b7c1603cd4bd 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1522,11 +1522,11 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
        info->thread = NULL;
        psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
-        fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
        if (psinfo == NULL)
                return 0;
+        fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
        /*
         * Figure out how many notes we're going to need for each thread.
         */
@@ -1929,7 +1929,10 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
        elf = kmalloc(sizeof(*elf), GFP_KERNEL);
        if (!elf)
                goto out;
-        
+        /*
+         * The number of segs are recored into ELF header as 16bit value.
+         * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
+         */
        segs = current->mm->map_count;
 #ifdef ELF_CORE_EXTRA_PHDRS
        segs += ELF_CORE_EXTRA_PHDRS;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 31c46a241bac..49a34e7f7306 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -1,7 +1,7 @@
 /*
 * bio-integrity.c - bio data integrity extensions
 *
- * Copyright (C) 2007, 2008 Oracle Corporation
+ * Copyright (C) 2007, 2008, 2009 Oracle Corporation
 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
 *
 * This program is free software; you can redistribute it and/or
@@ -25,63 +25,121 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
-static struct kmem_cache *bio_integrity_slab __read_mostly;
+struct integrity_slab {
-static mempool_t *bio_integrity_pool;
+        struct kmem_cache *slab;
-static struct bio_set *integrity_bio_set;
+        unsigned short nr_vecs;
+        char name[8];
+};
+#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) }
+struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = {
+        IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES),
+};
+#undef IS
 static struct workqueue_struct *kintegrityd_wq;
+static inline unsigned int vecs_to_idx(unsigned int nr)
+{
+        switch (nr) {
+        case 1:
+                return 0;
+        case 2 ... 4:
+                return 1;
+        case 5 ... 16:
+                return 2;
+        case 17 ... 64:
+                return 3;
+        case 65 ... 128:
+                return 4;
+        case 129 ... BIO_MAX_PAGES:
+                return 5;
+        default:
+                BUG();
+        }
+}
+static inline int use_bip_pool(unsigned int idx)
+{
+        if (idx == BIOVEC_NR_POOLS)
+                return 1;
+        return 0;
+}
 /**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
 * @bio:        bio to attach integrity metadata to
 * @gfp_mask:   Memory allocation mask
 * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ * @bs:         bio_set to allocate from
 *
 * Description: This function prepares a bio for attaching integrity
 * metadata.  nr_vecs specifies the maximum number of pages containing
 * integrity metadata that can be attached.
 */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
-                                                  gfp_t gfp_mask,
+                                                         gfp_t gfp_mask,
-                                                  unsigned int nr_vecs)
+                                                         unsigned int nr_vecs,
+                                                         struct bio_set *bs)
 {
        struct bio_integrity_payload *bip;
-        struct bio_vec *iv;
+        unsigned int idx = vecs_to_idx(nr_vecs);
-        unsigned long idx;
        BUG_ON(bio == NULL);
+        bip = NULL;
-        bip = mempool_alloc(bio_integrity_pool, gfp_mask);
+        /* Lower order allocations come straight from slab */
-        if (unlikely(bip == NULL)) {
+        if (!use_bip_pool(idx))
-                printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+                bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask);
-                return NULL;
-        }
-        memset(bip, 0, sizeof(*bip));
+        /* Use mempool if lower order alloc failed or max vecs were requested */
+        if (bip == NULL) {
+                bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
-        iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, integrity_bio_set);
+                if (unlikely(bip == NULL)) {
-        if (unlikely(iv == NULL)) {
+                        printk(KERN_ERR "%s: could not alloc bip\n", __func__);
-                printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+                        return NULL;
-                mempool_free(bip, bio_integrity_pool);
+                }
-                return NULL;
        }
-        bip->bip_pool = idx;
+        memset(bip, 0, sizeof(*bip));
-        bip->bip_vec = iv;
+        bip->bip_slab = idx;
        bip->bip_bio = bio;
        bio->bi_integrity = bip;
        return bip;
 }
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio:        bio to attach integrity metadata to
+ * @gfp_mask:   Memory allocation mask
+ * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+                                                  gfp_t gfp_mask,
+                                                  unsigned int nr_vecs)
+{
+        return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
 EXPORT_SYMBOL(bio_integrity_alloc);
 /**
 * bio_integrity_free - Free bio integrity payload
 * @bio:        bio containing bip to be freed
+ * @bs:         bio_set this bio was allocated from
 *
 * Description: Used to free the integrity portion of a bio. Usually
 * called from bio_free().
 */
-void bio_integrity_free(struct bio *bio)
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
 {
        struct bio_integrity_payload *bip = bio->bi_integrity;
@@ -92,8 +150,10 @@ void bio_integrity_free(struct bio *bio)
            && bip->bip_buf != NULL)
                kfree(bip->bip_buf);
-        bvec_free_bs(integrity_bio_set, bip->bip_vec, bip->bip_pool);
+        if (use_bip_pool(bip->bip_slab))
-        mempool_free(bip, bio_integrity_pool);
+                mempool_free(bip, bs->bio_integrity_pool);
+        else
+                kmem_cache_free(bip_slab[bip->bip_slab].slab, bip);
        bio->bi_integrity = NULL;
 }
@@ -114,7 +174,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
        struct bio_integrity_payload *bip = bio->bi_integrity;
        struct bio_vec *iv;
-        if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+        if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) {
                printk(KERN_ERR "%s: bip_vec full\n", __func__);
                return 0;
        }
@@ -647,8 +707,8 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
        bp->iv1 = bip->bip_vec[0];
        bp->iv2 = bip->bip_vec[0];
-        bp->bip1.bip_vec = &bp->iv1;
+        bp->bip1.bip_vec[0] = bp->iv1;
-        bp->bip2.bip_vec = &bp->iv2;
+        bp->bip2.bip_vec[0] = bp->iv2;
        bp->iv1.bv_len = sectors * bi->tuple_size;
        bp->iv2.bv_offset += sectors * bi->tuple_size;
@@ -667,17 +727,19 @@ EXPORT_SYMBOL(bio_integrity_split);
 * @bio:        New bio
 * @bio_src:    Original bio
 * @gfp_mask:   Memory allocation mask
+ * @bs:         bio_set to allocate bip from
 *
 * Description: Called to allocate a bip when cloning a bio
 */
-int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+                        gfp_t gfp_mask, struct bio_set *bs)
 {
        struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
        struct bio_integrity_payload *bip;
        BUG_ON(bip_src == NULL);
-        bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
+        bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
        if (bip == NULL)
                return -EIO;
@@ -693,25 +755,43 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bio_integrity_clone);
-static int __init bio_integrity_init(void)
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
-        kintegrityd_wq = create_workqueue("kintegrityd");
+        unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
+        bs->bio_integrity_pool =
+                mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
+        if (!bs->bio_integrity_pool)
+                return -1;
+        return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+void bioset_integrity_free(struct bio_set *bs)
+{
+        if (bs->bio_integrity_pool)
+                mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+void __init bio_integrity_init(void)
+{
+        unsigned int i;
+        kintegrityd_wq = create_workqueue("kintegrityd");
        if (!kintegrityd_wq)
                panic("Failed to create kintegrityd\n");
-        bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+        for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) {
-                                        SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+                unsigned int size;
-        bio_integrity_pool = mempool_create_slab_pool(BIO_POOL_SIZE,
+                size = sizeof(struct bio_integrity_payload)
-                                                      bio_integrity_slab);
+                        + bip_slab[i].nr_vecs * sizeof(struct bio_vec);
-        if (!bio_integrity_pool)
-                panic("bio_integrity: can't allocate bip pool\n");
-        integrity_bio_set = bioset_create(BIO_POOL_SIZE, 0);
+                bip_slab[i].slab =
-        if (!integrity_bio_set)
+                        kmem_cache_create(bip_slab[i].name, size, 0,
-                panic("bio_integrity: can't allocate bio_set\n");
+                                          SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+        }
-        return 0;
 }
-subsys_initcall(bio_integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 24c914043532..1486b19fc431 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -238,7 +238,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
                bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
        if (bio_integrity(bio))
-                bio_integrity_free(bio);
+                bio_integrity_free(bio, bs);
        /*
         * If we have front padding, adjust the bio pointer before freeing
@@ -341,7 +341,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 static void bio_kmalloc_destructor(struct bio *bio)
 {
        if (bio_integrity(bio))
-                bio_integrity_free(bio);
+                bio_integrity_free(bio, fs_bio_set);
        kfree(bio);
 }
@@ -472,7 +472,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
        if (bio_integrity(bio)) {
                int ret;
-                ret = bio_integrity_clone(b, bio, gfp_mask);
+                ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
                if (ret < 0) {
                        bio_put(b);
@@ -1539,6 +1539,7 @@ void bioset_free(struct bio_set *bs)
        if (bs->bio_pool)
                mempool_destroy(bs->bio_pool);
+        bioset_integrity_free(bs);
        biovec_free_pools(bs);
        bio_put_slab(bs);
@@ -1579,6 +1580,9 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
        if (!bs->bio_pool)
                goto bad;
+        if (bioset_integrity_create(bs, pool_size))
+                goto bad;
        if (!biovec_create_pools(bs, pool_size))
                return bs;
@@ -1616,6 +1620,7 @@ static int __init init_bio(void)
        if (!bio_slabs)
                panic("bio: can't allocate bios\n");
+        bio_integrity_init();
        biovec_init_slabs();
        fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7f88628a1a72..6e4f6c50a120 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -299,8 +299,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                                           "btrfs-%s-%d", workers->name,
                                           workers->num_workers + i);
                if (IS_ERR(worker->task)) {
-                        kfree(worker);
                        ret = PTR_ERR(worker->task);
+                        kfree(worker);
                        goto fail;
                }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2779c2f5360a..98a873838717 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2074,8 +2074,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
-                        *root);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index edc7d208c5ce..a5aca3997d42 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -990,15 +990,13 @@ static inline int extent_ref_type(u64 parent, u64 owner)
        return type;
 }
-static int find_next_key(struct btrfs_path *path, struct btrfs_key *key)
+static int find_next_key(struct btrfs_path *path, int level,
+                         struct btrfs_key *key)
 {
-        int level;
+        for (; level < BTRFS_MAX_LEVEL; level++) {
-        BUG_ON(!path->keep_locks);
-        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
                if (!path->nodes[level])
                        break;
-                btrfs_assert_tree_locked(path->nodes[level]);
                if (path->slots[level] + 1 >=
                    btrfs_header_nritems(path->nodes[level]))
                        continue;
@@ -1158,7 +1156,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
                 * For simplicity, we just do not add new inline back
                 * ref if there is any kind of item for this block
                 */
-                if (find_next_key(path, &key) == 0 && key.objectid == bytenr &&
+                if (find_next_key(path, 0, &key) == 0 &&
+                    key.objectid == bytenr &&
                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
                        err = -EAGAIN;
                        goto out;
@@ -2697,7 +2696,7 @@ again:
                printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
                       ", %llu bytes_used, %llu bytes_reserved, "
-                       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
+                       "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
                       "%llu total\n", (unsigned long long)bytes,
                       (unsigned long long)data_sinfo->bytes_delalloc,
                       (unsigned long long)data_sinfo->bytes_used,
@@ -4128,6 +4127,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
        return buf;
 }
+#if 0
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *leaf)
 {
@@ -4171,8 +4171,6 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
        return 0;
 }
-#if 0
 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root,
                                        struct btrfs_leaf_ref *ref)
@@ -4553,262 +4551,471 @@ out:
 }
 #endif
+struct walk_control {
+        u64 refs[BTRFS_MAX_LEVEL];
+        u64 flags[BTRFS_MAX_LEVEL];
+        struct btrfs_key update_progress;
+        int stage;
+        int level;
+        int shared_level;
+        int update_ref;
+        int keep_locks;
+};
+#define DROP_REFERENCE  1
+#define UPDATE_BACKREF  2
 /*
- * helper function for drop_subtree, this function is similar to
+ * hepler to process tree block while walking down the tree.
- * walk_down_tree. The main difference is that it checks reference
+ *
- * counts while tree blocks are locked.
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block. if the block is shared and
+ * we need update back refs for the subtree rooted at the
+ * block, this function changes wc->stage to UPDATE_BACKREF
+ *
+ * when wc->stage == UPDATE_BACKREF, this function updates
+ * back refs for pointers in the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
 */
-static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
-                                   struct btrfs_path *path, int *level)
+                                   struct btrfs_path *path,
+                                   struct walk_control *wc)
 {
-        struct extent_buffer *next;
+        int level = wc->level;
-        struct extent_buffer *cur;
+        struct extent_buffer *eb = path->nodes[level];
-        struct extent_buffer *parent;
+        struct btrfs_key key;
-        u64 bytenr;
+        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
-        u64 ptr_gen;
-        u64 refs;
-        u64 flags;
-        u32 blocksize;
        int ret;
-        cur = path->nodes[*level];
+        if (wc->stage == UPDATE_BACKREF &&
-        ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len,
+            btrfs_header_owner(eb) != root->root_key.objectid)
-                                       &refs, &flags);
+                return 1;
-        BUG_ON(ret);
-        if (refs > 1)
-                goto out;
-        BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+        /*
+         * when reference count of tree block is 1, it won't increase
+         * again. once full backref flag is set, we never clear it.
+         */
+        if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+            (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
+                BUG_ON(!path->locks[level]);
+                ret = btrfs_lookup_extent_info(trans, root,
+                                               eb->start, eb->len,
+                                               &wc->refs[level],
+                                               &wc->flags[level]);
+                BUG_ON(ret);
+                BUG_ON(wc->refs[level] == 0);
+        }
-        while (*level >= 0) {
+        if (wc->stage == DROP_REFERENCE &&
-                cur = path->nodes[*level];
+            wc->update_ref && wc->refs[level] > 1) {
-                if (*level == 0) {
+                BUG_ON(eb == root->node);
-                        ret = btrfs_drop_leaf_ref(trans, root, cur);
+                BUG_ON(path->slots[level] > 0);
-                        BUG_ON(ret);
+                if (level == 0)
-                        clean_tree_block(trans, root, cur);
+                        btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
-                        break;
+                else
-                }
+                        btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
-                if (path->slots[*level] >= btrfs_header_nritems(cur)) {
+                if (btrfs_header_owner(eb) == root->root_key.objectid &&
-                        clean_tree_block(trans, root, cur);
+                    btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
-                        break;
+                        wc->stage = UPDATE_BACKREF;
+                        wc->shared_level = level;
                }
+        }
-                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+        if (wc->stage == DROP_REFERENCE) {
-                blocksize = btrfs_level_size(root, *level - 1);
+                if (wc->refs[level] > 1)
-                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+                        return 1;
-                next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+                if (path->locks[level] && !wc->keep_locks) {
-                btrfs_tree_lock(next);
+                        btrfs_tree_unlock(eb);
-                btrfs_set_lock_blocking(next);
+                        path->locks[level] = 0;
+                }
+                return 0;
+        }
-                ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+        /* wc->stage == UPDATE_BACKREF */
-                                               &refs, &flags);
+        if (!(wc->flags[level] & flag)) {
+                BUG_ON(!path->locks[level]);
+                ret = btrfs_inc_ref(trans, root, eb, 1);
                BUG_ON(ret);
-                if (refs > 1) {
+                ret = btrfs_dec_ref(trans, root, eb, 0);
-                        parent = path->nodes[*level];
+                BUG_ON(ret);
-                        ret = btrfs_free_extent(trans, root, bytenr,
+                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
-                                                blocksize, parent->start,
+                                                  eb->len, flag, 0);
-                                                btrfs_header_owner(parent),
+                BUG_ON(ret);
-                                                *level - 1, 0);
+                wc->flags[level] |= flag;
+        }
+        /*
+         * the block is shared by multiple trees, so it's not good to
+         * keep the tree lock
+         */
+        if (path->locks[level] && level > 0) {
+                btrfs_tree_unlock(eb);
+                path->locks[level] = 0;
+        }
+        return 0;
+}
+/*
+ * hepler to process tree block while walking up the tree.
+ *
+ * when wc->stage == DROP_REFERENCE, this function drops
+ * reference count on the block.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function changes
+ * wc->stage back to DROP_REFERENCE if we changed wc->stage
+ * to UPDATE_BACKREF previously while processing the block.
+ *
+ * NOTE: return value 1 means we should stop walking up.
+ */
+static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct walk_control *wc)
+{
+        int ret = 0;
+        int level = wc->level;
+        struct extent_buffer *eb = path->nodes[level];
+        u64 parent = 0;
+        if (wc->stage == UPDATE_BACKREF) {
+                BUG_ON(wc->shared_level < level);
+                if (level < wc->shared_level)
+                        goto out;
+                BUG_ON(wc->refs[level] <= 1);
+                ret = find_next_key(path, level + 1, &wc->update_progress);
+                if (ret > 0)
+                        wc->update_ref = 0;
+                wc->stage = DROP_REFERENCE;
+                wc->shared_level = -1;
+                path->slots[level] = 0;
+                /*
+                 * check reference count again if the block isn't locked.
+                 * we should start walking down the tree again if reference
+                 * count is one.
+                 */
+                if (!path->locks[level]) {
+                        BUG_ON(level == 0);
+                        btrfs_tree_lock(eb);
+                        btrfs_set_lock_blocking(eb);
+                        path->locks[level] = 1;
+                        ret = btrfs_lookup_extent_info(trans, root,
+                                                       eb->start, eb->len,
+                                                       &wc->refs[level],
+                                                       &wc->flags[level]);
                        BUG_ON(ret);
-                        path->slots[*level]++;
+                        BUG_ON(wc->refs[level] == 0);
-                        btrfs_tree_unlock(next);
+                        if (wc->refs[level] == 1) {
-                        free_extent_buffer(next);
+                                btrfs_tree_unlock(eb);
-                        continue;
+                                path->locks[level] = 0;
+                                return 1;
+                        }
+                } else {
+                        BUG_ON(level != 0);
                }
+        }
-                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+        /* wc->stage == DROP_REFERENCE */
+        BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
-                *level = btrfs_header_level(next);
+        if (wc->refs[level] == 1) {
-                path->nodes[*level] = next;
+                if (level == 0) {
-                path->slots[*level] = 0;
+                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-                path->locks[*level] = 1;
+                                ret = btrfs_dec_ref(trans, root, eb, 1);
-                cond_resched();
+                        else
+                                ret = btrfs_dec_ref(trans, root, eb, 0);
+                        BUG_ON(ret);
+                }
+                /* make block locked assertion in clean_tree_block happy */
+                if (!path->locks[level] &&
+                    btrfs_header_generation(eb) == trans->transid) {
+                        btrfs_tree_lock(eb);
+                        btrfs_set_lock_blocking(eb);
+                        path->locks[level] = 1;
+                }
+                clean_tree_block(trans, root, eb);
+        }
+        if (eb == root->node) {
+                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+                        parent = eb->start;
+                else
+                        BUG_ON(root->root_key.objectid !=
+                               btrfs_header_owner(eb));
+        } else {
+                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+                        parent = path->nodes[level + 1]->start;
+                else
+                        BUG_ON(root->root_key.objectid !=
+                               btrfs_header_owner(path->nodes[level + 1]));
        }
-out:
-        if (path->nodes[*level] == root->node)
-                parent = path->nodes[*level];
-        else
-                parent = path->nodes[*level + 1];
-        bytenr = path->nodes[*level]->start;
-        blocksize = path->nodes[*level]->len;
-        ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start,
+        ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
-                                btrfs_header_owner(parent), *level, 0);
+                                root->root_key.objectid, level, 0);
        BUG_ON(ret);
+out:
+        wc->refs[level] = 0;
+        wc->flags[level] = 0;
+        return ret;
+}
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct btrfs_path *path,
+                                   struct walk_control *wc)
+{
+        struct extent_buffer *next;
+        struct extent_buffer *cur;
+        u64 bytenr;
+        u64 ptr_gen;
+        u32 blocksize;
+        int level = wc->level;
+        int ret;
+        while (level >= 0) {
+                cur = path->nodes[level];
+                BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
-        if (path->locks[*level]) {
+                ret = walk_down_proc(trans, root, path, wc);
-                btrfs_tree_unlock(path->nodes[*level]);
+                if (ret > 0)
-                path->locks[*level] = 0;
+                        break;
+                if (level == 0)
+                        break;
+                bytenr = btrfs_node_blockptr(cur, path->slots[level]);
+                blocksize = btrfs_level_size(root, level - 1);
+                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
+                next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+                btrfs_tree_lock(next);
+                btrfs_set_lock_blocking(next);
+                level--;
+                BUG_ON(level != btrfs_header_level(next));
+                path->nodes[level] = next;
+                path->slots[level] = 0;
+                path->locks[level] = 1;
+                wc->level = level;
        }
-        free_extent_buffer(path->nodes[*level]);
-        path->nodes[*level] = NULL;
-        *level += 1;
-        cond_resched();
        return 0;
 }
-/*
- * helper for dropping snapshots.  This walks back up the tree in the path
- * to find the first node higher up where we haven't yet gone through
- * all the slots
- */
 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
-                                 int *level, int max_level)
+                                 struct walk_control *wc, int max_level)
 {
-        struct btrfs_root_item *root_item = &root->root_item;
+        int level = wc->level;
-        int i;
-        int slot;
        int ret;
-        for (i = *level; i < max_level && path->nodes[i]; i++) {
+        path->slots[level] = btrfs_header_nritems(path->nodes[level]);
-                slot = path->slots[i];
+        while (level < max_level && path->nodes[level]) {
-                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
+                wc->level = level;
-                        /*
+                if (path->slots[level] + 1 <
-                         * there is more work to do in this level.
+                    btrfs_header_nritems(path->nodes[level])) {
-                         * Update the drop_progress marker to reflect
+                        path->slots[level]++;
-                         * the work we've done so far, and then bump
-                         * the slot number
-                         */
-                        path->slots[i]++;
-                        WARN_ON(*level == 0);
-                        if (max_level == BTRFS_MAX_LEVEL) {
-                                btrfs_node_key(path->nodes[i],
-                                               &root_item->drop_progress,
-                                               path->slots[i]);
-                                root_item->drop_level = i;
-                        }
-                        *level = i;
                        return 0;
                } else {
-                        struct extent_buffer *parent;
+                        ret = walk_up_proc(trans, root, path, wc);
+                        if (ret > 0)
-                        /*
+                                return 0;
-                         * this whole node is done, free our reference
-                         * on it and go up one level
-                         */
-                        if (path->nodes[*level] == root->node)
-                                parent = path->nodes[*level];
-                        else
-                                parent = path->nodes[*level + 1];
-                        clean_tree_block(trans, root, path->nodes[i]);
+                        if (path->locks[level]) {
-                        ret = btrfs_free_extent(trans, root,
+                                btrfs_tree_unlock(path->nodes[level]);
-                                                path->nodes[i]->start,
+                                path->locks[level] = 0;
-                                                path->nodes[i]->len,
-                                                parent->start,
-                                                btrfs_header_owner(parent),
-                                                *level, 0);
-                        BUG_ON(ret);
-                        if (path->locks[*level]) {
-                                btrfs_tree_unlock(path->nodes[i]);
-                                path->locks[i] = 0;
                        }
-                        free_extent_buffer(path->nodes[i]);
+                        free_extent_buffer(path->nodes[level]);
-                        path->nodes[i] = NULL;
+                        path->nodes[level] = NULL;
-                        *level = i + 1;
+                        level++;
                }
        }
        return 1;
 }
 /*
- * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * drop a subvolume tree.
- * the tree freeing any blocks that have a ref count of zero after being
+ *
- * decremented.
+ * this function traverses the tree freeing any blocks that only
+ * referenced by the tree.
+ *
+ * when a shared tree block is found. this function decreases its
+ * reference count by one. if update_ref is true, this function
+ * also make sure backrefs for the shared block and all lower level
+ * blocks are properly updated.
 */
-int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
-                        *root)
 {
-        int ret = 0;
-        int wret;
-        int level;
        struct btrfs_path *path;
-        int update_count;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *tree_root = root->fs_info->tree_root;
        struct btrfs_root_item *root_item = &root->root_item;
+        struct walk_control *wc;
+        struct btrfs_key key;
+        int err = 0;
+        int ret;
+        int level;
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        level = btrfs_header_level(root->node);
+        wc = kzalloc(sizeof(*wc), GFP_NOFS);
+        BUG_ON(!wc);
+        trans = btrfs_start_transaction(tree_root, 1);
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+                level = btrfs_header_level(root->node);
                path->nodes[level] = btrfs_lock_root_node(root);
                btrfs_set_lock_blocking(path->nodes[level]);
                path->slots[level] = 0;
                path->locks[level] = 1;
+                memset(&wc->update_progress, 0,
+                       sizeof(wc->update_progress));
        } else {
-                struct btrfs_key key;
-                struct btrfs_disk_key found_key;
-                struct extent_buffer *node;
                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+                memcpy(&wc->update_progress, &key,
+                       sizeof(wc->update_progress));
                level = root_item->drop_level;
+                BUG_ON(level == 0);
                path->lowest_level = level;
-                wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-                if (wret < 0) {
+                path->lowest_level = 0;
-                        ret = wret;
+                if (ret < 0) {
+                        err = ret;
                        goto out;
                }
-                node = path->nodes[level];
+                btrfs_node_key_to_cpu(path->nodes[level], &key,
-                btrfs_node_key(node, &found_key, path->slots[level]);
+                                      path->slots[level]);
-                WARN_ON(memcmp(&found_key, &root_item->drop_progress,
+                WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
-                               sizeof(found_key)));
                /*
                 * unlock our path, this is safe because only this
                 * function is allowed to delete this snapshot
                 */
                btrfs_unlock_up_safe(path, 0);
+                level = btrfs_header_level(root->node);
+                while (1) {
+                        btrfs_tree_lock(path->nodes[level]);
+                        btrfs_set_lock_blocking(path->nodes[level]);
+                        ret = btrfs_lookup_extent_info(trans, root,
+                                                path->nodes[level]->start,
+                                                path->nodes[level]->len,
+                                                &wc->refs[level],
+                                                &wc->flags[level]);
+                        BUG_ON(ret);
+                        BUG_ON(wc->refs[level] == 0);
+                        if (level == root_item->drop_level)
+                                break;
+                        btrfs_tree_unlock(path->nodes[level]);
+                        WARN_ON(wc->refs[level] != 1);
+                        level--;
+                }
        }
+        wc->level = level;
+        wc->shared_level = -1;
+        wc->stage = DROP_REFERENCE;
+        wc->update_ref = update_ref;
+        wc->keep_locks = 0;
        while (1) {
-                unsigned long update;
+                ret = walk_down_tree(trans, root, path, wc);
-                wret = walk_down_tree(trans, root, path, &level);
+                if (ret < 0) {
-                if (wret > 0)
+                        err = ret;
                        break;
-                if (wret < 0)
+                }
-                        ret = wret;
-                wret = walk_up_tree(trans, root, path, &level,
+                ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
-                                    BTRFS_MAX_LEVEL);
+                if (ret < 0) {
-                if (wret > 0)
+                        err = ret;
                        break;
-                if (wret < 0)
+                }
-                        ret = wret;
-                if (trans->transaction->in_commit ||
+                if (ret > 0) {
-                    trans->transaction->delayed_refs.flushing) {
+                        BUG_ON(wc->stage != DROP_REFERENCE);
-                        ret = -EAGAIN;
                        break;
                }
-                for (update_count = 0; update_count < 16; update_count++) {
+                if (wc->stage == DROP_REFERENCE) {
+                        level = wc->level;
+                        btrfs_node_key(path->nodes[level],
+                                       &root_item->drop_progress,
+                                       path->slots[level]);
+                        root_item->drop_level = level;
+                }
+                BUG_ON(wc->level == 0);
+                if (trans->transaction->in_commit ||
+                    trans->transaction->delayed_refs.flushing) {
+                        ret = btrfs_update_root(trans, tree_root,
+                                                &root->root_key,
+                                                root_item);
+                        BUG_ON(ret);
+                        btrfs_end_transaction(trans, tree_root);
+                        trans = btrfs_start_transaction(tree_root, 1);
+                } else {
+                        unsigned long update;
                        update = trans->delayed_ref_updates;
                        trans->delayed_ref_updates = 0;
                        if (update)
-                                btrfs_run_delayed_refs(trans, root, update);
+                                btrfs_run_delayed_refs(trans, tree_root,
-                        else
+                                                       update);
-                                break;
                }
        }
+        btrfs_release_path(root, path);
+        BUG_ON(err);
+        ret = btrfs_del_root(trans, tree_root, &root->root_key);
+        BUG_ON(ret);
+        free_extent_buffer(root->node);
+        free_extent_buffer(root->commit_root);
+        kfree(root);
 out:
+        btrfs_end_transaction(trans, tree_root);
+        kfree(wc);
        btrfs_free_path(path);
-        return ret;
+        return err;
 }
+/*
+ * drop subtree rooted at tree block 'node'.
+ *
+ * NOTE: this function will unlock and release tree block 'node'
+ */
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
                        struct extent_buffer *parent)
 {
        struct btrfs_path *path;
+        struct walk_control *wc;
        int level;
        int parent_level;
        int ret = 0;
        int wret;
+        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        wc = kzalloc(sizeof(*wc), GFP_NOFS);
+        BUG_ON(!wc);
        btrfs_assert_tree_locked(parent);
        parent_level = btrfs_header_level(parent);
        extent_buffer_get(parent);
@@ -4817,24 +5024,33 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        btrfs_assert_tree_locked(node);
        level = btrfs_header_level(node);
-        extent_buffer_get(node);
        path->nodes[level] = node;
        path->slots[level] = 0;
+        path->locks[level] = 1;
+        wc->refs[parent_level] = 1;
+        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+        wc->level = level;
+        wc->shared_level = -1;
+        wc->stage = DROP_REFERENCE;
+        wc->update_ref = 0;
+        wc->keep_locks = 1;
        while (1) {
-                wret = walk_down_tree(trans, root, path, &level);
+                wret = walk_down_tree(trans, root, path, wc);
-                if (wret < 0)
+                if (wret < 0) {
                        ret = wret;
-                if (wret != 0)
                        break;
+                }
-                wret = walk_up_tree(trans, root, path, &level, parent_level);
+                wret = walk_up_tree(trans, root, path, wc, parent_level);
                if (wret < 0)
                        ret = wret;
                if (wret != 0)
                        break;
        }
+        kfree(wc);
        btrfs_free_path(path);
        return ret;
 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 126477eaecf5..7c3cd248d8d6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -151,7 +151,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
        }
        if (end_pos > isize) {
                i_size_write(inode, end_pos);
-                btrfs_update_inode(trans, root, inode);
+                /* we've only changed i_size in ram, and we haven't updated
+                 * the disk i_size.  There is no need to log the inode
+                 * at this time.
+                 */
        }
        err = btrfs_end_transaction(trans, root);
 out_unlock:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dbe1aabf96cd..7ffa3d34ea19 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3580,12 +3580,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                owner = 1;
        BTRFS_I(inode)->block_group =
                        btrfs_find_block_group(root, 0, alloc_hint, owner);
-        if ((mode & S_IFREG)) {
-                if (btrfs_test_opt(root, NODATASUM))
-                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
-                if (btrfs_test_opt(root, NODATACOW))
-                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
-        }
        key[0].objectid = objectid;
        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -3640,6 +3634,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        btrfs_inherit_iflags(inode, dir);
+        if ((mode & S_IFREG)) {
+                if (btrfs_test_opt(root, NODATASUM))
+                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+                if (btrfs_test_opt(root, NODATACOW))
+                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+        }
        insert_inode_hash(inode);
        inode_tree_add(inode);
        return inode;
@@ -5082,6 +5083,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
        struct extent_map *em;
        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root;
        int ret;
        alloc_start = offset & ~mask;
@@ -5100,6 +5102,13 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        goto out;
        }
+        root = BTRFS_I(inode)->root;
+        ret = btrfs_check_data_free_space(root, inode,
+                                          alloc_end - alloc_start);
+        if (ret)
+                goto out;
        locked_end = alloc_end - 1;
        while (1) {
                struct btrfs_ordered_extent *ordered;
@@ -5107,7 +5116,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
                if (!trans) {
                        ret = -EIO;
-                        goto out;
+                        goto out_free;
                }
                /* the extent lock is ordered inside the running
@@ -5168,6 +5177,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                      GFP_NOFS);
        btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+out_free:
+        btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index eff18f5b5362..9f4db848db10 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1028,7 +1028,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                                struct btrfs_file_extent_item);
                        comp = btrfs_file_extent_compression(leaf, extent);
                        type = btrfs_file_extent_type(leaf, extent);
-                        if (type == BTRFS_FILE_EXTENT_REG) {
+                        if (type == BTRFS_FILE_EXTENT_REG ||
+                            type == BTRFS_FILE_EXTENT_PREALLOC) {
                                disko = btrfs_file_extent_disk_bytenr(leaf,
                                                                      extent);
                                diskl = btrfs_file_extent_disk_num_bytes(leaf,
@@ -1051,7 +1052,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        new_key.objectid = inode->i_ino;
                        new_key.offset = key.offset + destoff - off;
-                        if (type == BTRFS_FILE_EXTENT_REG) {
+                        if (type == BTRFS_FILE_EXTENT_REG ||
+                            type == BTRFS_FILE_EXTENT_PREALLOC) {
                                ret = btrfs_insert_empty_item(trans, root, path,
                                                              &new_key, size);
                                if (ret)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b23dc209ae10..008397934778 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1788,7 +1788,7 @@ static void merge_func(struct btrfs_work *work)
                btrfs_end_transaction(trans, root);
        }
-        btrfs_drop_dead_root(reloc_root);
+        btrfs_drop_snapshot(reloc_root, 0);
        if (atomic_dec_and_test(async->num_pending))
                complete(async->done);
@@ -2075,9 +2075,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
                        BUG_ON(ret);
-                        btrfs_tree_unlock(eb);
-                        free_extent_buffer(eb);
                }
                if (!lowest) {
                        btrfs_tree_unlock(upper->eb);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4e83457ea253..2dbf1c1f56ee 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -593,6 +593,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
        return 0;
 }
+#if 0
 /*
 * when dropping snapshots, we generate a ton of delayed refs, and it makes
 * sense not to join the transaction while it is trying to flush the current
@@ -681,6 +682,7 @@ int btrfs_drop_dead_root(struct btrfs_root *root)
        btrfs_btree_balance_dirty(tree_root, nr);
        return ret;
 }
+#endif
 /*
 * new snapshots need to be created at a very specific time in the
@@ -1081,7 +1083,7 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
        while (!list_empty(&list)) {
                root = list_entry(list.next, struct btrfs_root, root_list);
                list_del_init(&root->root_list);
-                btrfs_drop_dead_root(root);
+                btrfs_drop_snapshot(root, 0);
        }
        return 0;
 }
diff --git a/fs/compat.c b/fs/compat.c
index cdd51a3a7c53..fbadb947727b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1486,8 +1486,8 @@ int compat_do_execve(char * filename,
        if (!bprm)
                goto out_files;
-        retval = mutex_lock_interruptible(&current->cred_guard_mutex);
+        retval = -ERESTARTNOINTR;
-        if (retval < 0)
+        if (mutex_lock_interruptible(&current->cred_guard_mutex))
                goto out_free;
        current->in_execve = 1;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 3f0e1974abdc..31d12de83a2a 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -14,35 +14,44 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
-#include <linux/eventfd.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
+#include <linux/kref.h>
+#include <linux/eventfd.h>
 struct eventfd_ctx {
+        struct kref kref;
        wait_queue_head_t wqh;
        /*
         * Every time that a write(2) is performed on an eventfd, the
         * value of the __u64 being written is added to "count" and a
         * wakeup is performed on "wqh". A read(2) will return the "count"
         * value to userspace, and will reset "count" to zero. The kernel
-         * size eventfd_signal() also, adds to the "count" counter and
+         * side eventfd_signal() also, adds to the "count" counter and
         * issue a wakeup.
         */
        __u64 count;
        unsigned int flags;
 };
-/*
+/**
- * Adds "n" to the eventfd counter "count". Returns "n" in case of
+ * eventfd_signal - Adds @n to the eventfd counter.
- * success, or a value lower then "n" in case of coutner overflow.
+ * @ctx: [in] Pointer to the eventfd context.
- * This function is supposed to be called by the kernel in paths
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
- * that do not allow sleeping. In this function we allow the counter
+ *          The value cannot be negative.
- * to reach the ULLONG_MAX value, and we signal this as overflow
+ *
- * condition by returining a POLLERR to poll(2).
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returining a POLLERR
+ * to poll(2).
+ *
+ * Returns @n in case of success, a non-negative number lower than @n in case
+ * of overflow, or the following error codes:
+ *
+ * -EINVAL    : The value of @n is negative.
 */
-int eventfd_signal(struct file *file, int n)
+int eventfd_signal(struct eventfd_ctx *ctx, int n)
 {
-        struct eventfd_ctx *ctx = file->private_data;
        unsigned long flags;
        if (n < 0)
@@ -59,9 +68,45 @@ int eventfd_signal(struct file *file, int n)
 }
 EXPORT_SYMBOL_GPL(eventfd_signal);
+static void eventfd_free(struct kref *kref)
+{
+        struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
+        kfree(ctx);
+}
+/**
+ * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to the eventfd context.
+ *
+ * Returns: In case of success, returns a pointer to the eventfd context.
+ */
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
+{
+        kref_get(&ctx->kref);
+        return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_get);
+/**
+ * eventfd_ctx_put - Releases a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to eventfd context.
+ *
+ * The eventfd context reference must have been previously acquired either
+ * with eventfd_ctx_get() or eventfd_ctx_fdget()).
+ */
+void eventfd_ctx_put(struct eventfd_ctx *ctx)
+{
+        kref_put(&ctx->kref, eventfd_free);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_put);
 static int eventfd_release(struct inode *inode, struct file *file)
 {
-        kfree(file->private_data);
+        struct eventfd_ctx *ctx = file->private_data;
+        wake_up_poll(&ctx->wqh, POLLHUP);
+        eventfd_ctx_put(ctx);
        return 0;
 }
@@ -185,6 +230,16 @@ static const struct file_operations eventfd_fops = {
        .write          = eventfd_write,
 };
+/**
+ * eventfd_fget - Acquire a reference of an eventfd file descriptor.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the eventfd file structure in case of success, or the
+ * following error pointer:
+ *
+ * -EBADF    : Invalid @fd file descriptor.
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
 struct file *eventfd_fget(int fd)
 {
        struct file *file;
@@ -201,6 +256,48 @@ struct file *eventfd_fget(int fd)
 }
 EXPORT_SYMBOL_GPL(eventfd_fget);
+/**
+ * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointers returned by the following functions:
+ *
+ * eventfd_fget
+ */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd)
+{
+        struct file *file;
+        struct eventfd_ctx *ctx;
+        file = eventfd_fget(fd);
+        if (IS_ERR(file))
+                return (struct eventfd_ctx *) file;
+        ctx = eventfd_ctx_get(file->private_data);
+        fput(file);
+        return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
+/**
+ * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
+ * @file: [in] Eventfd file pointer.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointer:
+ *
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
+{
+        if (file->f_op != &eventfd_fops)
+                return ERR_PTR(-EINVAL);
+        return eventfd_ctx_get(file->private_data);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
        int fd;
@@ -217,6 +314,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
        if (!ctx)
                return -ENOMEM;
+        kref_init(&ctx->kref);
        init_waitqueue_head(&ctx->wqh);
        ctx->count = count;
        ctx->flags = flags;
diff --git a/fs/exec.c b/fs/exec.c
index e639957d7a57..4a8849e45b21 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1277,8 +1277,8 @@ int do_execve(char * filename,
        if (!bprm)
                goto out_files;
-        retval = mutex_lock_interruptible(&current->cred_guard_mutex);
+        retval = -ERESTARTNOINTR;
-        if (retval < 0)
+        if (mutex_lock_interruptible(&current->cred_guard_mutex))
                goto out_free;
        current->in_execve = 1;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 6524ecaebb7a..e1dedb0f7873 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -66,8 +66,16 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
        inode = NULL;
        if (ino) {
                inode = ext2_iget(dir->i_sb, ino);
-                if (IS_ERR(inode))
+                if (unlikely(IS_ERR(inode))) {
-                        return ERR_CAST(inode);
+                        if (PTR_ERR(inode) == -ESTALE) {
+                                ext2_error(dir->i_sb, __func__,
+                                                "deleted inode referenced: %lu",
+                                                ino);
+                                return ERR_PTR(-EIO);
+                        } else {
+                                return ERR_CAST(inode);
+                        }
+                }
        }
        return d_splice_alias(inode, dentry);
 }
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8fed2ed12f38..f58ecbc416c8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -849,6 +849,81 @@ err:
        return err;
 }
+static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
+                                   struct fuse_copy_state *cs)
+{
+        struct fuse_notify_inval_inode_out outarg;
+        int err = -EINVAL;
+        if (size != sizeof(outarg))
+                goto err;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                goto err;
+        fuse_copy_finish(cs);
+        down_read(&fc->killsb);
+        err = -ENOENT;
+        if (!fc->sb)
+                goto err_unlock;
+        err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
+                                       outarg.off, outarg.len);
+err_unlock:
+        up_read(&fc->killsb);
+        return err;
+err:
+        fuse_copy_finish(cs);
+        return err;
+}
+static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
+                                   struct fuse_copy_state *cs)
+{
+        struct fuse_notify_inval_entry_out outarg;
+        int err = -EINVAL;
+        char buf[FUSE_NAME_MAX+1];
+        struct qstr name;
+        if (size < sizeof(outarg))
+                goto err;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                goto err;
+        err = -ENAMETOOLONG;
+        if (outarg.namelen > FUSE_NAME_MAX)
+                goto err;
+        name.name = buf;
+        name.len = outarg.namelen;
+        err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+        if (err)
+                goto err;
+        fuse_copy_finish(cs);
+        buf[outarg.namelen] = 0;
+        name.hash = full_name_hash(name.name, name.len);
+        down_read(&fc->killsb);
+        err = -ENOENT;
+        if (!fc->sb)
+                goto err_unlock;
+        err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
+err_unlock:
+        up_read(&fc->killsb);
+        return err;
+err:
+        fuse_copy_finish(cs);
+        return err;
+}
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
                       unsigned int size, struct fuse_copy_state *cs)
 {
@@ -856,6 +931,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
        case FUSE_NOTIFY_POLL:
                return fuse_notify_poll(fc, size, cs);
+        case FUSE_NOTIFY_INVAL_INODE:
+                return fuse_notify_inval_inode(fc, size, cs);
+        case FUSE_NOTIFY_INVAL_ENTRY:
+                return fuse_notify_inval_entry(fc, size, cs);
        default:
                fuse_copy_finish(cs);
                return -EINVAL;
@@ -910,7 +991,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
                               unsigned long nr_segs, loff_t pos)
 {
        int err;
-        unsigned nbytes = iov_length(iov, nr_segs);
+        size_t nbytes = iov_length(iov, nr_segs);
        struct fuse_req *req;
        struct fuse_out_header oh;
        struct fuse_copy_state cs;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b3089a083d30..e703654e7f40 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -375,7 +375,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        struct fuse_conn *fc = get_fuse_conn(dir);
        struct fuse_req *req;
        struct fuse_req *forget_req;
-        struct fuse_open_in inarg;
+        struct fuse_create_in inarg;
        struct fuse_open_out outopen;
        struct fuse_entry_out outentry;
        struct fuse_file *ff;
@@ -399,15 +399,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (!ff)
                goto out_put_request;
+        if (!fc->dont_mask)
+                mode &= ~current_umask();
        flags &= ~O_NOCTTY;
        memset(&inarg, 0, sizeof(inarg));
        memset(&outentry, 0, sizeof(outentry));
        inarg.flags = flags;
        inarg.mode = mode;
+        inarg.umask = current_umask();
        req->in.h.opcode = FUSE_CREATE;
        req->in.h.nodeid = get_node_id(dir);
        req->in.numargs = 2;
-        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) :
+                                                sizeof(inarg);
        req->in.args[0].value = &inarg;
        req->in.args[1].size = entry->d_name.len + 1;
        req->in.args[1].value = entry->d_name.name;
@@ -546,12 +551,17 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
        if (IS_ERR(req))
                return PTR_ERR(req);
+        if (!fc->dont_mask)
+                mode &= ~current_umask();
        memset(&inarg, 0, sizeof(inarg));
        inarg.mode = mode;
        inarg.rdev = new_encode_dev(rdev);
+        inarg.umask = current_umask();
        req->in.h.opcode = FUSE_MKNOD;
        req->in.numargs = 2;
-        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE :
+                                                sizeof(inarg);
        req->in.args[0].value = &inarg;
        req->in.args[1].size = entry->d_name.len + 1;
        req->in.args[1].value = entry->d_name.name;
@@ -578,8 +588,12 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
        if (IS_ERR(req))
                return PTR_ERR(req);
+        if (!fc->dont_mask)
+                mode &= ~current_umask();
        memset(&inarg, 0, sizeof(inarg));
        inarg.mode = mode;
+        inarg.umask = current_umask();
        req->in.h.opcode = FUSE_MKDIR;
        req->in.numargs = 2;
        req->in.args[0].size = sizeof(inarg);
@@ -845,6 +859,43 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
        return err;
 }
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+                             struct qstr *name)
+{
+        int err = -ENOTDIR;
+        struct inode *parent;
+        struct dentry *dir;
+        struct dentry *entry;
+        parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid);
+        if (!parent)
+                return -ENOENT;
+        mutex_lock(&parent->i_mutex);
+        if (!S_ISDIR(parent->i_mode))
+                goto unlock;
+        err = -ENOENT;
+        dir = d_find_alias(parent);
+        if (!dir)
+                goto unlock;
+        entry = d_lookup(dir, name);
+        dput(dir);
+        if (!entry)
+                goto unlock;
+        fuse_invalidate_attr(parent);
+        fuse_invalidate_entry(entry);
+        dput(entry);
+        err = 0;
+ unlock:
+        mutex_unlock(&parent->i_mutex);
+        iput(parent);
+        return err;
+}
 /*
 * Calling into a user-controlled filesystem gives the filesystem
 * daemon ptrace-like capabilities over the requester process.  This
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index fce6ce694fde..cbc464043b6f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1922,7 +1922,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)
        req = fuse_get_req(fc);
        if (IS_ERR(req))
-                return PTR_ERR(req);
+                return POLLERR;
        req->in.h.opcode = FUSE_POLL;
        req->in.h.nodeid = ff->nodeid;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index aaf2f9ff970e..52b641fc0faf 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -446,6 +446,9 @@ struct fuse_conn {
        /** Do multi-page cached writes */
        unsigned big_writes:1;
+        /** Don't apply umask to creation modes */
+        unsigned dont_mask:1;
        /** The number of requests waiting for completion */
        atomic_t num_waiting;
@@ -481,6 +484,12 @@ struct fuse_conn {
        /** Called on final put */
        void (*release)(struct fuse_conn *);
+        /** Super block for this connection. */
+        struct super_block *sb;
+        /** Read/write semaphore to hold when accessing sb. */
+        struct rw_semaphore killsb;
 };
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -509,6 +518,11 @@ extern const struct file_operations fuse_dev_operations;
 extern const struct dentry_operations fuse_dentry_operations;
 /**
+ * Inode to nodeid comparison.
+ */
+int fuse_inode_eq(struct inode *inode, void *_nodeidp);
+/**
 * Get a filled in inode
 */
 struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
@@ -708,6 +722,19 @@ void fuse_release_nowrite(struct inode *inode);
 u64 fuse_get_attr_version(struct fuse_conn *fc);
+/**
+ * File-system tells the kernel to invalidate cache for the given node id.
+ */
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+                             loff_t offset, loff_t len);
+/**
+ * File-system tells the kernel to invalidate parent attributes and
+ * the dentry matching parent/name.
+ */
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+                             struct qstr *name);
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
                 bool isdir);
 ssize_t fuse_direct_io(struct file *file, const char __user *buf,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d8673ccf90b7..f91ccc4a189d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -206,7 +206,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
                BUG();
 }
-static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+int fuse_inode_eq(struct inode *inode, void *_nodeidp)
 {
        u64 nodeid = *(u64 *) _nodeidp;
        if (get_node_id(inode) == nodeid)
@@ -257,6 +257,31 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
        return inode;
 }
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+                             loff_t offset, loff_t len)
+{
+        struct inode *inode;
+        pgoff_t pg_start;
+        pgoff_t pg_end;
+        inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+        if (!inode)
+                return -ENOENT;
+        fuse_invalidate_attr(inode);
+        if (offset >= 0) {
+                pg_start = offset >> PAGE_CACHE_SHIFT;
+                if (len <= 0)
+                        pg_end = -1;
+                else
+                        pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+                invalidate_inode_pages2_range(inode->i_mapping,
+                                              pg_start, pg_end);
+        }
+        iput(inode);
+        return 0;
+}
 static void fuse_umount_begin(struct super_block *sb)
 {
        fuse_abort_conn(get_fuse_conn_super(sb));
@@ -480,6 +505,7 @@ void fuse_conn_init(struct fuse_conn *fc)
        memset(fc, 0, sizeof(*fc));
        spin_lock_init(&fc->lock);
        mutex_init(&fc->inst_mutex);
+        init_rwsem(&fc->killsb);
        atomic_set(&fc->count, 1);
        init_waitqueue_head(&fc->waitq);
        init_waitqueue_head(&fc->blocked_waitq);
@@ -725,6 +751,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                        }
                        if (arg->flags & FUSE_BIG_WRITES)
                                fc->big_writes = 1;
+                        if (arg->flags & FUSE_DONT_MASK)
+                                fc->dont_mask = 1;
                } else {
                        ra_pages = fc->max_read / PAGE_CACHE_SIZE;
                        fc->no_lock = 1;
@@ -748,7 +776,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        arg->minor = FUSE_KERNEL_MINOR_VERSION;
        arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
-                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES;
+                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK;
        req->in.h.opcode = FUSE_INIT;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(*arg);
@@ -860,10 +888,16 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        fuse_conn_init(fc);
        fc->dev = sb->s_dev;
+        fc->sb = sb;
        err = fuse_bdi_init(fc, sb);
        if (err)
                goto err_put_conn;
+        /* Handle umasking inside the fuse code */
+        if (sb->s_flags & MS_POSIXACL)
+                fc->dont_mask = 1;
+        sb->s_flags |= MS_POSIXACL;
        fc->release = fuse_free_conn;
        fc->flags = d.flags;
        fc->user_id = d.user_id;
@@ -941,12 +975,25 @@ static int fuse_get_sb(struct file_system_type *fs_type,
        return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
 }
+static void fuse_kill_sb_anon(struct super_block *sb)
+{
+        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        if (fc) {
+                down_write(&fc->killsb);
+                fc->sb = NULL;
+                up_write(&fc->killsb);
+        }
+        kill_anon_super(sb);
+}
 static struct file_system_type fuse_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fuse",
        .fs_flags       = FS_HAS_SUBTYPE,
        .get_sb         = fuse_get_sb,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = fuse_kill_sb_anon,
 };
 #ifdef CONFIG_BLOCK
@@ -958,11 +1005,24 @@ static int fuse_get_sb_blk(struct file_system_type *fs_type,
                           mnt);
 }
+static void fuse_kill_sb_blk(struct super_block *sb)
+{
+        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        if (fc) {
+                down_write(&fc->killsb);
+                fc->sb = NULL;
+                up_write(&fc->killsb);
+        }
+        kill_block_super(sb);
+}
 static struct file_system_type fuseblk_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fuseblk",
        .get_sb         = fuse_get_sb_blk,
-        .kill_sb        = kill_block_super,
+        .kill_sb        = fuse_kill_sb_blk,
        .fs_flags       = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
 };
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index fe02ad4740e7..032604e5ef2c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -972,6 +972,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
        sb->s_blocksize_bits = 10;
        sb->s_magic = HOSTFS_SUPER_MAGIC;
        sb->s_op = &hostfs_sbops;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
        /* NULL is printed as <NULL> by sprintf: avoid that. */
        if (req_root == NULL)
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 7515e73e2bfb..696686cc206e 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -130,9 +130,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
        if (jffs2_sum_active()) {
                s = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
                if (!s) {
-                        kfree(flashbuf);
                        JFFS2_WARNING("Can't allocate memory for summary\n");
-                        return -ENOMEM;
+                        ret = -ENOMEM;
+                        goto out;
                }
        }
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 4145083dcf88..23341c1063bc 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -678,7 +678,6 @@ __be32
 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                        int access, struct file **filp)
 {
-        const struct cred *cred = current_cred();
        struct dentry   *dentry;
        struct inode    *inode;
        int             flags = O_RDONLY|O_LARGEFILE;
@@ -733,7 +732,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                vfs_dq_init(inode);
        }
        *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
-                            flags, cred);
+                            flags, current_cred());
        if (IS_ERR(*filp))
                host_err = PTR_ERR(*filp);
        else
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ff231ad23895..ff27a2965844 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -296,12 +296,15 @@ static int inotify_fasync(int fd, struct file *file, int on)
 static int inotify_release(struct inode *ignored, struct file *file)
 {
        struct fsnotify_group *group = file->private_data;
+        struct user_struct *user = group->inotify_data.user;
        fsnotify_clear_marks_by_group(group);
        /* free this group, matching get was inotify_init->fsnotify_obtain_group */
        fsnotify_put_group(group);
+        atomic_dec(&user->inotify_devs);
        return 0;
 }
diff --git a/fs/sync.c b/fs/sync.c
index dd200025af85..3422ba61d86d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -112,8 +112,13 @@ restart:
        mutex_unlock(&mutex);
 }
+/*
+ * sync everything.  Start out by waking pdflush, because that writes back
+ * all queues in parallel.
+ */
 SYSCALL_DEFINE0(sync)
 {
+        wakeup_pdflush(0);
        sync_filesystems(0);
        sync_filesystems(1);
        if (unlikely(laptop_mode))