148 files changed, 6376 insertions, 3047 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 392c5dac1981..d934f04e7736 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -184,10 +184,20 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        v9ses->afid = option;
                        break;
                case Opt_uname:
-                        match_strlcpy(v9ses->uname, &args[0], PATH_MAX);
+                        kfree(v9ses->uname);
+                        v9ses->uname = match_strdup(&args[0]);
+                        if (!v9ses->uname) {
+                                ret = -ENOMEM;
+                                goto free_and_return;
+                        }
                        break;
                case Opt_remotename:
-                        match_strlcpy(v9ses->aname, &args[0], PATH_MAX);
+                        kfree(v9ses->aname);
+                        v9ses->aname = match_strdup(&args[0]);
+                        if (!v9ses->aname) {
+                                ret = -ENOMEM;
+                                goto free_and_return;
+                        }
                        break;
                case Opt_nodevmap:
                        v9ses->nodev = 1;
@@ -287,21 +297,21 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        struct p9_fid *fid;
        int rc;
-        v9ses->uname = __getname();
+        v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
        if (!v9ses->uname)
                return ERR_PTR(-ENOMEM);
-        v9ses->aname = __getname();
+        v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
        if (!v9ses->aname) {
-                __putname(v9ses->uname);
+                kfree(v9ses->uname);
                return ERR_PTR(-ENOMEM);
        }
        init_rwsem(&v9ses->rename_sem);
        rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
        if (rc) {
-                __putname(v9ses->aname);
+                kfree(v9ses->aname);
-                __putname(v9ses->uname);
+                kfree(v9ses->uname);
                return ERR_PTR(rc);
        }
@@ -309,8 +319,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        list_add(&v9ses->slist, &v9fs_sessionlist);
        spin_unlock(&v9fs_sessionlist_lock);
-        strcpy(v9ses->uname, V9FS_DEFUSER);
-        strcpy(v9ses->aname, V9FS_DEFANAME);
        v9ses->uid = ~0;
        v9ses->dfltuid = V9FS_DEFUID;
        v9ses->dfltgid = V9FS_DEFGID;
@@ -412,8 +420,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
                kfree(v9ses->cachetag);
        }
 #endif
-        __putname(v9ses->uname);
+        kfree(v9ses->uname);
-        __putname(v9ses->aname);
+        kfree(v9ses->aname);
        bdi_destroy(&v9ses->bdi);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index cbf9dbb1b2a2..890bed538f9b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1276,12 +1276,12 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
        }
        /* copy extension buffer into buffer */
-        strncpy(buffer, st->extension, buflen);
+        retval = min(strlen(st->extension)+1, (size_t)buflen);
+        memcpy(buffer, st->extension, retval);
-        p9_debug(P9_DEBUG_VFS, "%s -> %s (%s)\n",
+        p9_debug(P9_DEBUG_VFS, "%s -> %s (%.*s)\n",
-                 dentry->d_name.name, st->extension, buffer);
+                 dentry->d_name.name, st->extension, buflen, buffer);
-        retval = strnlen(buffer, buflen);
 done:
        p9stat_free(st);
        kfree(st);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e7396cfdb109..91b11650722e 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -392,10 +392,12 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
                ino->flags |= AUTOFS_INF_PENDING;
                spin_unlock(&sbi->fs_lock);
                status = autofs4_mount_wait(dentry);
-                if (status)
-                        return ERR_PTR(status);
                spin_lock(&sbi->fs_lock);
                ino->flags &= ~AUTOFS_INF_PENDING;
+                if (status) {
+                        spin_unlock(&sbi->fs_lock);
+                        return ERR_PTR(status);
+                }
        }
 done:
        if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e800dec958c3..fbd9f60bd763 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -36,7 +36,6 @@
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
-#include <asm/exec.h>
 #ifndef user_long_t
 #define user_long_t long
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 262db114ff01..a46049154107 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -39,7 +39,6 @@
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/pgalloc.h>
-#include <asm/exec.h>
 typedef char *elf_caddr_t;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index e85c04b9f61c..a3f28f331b2b 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -70,23 +70,25 @@ static inline int use_bip_pool(unsigned int idx)
 }
 /**
- * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
 * @bio:        bio to attach integrity metadata to
 * @gfp_mask:   Memory allocation mask
 * @nr_vecs:    Number of integrity metadata scatter-gather elements
- * @bs:         bio_set to allocate from
 *
 * Description: This function prepares a bio for attaching integrity
 * metadata.  nr_vecs specifies the maximum number of pages containing
 * integrity metadata that can be attached.
 */
-struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
-                                                         gfp_t gfp_mask,
+                                                  gfp_t gfp_mask,
-                                                         unsigned int nr_vecs,
+                                                  unsigned int nr_vecs)
-                                                         struct bio_set *bs)
 {
        struct bio_integrity_payload *bip;
        unsigned int idx = vecs_to_idx(nr_vecs);
+        struct bio_set *bs = bio->bi_pool;
+        if (!bs)
+                bs = fs_bio_set;
        BUG_ON(bio == NULL);
        bip = NULL;
@@ -114,37 +116,22 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
        return bip;
 }
-EXPORT_SYMBOL(bio_integrity_alloc_bioset);
-/**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
- * @bio:        bio to attach integrity metadata to
- * @gfp_mask:   Memory allocation mask
- * @nr_vecs:    Number of integrity metadata scatter-gather elements
- *
- * Description: This function prepares a bio for attaching integrity
- * metadata.  nr_vecs specifies the maximum number of pages containing
- * integrity metadata that can be attached.
- */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
-                                                  gfp_t gfp_mask,
-                                                  unsigned int nr_vecs)
-{
-        return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
-}
 EXPORT_SYMBOL(bio_integrity_alloc);
 /**
 * bio_integrity_free - Free bio integrity payload
 * @bio:        bio containing bip to be freed
- * @bs:         bio_set this bio was allocated from
 *
 * Description: Used to free the integrity portion of a bio. Usually
 * called from bio_free().
 */
-void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+void bio_integrity_free(struct bio *bio)
 {
        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct bio_set *bs = bio->bi_pool;
+        if (!bs)
+                bs = fs_bio_set;
        BUG_ON(bip == NULL);
@@ -730,19 +717,18 @@ EXPORT_SYMBOL(bio_integrity_split);
 * @bio:        New bio
 * @bio_src:    Original bio
 * @gfp_mask:   Memory allocation mask
- * @bs:         bio_set to allocate bip from
 *
 * Description: Called to allocate a bip when cloning a bio
 */
 int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
-                        gfp_t gfp_mask, struct bio_set *bs)
+                        gfp_t gfp_mask)
 {
        struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
        struct bio_integrity_payload *bip;
        BUG_ON(bip_src == NULL);
-        bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
+        bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
        if (bip == NULL)
                return -EIO;
diff --git a/fs/bio.c b/fs/bio.c
index 71072ab99128..9298c65ad9c7 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -55,6 +55,7 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 * IO code that does not need private memory pools.
 */
 struct bio_set *fs_bio_set;
+EXPORT_SYMBOL(fs_bio_set);
 /*
 * Our slab pool management
@@ -233,26 +234,37 @@ fallback:
        return bvl;
 }
-void bio_free(struct bio *bio, struct bio_set *bs)
+static void __bio_free(struct bio *bio)
 {
+        bio_disassociate_task(bio);
+        if (bio_integrity(bio))
+                bio_integrity_free(bio);
+}
+static void bio_free(struct bio *bio)
+{
+        struct bio_set *bs = bio->bi_pool;
        void *p;
-        if (bio_has_allocated_vec(bio))
+        __bio_free(bio);
-                bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
-        if (bio_integrity(bio))
+        if (bs) {
-                bio_integrity_free(bio, bs);
+                if (bio_has_allocated_vec(bio))
+                        bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
-        /*
+                /*
-         * If we have front padding, adjust the bio pointer before freeing
+                 * If we have front padding, adjust the bio pointer before freeing
-         */
+                 */
-        p = bio;
+                p = bio;
-        if (bs->front_pad)
                p -= bs->front_pad;
-        mempool_free(p, bs->bio_pool);
+                mempool_free(p, bs->bio_pool);
+        } else {
+                /* Bio was allocated by bio_kmalloc() */
+                kfree(bio);
+        }
 }
-EXPORT_SYMBOL(bio_free);
 void bio_init(struct bio *bio)
 {
@@ -263,48 +275,85 @@ void bio_init(struct bio *bio)
 EXPORT_SYMBOL(bio_init);
 /**
+ * bio_reset - reinitialize a bio
+ * @bio:        bio to reset
+ *
+ * Description:
+ *   After calling bio_reset(), @bio will be in the same state as a freshly
+ *   allocated bio returned bio bio_alloc_bioset() - the only fields that are
+ *   preserved are the ones that are initialized by bio_alloc_bioset(). See
+ *   comment in struct bio.
+ */
+void bio_reset(struct bio *bio)
+{
+        unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
+        __bio_free(bio);
+        memset(bio, 0, BIO_RESET_BYTES);
+        bio->bi_flags = flags|(1 << BIO_UPTODATE);
+}
+EXPORT_SYMBOL(bio_reset);
+/**
 * bio_alloc_bioset - allocate a bio for I/O
 * @gfp_mask:   the GFP_ mask given to the slab allocator
 * @nr_iovecs:  number of iovecs to pre-allocate
 * @bs:         the bio_set to allocate from.
 *
 * Description:
- *   bio_alloc_bioset will try its own mempool to satisfy the allocation.
+ *   If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
- *   If %__GFP_WAIT is set then we will block on the internal pool waiting
+ *   backed by the @bs's mempool.
- *   for a &struct bio to become free.
 *
- *   Note that the caller must set ->bi_destructor on successful return
+ *   When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
- *   of a bio, to do the appropriate freeing of the bio once the reference
+ *   able to allocate a bio. This is due to the mempool guarantees. To make this
- *   count drops to zero.
+ *   work, callers must never allocate more than 1 bio at a time from this pool.
- **/
+ *   Callers that need to allocate more than 1 bio must always submit the
+ *   previously allocated bio for IO before attempting to allocate a new one.
+ *   Failure to do so can cause deadlocks under memory pressure.
+ *
+ *   RETURNS:
+ *   Pointer to new bio on success, NULL on failure.
+ */
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+        unsigned front_pad;
+        unsigned inline_vecs;
        unsigned long idx = BIO_POOL_NONE;
        struct bio_vec *bvl = NULL;
        struct bio *bio;
        void *p;
-        p = mempool_alloc(bs->bio_pool, gfp_mask);
+        if (!bs) {
+                if (nr_iovecs > UIO_MAXIOV)
+                        return NULL;
+                p = kmalloc(sizeof(struct bio) +
+                            nr_iovecs * sizeof(struct bio_vec),
+                            gfp_mask);
+                front_pad = 0;
+                inline_vecs = nr_iovecs;
+        } else {
+                p = mempool_alloc(bs->bio_pool, gfp_mask);
+                front_pad = bs->front_pad;
+                inline_vecs = BIO_INLINE_VECS;
+        }
        if (unlikely(!p))
                return NULL;
-        bio = p + bs->front_pad;
+        bio = p + front_pad;
        bio_init(bio);
-        if (unlikely(!nr_iovecs))
+        if (nr_iovecs > inline_vecs) {
-                goto out_set;
-        if (nr_iovecs <= BIO_INLINE_VECS) {
-                bvl = bio->bi_inline_vecs;
-                nr_iovecs = BIO_INLINE_VECS;
-        } else {
                bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
                if (unlikely(!bvl))
                        goto err_free;
+        } else if (nr_iovecs) {
-                nr_iovecs = bvec_nr_vecs(idx);
+                bvl = bio->bi_inline_vecs;
        }
-out_set:
+        bio->bi_pool = bs;
        bio->bi_flags |= idx << BIO_POOL_OFFSET;
        bio->bi_max_vecs = nr_iovecs;
        bio->bi_io_vec = bvl;
@@ -316,80 +365,6 @@ err_free:
 }
 EXPORT_SYMBOL(bio_alloc_bioset);
-static void bio_fs_destructor(struct bio *bio)
-{
-        bio_free(bio, fs_bio_set);
-}
-/**
- *      bio_alloc - allocate a new bio, memory pool backed
- *      @gfp_mask: allocation mask to use
- *      @nr_iovecs: number of iovecs
- *
- *      bio_alloc will allocate a bio and associated bio_vec array that can hold
- *      at least @nr_iovecs entries. Allocations will be done from the
- *      fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc.
- *
- *      If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
- *      a bio. This is due to the mempool guarantees. To make this work, callers
- *      must never allocate more than 1 bio at a time from this pool. Callers
- *      that need to allocate more than 1 bio must always submit the previously
- *      allocated bio for IO before attempting to allocate a new one. Failure to
- *      do so can cause livelocks under memory pressure.
- *
- *      RETURNS:
- *      Pointer to new bio on success, NULL on failure.
- */
-struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
-        struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
-        if (bio)
-                bio->bi_destructor = bio_fs_destructor;
-        return bio;
-}
-EXPORT_SYMBOL(bio_alloc);
-static void bio_kmalloc_destructor(struct bio *bio)
-{
-        if (bio_integrity(bio))
-                bio_integrity_free(bio, fs_bio_set);
-        kfree(bio);
-}
-/**
- * bio_kmalloc - allocate a bio for I/O using kmalloc()
- * @gfp_mask:   the GFP_ mask given to the slab allocator
- * @nr_iovecs:  number of iovecs to pre-allocate
- *
- * Description:
- *   Allocate a new bio with @nr_iovecs bvecs.  If @gfp_mask contains
- *   %__GFP_WAIT, the allocation is guaranteed to succeed.
- *
- **/
-struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
-        struct bio *bio;
-        if (nr_iovecs > UIO_MAXIOV)
-                return NULL;
-        bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
-                      gfp_mask);
-        if (unlikely(!bio))
-                return NULL;
-        bio_init(bio);
-        bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
-        bio->bi_max_vecs = nr_iovecs;
-        bio->bi_io_vec = bio->bi_inline_vecs;
-        bio->bi_destructor = bio_kmalloc_destructor;
-        return bio;
-}
-EXPORT_SYMBOL(bio_kmalloc);
 void zero_fill_bio(struct bio *bio)
 {
        unsigned long flags;
@@ -420,11 +395,8 @@ void bio_put(struct bio *bio)
        /*
         * last put frees it
         */
-        if (atomic_dec_and_test(&bio->bi_cnt)) {
+        if (atomic_dec_and_test(&bio->bi_cnt))
-                bio_disassociate_task(bio);
+                bio_free(bio);
-                bio->bi_next = NULL;
-                bio->bi_destructor(bio);
-        }
 }
 EXPORT_SYMBOL(bio_put);
@@ -466,26 +438,28 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
 EXPORT_SYMBOL(__bio_clone);
 /**
- *      bio_clone       -       clone a bio
+ *      bio_clone_bioset -      clone a bio
 *      @bio: bio to clone
 *      @gfp_mask: allocation priority
+ *      @bs: bio_set to allocate from
 *
 *      Like __bio_clone, only also allocates the returned bio
 */
-struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
+struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
+                             struct bio_set *bs)
 {
-        struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
+        struct bio *b;
+        b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs);
        if (!b)
                return NULL;
-        b->bi_destructor = bio_fs_destructor;
        __bio_clone(b, bio);
        if (bio_integrity(bio)) {
                int ret;
-                ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
+                ret = bio_integrity_clone(b, bio, gfp_mask);
                if (ret < 0) {
                        bio_put(b);
@@ -495,7 +469,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
        return b;
 }
-EXPORT_SYMBOL(bio_clone);
+EXPORT_SYMBOL(bio_clone_bioset);
 /**
 *      bio_get_nr_vecs         - return approx number of vecs
@@ -1501,7 +1475,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
        trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
                                bi->bi_sector + first_sectors);
-        BUG_ON(bi->bi_vcnt != 1);
+        BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
        BUG_ON(bi->bi_idx != 0);
        atomic_set(&bp->cnt, 3);
        bp->error = 0;
@@ -1511,17 +1485,22 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
        bp->bio2.bi_size -= first_sectors << 9;
        bp->bio1.bi_size = first_sectors << 9;
-        bp->bv1 = bi->bi_io_vec[0];
+        if (bi->bi_vcnt != 0) {
-        bp->bv2 = bi->bi_io_vec[0];
+                bp->bv1 = bi->bi_io_vec[0];
-        bp->bv2.bv_offset += first_sectors << 9;
+                bp->bv2 = bi->bi_io_vec[0];
-        bp->bv2.bv_len -= first_sectors << 9;
-        bp->bv1.bv_len = first_sectors << 9;
+                if (bio_is_rw(bi)) {
+                        bp->bv2.bv_offset += first_sectors << 9;
+                        bp->bv2.bv_len -= first_sectors << 9;
+                        bp->bv1.bv_len = first_sectors << 9;
+                }
-        bp->bio1.bi_io_vec = &bp->bv1;
+                bp->bio1.bi_io_vec = &bp->bv1;
-        bp->bio2.bi_io_vec = &bp->bv2;
+                bp->bio2.bi_io_vec = &bp->bv2;
-        bp->bio1.bi_max_vecs = 1;
+                bp->bio1.bi_max_vecs = 1;
-        bp->bio2.bi_max_vecs = 1;
+                bp->bio2.bi_max_vecs = 1;
+        }
        bp->bio1.bi_end_io = bio_pair_end_1;
        bp->bio2.bi_end_io = bio_pair_end_2;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 38e721b35d45..b3c1d3dae77d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -116,6 +116,8 @@ EXPORT_SYMBOL(invalidate_bdev);
 int set_blocksize(struct block_device *bdev, int size)
 {
+        struct address_space *mapping;
        /* Size must be a power of two, and between 512 and PAGE_SIZE */
        if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
                return -EINVAL;
@@ -124,6 +126,19 @@ int set_blocksize(struct block_device *bdev, int size)
        if (size < bdev_logical_block_size(bdev))
                return -EINVAL;
+        /* Prevent starting I/O or mapping the device */
+        percpu_down_write(&bdev->bd_block_size_semaphore);
+        /* Check that the block device is not memory mapped */
+        mapping = bdev->bd_inode->i_mapping;
+        mutex_lock(&mapping->i_mmap_mutex);
+        if (mapping_mapped(mapping)) {
+                mutex_unlock(&mapping->i_mmap_mutex);
+                percpu_up_write(&bdev->bd_block_size_semaphore);
+                return -EBUSY;
+        }
+        mutex_unlock(&mapping->i_mmap_mutex);
        /* Don't change the size if it is same as current */
        if (bdev->bd_block_size != size) {
                sync_blockdev(bdev);
@@ -131,6 +146,9 @@ int set_blocksize(struct block_device *bdev, int size)
                bdev->bd_inode->i_blkbits = blksize_bits(size);
                kill_bdev(bdev);
        }
+        percpu_up_write(&bdev->bd_block_size_semaphore);
        return 0;
 }
@@ -441,6 +459,12 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
        struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
+        if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
+                kmem_cache_free(bdev_cachep, ei);
+                return NULL;
+        }
        return &ei->vfs_inode;
 }
@@ -449,6 +473,8 @@ static void bdev_i_callback(struct rcu_head *head)
        struct inode *inode = container_of(head, struct inode, i_rcu);
        struct bdev_inode *bdi = BDEV_I(inode);
+        percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
        kmem_cache_free(bdev_cachep, bdi);
 }
@@ -1567,6 +1593,22 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        return blkdev_ioctl(bdev, mode, cmd, arg);
 }
+ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
+                        unsigned long nr_segs, loff_t pos)
+{
+        ssize_t ret;
+        struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
+        percpu_down_read(&bdev->bd_block_size_semaphore);
+        ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+        percpu_up_read(&bdev->bd_block_size_semaphore);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_aio_read);
 /*
 * Write data to the block device.  Only intended for the block device itself
 * and the raw driver which basically is a fake block device.
@@ -1578,12 +1620,16 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                         unsigned long nr_segs, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
+        struct block_device *bdev = I_BDEV(file->f_mapping->host);
        struct blk_plug plug;
        ssize_t ret;
        BUG_ON(iocb->ki_pos != pos);
        blk_start_plug(&plug);
+        percpu_down_read(&bdev->bd_block_size_semaphore);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        if (ret > 0 || ret == -EIOCBQUEUED) {
                ssize_t err;
@@ -1592,11 +1638,29 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (err < 0 && ret > 0)
                        ret = err;
        }
+        percpu_up_read(&bdev->bd_block_size_semaphore);
        blk_finish_plug(&plug);
        return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_aio_write);
+static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        int ret;
+        struct block_device *bdev = I_BDEV(file->f_mapping->host);
+        percpu_down_read(&bdev->bd_block_size_semaphore);
+        ret = generic_file_mmap(file, vma);
+        percpu_up_read(&bdev->bd_block_size_semaphore);
+        return ret;
+}
 /*
 * Try to release a page associated with block device when the system
 * is under memory pressure.
@@ -1627,9 +1691,9 @@ const struct file_operations def_blk_fops = {
        .llseek         = block_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
-        .aio_read       = generic_file_aio_read,
+        .aio_read       = blkdev_aio_read,
        .aio_write      = blkdev_aio_write,
-        .mmap           = generic_file_mmap,
+        .mmap           = blkdev_mmap,
        .fsync          = blkdev_fsync,
        .unlocked_ioctl = block_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ff6475f409d6..f3187938e081 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,6 +16,7 @@
 * Boston, MA 021110-1307, USA.
 */
+#include <linux/vmalloc.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
@@ -231,7 +232,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
                        }
                        if (!ret) {
                                ret = ulist_add(parents, eb->start,
-                                                (unsigned long)eie, GFP_NOFS);
+                                                (uintptr_t)eie, GFP_NOFS);
                                if (ret < 0)
                                        break;
                                if (!extent_item_pos) {
@@ -363,8 +364,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                ULIST_ITER_INIT(&uiter);
                node = ulist_next(parents, &uiter);
                ref->parent = node ? node->val : 0;
-                ref->inode_list =
+                ref->inode_list = node ?
-                        node ? (struct extent_inode_elem *)node->aux : 0;
+                        (struct extent_inode_elem *)(uintptr_t)node->aux : 0;
                /* additional parents require new refs being added here */
                while ((node = ulist_next(parents, &uiter))) {
@@ -375,8 +376,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                        }
                        memcpy(new_ref, ref, sizeof(*ref));
                        new_ref->parent = node->val;
-                        new_ref->inode_list =
+                        new_ref->inode_list = (struct extent_inode_elem *)
-                                        (struct extent_inode_elem *)node->aux;
+                                                        (uintptr_t)node->aux;
                        list_add(&new_ref->list, &ref->list);
                }
                ulist_reinit(parents);
@@ -914,8 +915,8 @@ again:
                                free_extent_buffer(eb);
                        }
                        ret = ulist_add_merge(refs, ref->parent,
-                                              (unsigned long)ref->inode_list,
+                                              (uintptr_t)ref->inode_list,
-                                              (unsigned long *)&eie, GFP_NOFS);
+                                              (u64 *)&eie, GFP_NOFS);
                        if (!ret && extent_item_pos) {
                                /*
                                 * we've recorded that parent, so we must extend
@@ -959,7 +960,7 @@ static void free_leaf_list(struct ulist *blocks)
        while ((node = ulist_next(blocks, &uiter))) {
                if (!node->aux)
                        continue;
-                eie = (struct extent_inode_elem *)node->aux;
+                eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
                for (; eie; eie = eie_next) {
                        eie_next = eie->next;
                        kfree(eie);
@@ -1108,26 +1109,80 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
                                found_key);
 }
-/*
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
- * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
+                          u64 start_off, struct btrfs_path *path,
- * of the path are separated by '/' and the path is guaranteed to be
+                          struct btrfs_inode_extref **ret_extref,
- * 0-terminated. the path is only given within the current file system.
+                          u64 *found_off)
- * Therefore, it never starts with a '/'. the caller is responsible to provide
+{
- * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+        int ret, slot;
- * the start point of the resulting string is returned. this pointer is within
+        struct btrfs_key key;
- * dest, normally.
+        struct btrfs_key found_key;
- * in case the path buffer would overflow, the pointer is decremented further
+        struct btrfs_inode_extref *extref;
- * as if output was written to the buffer, though no more output is actually
+        struct extent_buffer *leaf;
- * generated. that way, the caller can determine how much space would be
+        unsigned long ptr;
- * required for the path to fit into the buffer. in that case, the returned
- * value will be smaller than dest. callers must check this!
+        key.objectid = inode_objectid;
- */
+        btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
-char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+        key.offset = start_off;
-                         struct btrfs_inode_ref *iref,
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                return ret;
+        while (1) {
+                leaf = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        /*
+                         * If the item at offset is not found,
+                         * btrfs_search_slot will point us to the slot
+                         * where it should be inserted. In our case
+                         * that will be the slot directly before the
+                         * next INODE_REF_KEY_V2 item. In the case
+                         * that we're pointing to the last slot in a
+                         * leaf, we must move one leaf over.
+                         */
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret) {
+                                if (ret >= 1)
+                                        ret = -ENOENT;
+                                break;
+                        }
+                        continue;
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                /*
+                 * Check that we're still looking at an extended ref key for
+                 * this particular objectid. If we have different
+                 * objectid or type then there are no more to be found
+                 * in the tree and we can exit.
+                 */
+                ret = -ENOENT;
+                if (found_key.objectid != inode_objectid)
+                        break;
+                if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
+                        break;
+                ret = 0;
+                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+                extref = (struct btrfs_inode_extref *)ptr;
+                *ret_extref = extref;
+                if (found_off)
+                        *found_off = found_key.offset;
+                break;
+        }
+        return ret;
+}
+static char *ref_to_path(struct btrfs_root *fs_root,
+                         struct btrfs_path *path,
+                         u32 name_len, unsigned long name_off,
                         struct extent_buffer *eb_in, u64 parent,
                         char *dest, u32 size)
 {
-        u32 len;
        int slot;
        u64 next_inum;
        int ret;
@@ -1135,17 +1190,17 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
        struct extent_buffer *eb = eb_in;
        struct btrfs_key found_key;
        int leave_spinning = path->leave_spinning;
+        struct btrfs_inode_ref *iref;
        if (bytes_left >= 0)
                dest[bytes_left] = '\0';
        path->leave_spinning = 1;
        while (1) {
-                len = btrfs_inode_ref_name_len(eb, iref);
+                bytes_left -= name_len;
-                bytes_left -= len;
                if (bytes_left >= 0)
                        read_extent_buffer(eb, dest + bytes_left,
-                                                (unsigned long)(iref + 1), len);
+                                           name_off, name_len);
                if (eb != eb_in) {
                        btrfs_tree_read_unlock_blocking(eb);
                        free_extent_buffer(eb);
@@ -1155,6 +1210,7 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                        ret = -ENOENT;
                if (ret)
                        break;
                next_inum = found_key.offset;
                /* regular exit ahead */
@@ -1170,8 +1226,11 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                }
                btrfs_release_path(path);
                iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+                name_len = btrfs_inode_ref_name_len(eb, iref);
+                name_off = (unsigned long)(iref + 1);
                parent = next_inum;
                --bytes_left;
                if (bytes_left >= 0)
@@ -1188,12 +1247,39 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 }
 /*
+ * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
+ * of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+char *btrfs_iref_to_path(struct btrfs_root *fs_root,
+                         struct btrfs_path *path,
+                         struct btrfs_inode_ref *iref,
+                         struct extent_buffer *eb_in, u64 parent,
+                         char *dest, u32 size)
+{
+        return ref_to_path(fs_root, path,
+                           btrfs_inode_ref_name_len(eb_in, iref),
+                           (unsigned long)(iref + 1),
+                           eb_in, parent, dest, size);
+}
+/*
 * this makes the path point to (logical EXTENT_ITEM *)
 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
 * tree blocks and <0 on error.
 */
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
-                        struct btrfs_path *path, struct btrfs_key *found_key)
+                        struct btrfs_path *path, struct btrfs_key *found_key,
+                        u64 *flags_ret)
 {
        int ret;
        u64 flags;
@@ -1237,10 +1323,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
                 (unsigned long long)found_key->objectid,
                 (unsigned long long)found_key->offset,
                 (unsigned long long)flags, item_size);
-        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
-                return BTRFS_EXTENT_FLAG_TREE_BLOCK;
+        WARN_ON(!flags_ret);
-        if (flags & BTRFS_EXTENT_FLAG_DATA)
+        if (flags_ret) {
-                return BTRFS_EXTENT_FLAG_DATA;
+                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+                        *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+                else if (flags & BTRFS_EXTENT_FLAG_DATA)
+                        *flags_ret = BTRFS_EXTENT_FLAG_DATA;
+                else
+                        BUG_ON(1);
+                return 0;
+        }
        return -EIO;
 }
@@ -1404,12 +1497,13 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
                ULIST_ITER_INIT(&root_uiter);
                while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
                        pr_debug("root %llu references leaf %llu, data list "
-                                 "%#lx\n", root_node->val, ref_node->val,
+                                 "%#llx\n", root_node->val, ref_node->val,
-                                 ref_node->aux);
+                                 (long long)ref_node->aux);
-                        ret = iterate_leaf_refs(
+                        ret = iterate_leaf_refs((struct extent_inode_elem *)
-                                (struct extent_inode_elem *)ref_node->aux,
+                                                (uintptr_t)ref_node->aux,
-                                root_node->val, extent_item_objectid,
+                                                root_node->val,
-                                iterate, ctx);
+                                                extent_item_objectid,
+                                                iterate, ctx);
                }
                ulist_free(roots);
                roots = NULL;
@@ -1432,15 +1526,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 {
        int ret;
        u64 extent_item_pos;
+        u64 flags = 0;
        struct btrfs_key found_key;
        int search_commit_root = path->search_commit_root;
-        ret = extent_from_logical(fs_info, logical, path,
+        ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
-                                        &found_key);
        btrfs_release_path(path);
        if (ret < 0)
                return ret;
-        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                return -EINVAL;
        extent_item_pos = logical - found_key.objectid;
@@ -1451,9 +1545,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
        return ret;
 }
-static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
-                                struct btrfs_path *path,
+                              struct extent_buffer *eb, void *ctx);
-                                iterate_irefs_t *iterate, void *ctx)
+static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+                              struct btrfs_path *path,
+                              iterate_irefs_t *iterate, void *ctx)
 {
        int ret = 0;
        int slot;
@@ -1470,7 +1567,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
        while (!ret) {
                path->leave_spinning = 1;
                ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
-                                        &found_key);
+                                     &found_key);
                if (ret < 0)
                        break;
                if (ret) {
@@ -1498,7 +1595,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
                                 "tree %llu\n", cur,
                                 (unsigned long long)found_key.objectid,
                                 (unsigned long long)fs_root->objectid);
-                        ret = iterate(parent, iref, eb, ctx);
+                        ret = iterate(parent, name_len,
+                                      (unsigned long)(iref + 1), eb, ctx);
                        if (ret)
                                break;
                        len = sizeof(*iref) + name_len;
@@ -1513,12 +1611,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
        return ret;
 }
+static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
+                                 struct btrfs_path *path,
+                                 iterate_irefs_t *iterate, void *ctx)
+{
+        int ret;
+        int slot;
+        u64 offset = 0;
+        u64 parent;
+        int found = 0;
+        struct extent_buffer *eb;
+        struct btrfs_inode_extref *extref;
+        struct extent_buffer *leaf;
+        u32 item_size;
+        u32 cur_offset;
+        unsigned long ptr;
+        while (1) {
+                ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
+                                            &offset);
+                if (ret < 0)
+                        break;
+                if (ret) {
+                        ret = found ? 0 : -ENOENT;
+                        break;
+                }
+                ++found;
+                slot = path->slots[0];
+                eb = path->nodes[0];
+                /* make sure we can use eb after releasing the path */
+                atomic_inc(&eb->refs);
+                btrfs_tree_read_lock(eb);
+                btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                btrfs_release_path(path);
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+                cur_offset = 0;
+                while (cur_offset < item_size) {
+                        u32 name_len;
+                        extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
+                        parent = btrfs_inode_extref_parent(eb, extref);
+                        name_len = btrfs_inode_extref_name_len(eb, extref);
+                        ret = iterate(parent, name_len,
+                                      (unsigned long)&extref->name, eb, ctx);
+                        if (ret)
+                                break;
+                        cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+                        cur_offset += sizeof(*extref);
+                }
+                btrfs_tree_read_unlock_blocking(eb);
+                free_extent_buffer(eb);
+                offset++;
+        }
+        btrfs_release_path(path);
+        return ret;
+}
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+                         struct btrfs_path *path, iterate_irefs_t *iterate,
+                         void *ctx)
+{
+        int ret;
+        int found_refs = 0;
+        ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
+        if (!ret)
+                ++found_refs;
+        else if (ret != -ENOENT)
+                return ret;
+        ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
+        if (ret == -ENOENT && found_refs)
+                return 0;
+        return ret;
+}
 /*
 * returns 0 if the path could be dumped (probably truncated)
 * returns <0 in case of an error
 */
-static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
-                                struct extent_buffer *eb, void *ctx)
+                         struct extent_buffer *eb, void *ctx)
 {
        struct inode_fs_paths *ipath = ctx;
        char *fspath;
@@ -1531,20 +1715,17 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
                                        ipath->fspath->bytes_left - s_ptr : 0;
        fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
-        fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
+        fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
-                                inum, fspath_min, bytes_left);
+                             name_off, eb, inum, fspath_min,
+                             bytes_left);
        if (IS_ERR(fspath))
                return PTR_ERR(fspath);
        if (fspath > fspath_min) {
-                pr_debug("path resolved: %s\n", fspath);
                ipath->fspath->val[i] = (u64)(unsigned long)fspath;
                ++ipath->fspath->elem_cnt;
                ipath->fspath->bytes_left = fspath - fspath_min;
        } else {
-                pr_debug("missed path, not enough space. missing bytes: %lu, "
-                         "constructed so far: %s\n",
-                         (unsigned long)(fspath_min - fspath), fspath_min);
                ++ipath->fspath->elem_missed;
                ipath->fspath->bytes_missing += fspath_min - fspath;
                ipath->fspath->bytes_left = 0;
@@ -1566,7 +1747,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
 {
        return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
-                                inode_to_path, ipath);
+                             inode_to_path, ipath);
 }
 struct btrfs_data_container *init_data_container(u32 total_bytes)
@@ -1575,7 +1756,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
        size_t alloc_bytes;
        alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
-        data = kmalloc(alloc_bytes, GFP_NOFS);
+        data = vmalloc(alloc_bytes);
        if (!data)
                return ERR_PTR(-ENOMEM);
@@ -1626,6 +1807,6 @@ void free_ipath(struct inode_fs_paths *ipath)
 {
        if (!ipath)
                return;
-        kfree(ipath->fspath);
+        vfree(ipath->fspath);
        kfree(ipath);
 }
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 032f4dc7eab8..e75533043a5f 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -33,14 +33,13 @@ struct inode_fs_paths {
 typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
                void *ctx);
-typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
-                                struct extent_buffer *eb, void *ctx);
 int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
                        struct btrfs_path *path);
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
-                        struct btrfs_path *path, struct btrfs_key *found_key);
+                        struct btrfs_path *path, struct btrfs_key *found_key,
+                        u64 *flags);
 int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
                                struct btrfs_extent_item *ei, u32 item_size,
@@ -69,4 +68,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
                                        struct btrfs_path *path);
 void free_ipath(struct inode_fs_paths *ipath);
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+                          u64 start_off, struct btrfs_path *path,
+                          struct btrfs_inode_extref **ret_extref,
+                          u64 *found_off);
 #endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5b2ad6bc4fe7..ed8ca7ca5eff 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -38,6 +38,7 @@
 #define BTRFS_INODE_DELALLOC_META_RESERVED      4
 #define BTRFS_INODE_HAS_ORPHAN_ITEM             5
 #define BTRFS_INODE_HAS_ASYNC_EXTENT            6
+#define BTRFS_INODE_NEEDS_FULL_SYNC             7
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -143,6 +144,9 @@ struct btrfs_inode {
        /* flags field from the on disk inode */
        u32 flags;
+        /* a local copy of root's last_log_commit */
+        unsigned long last_log_commit;
        /*
         * Counters to keep track of the number of extent item's we may use due
         * to delalloc and such.  outstanding_extents is the number of extent
@@ -202,15 +206,10 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
 static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 {
-        struct btrfs_root *root = BTRFS_I(inode)->root;
-        int ret = 0;
-        mutex_lock(&root->log_mutex);
        if (BTRFS_I(inode)->logged_trans == generation &&
-            BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+            BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit)
-                ret = 1;
+                return 1;
-        mutex_unlock(&root->log_mutex);
+        return 0;
-        return ret;
 }
 #endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9197e2e33407..5a3e45db642a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -37,8 +37,9 @@
 *        the file system was mounted, (i.e., they have been
 *        referenced by the super block) or they have been
 *        written since then and the write completion callback
- *        was called and a FLUSH request to the device where
+ *        was called and no write error was indicated and a
- *        these blocks are located was received and completed.
+ *        FLUSH request to the device where these blocks are
+ *        located was received and completed.
 *    2b. All referenced blocks need to have a generation
 *        number which is equal to the parent's number.
 *
@@ -2601,6 +2602,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
                               (unsigned long long)l->block_ref_to->dev_bytenr,
                               l->block_ref_to->mirror_num);
                        ret = -1;
+                } else if (l->block_ref_to->iodone_w_error) {
+                        printk(KERN_INFO "btrfs: attempt to write superblock"
+                               " which references block %c @%llu (%s/%llu/%d)"
+                               " which has write error!\n",
+                               btrfsic_get_block_type(state, l->block_ref_to),
+                               (unsigned long long)
+                               l->block_ref_to->logical_bytenr,
+                               l->block_ref_to->dev_state->name,
+                               (unsigned long long)l->block_ref_to->dev_bytenr,
+                               l->block_ref_to->mirror_num);
+                        ret = -1;
                } else if (l->parent_generation !=
                           l->block_ref_to->generation &&
                           BTRFSIC_GENERATION_UNKNOWN !=
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 43d1c5a3a030..c6467aa88bee 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        u64 em_start;
        struct extent_map *em;
        int ret = -ENOMEM;
+        int faili = 0;
        u32 *sums;
        tree = &BTRFS_I(inode)->io_tree;
@@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        for (pg_index = 0; pg_index < nr_pages; pg_index++) {
                cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
                                                              __GFP_HIGHMEM);
-                if (!cb->compressed_pages[pg_index])
+                if (!cb->compressed_pages[pg_index]) {
+                        faili = pg_index - 1;
+                        ret = -ENOMEM;
                        goto fail2;
+                }
        }
+        faili = nr_pages - 1;
        cb->nr_pages = nr_pages;
        add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -713,8 +718,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        return 0;
 fail2:
-        for (pg_index = 0; pg_index < nr_pages; pg_index++)
+        while (faili >= 0) {
-                free_page((unsigned long)cb->compressed_pages[pg_index]);
+                __free_page(cb->compressed_pages[faili]);
+                faili--;
+        }
        kfree(cb->compressed_pages);
 fail1:
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d183f60d63a..b33436211000 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4402,149 +4402,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
 }
 /*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
- * Returns the number of keys that were inserted.
- */
-int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
-                            struct btrfs_path *path,
-                            struct btrfs_key *cpu_key, u32 *data_size,
-                            int nr)
-{
-        struct extent_buffer *leaf;
-        struct btrfs_item *item;
-        int ret = 0;
-        int slot;
-        int i;
-        u32 nritems;
-        u32 total_data = 0;
-        u32 total_size = 0;
-        unsigned int data_end;
-        struct btrfs_disk_key disk_key;
-        struct btrfs_key found_key;
-        struct btrfs_map_token token;
-        btrfs_init_map_token(&token);
-        for (i = 0; i < nr; i++) {
-                if (total_size + data_size[i] + sizeof(struct btrfs_item) >
-                    BTRFS_LEAF_DATA_SIZE(root)) {
-                        break;
-                        nr = i;
-                }
-                total_data += data_size[i];
-                total_size += data_size[i] + sizeof(struct btrfs_item);
-        }
-        BUG_ON(nr == 0);
-        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-        if (ret == 0)
-                return -EEXIST;
-        if (ret < 0)
-                goto out;
-        leaf = path->nodes[0];
-        nritems = btrfs_header_nritems(leaf);
-        data_end = leaf_data_end(root, leaf);
-        if (btrfs_leaf_free_space(root, leaf) < total_size) {
-                for (i = nr; i >= 0; i--) {
-                        total_data -= data_size[i];
-                        total_size -= data_size[i] + sizeof(struct btrfs_item);
-                        if (total_size < btrfs_leaf_free_space(root, leaf))
-                                break;
-                }
-                nr = i;
-        }
-        slot = path->slots[0];
-        BUG_ON(slot < 0);
-        if (slot != nritems) {
-                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
-                item = btrfs_item_nr(leaf, slot);
-                btrfs_item_key_to_cpu(leaf, &found_key, slot);
-                /* figure out how many keys we can insert in here */
-                total_data = data_size[0];
-                for (i = 1; i < nr; i++) {
-                        if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
-                                break;
-                        total_data += data_size[i];
-                }
-                nr = i;
-                if (old_data < data_end) {
-                        btrfs_print_leaf(root, leaf);
-                        printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
-                               slot, old_data, data_end);
-                        BUG_ON(1);
-                }
-                /*
-                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
-                 */
-                /* first correct the data pointers */
-                for (i = slot; i < nritems; i++) {
-                        u32 ioff;
-                        item = btrfs_item_nr(leaf, i);
-                        ioff = btrfs_token_item_offset(leaf, item, &token);
-                        btrfs_set_token_item_offset(leaf, item,
-                                                    ioff - total_data, &token);
-                }
-                /* shift the items */
-                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
-                              btrfs_item_nr_offset(slot),
-                              (nritems - slot) * sizeof(struct btrfs_item));
-                /* shift the data */
-                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
-                              data_end - total_data, btrfs_leaf_data(leaf) +
-                              data_end, old_data - data_end);
-                data_end = old_data;
-        } else {
-                /*
-                 * this sucks but it has to be done, if we are inserting at
-                 * the end of the leaf only insert 1 of the items, since we
-                 * have no way of knowing whats on the next leaf and we'd have
-                 * to drop our current locks to figure it out
-                 */
-                nr = 1;
-        }
-        /* setup the item for the new data */
-        for (i = 0; i < nr; i++) {
-                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
-                btrfs_set_item_key(leaf, &disk_key, slot + i);
-                item = btrfs_item_nr(leaf, slot + i);
-                btrfs_set_token_item_offset(leaf, item,
-                                            data_end - data_size[i], &token);
-                data_end -= data_size[i];
-                btrfs_set_token_item_size(leaf, item, data_size[i], &token);
-        }
-        btrfs_set_header_nritems(leaf, nritems + nr);
-        btrfs_mark_buffer_dirty(leaf);
-        ret = 0;
-        if (slot == 0) {
-                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
-                fixup_low_keys(trans, root, path, &disk_key, 1);
-        }
-        if (btrfs_leaf_free_space(root, leaf) < 0) {
-                btrfs_print_leaf(root, leaf);
-                BUG();
-        }
-out:
-        if (!ret)
-                ret = nr;
-        return ret;
-}
-/*
 * this is a helper for btrfs_insert_empty_items, the main goal here is
 * to save stack depth by doing the bulk of the work in a function
 * that doesn't call btrfs_search_slot
@@ -5073,6 +4930,7 @@ static void tree_move_down(struct btrfs_root *root,
                           struct btrfs_path *path,
                           int *level, int root_level)
 {
+        BUG_ON(*level == 0);
        path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
                                        path->slots[*level]);
        path->slots[*level - 1] = 0;
@@ -5089,7 +4947,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root,
        path->slots[*level]++;
-        while (path->slots[*level] == nritems) {
+        while (path->slots[*level] >= nritems) {
                if (*level == root_level)
                        return -1;
@@ -5433,9 +5291,11 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                                        goto out;
                                advance_right = ADVANCE;
                        } else {
+                                WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
                                ret = tree_compare_item(left_root, left_path,
                                                right_path, tmp_buf);
                                if (ret) {
+                                        WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
                                        ret = changed_cb(left_root, right_root,
                                                left_path, right_path,
                                                &left_key,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9821b672f5a2..926c9ffc66d9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -154,6 +154,13 @@ struct btrfs_ordered_sum;
 */
 #define BTRFS_NAME_LEN 255
+/*
+ * Theoretical limit is larger, but we keep this down to a sane
+ * value. That should limit greatly the possibility of collisions on
+ * inode ref items.
+ */
+#define BTRFS_LINK_MAX 65535U
 /* 32 bytes in various csum fields */
 #define BTRFS_CSUM_SIZE 32
@@ -489,6 +496,8 @@ struct btrfs_super_block {
 */
 #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA     (1ULL << 5)
+#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF    (1ULL << 6)
 #define BTRFS_FEATURE_COMPAT_SUPP               0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP            0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP                     \
@@ -496,7 +505,8 @@ struct btrfs_super_block {
         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |        \
         BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |          \
         BTRFS_FEATURE_INCOMPAT_BIG_METADATA |          \
-         BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
+         BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |          \
+         BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
 /*
 * A leaf is full of items. offset and size tell us where to find
@@ -643,6 +653,14 @@ struct btrfs_inode_ref {
        /* name goes here */
 } __attribute__ ((__packed__));
+struct btrfs_inode_extref {
+        __le64 parent_objectid;
+        __le64 index;
+        __le16 name_len;
+        __u8   name[0];
+        /* name goes here */
+} __attribute__ ((__packed__));
 struct btrfs_timespec {
        __le64 sec;
        __le32 nsec;
@@ -1028,12 +1046,22 @@ struct btrfs_space_info {
        wait_queue_head_t wait;
 };
+#define BTRFS_BLOCK_RSV_GLOBAL          1
+#define BTRFS_BLOCK_RSV_DELALLOC        2
+#define BTRFS_BLOCK_RSV_TRANS           3
+#define BTRFS_BLOCK_RSV_CHUNK           4
+#define BTRFS_BLOCK_RSV_DELOPS          5
+#define BTRFS_BLOCK_RSV_EMPTY           6
+#define BTRFS_BLOCK_RSV_TEMP            7
 struct btrfs_block_rsv {
        u64 size;
        u64 reserved;
        struct btrfs_space_info *space_info;
        spinlock_t lock;
-        unsigned int full;
+        unsigned short full;
+        unsigned short type;
+        unsigned short failfast;
 };
 /*
@@ -1127,6 +1155,9 @@ struct btrfs_block_group_cache {
         * Today it will only have one thing on it, but that may change
         */
        struct list_head cluster_list;
+        /* For delayed block group creation */
+        struct list_head new_bg_list;
 };
 /* delayed seq elem */
@@ -1240,7 +1271,6 @@ struct btrfs_fs_info {
        struct mutex reloc_mutex;
        struct list_head trans_list;
-        struct list_head hashers;
        struct list_head dead_roots;
        struct list_head caching_block_groups;
@@ -1366,9 +1396,6 @@ struct btrfs_fs_info {
        struct rb_root defrag_inodes;
        atomic_t defrag_running;
-        spinlock_t ref_cache_lock;
-        u64 total_ref_cache_size;
        /*
         * these three are in extended format (availability of single
         * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1441,6 +1468,8 @@ struct btrfs_fs_info {
        /* next backup root to be overwritten */
        int backup_root_index;
+        int num_tolerated_disk_barrier_failures;
 };
 /*
@@ -1481,9 +1510,9 @@ struct btrfs_root {
        wait_queue_head_t log_commit_wait[2];
        atomic_t log_writers;
        atomic_t log_commit[2];
+        atomic_t log_batch;
        unsigned long log_transid;
        unsigned long last_log_commit;
-        unsigned long log_batch;
        pid_t log_start_pid;
        bool log_multiple_pids;
@@ -1592,6 +1621,7 @@ struct btrfs_ioctl_defrag_range_args {
 */
 #define BTRFS_INODE_ITEM_KEY            1
 #define BTRFS_INODE_REF_KEY             12
+#define BTRFS_INODE_EXTREF_KEY          13
 #define BTRFS_XATTR_ITEM_KEY            24
 #define BTRFS_ORPHAN_ITEM_KEY           48
 /* reserve 2-15 close to the inode for later flexibility */
@@ -1978,6 +2008,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
 BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+                   parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+                   name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
 BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
@@ -2858,6 +2895,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -2874,8 +2913,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+                                              unsigned short type);
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv);
 int btrfs_block_rsv_add(struct btrfs_root *root,
@@ -3172,12 +3212,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           const char *name, int name_len,
                           u64 inode_objectid, u64 ref_objectid, u64 *index);
-struct btrfs_inode_ref *
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
-                        struct btrfs_root *root,
+                              struct btrfs_path *path,
-                        struct btrfs_path *path,
+                              const char *name, int name_len,
-                        const char *name, int name_len,
+                              u64 inode_objectid, u64 ref_objectid, int mod,
-                        u64 inode_objectid, u64 ref_objectid, int mod);
+                              u64 *ret_index);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
@@ -3185,6 +3225,19 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
                       *root, struct btrfs_path *path,
                       struct btrfs_key *location, int mod);
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct btrfs_path *path,
+                          const char *name, int name_len,
+                          u64 inode_objectid, u64 ref_objectid, int ins_len,
+                          int cow);
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
+                                   u64 ref_objectid, const char *name,
+                                   int name_len,
+                                   struct btrfs_inode_extref **extref_ret);
 /* file-item.c */
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, u64 bytenr, u64 len);
@@ -3249,6 +3302,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct inode *dir, u64 objectid,
                        const char *name, int name_len);
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+                        int front);
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               struct inode *inode, u64 new_size,
@@ -3308,16 +3363,27 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 int btrfs_defrag_file(struct inode *inode, struct file *file,
                      struct btrfs_ioctl_defrag_range_args *range,
                      u64 newer_than, unsigned long max_pages);
+void btrfs_get_block_group_info(struct list_head *groups_list,
+                                struct btrfs_ioctl_space_info *space);
 /* file.c */
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
                           struct inode *inode);
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-                            int skip_pinned);
+                             int skip_pinned);
+int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
+                               u64 start, u64 end, int skip_pinned,
+                               int modified);
 extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
-                       u64 start, u64 end, u64 *hint_byte, int drop_cache);
+                         struct btrfs_root *root, struct inode *inode,
+                         struct btrfs_path *path, u64 start, u64 end,
+                         u64 *drop_end, int drop_cache);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct inode *inode, u64 start,
+                       u64 end, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
                              struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
@@ -3378,6 +3444,11 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
        }
 }
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
 #define btrfs_abort_transaction(trans, root, errno)             \
 do {                                                            \
        __btrfs_abort_transaction(trans, root, __func__,        \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 52c85e2b95d0..478f66bdc57b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache;
 int __init btrfs_delayed_inode_init(void)
 {
-        delayed_node_cache = kmem_cache_create("delayed_node",
+        delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
                                        sizeof(struct btrfs_delayed_node),
                                        0,
                                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
@@ -650,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
         * we're accounted for.
         */
        if (!src_rsv || (!trans->bytes_reserved &&
-            src_rsv != &root->fs_info->delalloc_block_rsv)) {
+                         src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
                ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
                /*
                 * Since we're under a transaction reserve_metadata_bytes could
@@ -668,7 +668,7 @@ static int btrfs_delayed_inode_reserve_metadata(
                                                      num_bytes, 1);
                }
                return ret;
-        } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+        } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
                spin_lock(&BTRFS_I(inode)->lock);
                if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
                                       &BTRFS_I(inode)->runtime_flags)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22e98e04c2ea..7cda51995c1e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,10 @@
 #include "check-integrity.h"
 #include "rcu-string.h"
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
@@ -217,26 +221,16 @@ static struct extent_map *btree_get_extent(struct inode *inode,
        write_lock(&em_tree->lock);
        ret = add_extent_mapping(em_tree, em);
        if (ret == -EEXIST) {
-                u64 failed_start = em->start;
-                u64 failed_len = em->len;
                free_extent_map(em);
                em = lookup_extent_mapping(em_tree, start, len);
-                if (em) {
+                if (!em)
-                        ret = 0;
+                        em = ERR_PTR(-EIO);
-                } else {
-                        em = lookup_extent_mapping(em_tree, failed_start,
-                                                   failed_len);
-                        ret = -EIO;
-                }
        } else if (ret) {
                free_extent_map(em);
-                em = NULL;
+                em = ERR_PTR(ret);
        }
        write_unlock(&em_tree->lock);
-        if (ret)
-                em = ERR_PTR(ret);
 out:
        return em;
 }
@@ -439,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
                WARN_ON(1);
                return 0;
        }
-        if (eb->pages[0] != page) {
-                WARN_ON(1);
-                return 0;
-        }
        if (!PageUptodate(page)) {
                WARN_ON(1);
                return 0;
@@ -869,10 +859,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
        return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
 }
+static int check_async_write(struct inode *inode, unsigned long bio_flags)
+{
+        if (bio_flags & EXTENT_BIO_TREE_LOG)
+                return 0;
+#ifdef CONFIG_X86
+        if (cpu_has_xmm4_2)
+                return 0;
+#endif
+        return 1;
+}
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                                 int mirror_num, unsigned long bio_flags,
                                 u64 bio_offset)
 {
+        int async = check_async_write(inode, bio_flags);
        int ret;
        if (!(rw & REQ_WRITE)) {
@@ -887,6 +889,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                        return ret;
                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
                                     mirror_num, 0);
+        } else if (!async) {
+                ret = btree_csum_one_bio(bio);
+                if (ret)
+                        return ret;
+                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                                     mirror_num, 0);
        }
        /*
@@ -1168,8 +1176,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        atomic_set(&root->log_commit[0], 0);
        atomic_set(&root->log_commit[1], 0);
        atomic_set(&root->log_writers, 0);
+        atomic_set(&root->log_batch, 0);
        atomic_set(&root->orphan_inodes, 0);
-        root->log_batch = 0;
        root->log_transid = 0;
        root->last_log_commit = 0;
        extent_io_tree_init(&root->dirty_log_pages,
@@ -1667,9 +1675,10 @@ static int transaction_kthread(void *arg)
                spin_unlock(&root->fs_info->trans_lock);
                /* If the file system is aborted, this will always fail. */
-                trans = btrfs_join_transaction(root);
+                trans = btrfs_attach_transaction(root);
                if (IS_ERR(trans)) {
-                        cannot_commit = true;
+                        if (PTR_ERR(trans) != -ENOENT)
+                                cannot_commit = true;
                        goto sleep;
                }
                if (transid == trans->transid) {
@@ -1994,13 +2003,11 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
-        INIT_LIST_HEAD(&fs_info->hashers);
        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
        INIT_LIST_HEAD(&fs_info->ordered_operations);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
        spin_lock_init(&fs_info->delalloc_lock);
        spin_lock_init(&fs_info->trans_lock);
-        spin_lock_init(&fs_info->ref_cache_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
        spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -2014,12 +2021,15 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->space_info);
        INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
        btrfs_mapping_init(&fs_info->mapping_tree);
-        btrfs_init_block_rsv(&fs_info->global_block_rsv);
+        btrfs_init_block_rsv(&fs_info->global_block_rsv,
-        btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
+                             BTRFS_BLOCK_RSV_GLOBAL);
-        btrfs_init_block_rsv(&fs_info->trans_block_rsv);
+        btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
-        btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
+                             BTRFS_BLOCK_RSV_DELALLOC);
-        btrfs_init_block_rsv(&fs_info->empty_block_rsv);
+        btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
-        btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
+        btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+        btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+        btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+                             BTRFS_BLOCK_RSV_DELOPS);
        atomic_set(&fs_info->nr_async_submits, 0);
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
@@ -2491,6 +2501,8 @@ retry_root_backup:
                printk(KERN_ERR "Failed to read block groups: %d\n", ret);
                goto fail_block_groups;
        }
+        fs_info->num_tolerated_disk_barrier_failures =
+                btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                               "btrfs-cleaner");
@@ -2874,12 +2886,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
                        printk_in_rcu("btrfs: disabling barriers on dev %s\n",
                                      rcu_str_deref(device->name));
                        device->nobarriers = 1;
-                }
+                } else if (!bio_flagged(bio, BIO_UPTODATE)) {
-                if (!bio_flagged(bio, BIO_UPTODATE)) {
                        ret = -EIO;
-                        if (!bio_flagged(bio, BIO_EOPNOTSUPP))
+                        btrfs_dev_stat_inc_and_print(device,
-                                btrfs_dev_stat_inc_and_print(device,
+                                BTRFS_DEV_STAT_FLUSH_ERRS);
-                                        BTRFS_DEV_STAT_FLUSH_ERRS);
                }
                /* drop the reference from the wait == 0 run */
@@ -2918,14 +2928,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 {
        struct list_head *head;
        struct btrfs_device *dev;
-        int errors = 0;
+        int errors_send = 0;
+        int errors_wait = 0;
        int ret;
        /* send down all the barriers */
        head = &info->fs_devices->devices;
        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev) {
-                        errors++;
+                        errors_send++;
                        continue;
                }
                if (!dev->in_fs_metadata || !dev->writeable)
@@ -2933,13 +2944,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
                ret = write_dev_flush(dev, 0);
                if (ret)
-                        errors++;
+                        errors_send++;
        }
        /* wait for all the barriers */
        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev) {
-                        errors++;
+                        errors_wait++;
                        continue;
                }
                if (!dev->in_fs_metadata || !dev->writeable)
@@ -2947,13 +2958,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
                ret = write_dev_flush(dev, 1);
                if (ret)
-                        errors++;
+                        errors_wait++;
        }
-        if (errors)
+        if (errors_send > info->num_tolerated_disk_barrier_failures ||
+            errors_wait > info->num_tolerated_disk_barrier_failures)
                return -EIO;
        return 0;
 }
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+        struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_ioctl_space_info space;
+        struct btrfs_space_info *sinfo;
+        u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+                       BTRFS_BLOCK_GROUP_SYSTEM,
+                       BTRFS_BLOCK_GROUP_METADATA,
+                       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+        int num_types = 4;
+        int i;
+        int c;
+        int num_tolerated_disk_barrier_failures =
+                (int)fs_info->fs_devices->num_devices;
+        for (i = 0; i < num_types; i++) {
+                struct btrfs_space_info *tmp;
+                sinfo = NULL;
+                rcu_read_lock();
+                list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
+                        if (tmp->flags == types[i]) {
+                                sinfo = tmp;
+                                break;
+                        }
+                }
+                rcu_read_unlock();
+                if (!sinfo)
+                        continue;
+                down_read(&sinfo->groups_sem);
+                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+                        if (!list_empty(&sinfo->block_groups[c])) {
+                                u64 flags;
+                                btrfs_get_block_group_info(
+                                        &sinfo->block_groups[c], &space);
+                                if (space.total_bytes == 0 ||
+                                    space.used_bytes == 0)
+                                        continue;
+                                flags = space.flags;
+                                /*
+                                 * return
+                                 * 0: if dup, single or RAID0 is configured for
+                                 *    any of metadata, system or data, else
+                                 * 1: if RAID5 is configured, or if RAID1 or
+                                 *    RAID10 is configured and only two mirrors
+                                 *    are used, else
+                                 * 2: if RAID6 is configured, else
+                                 * num_mirrors - 1: if RAID1 or RAID10 is
+                                 *                  configured and more than
+                                 *                  2 mirrors are used.
+                                 */
+                                if (num_tolerated_disk_barrier_failures > 0 &&
+                                    ((flags & (BTRFS_BLOCK_GROUP_DUP |
+                                               BTRFS_BLOCK_GROUP_RAID0)) ||
+                                     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
+                                      == 0)))
+                                        num_tolerated_disk_barrier_failures = 0;
+                                else if (num_tolerated_disk_barrier_failures > 1
+                                         &&
+                                         (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                                                   BTRFS_BLOCK_GROUP_RAID10)))
+                                        num_tolerated_disk_barrier_failures = 1;
+                        }
+                }
+                up_read(&sinfo->groups_sem);
+        }
+        return num_tolerated_disk_barrier_failures;
+}
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
        struct list_head *head;
@@ -2976,8 +3061,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        head = &root->fs_info->fs_devices->devices;
-        if (do_barriers)
+        if (do_barriers) {
-                barrier_all_devices(root->fs_info);
+                ret = barrier_all_devices(root->fs_info);
+                if (ret) {
+                        mutex_unlock(
+                                &root->fs_info->fs_devices->device_list_mutex);
+                        btrfs_error(root->fs_info, ret,
+                                    "errors while submitting device barriers.");
+                        return ret;
+                }
+        }
        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev) {
@@ -3211,10 +3304,6 @@ int close_ctree(struct btrfs_root *root)
                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
                       (unsigned long long)fs_info->delalloc_bytes);
        }
-        if (fs_info->total_ref_cache_size) {
-                printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
-                       (unsigned long long)fs_info->total_ref_cache_size);
-        }
        free_extent_buffer(fs_info->extent_root->node);
        free_extent_buffer(fs_info->extent_root->commit_root);
@@ -3360,52 +3449,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
        return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 }
-int btree_lock_page_hook(struct page *page, void *data,
-                                void (*flush_fn)(void *))
-{
-        struct inode *inode = page->mapping->host;
-        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct extent_buffer *eb;
-        /*
-         * We culled this eb but the page is still hanging out on the mapping,
-         * carry on.
-         */
-        if (!PagePrivate(page))
-                goto out;
-        eb = (struct extent_buffer *)page->private;
-        if (!eb) {
-                WARN_ON(1);
-                goto out;
-        }
-        if (page != eb->pages[0])
-                goto out;
-        if (!btrfs_try_tree_write_lock(eb)) {
-                flush_fn(data);
-                btrfs_tree_lock(eb);
-        }
-        btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-                spin_lock(&root->fs_info->delalloc_lock);
-                if (root->fs_info->dirty_metadata_bytes >= eb->len)
-                        root->fs_info->dirty_metadata_bytes -= eb->len;
-                else
-                        WARN_ON(1);
-                spin_unlock(&root->fs_info->delalloc_lock);
-        }
-        btrfs_tree_unlock(eb);
-out:
-        if (!trylock_page(page)) {
-                flush_fn(data);
-                lock_page(page);
-        }
-        return 0;
-}
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                              int read_only)
 {
@@ -3608,7 +3651,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
        while (1) {
                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-                                            mark);
+                                            mark, NULL);
                if (ret)
                        break;
@@ -3663,7 +3706,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
 again:
        while (1) {
                ret = find_first_extent_bit(unpin, 0, &start, &end,
-                                            EXTENT_DIRTY);
+                                            EXTENT_DIRTY, NULL);
                if (ret)
                        break;
@@ -3800,7 +3843,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
 }
 static struct extent_io_ops btree_extent_io_ops = {
-        .write_cache_pages_lock_hook = btree_lock_page_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
        .readpage_io_failed_hook = btree_io_failed_hook,
        .submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c5b00a735fef..2025a9132c16 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
                                     u64 objectid);
 int btree_lock_page_hook(struct page *page, void *data,
                                void (*flush_fn)(void *));
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+        struct btrfs_fs_info *fs_info);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ba58024d40d3..3d3e2c17d8d1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -94,8 +94,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                     u64 flags, struct btrfs_disk_key *key,
                                     int level, struct btrfs_key *ins);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *extent_root, u64 alloc_bytes,
+                          struct btrfs_root *extent_root, u64 flags,
-                          u64 flags, int force);
+                          int force);
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -312,7 +312,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
        while (start < end) {
                ret = find_first_extent_bit(info->pinned_extents, start,
                                            &extent_start, &extent_end,
-                                            EXTENT_DIRTY | EXTENT_UPTODATE);
+                                            EXTENT_DIRTY | EXTENT_UPTODATE,
+                                            NULL);
                if (ret)
                        break;
@@ -2361,10 +2362,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                }
 next:
-                do_chunk_alloc(trans, fs_info->extent_root,
-                               2 * 1024 * 1024,
-                               btrfs_get_alloc_profile(root, 0),
-                               CHUNK_ALLOC_NO_FORCE);
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
@@ -2478,10 +2475,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        if (root == root->fs_info->extent_root)
                root = root->fs_info->tree_root;
-        do_chunk_alloc(trans, root->fs_info->extent_root,
-                       2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
-                       CHUNK_ALLOC_NO_FORCE);
        btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
        delayed_refs = &trans->transaction->delayed_refs;
@@ -2551,6 +2544,12 @@ again:
        }
        if (run_all) {
+                if (!list_empty(&trans->new_bgs)) {
+                        spin_unlock(&delayed_refs->lock);
+                        btrfs_create_pending_block_groups(trans, root);
+                        spin_lock(&delayed_refs->lock);
+                }
                node = rb_first(&delayed_refs->root);
                if (!node)
                        goto out;
@@ -3406,7 +3405,6 @@ alloc:
                                return PTR_ERR(trans);
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                             bytes + 2 * 1024 * 1024,
                                             alloc_target,
                                             CHUNK_ALLOC_NO_FORCE);
                        btrfs_end_transaction(trans, root);
@@ -3488,8 +3486,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
 }
 static int should_alloc_chunk(struct btrfs_root *root,
-                              struct btrfs_space_info *sinfo, u64 alloc_bytes,
+                              struct btrfs_space_info *sinfo, int force)
-                              int force)
 {
        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3504,7 +3501,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
         * and purposes it's used space.  Don't worry about locking the
         * global_rsv, it doesn't change except when the transaction commits.
         */
-        num_allocated += global_rsv->size;
+        if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
+                num_allocated += global_rsv->size;
        /*
         * in limited mode, we want to have some free space up to
@@ -3518,15 +3516,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
                if (num_bytes - num_allocated < thresh)
                        return 1;
        }
-        thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
-        /* 256MB or 2% of the FS */
+        if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
-        thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
-        /* system chunks need a much small threshold */
-        if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
-                thresh = 32 * 1024 * 1024;
-        if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
                return 0;
        return 1;
 }
@@ -3576,8 +3567,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
 }
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *extent_root, u64 alloc_bytes,
+                          struct btrfs_root *extent_root, u64 flags, int force)
-                          u64 flags, int force)
 {
        struct btrfs_space_info *space_info;
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
@@ -3601,7 +3591,7 @@ again:
                return 0;
        }
-        if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
+        if (!should_alloc_chunk(extent_root, space_info, force)) {
                spin_unlock(&space_info->lock);
                return 0;
        } else if (space_info->chunk_alloc) {
@@ -3669,6 +3659,46 @@ out:
        return ret;
 }
+static int can_overcommit(struct btrfs_root *root,
+                          struct btrfs_space_info *space_info, u64 bytes,
+                          int flush)
+{
+        u64 profile = btrfs_get_alloc_profile(root, 0);
+        u64 avail;
+        u64 used;
+        used = space_info->bytes_used + space_info->bytes_reserved +
+                space_info->bytes_pinned + space_info->bytes_readonly +
+                space_info->bytes_may_use;
+        spin_lock(&root->fs_info->free_chunk_lock);
+        avail = root->fs_info->free_chunk_space;
+        spin_unlock(&root->fs_info->free_chunk_lock);
+        /*
+         * If we have dup, raid1 or raid10 then only half of the free
+         * space is actually useable.
+         */
+        if (profile & (BTRFS_BLOCK_GROUP_DUP |
+                       BTRFS_BLOCK_GROUP_RAID1 |
+                       BTRFS_BLOCK_GROUP_RAID10))
+                avail >>= 1;
+        /*
+         * If we aren't flushing don't let us overcommit too much, say
+         * 1/8th of the space.  If we can flush, let it overcommit up to
+         * 1/2 of the space.
+         */
+        if (flush)
+                avail >>= 3;
+        else
+                avail >>= 1;
+        if (used + bytes < space_info->total_bytes + avail)
+                return 1;
+        return 0;
+}
 /*
 * shrink metadata reservation for delalloc
 */
@@ -3693,7 +3723,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        if (delalloc_bytes == 0) {
                if (trans)
                        return;
-                btrfs_wait_ordered_extents(root, 0, 0);
+                btrfs_wait_ordered_extents(root, 0);
                return;
        }
@@ -3703,11 +3733,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
                                               WB_REASON_FS_FREE_SPACE);
+                /*
+                 * We need to wait for the async pages to actually start before
+                 * we do anything.
+                 */
+                wait_event(root->fs_info->async_submit_wait,
+                           !atomic_read(&root->fs_info->async_delalloc_pages));
                spin_lock(&space_info->lock);
-                if (space_info->bytes_used + space_info->bytes_reserved +
+                if (can_overcommit(root, space_info, orig, !trans)) {
-                    space_info->bytes_pinned + space_info->bytes_readonly +
-                    space_info->bytes_may_use + orig <=
-                    space_info->total_bytes) {
                        spin_unlock(&space_info->lock);
                        break;
                }
@@ -3715,7 +3749,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                loops++;
                if (wait_ordered && !trans) {
-                        btrfs_wait_ordered_extents(root, 0, 0);
+                        btrfs_wait_ordered_extents(root, 0);
                } else {
                        time_left = schedule_timeout_killable(1);
                        if (time_left)
@@ -3784,11 +3818,12 @@ commit:
 }
 enum flush_state {
-        FLUSH_DELALLOC          =       1,
+        FLUSH_DELAYED_ITEMS_NR  =       1,
-        FLUSH_DELALLOC_WAIT     =       2,
+        FLUSH_DELAYED_ITEMS     =       2,
-        FLUSH_DELAYED_ITEMS_NR  =       3,
+        FLUSH_DELALLOC          =       3,
-        FLUSH_DELAYED_ITEMS     =       4,
+        FLUSH_DELALLOC_WAIT     =       4,
-        COMMIT_TRANS            =       5,
+        ALLOC_CHUNK             =       5,
+        COMMIT_TRANS            =       6,
 };
 static int flush_space(struct btrfs_root *root,
@@ -3800,11 +3835,6 @@ static int flush_space(struct btrfs_root *root,
        int ret = 0;
        switch (state) {
-        case FLUSH_DELALLOC:
-        case FLUSH_DELALLOC_WAIT:
-                shrink_delalloc(root, num_bytes, orig_bytes,
-                                state == FLUSH_DELALLOC_WAIT);
-                break;
        case FLUSH_DELAYED_ITEMS_NR:
        case FLUSH_DELAYED_ITEMS:
                if (state == FLUSH_DELAYED_ITEMS_NR) {
@@ -3825,6 +3855,24 @@ static int flush_space(struct btrfs_root *root,
                ret = btrfs_run_delayed_items_nr(trans, root, nr);
                btrfs_end_transaction(trans, root);
                break;
+        case FLUSH_DELALLOC:
+        case FLUSH_DELALLOC_WAIT:
+                shrink_delalloc(root, num_bytes, orig_bytes,
+                                state == FLUSH_DELALLOC_WAIT);
+                break;
+        case ALLOC_CHUNK:
+                trans = btrfs_join_transaction(root);
+                if (IS_ERR(trans)) {
+                        ret = PTR_ERR(trans);
+                        break;
+                }
+                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                     btrfs_get_alloc_profile(root, 0),
+                                     CHUNK_ALLOC_NO_FORCE);
+                btrfs_end_transaction(trans, root);
+                if (ret == -ENOSPC)
+                        ret = 0;
+                break;
        case COMMIT_TRANS:
                ret = may_commit_transaction(root, space_info, orig_bytes, 0);
                break;
@@ -3856,10 +3904,9 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
        struct btrfs_space_info *space_info = block_rsv->space_info;
        u64 used;
        u64 num_bytes = orig_bytes;
-        int flush_state = FLUSH_DELALLOC;
+        int flush_state = FLUSH_DELAYED_ITEMS_NR;
        int ret = 0;
        bool flushing = false;
-        bool committed = false;
 again:
        ret = 0;
@@ -3922,57 +3969,12 @@ again:
                        (orig_bytes * 2);
        }
-        if (ret) {
+        if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
-                u64 profile = btrfs_get_alloc_profile(root, 0);
+                space_info->bytes_may_use += orig_bytes;
-                u64 avail;
+                trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                              space_info->flags, orig_bytes,
-                /*
+                                              1);
-                 * If we have a lot of space that's pinned, don't bother doing
+                ret = 0;
-                 * the overcommit dance yet and just commit the transaction.
-                 */
-                avail = (space_info->total_bytes - space_info->bytes_used) * 8;
-                do_div(avail, 10);
-                if (space_info->bytes_pinned >= avail && flush && !committed) {
-                        space_info->flush = 1;
-                        flushing = true;
-                        spin_unlock(&space_info->lock);
-                        ret = may_commit_transaction(root, space_info,
-                                                     orig_bytes, 1);
-                        if (ret)
-                                goto out;
-                        committed = true;
-                        goto again;
-                }
-                spin_lock(&root->fs_info->free_chunk_lock);
-                avail = root->fs_info->free_chunk_space;
-                /*
-                 * If we have dup, raid1 or raid10 then only half of the free
-                 * space is actually useable.
-                 */
-                if (profile & (BTRFS_BLOCK_GROUP_DUP |
-                               BTRFS_BLOCK_GROUP_RAID1 |
-                               BTRFS_BLOCK_GROUP_RAID10))
-                        avail >>= 1;
-                /*
-                 * If we aren't flushing don't let us overcommit too much, say
-                 * 1/8th of the space.  If we can flush, let it overcommit up to
-                 * 1/2 of the space.
-                 */
-                if (flush)
-                        avail >>= 3;
-                else
-                        avail >>= 1;
-                 spin_unlock(&root->fs_info->free_chunk_lock);
-                if (used + num_bytes < space_info->total_bytes + avail) {
-                        space_info->bytes_may_use += orig_bytes;
-                        trace_btrfs_space_reservation(root->fs_info,
-                                "space_info", space_info->flags, orig_bytes, 1);
-                        ret = 0;
-                }
        }
        /*
@@ -4114,13 +4116,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
        return 0;
 }
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
 {
        memset(rsv, 0, sizeof(*rsv));
        spin_lock_init(&rsv->lock);
+        rsv->type = type;
 }
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+                                              unsigned short type)
 {
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4129,7 +4133,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
        if (!block_rsv)
                return NULL;
-        btrfs_init_block_rsv(block_rsv);
+        btrfs_init_block_rsv(block_rsv, type);
        block_rsv->space_info = __find_space_info(fs_info,
                                                  BTRFS_BLOCK_GROUP_METADATA);
        return block_rsv;
@@ -4138,6 +4142,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv)
 {
+        if (!rsv)
+                return;
        btrfs_block_rsv_release(root, rsv, (u64)-1);
        kfree(rsv);
 }
@@ -4416,10 +4422,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
        struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
        /*
-         * two for root back/forward refs, two for directory entries
+         * two for root back/forward refs, two for directory entries,
-         * and one for root of the snapshot.
+         * one for root of the snapshot and one for parent inode.
         */
-        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
+        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
        dst_rsv->space_info = src_rsv->space_info;
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
@@ -5018,7 +5024,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
        while (1) {
                ret = find_first_extent_bit(unpin, 0, &start, &end,
-                                            EXTENT_DIRTY);
+                                            EXTENT_DIRTY, NULL);
                if (ret)
                        break;
@@ -5096,8 +5102,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        ret = remove_extent_backref(trans, extent_root, path,
                                                    NULL, refs_to_drop,
                                                    is_data);
-                        if (ret)
+                        if (ret) {
-                                goto abort;
+                                btrfs_abort_transaction(trans, extent_root, ret);
+                                goto out;
+                        }
                        btrfs_release_path(path);
                        path->leave_spinning = 1;
@@ -5115,8 +5123,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                        btrfs_print_leaf(extent_root,
                                                         path->nodes[0]);
                        }
-                        if (ret < 0)
+                        if (ret < 0) {
-                                goto abort;
+                                btrfs_abort_transaction(trans, extent_root, ret);
+                                goto out;
+                        }
                        extent_slot = path->slots[0];
                }
        } else if (ret == -ENOENT) {
@@ -5130,7 +5140,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                       (unsigned long long)owner_objectid,
                       (unsigned long long)owner_offset);
        } else {
-                goto abort;
+                btrfs_abort_transaction(trans, extent_root, ret);
+                goto out;
        }
        leaf = path->nodes[0];
@@ -5140,8 +5151,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                BUG_ON(found_extent || extent_slot != path->slots[0]);
                ret = convert_extent_item_v0(trans, extent_root, path,
                                             owner_objectid, 0);
-                if (ret < 0)
+                if (ret < 0) {
-                        goto abort;
+                        btrfs_abort_transaction(trans, extent_root, ret);
+                        goto out;
+                }
                btrfs_release_path(path);
                path->leave_spinning = 1;
@@ -5158,8 +5171,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                               (unsigned long long)bytenr);
                        btrfs_print_leaf(extent_root, path->nodes[0]);
                }
-                if (ret < 0)
+                if (ret < 0) {
-                        goto abort;
+                        btrfs_abort_transaction(trans, extent_root, ret);
+                        goto out;
+                }
                extent_slot = path->slots[0];
                leaf = path->nodes[0];
                item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -5196,8 +5212,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        ret = remove_extent_backref(trans, extent_root, path,
                                                    iref, refs_to_drop,
                                                    is_data);
-                        if (ret)
+                        if (ret) {
-                                goto abort;
+                                btrfs_abort_transaction(trans, extent_root, ret);
+                                goto out;
+                        }
                }
        } else {
                if (found_extent) {
@@ -5214,27 +5232,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
-                if (ret)
+                if (ret) {
-                        goto abort;
+                        btrfs_abort_transaction(trans, extent_root, ret);
+                        goto out;
+                }
                btrfs_release_path(path);
                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
-                        if (ret)
+                        if (ret) {
-                                goto abort;
+                                btrfs_abort_transaction(trans, extent_root, ret);
+                                goto out;
+                        }
                }
                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
-                if (ret)
+                if (ret) {
-                        goto abort;
+                        btrfs_abort_transaction(trans, extent_root, ret);
+                        goto out;
+                }
        }
 out:
        btrfs_free_path(path);
        return ret;
-abort:
-        btrfs_abort_transaction(trans, extent_root, ret);
-        goto out;
 }
 /*
@@ -5497,8 +5517,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_cache *used_block_group;
        u64 search_start = 0;
        int empty_cluster = 2 * 1024 * 1024;
-        int allowed_chunk_alloc = 0;
-        int done_chunk_alloc = 0;
        struct btrfs_space_info *space_info;
        int loop = 0;
        int index = 0;
@@ -5530,9 +5548,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        if (btrfs_mixed_space_info(space_info))
                use_cluster = false;
-        if (orig_root->ref_cows || empty_size)
-                allowed_chunk_alloc = 1;
        if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
                last_ptr = &root->fs_info->meta_alloc_cluster;
                if (!btrfs_test_opt(root, SSD))
@@ -5806,10 +5821,6 @@ checks:
                trace_btrfs_reserve_extent(orig_root, block_group,
                                           search_start, num_bytes);
-                if (offset < search_start)
-                        btrfs_add_free_space(used_block_group, offset,
-                                             search_start - offset);
-                BUG_ON(offset > search_start);
                if (used_block_group != block_group)
                        btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
@@ -5842,34 +5853,17 @@ loop:
                index = 0;
                loop++;
                if (loop == LOOP_ALLOC_CHUNK) {
-                       if (allowed_chunk_alloc) {
+                        ret = do_chunk_alloc(trans, root, data,
-                                ret = do_chunk_alloc(trans, root, num_bytes +
+                                             CHUNK_ALLOC_FORCE);
-                                                     2 * 1024 * 1024, data,
+                        /*
-                                                     CHUNK_ALLOC_LIMITED);
+                         * Do not bail out on ENOSPC since we
-                                /*
+                         * can do more things.
-                                 * Do not bail out on ENOSPC since we
+                         */
-                                 * can do more things.
+                        if (ret < 0 && ret != -ENOSPC) {
-                                 */
+                                btrfs_abort_transaction(trans,
-                                if (ret < 0 && ret != -ENOSPC) {
+                                                        root, ret);
-                                        btrfs_abort_transaction(trans,
+                                goto out;
-                                                                root, ret);
-                                        goto out;
-                                }
-                                allowed_chunk_alloc = 0;
-                                if (ret == 1)
-                                        done_chunk_alloc = 1;
-                        } else if (!done_chunk_alloc &&
-                                   space_info->force_alloc ==
-                                   CHUNK_ALLOC_NO_FORCE) {
-                                space_info->force_alloc = CHUNK_ALLOC_LIMITED;
                        }
-                       /*
-                        * We didn't allocate a chunk, go ahead and drop the
-                        * empty size and loop again.
-                        */
-                       if (!done_chunk_alloc)
-                               loop = LOOP_NO_EMPTY_SIZE;
                }
                if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5944,20 +5938,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
        data = btrfs_get_alloc_profile(root, data);
 again:
-        /*
-         * the only place that sets empty_size is btrfs_realloc_node, which
-         * is not called recursively on allocations
-         */
-        if (empty_size || root->ref_cows) {
-                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                     num_bytes + 2 * 1024 * 1024, data,
-                                     CHUNK_ALLOC_NO_FORCE);
-                if (ret < 0 && ret != -ENOSPC) {
-                        btrfs_abort_transaction(trans, root, ret);
-                        return ret;
-                }
-        }
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
                               hint_byte, ins, data);
@@ -5967,12 +5947,6 @@ again:
                        num_bytes = num_bytes >> 1;
                        num_bytes = num_bytes & ~(root->sectorsize - 1);
                        num_bytes = max(num_bytes, min_alloc_size);
-                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                       num_bytes, data, CHUNK_ALLOC_FORCE);
-                        if (ret < 0 && ret != -ENOSPC) {
-                                btrfs_abort_transaction(trans, root, ret);
-                                return ret;
-                        }
                        if (num_bytes == min_alloc_size)
                                final_tried = true;
                        goto again;
@@ -6314,7 +6288,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        ret = block_rsv_use_bytes(block_rsv, blocksize);
        if (!ret)
                return block_rsv;
-        if (ret) {
+        if (ret && !block_rsv->failfast) {
                static DEFINE_RATELIMIT_STATE(_rs,
                                DEFAULT_RATELIMIT_INTERVAL,
                                /*DEFAULT_RATELIMIT_BURST*/ 2);
@@ -7279,7 +7253,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        alloc_flags = update_block_group_flags(root, cache->flags);
        if (alloc_flags != cache->flags) {
-                ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+                ret = do_chunk_alloc(trans, root, alloc_flags,
                                     CHUNK_ALLOC_FORCE);
                if (ret < 0)
                        goto out;
@@ -7289,7 +7263,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        if (!ret)
                goto out;
        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
-        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+        ret = do_chunk_alloc(trans, root, alloc_flags,
                             CHUNK_ALLOC_FORCE);
        if (ret < 0)
                goto out;
@@ -7303,7 +7277,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, u64 type)
 {
        u64 alloc_flags = get_alloc_profile(root, type);
-        return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+        return do_chunk_alloc(trans, root, alloc_flags,
                              CHUNK_ALLOC_FORCE);
 }
@@ -7810,6 +7784,34 @@ error:
        return ret;
 }
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root)
+{
+        struct btrfs_block_group_cache *block_group, *tmp;
+        struct btrfs_root *extent_root = root->fs_info->extent_root;
+        struct btrfs_block_group_item item;
+        struct btrfs_key key;
+        int ret = 0;
+        list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
+                                 new_bg_list) {
+                list_del_init(&block_group->new_bg_list);
+                if (ret)
+                        continue;
+                spin_lock(&block_group->lock);
+                memcpy(&item, &block_group->item, sizeof(item));
+                memcpy(&key, &block_group->key, sizeof(key));
+                spin_unlock(&block_group->lock);
+                ret = btrfs_insert_item(trans, extent_root, &key, &item,
+                                        sizeof(item));
+                if (ret)
+                        btrfs_abort_transaction(trans, extent_root, ret);
+        }
+}
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, u64 bytes_used,
                           u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7843,6 +7845,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        spin_lock_init(&cache->lock);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
+        INIT_LIST_HEAD(&cache->new_bg_list);
        btrfs_init_free_space_ctl(cache);
@@ -7874,12 +7877,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        ret = btrfs_add_block_group_cache(root->fs_info, cache);
        BUG_ON(ret); /* Logic error */
-        ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
+        list_add_tail(&cache->new_bg_list, &trans->new_bgs);
-                                sizeof(cache->item));
-        if (ret) {
-                btrfs_abort_transaction(trans, extent_root, ret);
-                return ret;
-        }
        set_avail_alloc_bits(extent_root->fs_info, type);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b08ea4717e9d..8036d3a84853 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -45,6 +45,7 @@ struct extent_page_data {
        struct bio *bio;
        struct extent_io_tree *tree;
        get_extent_t *get_extent;
+        unsigned long bio_flags;
        /* tells writepage not to lock the state bits for this range
         * it still does the unlocking
@@ -64,13 +65,13 @@ tree_fs_info(struct extent_io_tree *tree)
 int __init extent_io_init(void)
 {
-        extent_state_cache = kmem_cache_create("extent_state",
+        extent_state_cache = kmem_cache_create("btrfs_extent_state",
                        sizeof(struct extent_state), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!extent_state_cache)
                return -ENOMEM;
-        extent_buffer_cache = kmem_cache_create("extent_buffers",
+        extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
                        sizeof(struct extent_buffer), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!extent_buffer_cache)
@@ -942,6 +943,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
 * @end:        the end offset in bytes (inclusive)
 * @bits:       the bits to set in this range
 * @clear_bits: the bits to clear in this range
+ * @cached_state:       state that we're going to cache
 * @mask:       the allocation mask
 *
 * This will go through and set bits for the given range.  If any states exist
@@ -951,7 +953,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
 * boundary bits like LOCK.
 */
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                       int bits, int clear_bits, gfp_t mask)
+                       int bits, int clear_bits,
+                       struct extent_state **cached_state, gfp_t mask)
 {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
@@ -968,6 +971,15 @@ again:
        }
        spin_lock(&tree->lock);
+        if (cached_state && *cached_state) {
+                state = *cached_state;
+                if (state->start <= start && state->end > start &&
+                    state->tree) {
+                        node = &state->rb_node;
+                        goto hit_next;
+                }
+        }
        /*
         * this search will find all the extents that end after
         * our range starts.
@@ -998,6 +1010,7 @@ hit_next:
         */
        if (state->start == start && state->end <= end) {
                set_state_bits(tree, state, &bits);
+                cache_state(state, cached_state);
                state = clear_state_bit(tree, state, &clear_bits, 0);
                if (last_end == (u64)-1)
                        goto out;
@@ -1038,6 +1051,7 @@ hit_next:
                        goto out;
                if (state->end <= end) {
                        set_state_bits(tree, state, &bits);
+                        cache_state(state, cached_state);
                        state = clear_state_bit(tree, state, &clear_bits, 0);
                        if (last_end == (u64)-1)
                                goto out;
@@ -1076,6 +1090,7 @@ hit_next:
                                   &bits);
                if (err)
                        extent_io_tree_panic(tree, err);
+                cache_state(prealloc, cached_state);
                prealloc = NULL;
                start = this_end + 1;
                goto search_again;
@@ -1098,6 +1113,7 @@ hit_next:
                        extent_io_tree_panic(tree, err);
                set_state_bits(tree, prealloc, &bits);
+                cache_state(prealloc, cached_state);
                clear_state_bit(tree, prealloc, &clear_bits, 0);
                prealloc = NULL;
                goto out;
@@ -1150,6 +1166,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                              NULL, cached_state, mask);
 }
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+                      struct extent_state **cached_state, gfp_t mask)
+{
+        return set_extent_bit(tree, start, end,
+                              EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+                              NULL, cached_state, mask);
+}
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask)
 {
@@ -1294,18 +1318,42 @@ out:
 * If nothing was found, 1 is returned. If found something, return 0.
 */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-                          u64 *start_ret, u64 *end_ret, int bits)
+                          u64 *start_ret, u64 *end_ret, int bits,
+                          struct extent_state **cached_state)
 {
        struct extent_state *state;
+        struct rb_node *n;
        int ret = 1;
        spin_lock(&tree->lock);
+        if (cached_state && *cached_state) {
+                state = *cached_state;
+                if (state->end == start - 1 && state->tree) {
+                        n = rb_next(&state->rb_node);
+                        while (n) {
+                                state = rb_entry(n, struct extent_state,
+                                                 rb_node);
+                                if (state->state & bits)
+                                        goto got_it;
+                                n = rb_next(n);
+                        }
+                        free_extent_state(*cached_state);
+                        *cached_state = NULL;
+                        goto out;
+                }
+                free_extent_state(*cached_state);
+                *cached_state = NULL;
+        }
        state = find_first_extent_bit_state(tree, start, bits);
+got_it:
        if (state) {
+                cache_state(state, cached_state);
                *start_ret = state->start;
                *end_ret = state->end;
                ret = 0;
        }
+out:
        spin_unlock(&tree->lock);
        return ret;
 }
@@ -2068,7 +2116,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
                }
                read_unlock(&em_tree->lock);
-                if (!em || IS_ERR(em)) {
+                if (!em) {
                        kfree(failrec);
                        return -EIO;
                }
@@ -2304,8 +2352,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                struct extent_state *cached = NULL;
                struct extent_state *state;
-                pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
+                pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
-                         "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+                         "mirror=%ld\n", (u64)bio->bi_sector, err,
                         (long int)bio->bi_bdev);
                tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -2709,12 +2757,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                                         end_bio_extent_readpage, mirror_num,
                                         *bio_flags,
                                         this_bio_flag);
-                        BUG_ON(ret == -ENOMEM);
+                        if (!ret) {
-                        nr++;
+                                nr++;
-                        *bio_flags = this_bio_flag;
+                                *bio_flags = this_bio_flag;
+                        }
                }
-                if (ret)
+                if (ret) {
                        SetPageError(page);
+                        unlock_extent(tree, cur, cur + iosize - 1);
+                }
                cur = cur + iosize;
                pg_offset += iosize;
        }
@@ -3161,12 +3212,16 @@ static int write_one_eb(struct extent_buffer *eb,
        struct block_device *bdev = fs_info->fs_devices->latest_bdev;
        u64 offset = eb->start;
        unsigned long i, num_pages;
+        unsigned long bio_flags = 0;
        int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
        int ret = 0;
        clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
        atomic_set(&eb->io_pages, num_pages);
+        if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
+                bio_flags = EXTENT_BIO_TREE_LOG;
        for (i = 0; i < num_pages; i++) {
                struct page *p = extent_buffer_page(eb, i);
@@ -3175,7 +3230,8 @@ static int write_one_eb(struct extent_buffer *eb,
                ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
                                         PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
                                         -1, end_bio_extent_buffer_writepage,
-                                         0, 0, 0);
+                                         0, epd->bio_flags, bio_flags);
+                epd->bio_flags = bio_flags;
                if (ret) {
                        set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
                        SetPageError(p);
@@ -3210,6 +3266,7 @@ int btree_write_cache_pages(struct address_space *mapping,
                .tree = tree,
                .extent_locked = 0,
                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+                .bio_flags = 0,
        };
        int ret = 0;
        int done = 0;
@@ -3254,19 +3311,34 @@ retry:
                                break;
                        }
+                        spin_lock(&mapping->private_lock);
+                        if (!PagePrivate(page)) {
+                                spin_unlock(&mapping->private_lock);
+                                continue;
+                        }
                        eb = (struct extent_buffer *)page->private;
+                        /*
+                         * Shouldn't happen and normally this would be a BUG_ON
+                         * but no sense in crashing the users box for something
+                         * we can survive anyway.
+                         */
                        if (!eb) {
+                                spin_unlock(&mapping->private_lock);
                                WARN_ON(1);
                                continue;
                        }
-                        if (eb == prev_eb)
+                        if (eb == prev_eb) {
+                                spin_unlock(&mapping->private_lock);
                                continue;
+                        }
-                        if (!atomic_inc_not_zero(&eb->refs)) {
+                        ret = atomic_inc_not_zero(&eb->refs);
-                                WARN_ON(1);
+                        spin_unlock(&mapping->private_lock);
+                        if (!ret)
                                continue;
-                        }
                        prev_eb = eb;
                        ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
@@ -3457,7 +3529,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
                if (epd->sync_io)
                        rw = WRITE_SYNC;
-                ret = submit_one_bio(rw, epd->bio, 0, 0);
+                ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
                BUG_ON(ret < 0); /* -ENOMEM */
                epd->bio = NULL;
        }
@@ -3480,6 +3552,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                .get_extent = get_extent,
                .extent_locked = 0,
                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+                .bio_flags = 0,
        };
        ret = __extent_writepage(page, wbc, &epd);
@@ -3504,6 +3577,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                .get_extent = get_extent,
                .extent_locked = 1,
                .sync_io = mode == WB_SYNC_ALL,
+                .bio_flags = 0,
        };
        struct writeback_control wbc_writepages = {
                .sync_mode      = mode,
@@ -3543,6 +3617,7 @@ int extent_writepages(struct extent_io_tree *tree,
                .get_extent = get_extent,
                .extent_locked = 0,
                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+                .bio_flags = 0,
        };
        ret = extent_write_cache_pages(tree, mapping, wbc,
@@ -3920,18 +3995,6 @@ out:
        return ret;
 }
-inline struct page *extent_buffer_page(struct extent_buffer *eb,
-                                              unsigned long i)
-{
-        return eb->pages[i];
-}
-inline unsigned long num_extent_pages(u64 start, u64 len)
-{
-        return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
-                (start >> PAGE_CACHE_SHIFT);
-}
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
 #if LEAK_DEBUG
@@ -4047,7 +4110,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
        return eb;
 err:
-        for (i--; i > 0; i--)
+        for (i--; i >= 0; i--)
                __free_page(eb->pages[i]);
        __free_extent_buffer(eb);
        return NULL;
@@ -4192,10 +4255,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        for (i = 0; i < num_pages; i++, index++) {
                p = find_or_create_page(mapping, index, GFP_NOFS);
-                if (!p) {
+                if (!p)
-                        WARN_ON(1);
                        goto free_eb;
-                }
                spin_lock(&mapping->private_lock);
                if (PagePrivate(p)) {
@@ -4338,7 +4399,6 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
                /* Should be safe to release our pages at this point */
                btrfs_release_extent_buffer_page(eb, 0);
                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
                return 1;
        }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 25900af5b15d..711d12b80028 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -27,6 +27,7 @@
 * type for this bio
 */
 #define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_TREE_LOG 2
 #define EXTENT_BIO_FLAG_SHIFT 16
 /* these are bit numbers for test/set bit */
@@ -232,11 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask);
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                       int bits, int clear_bits, gfp_t mask);
+                       int bits, int clear_bits,
+                       struct extent_state **cached_state, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask);
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+                      struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-                          u64 *start_ret, u64 *end_ret, int bits);
+                          u64 *start_ret, u64 *end_ret, int bits,
+                          struct extent_state **cached_state);
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
                                                 u64 start, int bits);
 int extent_invalidatepage(struct extent_io_tree *tree,
@@ -277,8 +282,18 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
                             struct extent_buffer *eb, u64 start, int wait,
                             get_extent_t *get_extent, int mirror_num);
-unsigned long num_extent_pages(u64 start, u64 len);
-struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+        return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+                (start >> PAGE_CACHE_SHIFT);
+}
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+                                              unsigned long i)
+{
+        return eb->pages[i];
+}
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7c97b3301459..b8cbc8d5c7f7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache;
 int __init extent_map_init(void)
 {
-        extent_map_cache = kmem_cache_create("extent_map",
+        extent_map_cache = kmem_cache_create("btrfs_extent_map",
                        sizeof(struct extent_map), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!extent_map_cache)
@@ -35,6 +35,7 @@ void extent_map_exit(void)
 void extent_map_tree_init(struct extent_map_tree *tree)
 {
        tree->map = RB_ROOT;
+        INIT_LIST_HEAD(&tree->modified_extents);
        rwlock_init(&tree->lock);
 }
@@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void)
        em->in_tree = 0;
        em->flags = 0;
        em->compress_type = BTRFS_COMPRESS_NONE;
+        em->generation = 0;
        atomic_set(&em->refs, 1);
+        INIT_LIST_HEAD(&em->list);
        return em;
 }
@@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em)
        WARN_ON(atomic_read(&em->refs) == 0);
        if (atomic_dec_and_test(&em->refs)) {
                WARN_ON(em->in_tree);
+                WARN_ON(!list_empty(&em->list));
                kmem_cache_free(extent_map_cache, em);
        }
 }
@@ -198,6 +202,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
                        em->block_len += merge->block_len;
                        em->block_start = merge->block_start;
                        merge->in_tree = 0;
+                        if (merge->generation > em->generation) {
+                                em->mod_start = em->start;
+                                em->mod_len = em->len;
+                                em->generation = merge->generation;
+                                list_move(&em->list, &tree->modified_extents);
+                        }
+                        list_del_init(&merge->list);
                        rb_erase(&merge->rb_node, &tree->map);
                        free_extent_map(merge);
                }
@@ -211,14 +223,34 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
                em->block_len += merge->len;
                rb_erase(&merge->rb_node, &tree->map);
                merge->in_tree = 0;
+                if (merge->generation > em->generation) {
+                        em->mod_len = em->len;
+                        em->generation = merge->generation;
+                        list_move(&em->list, &tree->modified_extents);
+                }
+                list_del_init(&merge->list);
                free_extent_map(merge);
        }
 }
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+/**
+ * unpint_extent_cache - unpin an extent from the cache
+ * @tree:       tree to unpin the extent in
+ * @start:      logical offset in the file
+ * @len:        length of the extent
+ * @gen:        generation that this extent has been modified in
+ * @prealloc:   if this is set we need to clear the prealloc flag
+ *
+ * Called after an extent has been written to disk properly.  Set the generation
+ * to the generation that actually added the file item to the inode so we know
+ * we need to sync this extent when we call fsync().
+ */
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
+                       u64 gen)
 {
        int ret = 0;
        struct extent_map *em;
+        bool prealloc = false;
        write_lock(&tree->lock);
        em = lookup_extent_mapping(tree, start, len);
@@ -228,10 +260,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
        if (!em)
                goto out;
+        list_move(&em->list, &tree->modified_extents);
+        em->generation = gen;
        clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+        em->mod_start = em->start;
+        em->mod_len = em->len;
+        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+                prealloc = true;
+                clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+        }
        try_merge_map(tree, em);
+        if (prealloc) {
+                em->mod_start = em->start;
+                em->mod_len = em->len;
+        }
        free_extent_map(em);
 out:
        write_unlock(&tree->lock);
@@ -269,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree,
        }
        atomic_inc(&em->refs);
+        em->mod_start = em->start;
+        em->mod_len = em->len;
        try_merge_map(tree, em);
 out:
        return ret;
@@ -358,6 +407,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
        WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
        rb_erase(&em->rb_node, &tree->map);
+        if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+                list_del_init(&em->list);
        em->in_tree = 0;
        return ret;
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 1195f09761fe..679225555f7b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -13,6 +13,7 @@
 #define EXTENT_FLAG_COMPRESSED 1
 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
 struct extent_map {
        struct rb_node rb_node;
@@ -20,18 +21,23 @@ struct extent_map {
        /* all of these are in bytes */
        u64 start;
        u64 len;
+        u64 mod_start;
+        u64 mod_len;
        u64 orig_start;
        u64 block_start;
        u64 block_len;
+        u64 generation;
        unsigned long flags;
        struct block_device *bdev;
        atomic_t refs;
        unsigned int in_tree;
        unsigned int compress_type;
+        struct list_head list;
 };
 struct extent_map_tree {
        struct rb_root map;
+        struct list_head modified_extents;
        rwlock_t lock;
 };
@@ -60,7 +66,7 @@ struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
                                         u64 start, u64 len);
 #endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 857d93cd01dc..1ad08e4e4a15 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,11 +25,12 @@
 #include "transaction.h"
 #include "print-tree.h"
-#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
                                   sizeof(struct btrfs_item) * 2) / \
                                  size) - 1))
-#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE))
+#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
+                                       PAGE_CACHE_SIZE))
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
                                   sizeof(struct btrfs_ordered_sum)) / \
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f6b40e86121b..9ab1bed88116 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,6 +39,7 @@
 #include "tree-log.h"
 #include "locking.h"
 #include "compat.h"
+#include "volumes.h"
 /*
 * when auto defrag is enabled we
@@ -458,14 +459,15 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 * this drops all the extents in the cache that intersect the range
 * [start, end].  Existing extents are split as required.
 */
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-                            int skip_pinned)
+                             int skip_pinned)
 {
        struct extent_map *em;
        struct extent_map *split = NULL;
        struct extent_map *split2 = NULL;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        u64 len = end - start + 1;
+        u64 gen;
        int ret;
        int testend = 1;
        unsigned long flags;
@@ -477,11 +479,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                testend = 0;
        }
        while (1) {
+                int no_splits = 0;
                if (!split)
                        split = alloc_extent_map();
                if (!split2)
                        split2 = alloc_extent_map();
-                BUG_ON(!split || !split2); /* -ENOMEM */
+                if (!split || !split2)
+                        no_splits = 1;
                write_lock(&em_tree->lock);
                em = lookup_extent_mapping(em_tree, start, len);
@@ -490,6 +495,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        break;
                }
                flags = em->flags;
+                gen = em->generation;
                if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
                        if (testend && em->start + em->len >= start + len) {
                                free_extent_map(em);
@@ -506,6 +512,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
                remove_extent_mapping(em_tree, em);
+                if (no_splits)
+                        goto next;
                if (em->block_start < EXTENT_MAP_LAST_BYTE &&
                    em->start < start) {
@@ -518,12 +526,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                                split->block_len = em->block_len;
                        else
                                split->block_len = split->len;
+                        split->generation = gen;
                        split->bdev = em->bdev;
                        split->flags = flags;
                        split->compress_type = em->compress_type;
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret); /* Logic error */
+                        list_move(&split->list, &em_tree->modified_extents);
                        free_extent_map(split);
                        split = split2;
                        split2 = NULL;
@@ -537,6 +546,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->bdev = em->bdev;
                        split->flags = flags;
                        split->compress_type = em->compress_type;
+                        split->generation = gen;
                        if (compressed) {
                                split->block_len = em->block_len;
@@ -550,9 +560,11 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret); /* Logic error */
+                        list_move(&split->list, &em_tree->modified_extents);
                        free_extent_map(split);
                        split = NULL;
                }
+next:
                write_unlock(&em_tree->lock);
                /* once for us */
@@ -564,7 +576,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                free_extent_map(split);
        if (split2)
                free_extent_map(split2);
-        return 0;
 }
 /*
@@ -576,13 +587,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 * it is either truncated or split.  Anything entirely inside the range
 * is deleted from the tree.
 */
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
-                       u64 start, u64 end, u64 *hint_byte, int drop_cache)
+                         struct btrfs_root *root, struct inode *inode,
+                         struct btrfs_path *path, u64 start, u64 end,
+                         u64 *drop_end, int drop_cache)
 {
-        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *fi;
-        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_key new_key;
        u64 ino = btrfs_ino(inode);
@@ -597,14 +608,12 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
        int recow;
        int ret;
        int modify_tree = -1;
+        int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
+        int found = 0;
        if (drop_cache)
                btrfs_drop_extent_cache(inode, start, end - 1, 0);
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
        if (start >= BTRFS_I(inode)->disk_i_size)
                modify_tree = 0;
@@ -666,6 +675,7 @@ next_slot:
                        goto next_slot;
                }
+                found = 1;
                search_start = max(key.offset, start);
                if (recow || !modify_tree) {
                        modify_tree = -1;
@@ -707,14 +717,13 @@ next_slot:
                                                        extent_end - start);
                        btrfs_mark_buffer_dirty(leaf);
-                        if (disk_bytenr > 0) {
+                        if (update_refs && disk_bytenr > 0) {
                                ret = btrfs_inc_extent_ref(trans, root,
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                new_key.objectid,
                                                start - extent_offset, 0);
                                BUG_ON(ret); /* -ENOMEM */
-                                *hint_byte = disk_bytenr;
                        }
                        key.offset = start;
                }
@@ -734,10 +743,8 @@ next_slot:
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        extent_end - end);
                        btrfs_mark_buffer_dirty(leaf);
-                        if (disk_bytenr > 0) {
+                        if (update_refs && disk_bytenr > 0)
                                inode_sub_bytes(inode, end - key.offset);
-                                *hint_byte = disk_bytenr;
-                        }
                        break;
                }
@@ -753,10 +760,8 @@ next_slot:
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        start - key.offset);
                        btrfs_mark_buffer_dirty(leaf);
-                        if (disk_bytenr > 0) {
+                        if (update_refs && disk_bytenr > 0)
                                inode_sub_bytes(inode, extent_end - start);
-                                *hint_byte = disk_bytenr;
-                        }
                        if (end == extent_end)
                                break;
@@ -777,12 +782,13 @@ next_slot:
                                del_nr++;
                        }
-                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                        if (update_refs &&
+                            extent_type == BTRFS_FILE_EXTENT_INLINE) {
                                inode_sub_bytes(inode,
                                                extent_end - key.offset);
                                extent_end = ALIGN(extent_end,
                                                   root->sectorsize);
-                        } else if (disk_bytenr > 0) {
+                        } else if (update_refs && disk_bytenr > 0) {
                                ret = btrfs_free_extent(trans, root,
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
@@ -791,7 +797,6 @@ next_slot:
                                BUG_ON(ret); /* -ENOMEM */
                                inode_sub_bytes(inode,
                                                extent_end - key.offset);
-                                *hint_byte = disk_bytenr;
                        }
                        if (end == extent_end)
@@ -806,7 +811,7 @@ next_slot:
                                              del_nr);
                        if (ret) {
                                btrfs_abort_transaction(trans, root, ret);
-                                goto out;
+                                break;
                        }
                        del_nr = 0;
@@ -825,7 +830,24 @@ next_slot:
                        btrfs_abort_transaction(trans, root, ret);
        }
-out:
+        if (drop_end)
+                *drop_end = found ? min(end, extent_end) : end;
+        btrfs_release_path(path);
+        return ret;
+}
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct inode *inode, u64 start,
+                       u64 end, int drop_cache)
+{
+        struct btrfs_path *path;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
+                                   drop_cache);
        btrfs_free_path(path);
        return ret;
 }
@@ -892,8 +914,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
        int ret;
        u64 ino = btrfs_ino(inode);
-        btrfs_drop_extent_cache(inode, start, end - 1, 0);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -935,12 +955,16 @@ again:
                        btrfs_set_item_key_safe(trans, root, path, &new_key);
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
+                        btrfs_set_file_extent_generation(leaf, fi,
+                                                         trans->transid);
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        extent_end - end);
                        btrfs_set_file_extent_offset(leaf, fi,
                                                     end - orig_offset);
                        fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
                                            struct btrfs_file_extent_item);
+                        btrfs_set_file_extent_generation(leaf, fi,
+                                                         trans->transid);
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        end - other_start);
                        btrfs_mark_buffer_dirty(leaf);
@@ -958,12 +982,16 @@ again:
                                            struct btrfs_file_extent_item);
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        start - key.offset);
+                        btrfs_set_file_extent_generation(leaf, fi,
+                                                         trans->transid);
                        path->slots[0]++;
                        new_key.offset = start;
                        btrfs_set_item_key_safe(trans, root, path, &new_key);
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
+                        btrfs_set_file_extent_generation(leaf, fi,
+                                                         trans->transid);
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        other_end - start);
                        btrfs_set_file_extent_offset(leaf, fi,
@@ -991,12 +1019,14 @@ again:
                leaf = path->nodes[0];
                fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
                                    struct btrfs_file_extent_item);
+                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
                btrfs_set_file_extent_num_bytes(leaf, fi,
                                                split - key.offset);
                fi = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_file_extent_item);
+                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
                btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
                btrfs_set_file_extent_num_bytes(leaf, fi,
                                                extent_end - split);
@@ -1056,12 +1086,14 @@ again:
                           struct btrfs_file_extent_item);
                btrfs_set_file_extent_type(leaf, fi,
                                           BTRFS_FILE_EXTENT_REG);
+                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
                btrfs_mark_buffer_dirty(leaf);
        } else {
                fi = btrfs_item_ptr(leaf, del_slot - 1,
                           struct btrfs_file_extent_item);
                btrfs_set_file_extent_type(leaf, fi,
                                           BTRFS_FILE_EXTENT_REG);
+                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
                btrfs_set_file_extent_num_bytes(leaf, fi,
                                                extent_end - key.offset);
                btrfs_mark_buffer_dirty(leaf);
@@ -1173,8 +1205,8 @@ again:
                clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
                                  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-                                  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
+                                  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-                                  GFP_NOFS);
+                                  0, 0, &cached_state, GFP_NOFS);
                unlock_extent_cached(&BTRFS_I(inode)->io_tree,
                                     start_pos, last_pos - 1, &cached_state,
                                     GFP_NOFS);
@@ -1514,16 +1546,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        trace_btrfs_sync_file(file, datasync);
+        /*
+         * We write the dirty pages in the range and wait until they complete
+         * out of the ->i_mutex. If so, we can flush the dirty pages by
+         * multi-task, and make the performance up.
+         */
+        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        if (ret)
+                return ret;
        mutex_lock(&inode->i_mutex);
        /*
-         * we wait first, since the writeback may change the inode, also wait
+         * We flush the dirty pages again to avoid some dirty pages in the
-         * ordered range does a filemape_write_and_wait_range which is why we
+         * range being left.
-         * don't do it above like other file systems.
         */
-        root->log_batch++;
+        atomic_inc(&root->log_batch);
        btrfs_wait_ordered_range(inode, start, end);
-        root->log_batch++;
+        atomic_inc(&root->log_batch);
        /*
         * check the transaction that last modified this inode
@@ -1544,6 +1584,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
            BTRFS_I(inode)->last_trans <=
            root->fs_info->last_trans_committed) {
                BTRFS_I(inode)->last_trans = 0;
+                /*
+                 * We'v had everything committed since the last time we were
+                 * modified so clear this flag in case it was set for whatever
+                 * reason, it's no longer relevant.
+                 */
+                clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                          &BTRFS_I(inode)->runtime_flags);
                mutex_unlock(&inode->i_mutex);
                goto out;
        }
@@ -1615,6 +1663,324 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
        return 0;
 }
+static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
+                          int slot, u64 start, u64 end)
+{
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+                return 0;
+        btrfs_item_key_to_cpu(leaf, &key, slot);
+        if (key.objectid != btrfs_ino(inode) ||
+            key.type != BTRFS_EXTENT_DATA_KEY)
+                return 0;
+        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+                return 0;
+        if (btrfs_file_extent_disk_bytenr(leaf, fi))
+                return 0;
+        if (key.offset == end)
+                return 1;
+        if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
+                return 1;
+        return 0;
+}
+static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
+                      struct btrfs_path *path, u64 offset, u64 end)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_buffer *leaf;
+        struct btrfs_file_extent_item *fi;
+        struct extent_map *hole_em;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct btrfs_key key;
+        int ret;
+        key.objectid = btrfs_ino(inode);
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        key.offset = offset;
+        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+        if (ret < 0)
+                return ret;
+        BUG_ON(!ret);
+        leaf = path->nodes[0];
+        if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
+                u64 num_bytes;
+                path->slots[0]--;
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+                        end - offset;
+                btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+                btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+                btrfs_set_file_extent_offset(leaf, fi, 0);
+                btrfs_mark_buffer_dirty(leaf);
+                goto out;
+        }
+        if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+                u64 num_bytes;
+                path->slots[0]++;
+                key.offset = offset;
+                btrfs_set_item_key_safe(trans, root, path, &key);
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
+                        offset;
+                btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+                btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+                btrfs_set_file_extent_offset(leaf, fi, 0);
+                btrfs_mark_buffer_dirty(leaf);
+                goto out;
+        }
+        btrfs_release_path(path);
+        ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+                                       0, 0, end - offset, 0, end - offset,
+                                       0, 0, 0);
+        if (ret)
+                return ret;
+out:
+        btrfs_release_path(path);
+        hole_em = alloc_extent_map();
+        if (!hole_em) {
+                btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                        &BTRFS_I(inode)->runtime_flags);
+        } else {
+                hole_em->start = offset;
+                hole_em->len = end - offset;
+                hole_em->orig_start = offset;
+                hole_em->block_start = EXTENT_MAP_HOLE;
+                hole_em->block_len = 0;
+                hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+                hole_em->compress_type = BTRFS_COMPRESS_NONE;
+                hole_em->generation = trans->transid;
+                do {
+                        btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+                        write_lock(&em_tree->lock);
+                        ret = add_extent_mapping(em_tree, hole_em);
+                        if (!ret)
+                                list_move(&hole_em->list,
+                                          &em_tree->modified_extents);
+                        write_unlock(&em_tree->lock);
+                } while (ret == -EEXIST);
+                free_extent_map(hole_em);
+                if (ret)
+                        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                                &BTRFS_I(inode)->runtime_flags);
+        }
+        return 0;
+}
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_state *cached_state = NULL;
+        struct btrfs_path *path;
+        struct btrfs_block_rsv *rsv;
+        struct btrfs_trans_handle *trans;
+        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+        u64 lockstart = (offset + mask) & ~mask;
+        u64 lockend = ((offset + len) & ~mask) - 1;
+        u64 cur_offset = lockstart;
+        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+        u64 drop_end;
+        unsigned long nr;
+        int ret = 0;
+        int err = 0;
+        bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
+                ((offset + len) >> PAGE_CACHE_SHIFT);
+        btrfs_wait_ordered_range(inode, offset, len);
+        mutex_lock(&inode->i_mutex);
+        if (offset >= inode->i_size) {
+                mutex_unlock(&inode->i_mutex);
+                return 0;
+        }
+        /*
+         * Only do this if we are in the same page and we aren't doing the
+         * entire page.
+         */
+        if (same_page && len < PAGE_CACHE_SIZE) {
+                ret = btrfs_truncate_page(inode, offset, len, 0);
+                mutex_unlock(&inode->i_mutex);
+                return ret;
+        }
+        /* zero back part of the first page */
+        ret = btrfs_truncate_page(inode, offset, 0, 0);
+        if (ret) {
+                mutex_unlock(&inode->i_mutex);
+                return ret;
+        }
+        /* zero the front end of the last page */
+        ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+        if (ret) {
+                mutex_unlock(&inode->i_mutex);
+                return ret;
+        }
+        if (lockend < lockstart) {
+                mutex_unlock(&inode->i_mutex);
+                return 0;
+        }
+        while (1) {
+                struct btrfs_ordered_extent *ordered;
+                truncate_pagecache_range(inode, lockstart, lockend);
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                 0, &cached_state);
+                ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+                /*
+                 * We need to make sure we have no ordered extents in this range
+                 * and nobody raced in and read a page in this range, if we did
+                 * we need to try again.
+                 */
+                if ((!ordered ||
+                    (ordered->file_offset + ordered->len < lockstart ||
+                     ordered->file_offset > lockend)) &&
+                     !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                     lockend, EXTENT_UPTODATE, 0,
+                                     cached_state)) {
+                        if (ordered)
+                                btrfs_put_ordered_extent(ordered);
+                        break;
+                }
+                if (ordered)
+                        btrfs_put_ordered_extent(ordered);
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+                                     lockend, &cached_state, GFP_NOFS);
+                btrfs_wait_ordered_range(inode, lockstart,
+                                         lockend - lockstart + 1);
+        }
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+        if (!rsv) {
+                ret = -ENOMEM;
+                goto out_free;
+        }
+        rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+        rsv->failfast = 1;
+        /*
+         * 1 - update the inode
+         * 1 - removing the extents in the range
+         * 1 - adding the hole extent
+         */
+        trans = btrfs_start_transaction(root, 3);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto out_free;
+        }
+        ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+                                      min_size);
+        BUG_ON(ret);
+        trans->block_rsv = rsv;
+        while (cur_offset < lockend) {
+                ret = __btrfs_drop_extents(trans, root, inode, path,
+                                           cur_offset, lockend + 1,
+                                           &drop_end, 1);
+                if (ret != -ENOSPC)
+                        break;
+                trans->block_rsv = &root->fs_info->trans_block_rsv;
+                ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+                if (ret) {
+                        err = ret;
+                        break;
+                }
+                cur_offset = drop_end;
+                ret = btrfs_update_inode(trans, root, inode);
+                if (ret) {
+                        err = ret;
+                        break;
+                }
+                nr = trans->blocks_used;
+                btrfs_end_transaction(trans, root);
+                btrfs_btree_balance_dirty(root, nr);
+                trans = btrfs_start_transaction(root, 3);
+                if (IS_ERR(trans)) {
+                        ret = PTR_ERR(trans);
+                        trans = NULL;
+                        break;
+                }
+                ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+                                              rsv, min_size);
+                BUG_ON(ret);    /* shouldn't happen */
+                trans->block_rsv = rsv;
+        }
+        if (ret) {
+                err = ret;
+                goto out_trans;
+        }
+        trans->block_rsv = &root->fs_info->trans_block_rsv;
+        ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+        if (ret) {
+                err = ret;
+                goto out_trans;
+        }
+out_trans:
+        if (!trans)
+                goto out_free;
+        trans->block_rsv = &root->fs_info->trans_block_rsv;
+        ret = btrfs_update_inode(trans, root, inode);
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+out_free:
+        btrfs_free_path(path);
+        btrfs_free_block_rsv(root, rsv);
+out:
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                             &cached_state, GFP_NOFS);
+        mutex_unlock(&inode->i_mutex);
+        if (ret && !err)
+                err = ret;
+        return err;
+}
 static long btrfs_fallocate(struct file *file, int mode,
                            loff_t offset, loff_t len)
 {
@@ -1633,15 +1999,18 @@ static long btrfs_fallocate(struct file *file, int mode,
        alloc_start = offset & ~mask;
        alloc_end =  (offset + len + mask) & ~mask;
-        /* We only support the FALLOC_FL_KEEP_SIZE mode */
+        /* Make sure we aren't being give some crap mode */
-        if (mode & ~FALLOC_FL_KEEP_SIZE)
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                return -EOPNOTSUPP;
+        if (mode & FALLOC_FL_PUNCH_HOLE)
+                return btrfs_punch_hole(inode, offset, len);
        /*
         * Make sure we have enough space before we do the
         * allocation.
         */
-        ret = btrfs_check_data_free_space(inode, len);
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
        if (ret)
                return ret;
@@ -1748,7 +2117,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 out:
        mutex_unlock(&inode->i_mutex);
        /* Let go of our reservation. */
-        btrfs_free_reserved_data_space(inode, len);
+        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
        return ret;
 }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6b10acfc2f5c..1027b854b90c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -966,7 +966,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                               block_group->key.offset)) {
                ret = find_first_extent_bit(unpin, start,
                                            &extent_start, &extent_end,
-                                            EXTENT_DIRTY);
+                                            EXTENT_DIRTY, NULL);
                if (ret) {
                        ret = 0;
                        break;
@@ -1454,9 +1454,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
                          max_t(u64, *offset, bitmap_info->offset));
        bits = bytes_to_bits(*bytes, ctl->unit);
-        for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
+        for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
-             i < BITS_PER_BITMAP;
-             i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
                next_zero = find_next_zero_bit(bitmap_info->bitmap,
                                               BITS_PER_BITMAP, i);
                if ((next_zero - i) >= bits) {
@@ -2307,9 +2305,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 again:
        found_bits = 0;
-        for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
+        for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
-             i < BITS_PER_BITMAP;
-             i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
                next_zero = find_next_zero_bit(entry->bitmap,
                                               BITS_PER_BITMAP, i);
                if (next_zero - i >= min_bits) {
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index db2ff9773b99..1d982812ab67 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -24,4 +24,14 @@ static inline u64 btrfs_name_hash(const char *name, int len)
 {
        return crc32c((u32)~1, name, len);
 }
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+                                    int len)
+{
+        return (u64) crc32c(parent_objectid, name, len);
+}
 #endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a13cf1a96c73..48b8fda93132 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -18,6 +18,7 @@
 #include "ctree.h"
 #include "disk-io.h"
+#include "hash.h"
 #include "transaction.h"
 #include "print-tree.h"
@@ -50,18 +51,57 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
        return 0;
 }
-struct btrfs_inode_ref *
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
+                                   const char *name, int name_len,
+                                   struct btrfs_inode_extref **extref_ret)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_inode_extref *extref;
+        unsigned long ptr;
+        unsigned long name_ptr;
+        u32 item_size;
+        u32 cur_offset = 0;
+        int ref_name_len;
+        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+        /*
+         * Search all extended backrefs in this item. We're only
+         * looking through any collisions so most of the time this is
+         * just going to compare against one buffer. If all is well,
+         * we'll return success and the inode ref object.
+         */
+        while (cur_offset < item_size) {
+                extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+                name_ptr = (unsigned long)(&extref->name);
+                ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+                if (ref_name_len == name_len &&
+                    btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
+                    (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
+                        if (extref_ret)
+                                *extref_ret = extref;
+                        return 1;
+                }
+                cur_offset += ref_name_len + sizeof(*extref);
+        }
+        return 0;
+}
+static struct btrfs_inode_ref *
 btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root,
+                       struct btrfs_root *root,
-                        struct btrfs_path *path,
+                       struct btrfs_path *path,
-                        const char *name, int name_len,
+                       const char *name, int name_len,
-                        u64 inode_objectid, u64 ref_objectid, int mod)
+                       u64 inode_objectid, u64 ref_objectid, int ins_len,
+                       int cow)
 {
+        int ret;
        struct btrfs_key key;
        struct btrfs_inode_ref *ref;
-        int ins_len = mod < 0 ? -1 : 0;
-        int cow = mod != 0;
-        int ret;
        key.objectid = inode_objectid;
        key.type = BTRFS_INODE_REF_KEY;
@@ -77,13 +117,150 @@ btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
        return ref;
 }
-int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+/* Returns NULL if no extref found */
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct btrfs_path *path,
+                          const char *name, int name_len,
+                          u64 inode_objectid, u64 ref_objectid, int ins_len,
+                          int cow)
+{
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_inode_extref *extref;
+        key.objectid = inode_objectid;
+        key.type = BTRFS_INODE_EXTREF_KEY;
+        key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (ret > 0)
+                return NULL;
+        if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
+                return NULL;
+        return extref;
+}
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct btrfs_path *path,
+                              const char *name, int name_len,
+                              u64 inode_objectid, u64 ref_objectid, int mod,
+                              u64 *ret_index)
+{
+        struct btrfs_inode_ref *ref;
+        struct btrfs_inode_extref *extref;
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
+                                     inode_objectid, ref_objectid, ins_len,
+                                     cow);
+        if (IS_ERR(ref))
+                return PTR_ERR(ref);
+        if (ref != NULL) {
+                *ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
+                return 0;
+        }
+        btrfs_release_path(path);
+        extref = btrfs_lookup_inode_extref(trans, root, path, name,
+                                           name_len, inode_objectid,
+                                           ref_objectid, ins_len, cow);
+        if (IS_ERR(extref))
+                return PTR_ERR(extref);
+        if (extref) {
+                *ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
+                return 0;
+        }
+        return -ENOENT;
+}
+int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           const char *name, int name_len,
                           u64 inode_objectid, u64 ref_objectid, u64 *index)
 {
        struct btrfs_path *path;
        struct btrfs_key key;
+        struct btrfs_inode_extref *extref;
+        struct extent_buffer *leaf;
+        int ret;
+        int del_len = name_len + sizeof(*extref);
+        unsigned long ptr;
+        unsigned long item_start;
+        u32 item_size;
+        key.objectid = inode_objectid;
+        btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+        key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret > 0)
+                ret = -ENOENT;
+        if (ret < 0)
+                goto out;
+        /*
+         * Sanity check - did we find the right item for this name?
+         * This should always succeed so error here will make the FS
+         * readonly.
+         */
+        if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
+                                            name, name_len, &extref)) {
+                btrfs_std_error(root->fs_info, -ENOENT);
+                ret = -EROFS;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        if (index)
+                *index = btrfs_inode_extref_index(leaf, extref);
+        if (del_len == item_size) {
+                /*
+                 * Common case only one ref in the item, remove the
+                 * whole item.
+                 */
+                ret = btrfs_del_item(trans, root, path);
+                goto out;
+        }
+        ptr = (unsigned long)extref;
+        item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+        memmove_extent_buffer(leaf, ptr, ptr + del_len,
+                              item_size - (ptr + del_len - item_start));
+        btrfs_truncate_item(trans, root, path, item_size - del_len, 1);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        const char *name, int name_len,
+                        u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
        struct btrfs_inode_ref *ref;
        struct extent_buffer *leaf;
        unsigned long ptr;
@@ -91,6 +268,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
        u32 item_size;
        u32 sub_item_len;
        int ret;
+        int search_ext_refs = 0;
        int del_len = name_len + sizeof(*ref);
        key.objectid = inode_objectid;
@@ -106,12 +284,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0) {
                ret = -ENOENT;
+                search_ext_refs = 1;
                goto out;
        } else if (ret < 0) {
                goto out;
        }
        if (!find_name_in_backref(path, name, name_len, &ref)) {
                ret = -ENOENT;
+                search_ext_refs = 1;
                goto out;
        }
        leaf = path->nodes[0];
@@ -129,8 +309,78 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
        item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
        memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
                              item_size - (ptr + sub_item_len - item_start));
-        btrfs_truncate_item(trans, root, path,
+        btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1);
-                                  item_size - sub_item_len, 1);
+out:
+        btrfs_free_path(path);
+        if (search_ext_refs) {
+                /*
+                 * No refs were found, or we could not find the
+                 * name in our ref array. Find and remove the extended
+                 * inode ref then.
+                 */
+                return btrfs_del_inode_extref(trans, root, name, name_len,
+                                              inode_objectid, ref_objectid, index);
+        }
+        return ret;
+}
+/*
+ * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ *
+ * The caller must have checked against BTRFS_LINK_MAX already.
+ */
+static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     const char *name, int name_len,
+                                     u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+        struct btrfs_inode_extref *extref;
+        int ret;
+        int ins_len = name_len + sizeof(*extref);
+        unsigned long ptr;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        key.objectid = inode_objectid;
+        key.type = BTRFS_INODE_EXTREF_KEY;
+        key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      ins_len);
+        if (ret == -EEXIST) {
+                if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+                                                   name, name_len, NULL))
+                        goto out;
+                btrfs_extend_item(trans, root, path, ins_len);
+                ret = 0;
+        }
+        if (ret < 0)
+                goto out;
+        leaf = path->nodes[0];
+        item = btrfs_item_nr(leaf, path->slots[0]);
+        ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
+        ptr += btrfs_item_size(leaf, item) - ins_len;
+        extref = (struct btrfs_inode_extref *)ptr;
+        btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+        btrfs_set_inode_extref_index(path->nodes[0], extref, index);
+        btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
+        ptr = (unsigned long)&extref->name;
+        write_extent_buffer(path->nodes[0], name, ptr, name_len);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
        btrfs_free_path(path);
        return ret;
@@ -191,6 +441,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 out:
        btrfs_free_path(path);
+        if (ret == -EMLINK) {
+                struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+                /* We ran out of space in the ref array. Need to
+                 * add an extended ref. */
+                if (btrfs_super_incompat_flags(disk_super)
+                    & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+                        ret = btrfs_insert_inode_extref(trans, root, name,
+                                                        name_len,
+                                                        inode_objectid,
+                                                        ref_objectid, index);
+        }
        return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a6ed6944e50c..85a1e5053fe6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -230,7 +230,6 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
        u64 inline_len = actual_end - start;
        u64 aligned_end = (end + root->sectorsize - 1) &
                        ~((u64)root->sectorsize - 1);
-        u64 hint_byte;
        u64 data_len = inline_len;
        int ret;
@@ -247,8 +246,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                return 1;
        }
-        ret = btrfs_drop_extents(trans, inode, start, aligned_end,
+        ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
-                                 &hint_byte, 1);
        if (ret)
                return ret;
@@ -664,7 +662,7 @@ retry:
                                           async_extent->compressed_size,
                                           async_extent->compressed_size,
                                           0, alloc_hint, &ins, 1);
-                        if (ret)
+                        if (ret && ret != -ENOSPC)
                                btrfs_abort_transaction(trans, root, ret);
                        btrfs_end_transaction(trans, root);
                }
@@ -1308,6 +1306,7 @@ out_check:
                        em->block_start = disk_bytenr;
                        em->bdev = root->fs_info->fs_devices->latest_bdev;
                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+                        set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                        while (1) {
                                write_lock(&em_tree->lock);
                                ret = add_extent_mapping(em_tree, em);
@@ -1364,11 +1363,7 @@ out_check:
        }
 error:
-        if (nolock) {
+        err = btrfs_end_transaction(trans, root);
-                err = btrfs_end_transaction_nolock(trans, root);
-        } else {
-                err = btrfs_end_transaction(trans, root);
-        }
        if (!ret)
                ret = err;
@@ -1785,7 +1780,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_key ins;
-        u64 hint;
        int ret;
        path = btrfs_alloc_path();
@@ -1803,8 +1797,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
         * the caller is expected to unpin it and allow it to be merged
         * with the others.
         */
-        ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
+        ret = btrfs_drop_extents(trans, root, inode, file_pos,
-                                 &hint, 0);
+                                 file_pos + num_bytes, 0);
        if (ret)
                goto out;
@@ -1828,10 +1822,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_set_file_extent_encryption(leaf, fi, encryption);
        btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
-        btrfs_unlock_up_safe(path, 1);
-        btrfs_set_lock_blocking(leaf);
        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(path);
        inode_add_bytes(inode, num_bytes);
@@ -1929,11 +1921,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                                                ordered_extent->len,
                                                compress_type, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
-                unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
-                                   ordered_extent->file_offset,
-                                   ordered_extent->len);
        }
+        unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+                           ordered_extent->file_offset, ordered_extent->len,
+                           trans->transid);
        if (ret < 0) {
                btrfs_abort_transaction(trans, root, ret);
                goto out_unlock;
@@ -1949,6 +1940,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                        btrfs_abort_transaction(trans, root, ret);
                        goto out_unlock;
                }
+        } else {
+                btrfs_set_inode_last_trans(trans, inode);
        }
        ret = 0;
 out_unlock:
@@ -1958,12 +1951,8 @@ out_unlock:
 out:
        if (root != root->fs_info->tree_root)
                btrfs_delalloc_release_metadata(inode, ordered_extent->len);
-        if (trans) {
+        if (trans)
-                if (nolock)
+                btrfs_end_transaction(trans, root);
-                        btrfs_end_transaction_nolock(trans, root);
-                else
-                        btrfs_end_transaction(trans, root);
-        }
        if (ret)
                clear_extent_uptodate(io_tree, ordered_extent->file_offset,
@@ -2119,7 +2108,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
        if (empty)
                return;
-        down_read(&root->fs_info->cleanup_work_sem);
        spin_lock(&fs_info->delayed_iput_lock);
        list_splice_init(&fs_info->delayed_iputs, &list);
        spin_unlock(&fs_info->delayed_iput_lock);
@@ -2130,7 +2118,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
                iput(delayed->inode);
                kfree(delayed);
        }
-        up_read(&root->fs_info->cleanup_work_sem);
 }
 enum btrfs_orphan_cleanup_state {
@@ -2198,7 +2185,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
        int ret;
        if (!root->orphan_block_rsv) {
-                block_rsv = btrfs_alloc_block_rsv(root);
+                block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
                if (!block_rsv)
                        return -ENOMEM;
        }
@@ -2225,7 +2212,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
                        insert = 1;
 #endif
                insert = 1;
-                atomic_dec(&root->orphan_inodes);
+                atomic_inc(&root->orphan_inodes);
        }
        if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
@@ -2590,6 +2577,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+        BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+        /*
+         * If we were modified in the current generation and evicted from memory
+         * and then re-read we need to do a full sync since we don't have any
+         * idea about which extents were modified before we were evicted from
+         * cache.
+         */
+        if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
+                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                        &BTRFS_I(inode)->runtime_flags);
        inode->i_version = btrfs_inode_sequence(leaf, inode_item);
        inode->i_generation = BTRFS_I(inode)->generation;
        inode->i_rdev = 0;
@@ -2894,7 +2893,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_path *path;
-        struct btrfs_inode_ref *ref;
        struct btrfs_dir_item *di;
        struct inode *inode = dentry->d_inode;
        u64 index;
@@ -3008,17 +3006,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        }
        btrfs_release_path(path);
-        ref = btrfs_lookup_inode_ref(trans, root, path,
+        ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
-                                dentry->d_name.name, dentry->d_name.len,
+                                        dentry->d_name.len, ino, dir_ino, 0,
-                                ino, dir_ino, 0);
+                                        &index);
-        if (IS_ERR(ref)) {
+        if (ret) {
-                err = PTR_ERR(ref);
+                err = ret;
                goto out;
        }
-        BUG_ON(!ref); /* Logic error */
        if (check_path_shared(root, path))
                goto out;
-        index = btrfs_inode_ref_index(path->nodes[0], ref);
        btrfs_release_path(path);
        /*
@@ -3061,7 +3059,7 @@ out:
 static void __unlink_end_trans(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
 {
-        if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+        if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
                btrfs_block_rsv_release(root, trans->block_rsv,
                                        trans->bytes_reserved);
                trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3191,9 +3189,10 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct btrfs_trans_handle *trans;
        unsigned long nr = 0;
-        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
+        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
-            btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
                return -ENOTEMPTY;
+        if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+                return -EPERM;
        trans = __unlink_start_trans(dir, dentry);
        if (IS_ERR(trans))
@@ -3267,8 +3266,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        path->reada = -1;
+        /*
+         * We want to drop from the next block forward in case this new size is
+         * not block aligned since we will be keeping the last block of the
+         * extent just the way it is.
+         */
        if (root->ref_cows || root == root->fs_info->tree_root)
-                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+                btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
        /*
         * This function is also used to drop the items in the log tree before
@@ -3429,12 +3433,6 @@ delete:
                if (path->slots[0] == 0 ||
                    path->slots[0] != pending_del_slot) {
-                        if (root->ref_cows &&
-                            BTRFS_I(inode)->location.objectid !=
-                                                BTRFS_FREE_INO_OBJECTID) {
-                                err = -EAGAIN;
-                                goto out;
-                        }
                        if (pending_del_nr) {
                                ret = btrfs_del_items(trans, root, path,
                                                pending_del_slot,
@@ -3465,12 +3463,20 @@ error:
 }
 /*
- * taken from block_truncate_page, but does cow as it zeros out
+ * btrfs_truncate_page - read, zero a chunk and write a page
- * any bytes left in the last page in the file.
+ * @inode - inode that we're zeroing
+ * @from - the offset to start zeroing
+ * @len - the length to zero, 0 to zero the entire range respective to the
+ *      offset
+ * @front - zero up to the offset instead of from the offset on
+ *
+ * This will find the page for the "from" offset and cow the page and zero the
+ * part we want to zero.  This is used with truncate and hole punching.
 */
-static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+                        int front)
 {
-        struct inode *inode = mapping->host;
+        struct address_space *mapping = inode->i_mapping;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
@@ -3485,7 +3491,8 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        u64 page_start;
        u64 page_end;
-        if ((offset & (blocksize - 1)) == 0)
+        if ((offset & (blocksize - 1)) == 0 &&
+            (!len || ((len & (blocksize - 1)) == 0)))
                goto out;
        ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
        if (ret)
@@ -3532,7 +3539,8 @@ again:
        }
        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-                          EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+                          EXTENT_DIRTY | EXTENT_DELALLOC |
+                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
                          0, 0, &cached_state, GFP_NOFS);
        ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -3545,8 +3553,13 @@ again:
        ret = 0;
        if (offset != PAGE_CACHE_SIZE) {
+                if (!len)
+                        len = PAGE_CACHE_SIZE - offset;
                kaddr = kmap(page);
-                memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+                if (front)
+                        memset(kaddr, 0, offset);
+                else
+                        memset(kaddr + offset, 0, len);
                flush_dcache_page(page);
                kunmap(page);
        }
@@ -3577,6 +3590,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        u64 mask = root->sectorsize - 1;
        u64 hole_start = (oldsize + mask) & ~mask;
        u64 block_end = (size + mask) & ~mask;
@@ -3613,7 +3627,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                last_byte = min(extent_map_end(em), block_end);
                last_byte = (last_byte + mask) & ~mask;
                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
-                        u64 hint_byte = 0;
+                        struct extent_map *hole_em;
                        hole_size = last_byte - cur_offset;
                        trans = btrfs_start_transaction(root, 3);
@@ -3622,9 +3636,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                break;
                        }
-                        err = btrfs_drop_extents(trans, inode, cur_offset,
+                        err = btrfs_drop_extents(trans, root, inode,
-                                                 cur_offset + hole_size,
+                                                 cur_offset,
-                                                 &hint_byte, 1);
+                                                 cur_offset + hole_size, 1);
                        if (err) {
                                btrfs_abort_transaction(trans, root, err);
                                btrfs_end_transaction(trans, root);
@@ -3641,9 +3655,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                break;
                        }
-                        btrfs_drop_extent_cache(inode, hole_start,
+                        btrfs_drop_extent_cache(inode, cur_offset,
-                                        last_byte - 1, 0);
+                                                cur_offset + hole_size - 1, 0);
+                        hole_em = alloc_extent_map();
+                        if (!hole_em) {
+                                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                                        &BTRFS_I(inode)->runtime_flags);
+                                goto next;
+                        }
+                        hole_em->start = cur_offset;
+                        hole_em->len = hole_size;
+                        hole_em->orig_start = cur_offset;
+                        hole_em->block_start = EXTENT_MAP_HOLE;
+                        hole_em->block_len = 0;
+                        hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+                        hole_em->compress_type = BTRFS_COMPRESS_NONE;
+                        hole_em->generation = trans->transid;
+                        while (1) {
+                                write_lock(&em_tree->lock);
+                                err = add_extent_mapping(em_tree, hole_em);
+                                if (!err)
+                                        list_move(&hole_em->list,
+                                                  &em_tree->modified_extents);
+                                write_unlock(&em_tree->lock);
+                                if (err != -EEXIST)
+                                        break;
+                                btrfs_drop_extent_cache(inode, cur_offset,
+                                                        cur_offset +
+                                                        hole_size - 1, 0);
+                        }
+                        free_extent_map(hole_em);
+next:
                        btrfs_update_inode(trans, root, inode);
                        btrfs_end_transaction(trans, root);
                }
@@ -3768,26 +3812,22 @@ void btrfs_evict_inode(struct inode *inode)
                goto no_delete;
        }
-        rsv = btrfs_alloc_block_rsv(root);
+        rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
        if (!rsv) {
                btrfs_orphan_del(NULL, inode);
                goto no_delete;
        }
        rsv->size = min_size;
+        rsv->failfast = 1;
        global_rsv = &root->fs_info->global_block_rsv;
        btrfs_i_size_write(inode, 0);
        /*
-         * This is a bit simpler than btrfs_truncate since
+         * This is a bit simpler than btrfs_truncate since we've already
-         *
+         * reserved our space for our orphan item in the unlink, so we just
-         * 1) We've already reserved our space for our orphan item in the
+         * need to reserve some slack space in case we add bytes and update
-         *    unlink.
+         * inode item when doing the truncate.
-         * 2) We're going to delete the inode item, so we don't need to update
-         *    it at all.
-         *
-         * So we just need to reserve some slack space in case we add bytes when
-         * doing the truncate.
         */
        while (1) {
                ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
@@ -3808,7 +3848,7 @@ void btrfs_evict_inode(struct inode *inode)
                        goto no_delete;
                }
-                trans = btrfs_start_transaction(root, 0);
+                trans = btrfs_start_transaction_noflush(root, 1);
                if (IS_ERR(trans)) {
                        btrfs_orphan_del(NULL, inode);
                        btrfs_free_block_rsv(root, rsv);
@@ -3818,9 +3858,13 @@ void btrfs_evict_inode(struct inode *inode)
                trans->block_rsv = rsv;
                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
-                if (ret != -EAGAIN)
+                if (ret != -ENOSPC)
                        break;
+                trans->block_rsv = &root->fs_info->trans_block_rsv;
+                ret = btrfs_update_inode(trans, root, inode);
+                BUG_ON(ret);
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
                trans = NULL;
@@ -4470,10 +4514,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
                        trans = btrfs_join_transaction(root);
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
-                if (nolock)
+                ret = btrfs_commit_transaction(trans, root);
-                        ret = btrfs_end_transaction_nolock(trans, root);
-                else
-                        ret = btrfs_commit_transaction(trans, root);
        }
        return ret;
 }
@@ -4671,6 +4712,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        BTRFS_I(inode)->generation = trans->transid;
        inode->i_generation = BTRFS_I(inode)->generation;
+        /*
+         * We could have gotten an inode number from somebody who was fsynced
+         * and then removed in this same transaction, so let's just set full
+         * sync since it will be a full sync anyway and this will blow away the
+         * old info in the log.
+         */
+        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
        if (S_ISDIR(mode))
                owner = 0;
        else
@@ -4680,6 +4729,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
        key[0].offset = 0;
+        /*
+         * Start new inodes with an inode_ref. This is slightly more
+         * efficient for small numbers of hard links since they will
+         * be packed into one item. Extended refs will kick in if we
+         * add more hard links than can fit in the ref item.
+         */
        key[1].objectid = objectid;
        btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
        key[1].offset = ref_objectid;
@@ -4986,7 +5041,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        if (root->objectid != BTRFS_I(inode)->root->objectid)
                return -EXDEV;
-        if (inode->i_nlink == ~0U)
+        if (inode->i_nlink >= BTRFS_LINK_MAX)
                return -EMLINK;
        err = btrfs_set_inode_index(dir, &index);
@@ -5450,7 +5505,8 @@ insert:
        write_unlock(&em_tree->lock);
 out:
-        trace_btrfs_get_extent(root, em);
+        if (em)
+                trace_btrfs_get_extent(root, em);
        if (path)
                btrfs_free_path(path);
@@ -5836,6 +5892,48 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
        return ret;
 }
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+                                           u64 len, u64 orig_start,
+                                           u64 block_start, u64 block_len,
+                                           int type)
+{
+        struct extent_map_tree *em_tree;
+        struct extent_map *em;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        em_tree = &BTRFS_I(inode)->extent_tree;
+        em = alloc_extent_map();
+        if (!em)
+                return ERR_PTR(-ENOMEM);
+        em->start = start;
+        em->orig_start = orig_start;
+        em->len = len;
+        em->block_len = block_len;
+        em->block_start = block_start;
+        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+        if (type == BTRFS_ORDERED_PREALLOC)
+                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+        do {
+                btrfs_drop_extent_cache(inode, em->start,
+                                em->start + em->len - 1, 0);
+                write_lock(&em_tree->lock);
+                ret = add_extent_mapping(em_tree, em);
+                write_unlock(&em_tree->lock);
+        } while (ret == -EEXIST);
+        if (ret) {
+                free_extent_map(em);
+                return ERR_PTR(ret);
+        }
+        return em;
+}
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
@@ -5950,6 +6048,19 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                        goto must_cow;
                if (can_nocow_odirect(trans, inode, start, len) == 1) {
+                        u64 orig_start = em->start;
+                        if (type == BTRFS_ORDERED_PREALLOC) {
+                                free_extent_map(em);
+                                em = create_pinned_em(inode, start, len,
+                                                       orig_start,
+                                                       block_start, len, type);
+                                if (IS_ERR(em)) {
+                                        btrfs_end_transaction(trans, root);
+                                        goto unlock_err;
+                                }
+                        }
                        ret = btrfs_add_ordered_extent_dio(inode, start,
                                           block_start, len, len, type);
                        btrfs_end_transaction(trans, root);
@@ -5999,7 +6110,8 @@ unlock:
        if (lockstart < lockend) {
                if (create && len < lockend - lockstart) {
                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                         lockstart + len - 1, unlock_bits, 1, 0,
+                                         lockstart + len - 1,
+                                         unlock_bits | EXTENT_DEFRAG, 1, 0,
                                         &cached_state, GFP_NOFS);
                        /*
                         * Beside unlock, we also need to cleanup reserved space
@@ -6007,8 +6119,8 @@ unlock:
                         */
                        clear_extent_bit(&BTRFS_I(inode)->io_tree,
                                         lockstart + len, lockend,
-                                         unlock_bits | EXTENT_DO_ACCOUNTING,
+                                         unlock_bits | EXTENT_DO_ACCOUNTING |
-                                         1, 0, NULL, GFP_NOFS);
+                                         EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
                } else {
                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
                                         lockend, unlock_bits, 1, 0,
@@ -6573,8 +6685,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                 */
                clear_extent_bit(tree, page_start, page_end,
                                 EXTENT_DIRTY | EXTENT_DELALLOC |
-                                 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
+                                 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
-                                 &cached_state, GFP_NOFS);
+                                 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
                /*
                 * whoever cleared the private bit is responsible
                 * for the finish_ordered_io
@@ -6590,7 +6702,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
        }
        clear_extent_bit(tree, page_start, page_end,
                 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-                 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
+                 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+                 &cached_state, GFP_NOFS);
        __btrfs_releasepage(page, GFP_NOFS);
        ClearPageChecked(page);
@@ -6687,7 +6800,8 @@ again:
         * prepare_pages in the normal write path.
         */
        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-                          EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+                          EXTENT_DIRTY | EXTENT_DELALLOC |
+                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
                          0, 0, &cached_state, GFP_NOFS);
        ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -6718,6 +6832,7 @@ again:
        BTRFS_I(inode)->last_trans = root->fs_info->generation;
        BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
@@ -6745,7 +6860,7 @@ static int btrfs_truncate(struct inode *inode)
        u64 mask = root->sectorsize - 1;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
-        ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+        ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
        if (ret)
                return ret;
@@ -6788,10 +6903,11 @@ static int btrfs_truncate(struct inode *inode)
         * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
         * updating the inode.
         */
-        rsv = btrfs_alloc_block_rsv(root);
+        rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
        if (!rsv)
                return -ENOMEM;
        rsv->size = min_size;
+        rsv->failfast = 1;
        /*
         * 1 for the truncate slack space
@@ -6837,36 +6953,21 @@ static int btrfs_truncate(struct inode *inode)
                                           &BTRFS_I(inode)->runtime_flags))
                btrfs_add_ordered_operation(trans, root, inode);
-        while (1) {
+        /*
-                ret = btrfs_block_rsv_refill(root, rsv, min_size);
+         * So if we truncate and then write and fsync we normally would just
-                if (ret) {
+         * write the extents that changed, which is a problem if we need to
-                        /*
+         * first truncate that entire inode.  So set this flag so we write out
-                         * This can only happen with the original transaction we
+         * all of the extents in the inode to the sync log so we're completely
-                         * started above, every other time we shouldn't have a
+         * safe.
-                         * transaction started yet.
+         */
-                         */
+        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
-                        if (ret == -EAGAIN)
+        trans->block_rsv = rsv;
-                                goto end_trans;
-                        err = ret;
-                        break;
-                }
-                if (!trans) {
-                        /* Just need the 1 for updating the inode */
-                        trans = btrfs_start_transaction(root, 1);
-                        if (IS_ERR(trans)) {
-                                ret = err = PTR_ERR(trans);
-                                trans = NULL;
-                                break;
-                        }
-                }
-                trans->block_rsv = rsv;
+        while (1) {
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
-                if (ret != -EAGAIN) {
+                if (ret != -ENOSPC) {
                        err = ret;
                        break;
                }
@@ -6877,11 +6978,22 @@ static int btrfs_truncate(struct inode *inode)
                        err = ret;
                        break;
                }
-end_trans:
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
-                trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
+                trans = btrfs_start_transaction(root, 2);
+                if (IS_ERR(trans)) {
+                        ret = err = PTR_ERR(trans);
+                        trans = NULL;
+                        break;
+                }
+                ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+                                              rsv, min_size);
+                BUG_ON(ret);    /* shouldn't happen */
+                trans->block_rsv = rsv;
        }
        if (ret == 0 && inode->i_nlink > 0) {
@@ -6965,6 +7077,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->csum_bytes = 0;
        ei->index_cnt = (u64)-1;
        ei->last_unlink_trans = 0;
+        ei->last_log_commit = 0;
        spin_lock_init(&ei->lock);
        ei->outstanding_extents = 0;
@@ -7095,31 +7208,31 @@ void btrfs_destroy_cachep(void)
 int btrfs_init_cachep(void)
 {
-        btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+        btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
                        sizeof(struct btrfs_inode), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
        if (!btrfs_inode_cachep)
                goto fail;
-        btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+        btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
                        sizeof(struct btrfs_trans_handle), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_trans_handle_cachep)
                goto fail;
-        btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+        btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
                        sizeof(struct btrfs_transaction), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_transaction_cachep)
                goto fail;
-        btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+        btrfs_path_cachep = kmem_cache_create("btrfs_path",
                        sizeof(struct btrfs_path), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_path_cachep)
                goto fail;
-        btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
+        btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
                        sizeof(struct btrfs_free_space), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_free_space_cachep)
@@ -7513,6 +7626,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                       loff_t actual_len, u64 *alloc_hint,
                                       struct btrfs_trans_handle *trans)
 {
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_map *em;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
@@ -7553,6 +7668,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                btrfs_drop_extent_cache(inode, cur_offset,
                                        cur_offset + ins.offset -1, 0);
+                em = alloc_extent_map();
+                if (!em) {
+                        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                                &BTRFS_I(inode)->runtime_flags);
+                        goto next;
+                }
+                em->start = cur_offset;
+                em->orig_start = cur_offset;
+                em->len = ins.offset;
+                em->block_start = ins.objectid;
+                em->block_len = ins.offset;
+                em->bdev = root->fs_info->fs_devices->latest_bdev;
+                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+                em->generation = trans->transid;
+                while (1) {
+                        write_lock(&em_tree->lock);
+                        ret = add_extent_mapping(em_tree, em);
+                        if (!ret)
+                                list_move(&em->list,
+                                          &em_tree->modified_extents);
+                        write_unlock(&em_tree->lock);
+                        if (ret != -EEXIST)
+                                break;
+                        btrfs_drop_extent_cache(inode, cur_offset,
+                                                cur_offset + ins.offset - 1,
+                                                0);
+                }
+                free_extent_map(em);
+next:
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
                *alloc_hint = ins.objectid + ins.offset;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 47127c1bd290..61168805f175 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -181,6 +181,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        int ret;
        u64 ip_oldflags;
        unsigned int i_oldflags;
+        umode_t mode;
        if (btrfs_root_readonly(root))
                return -EROFS;
@@ -203,6 +204,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        ip_oldflags = ip->flags;
        i_oldflags = inode->i_flags;
+        mode = inode->i_mode;
        flags = btrfs_mask_flags(inode->i_mode, flags);
        oldflags = btrfs_flags_to_ioctl(ip->flags);
@@ -237,10 +239,31 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                ip->flags |= BTRFS_INODE_DIRSYNC;
        else
                ip->flags &= ~BTRFS_INODE_DIRSYNC;
-        if (flags & FS_NOCOW_FL)
+        if (flags & FS_NOCOW_FL) {
-                ip->flags |= BTRFS_INODE_NODATACOW;
+                if (S_ISREG(mode)) {
-        else
+                        /*
-                ip->flags &= ~BTRFS_INODE_NODATACOW;
+                         * It's safe to turn csums off here, no extents exist.
+                         * Otherwise we want the flag to reflect the real COW
+                         * status of the file and will not set it.
+                         */
+                        if (inode->i_size == 0)
+                                ip->flags |= BTRFS_INODE_NODATACOW
+                                           | BTRFS_INODE_NODATASUM;
+                } else {
+                        ip->flags |= BTRFS_INODE_NODATACOW;
+                }
+        } else {
+                /*
+                 * Revert back under same assuptions as above
+                 */
+                if (S_ISREG(mode)) {
+                        if (inode->i_size == 0)
+                                ip->flags &= ~(BTRFS_INODE_NODATACOW
+                                             | BTRFS_INODE_NODATASUM);
+                } else {
+                        ip->flags &= ~BTRFS_INODE_NODATACOW;
+                }
+        }
        /*
         * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
@@ -516,7 +539,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        if (!pending_snapshot)
                return -ENOMEM;
-        btrfs_init_block_rsv(&pending_snapshot->block_rsv);
+        btrfs_init_block_rsv(&pending_snapshot->block_rsv,
+                             BTRFS_BLOCK_RSV_TEMP);
        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
        pending_snapshot->readonly = readonly;
@@ -525,7 +549,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
                *inherit = NULL;        /* take responsibility to free it */
        }
-        trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+        trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto fail;
@@ -614,7 +638,7 @@ static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
                return -ENOENT;
        BUG_ON(victim->d_parent->d_inode != dir);
-        audit_inode_child(victim, dir);
+        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
@@ -1022,8 +1046,8 @@ again:
                         page_start, page_end - 1, 0, &cached_state);
        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
                          page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-                          EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
+                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
-                          GFP_NOFS);
+                          &cached_state, GFP_NOFS);
        if (i_done != page_cnt) {
                spin_lock(&BTRFS_I(inode)->lock);
@@ -1034,8 +1058,8 @@ again:
        }
-        btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
+        set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
-                                  &cached_state);
+                          &cached_state, GFP_NOFS);
        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
                             page_start, page_end - 1, &cached_state,
@@ -2351,7 +2375,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        int ret;
        u64 len = olen;
        u64 bs = root->fs_info->sb->s_blocksize;
-        u64 hint_byte;
        /*
         * TODO:
@@ -2456,13 +2479,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
           another, and lock file content */
        while (1) {
                struct btrfs_ordered_extent *ordered;
-                lock_extent(&BTRFS_I(src)->io_tree, off, off+len);
+                lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
-                ordered = btrfs_lookup_first_ordered_extent(src, off+len);
+                ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1);
                if (!ordered &&
-                    !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
+                    !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1,
-                                   EXTENT_DELALLOC, 0, NULL))
+                                    EXTENT_DELALLOC, 0, NULL))
                        break;
-                unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
+                unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
                btrfs_wait_ordered_range(src, off, len);
@@ -2536,7 +2559,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        btrfs_release_path(path);
                        if (key.offset + datal <= off ||
-                            key.offset >= off+len)
+                            key.offset >= off + len - 1)
                                goto next;
                        memcpy(&new_key, &key, sizeof(new_key));
@@ -2574,10 +2597,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                        datal -= off - key.offset;
                                }
-                                ret = btrfs_drop_extents(trans, inode,
+                                ret = btrfs_drop_extents(trans, root, inode,
                                                         new_key.offset,
                                                         new_key.offset + datal,
-                                                         &hint_byte, 1);
+                                                         1);
                                if (ret) {
                                        btrfs_abort_transaction(trans, root,
                                                                ret);
@@ -2637,8 +2660,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                        new_key.offset += skip;
                                }
-                                if (key.offset + datal > off+len)
+                                if (key.offset + datal > off + len)
-                                        trim = key.offset + datal - (off+len);
+                                        trim = key.offset + datal - (off + len);
                                if (comp && (skip || trim)) {
                                        ret = -EINVAL;
@@ -2648,10 +2671,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                size -= skip + trim;
                                datal -= skip + trim;
-                                ret = btrfs_drop_extents(trans, inode,
+                                ret = btrfs_drop_extents(trans, root, inode,
                                                         new_key.offset,
                                                         new_key.offset + datal,
-                                                         &hint_byte, 1);
+                                                         1);
                                if (ret) {
                                        btrfs_abort_transaction(trans, root,
                                                                ret);
@@ -2715,7 +2738,7 @@ next:
        ret = 0;
 out:
        btrfs_release_path(path);
-        unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
+        unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
 out_unlock:
        mutex_unlock(&src->i_mutex);
        mutex_unlock(&inode->i_mutex);
@@ -2850,8 +2873,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        return 0;
 }
-static void get_block_group_info(struct list_head *groups_list,
+void btrfs_get_block_group_info(struct list_head *groups_list,
-                                 struct btrfs_ioctl_space_info *space)
+                                struct btrfs_ioctl_space_info *space)
 {
        struct btrfs_block_group_cache *block_group;
@@ -2959,8 +2982,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
                down_read(&info->groups_sem);
                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
                        if (!list_empty(&info->block_groups[c])) {
-                                get_block_group_info(&info->block_groups[c],
+                                btrfs_get_block_group_info(
-                                                     &space);
+                                        &info->block_groups[c], &space);
                                memcpy(dest, &space, sizeof(space));
                                dest++;
                                space_args.total_spaces++;
@@ -3208,11 +3231,9 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 {
        int ret = 0;
        int size;
-        u64 extent_item_pos;
        struct btrfs_ioctl_logical_ino_args *loi;
        struct btrfs_data_container *inodes = NULL;
        struct btrfs_path *path = NULL;
-        struct btrfs_key key;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -3230,7 +3251,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
                goto out;
        }
-        size = min_t(u32, loi->size, 4096);
+        size = min_t(u32, loi->size, 64 * 1024);
        inodes = init_data_container(size);
        if (IS_ERR(inodes)) {
                ret = PTR_ERR(inodes);
@@ -3238,22 +3259,13 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
                goto out;
        }
-        ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+        ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
-        btrfs_release_path(path);
+                                          build_ino_list, inodes);
+        if (ret == -EINVAL)
-        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                ret = -ENOENT;
        if (ret < 0)
                goto out;
-        extent_item_pos = loi->logical - key.objectid;
-        ret = iterate_extent_inodes(root->fs_info, key.objectid,
-                                        extent_item_pos, 0, build_ino_list,
-                                        inodes);
-        if (ret < 0)
-                goto out;
        ret = copy_to_user((void *)(unsigned long)loi->inodes,
                           (void *)(unsigned long)inodes, size);
        if (ret)
@@ -3261,7 +3273,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 out:
        btrfs_free_path(path);
-        kfree(inodes);
+        vfree(inodes);
        kfree(loi);
        return ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 051c7fe551dd..7772f02ba28e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,8 @@
 #include "btrfs_inode.h"
 #include "extent_io.h"
+static struct kmem_cache *btrfs_ordered_extent_cache;
 static u64 entry_end(struct btrfs_ordered_extent *entry)
 {
        if (entry->file_offset + entry->len < entry->file_offset)
@@ -187,7 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        struct btrfs_ordered_extent *entry;
        tree = &BTRFS_I(inode)->ordered_tree;
-        entry = kzalloc(sizeof(*entry), GFP_NOFS);
+        entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
        if (!entry)
                return -ENOMEM;
@@ -421,7 +423,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
                        list_del(&sum->list);
                        kfree(sum);
                }
-                kfree(entry);
+                kmem_cache_free(btrfs_ordered_extent_cache, entry);
        }
 }
@@ -466,8 +468,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
-void btrfs_wait_ordered_extents(struct btrfs_root *root,
+void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
-                                int nocow_only, int delay_iput)
 {
        struct list_head splice;
        struct list_head *cur;
@@ -482,15 +483,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
                cur = splice.next;
                ordered = list_entry(cur, struct btrfs_ordered_extent,
                                     root_extent_list);
-                if (nocow_only &&
-                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
-                    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
-                        list_move(&ordered->root_extent_list,
-                                  &root->fs_info->ordered_extents);
-                        cond_resched_lock(&root->fs_info->ordered_extent_lock);
-                        continue;
-                }
                list_del_init(&ordered->root_extent_list);
                atomic_inc(&ordered->refs);
@@ -775,7 +767,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
        u64 disk_i_size;
        u64 new_i_size;
-        u64 i_size_test;
        u64 i_size = i_size_read(inode);
        struct rb_node *node;
        struct rb_node *prev = NULL;
@@ -835,55 +826,30 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                        break;
                if (test->file_offset >= i_size)
                        break;
-                if (test->file_offset >= disk_i_size)
+                if (test->file_offset >= disk_i_size) {
+                        /*
+                         * we don't update disk_i_size now, so record this
+                         * undealt i_size. Or we will not know the real
+                         * i_size.
+                         */
+                        if (test->outstanding_isize < offset)
+                                test->outstanding_isize = offset;
+                        if (ordered &&
+                            ordered->outstanding_isize >
+                            test->outstanding_isize)
+                                test->outstanding_isize =
+                                                ordered->outstanding_isize;
                        goto out;
-        }
-        new_i_size = min_t(u64, offset, i_size);
-        /*
-         * at this point, we know we can safely update i_size to at least
-         * the offset from this ordered extent.  But, we need to
-         * walk forward and see if ios from higher up in the file have
-         * finished.
-         */
-        if (ordered) {
-                node = rb_next(&ordered->rb_node);
-        } else {
-                if (prev)
-                        node = rb_next(prev);
-                else
-                        node = rb_first(&tree->tree);
-        }
-        /*
-         * We are looking for an area between our current extent and the next
-         * ordered extent to update the i_size to.  There are 3 cases here
-         *
-         * 1) We don't actually have anything and we can update to i_size.
-         * 2) We have stuff but they already did their i_size update so again we
-         * can just update to i_size.
-         * 3) We have an outstanding ordered extent so the most we can update
-         * our disk_i_size to is the start of the next offset.
-         */
-        i_size_test = i_size;
-        for (; node; node = rb_next(node)) {
-                test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-                if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
-                        continue;
-                if (test->file_offset > offset) {
-                        i_size_test = test->file_offset;
-                        break;
                }
        }
+        new_i_size = min_t(u64, offset, i_size);
        /*
-         * i_size_test is the end of a region after this ordered
+         * Some ordered extents may completed before the current one, and
-         * extent where there are no ordered extents, we can safely set
+         * we hold the real i_size in ->outstanding_isize.
-         * disk_i_size to this.
         */
-        if (i_size_test > offset)
+        if (ordered && ordered->outstanding_isize > new_i_size)
-                new_i_size = min_t(u64, i_size_test, i_size);
+                new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
        BTRFS_I(inode)->disk_i_size = new_i_size;
        ret = 0;
 out:
@@ -984,3 +950,20 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
        }
        spin_unlock(&root->fs_info->ordered_extent_lock);
 }
+int __init ordered_data_init(void)
+{
+        btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
+                                     sizeof(struct btrfs_ordered_extent), 0,
+                                     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                     NULL);
+        if (!btrfs_ordered_extent_cache)
+                return -ENOMEM;
+        return 0;
+}
+void ordered_data_exit(void)
+{
+        if (btrfs_ordered_extent_cache)
+                kmem_cache_destroy(btrfs_ordered_extent_cache);
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e03c560d2997..dd27a0b46a37 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -96,6 +96,13 @@ struct btrfs_ordered_extent {
        /* number of bytes that still need writing */
        u64 bytes_left;
+        /*
+         * the end of the ordered extent which is behind it but
+         * didn't update disk_i_size. Please see the comment of
+         * btrfs_ordered_update_i_size();
+         */
+        u64 outstanding_isize;
        /* flags (described above) */
        unsigned long flags;
@@ -183,6 +190,7 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode);
-void btrfs_wait_ordered_extents(struct btrfs_root *root,
+void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
-                                int nocow_only, int delay_iput);
+int __init ordered_data_init(void);
+void ordered_data_exit(void);
 #endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b65015581744..5039686df6ae 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1145,12 +1145,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
                ulist_reinit(tmp);
                                                /* XXX id not needed */
-                ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+                ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC);
                ULIST_ITER_INIT(&tmp_uiter);
                while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
                        struct btrfs_qgroup_list *glist;
-                        qg = (struct btrfs_qgroup *)tmp_unode->aux;
+                        qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
                        if (qg->refcnt < seq)
                                qg->refcnt = seq + 1;
                        else
@@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
                        list_for_each_entry(glist, &qg->groups, next_group) {
                                ulist_add(tmp, glist->group->qgroupid,
-                                          (unsigned long)glist->group,
+                                          (u64)(uintptr_t)glist->group,
                                          GFP_ATOMIC);
                        }
                }
@@ -1168,13 +1168,13 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
         * step 2: walk from the new root
         */
        ulist_reinit(tmp);
-        ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+        ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
        ULIST_ITER_INIT(&uiter);
        while ((unode = ulist_next(tmp, &uiter))) {
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
-                qg = (struct btrfs_qgroup *)unode->aux;
+                qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
                if (qg->refcnt < seq) {
                        /* not visited by step 1 */
                        qg->rfer += sgn * node->num_bytes;
@@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
                list_for_each_entry(glist, &qg->groups, next_group) {
                        ulist_add(tmp, glist->group->qgroupid,
-                                  (unsigned long)glist->group, GFP_ATOMIC);
+                                  (uintptr_t)glist->group, GFP_ATOMIC);
                }
        }
@@ -1208,12 +1208,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
                        continue;
                ulist_reinit(tmp);
-                ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+                ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
                ULIST_ITER_INIT(&tmp_uiter);
                while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
                        struct btrfs_qgroup_list *glist;
-                        qg = (struct btrfs_qgroup *)tmp_unode->aux;
+                        qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
                        if (qg->tag == seq)
                                continue;
@@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
                        list_for_each_entry(glist, &qg->groups, next_group) {
                                ulist_add(tmp, glist->group->qgroupid,
-                                          (unsigned long)glist->group,
+                                          (uintptr_t)glist->group,
                                          GFP_ATOMIC);
                        }
                }
@@ -1469,13 +1469,17 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
         * be exceeded
         */
        ulist = ulist_alloc(GFP_ATOMIC);
-        ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+        if (!ulist) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
        ULIST_ITER_INIT(&uiter);
        while ((unode = ulist_next(ulist, &uiter))) {
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
-                qg = (struct btrfs_qgroup *)unode->aux;
+                qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
                if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
                    qg->reserved + qg->rfer + num_bytes >
@@ -1489,7 +1493,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
                list_for_each_entry(glist, &qg->groups, next_group) {
                        ulist_add(ulist, glist->group->qgroupid,
-                                  (unsigned long)glist->group, GFP_ATOMIC);
+                                  (uintptr_t)glist->group, GFP_ATOMIC);
                }
        }
        if (ret)
@@ -1502,7 +1506,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
        while ((unode = ulist_next(ulist, &uiter))) {
                struct btrfs_qgroup *qg;
-                qg = (struct btrfs_qgroup *)unode->aux;
+                qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
                qg->reserved += num_bytes;
        }
@@ -1541,19 +1545,23 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
                goto out;
        ulist = ulist_alloc(GFP_ATOMIC);
-        ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+        if (!ulist) {
+                btrfs_std_error(fs_info, -ENOMEM);
+                goto out;
+        }
+        ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
        ULIST_ITER_INIT(&uiter);
        while ((unode = ulist_next(ulist, &uiter))) {
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
-                qg = (struct btrfs_qgroup *)unode->aux;
+                qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
                qg->reserved -= num_bytes;
                list_for_each_entry(glist, &qg->groups, next_group) {
                        ulist_add(ulist, glist->group->qgroupid,
-                                  (unsigned long)glist->group, GFP_ATOMIC);
+                                  (uintptr_t)glist->group, GFP_ATOMIC);
                }
        }
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4da08652004d..776f0aa128fc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3270,8 +3270,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
        key.offset = 0;
        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
-        if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) {
+        if (IS_ERR(inode) || is_bad_inode(inode)) {
-                if (inode && !IS_ERR(inode))
+                if (!IS_ERR(inode))
                        iput(inode);
                return -ENOENT;
        }
@@ -3621,7 +3621,7 @@ next:
                ret = find_first_extent_bit(&rc->processed_blocks,
                                            key.objectid, &start, &end,
-                                            EXTENT_DIRTY);
+                                            EXTENT_DIRTY, NULL);
                if (ret == 0 && start <= key.objectid) {
                        btrfs_release_path(path);
@@ -3674,7 +3674,8 @@ int prepare_to_relocate(struct reloc_control *rc)
        struct btrfs_trans_handle *trans;
        int ret;
-        rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+        rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
+                                              BTRFS_BLOCK_RSV_TEMP);
        if (!rc->block_rsv)
                return -ENOMEM;
@@ -4057,7 +4058,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
               (unsigned long long)rc->block_group->flags);
        btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
-        btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
+        btrfs_wait_ordered_extents(fs_info->tree_root, 0);
        while (1) {
                mutex_lock(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 10d8e4d88071..eb923d087da7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -141,8 +141,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
                return -ENOMEM;
        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-        if (ret < 0)
+        if (ret < 0) {
-                goto out_abort;
+                btrfs_abort_transaction(trans, root, ret);
+                goto out;
+        }
        if (ret != 0) {
                btrfs_print_leaf(root, path->nodes[0]);
@@ -166,16 +168,23 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
                btrfs_release_path(path);
                ret = btrfs_search_slot(trans, root, key, path,
                                -1, 1);
-                if (ret < 0)
+                if (ret < 0) {
-                        goto out_abort;
+                        btrfs_abort_transaction(trans, root, ret);
+                        goto out;
+                }
                ret = btrfs_del_item(trans, root, path);
-                if (ret < 0)
+                if (ret < 0) {
-                        goto out_abort;
+                        btrfs_abort_transaction(trans, root, ret);
+                        goto out;
+                }
                btrfs_release_path(path);
                ret = btrfs_insert_empty_item(trans, root, path,
                                key, sizeof(*item));
-                if (ret < 0)
+                if (ret < 0) {
-                        goto out_abort;
+                        btrfs_abort_transaction(trans, root, ret);
+                        goto out;
+                }
                l = path->nodes[0];
                slot = path->slots[0];
                ptr = btrfs_item_ptr_offset(l, slot);
@@ -192,10 +201,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 out:
        btrfs_free_path(path);
        return ret;
-out_abort:
-        btrfs_abort_transaction(trans, root, ret);
-        goto out;
 }
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b223620cd5a6..27892f67e69b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -352,13 +352,14 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
        struct extent_buffer *eb;
        struct btrfs_extent_item *ei;
        struct scrub_warning swarn;
-        u32 item_size;
+        unsigned long ptr = 0;
-        int ret;
+        u64 extent_item_pos;
+        u64 flags = 0;
        u64 ref_root;
+        u32 item_size;
        u8 ref_level;
-        unsigned long ptr = 0;
        const int bufsize = 4096;
-        u64 extent_item_pos;
+        int ret;
        path = btrfs_alloc_path();
@@ -375,7 +376,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
        if (!path || !swarn.scratch_buf || !swarn.msg_buf)
                goto out;
-        ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
+        ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
+                                  &flags);
        if (ret < 0)
                goto out;
@@ -387,7 +389,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
        item_size = btrfs_item_size_nr(eb, path->slots[0]);
        btrfs_release_path(path);
-        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                do {
                        ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
                                                        &ref_root, &ref_level);
@@ -1029,6 +1031,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
                                spin_lock(&sdev->stat_lock);
                                sdev->stat.malloc_errors++;
                                spin_unlock(&sdev->stat_lock);
+                                kfree(bbio);
                                return -ENOMEM;
                        }
                        sblock->page_count++;
@@ -1666,21 +1669,6 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
                scrub_block_put(sblock);
        }
-        if (sbio->err) {
-                /* what is this good for??? */
-                sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-                sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
-                sbio->bio->bi_phys_segments = 0;
-                sbio->bio->bi_idx = 0;
-                for (i = 0; i < sbio->page_count; i++) {
-                        struct bio_vec *bi;
-                        bi = &sbio->bio->bi_io_vec[i];
-                        bi->bv_offset = 0;
-                        bi->bv_len = PAGE_SIZE;
-                }
-        }
        bio_put(sbio->bio);
        sbio->bio = NULL;
        spin_lock(&sdev->list_lock);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fb5ffe95f869..c7beb543a4a8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -107,7 +107,6 @@ struct send_ctx {
        int cur_inode_new;
        int cur_inode_new_gen;
        int cur_inode_deleted;
-        int cur_inode_first_ref_orphan;
        u64 cur_inode_size;
        u64 cur_inode_mode;
@@ -126,7 +125,15 @@ struct send_ctx {
 struct name_cache_entry {
        struct list_head list;
-        struct list_head use_list;
+        /*
+         * radix_tree has only 32bit entries but we need to handle 64bit inums.
+         * We use the lower 32bit of the 64bit inum to store it in the tree. If
+         * more then one inum would fall into the same entry, we use radix_list
+         * to store the additional entries. radix_list is also used to store
+         * entries where two entries have the same inum but different
+         * generations.
+         */
+        struct list_head radix_list;
        u64 ino;
        u64 gen;
        u64 parent_ino;
@@ -328,6 +335,7 @@ out:
        return ret;
 }
+#if 0
 static void fs_path_remove(struct fs_path *p)
 {
        BUG_ON(p->reversed);
@@ -335,6 +343,7 @@ static void fs_path_remove(struct fs_path *p)
                p->end--;
        *p->end = 0;
 }
+#endif
 static int fs_path_copy(struct fs_path *p, struct fs_path *from)
 {
@@ -377,7 +386,7 @@ static struct btrfs_path *alloc_path_for_send(void)
        return path;
 }
-static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
+int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
 {
        int ret;
        mm_segment_t old_fs;
@@ -387,8 +396,7 @@ static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
        set_fs(KERNEL_DS);
        while (pos < len) {
-                ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos,
+                ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
-                                &sctx->send_off);
                /* TODO handle that correctly */
                /*if (ret == -ERESTARTSYS) {
                        continue;
@@ -544,7 +552,8 @@ static int send_header(struct send_ctx *sctx)
        strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
        hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
-        return write_buf(sctx, &hdr, sizeof(hdr));
+        return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
+                                        &sctx->send_off);
 }
 /*
@@ -581,7 +590,8 @@ static int send_cmd(struct send_ctx *sctx)
        crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
        hdr->crc = cpu_to_le32(crc);
-        ret = write_buf(sctx, sctx->send_buf, sctx->send_size);
+        ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+                                        &sctx->send_off);
        sctx->total_send_size += sctx->send_size;
        sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
@@ -687,7 +697,8 @@ out:
 */
 static int get_inode_info(struct btrfs_root *root,
                          u64 ino, u64 *size, u64 *gen,
-                          u64 *mode, u64 *uid, u64 *gid)
+                          u64 *mode, u64 *uid, u64 *gid,
+                          u64 *rdev)
 {
        int ret;
        struct btrfs_inode_item *ii;
@@ -721,6 +732,8 @@ static int get_inode_info(struct btrfs_root *root,
                *uid = btrfs_inode_uid(path->nodes[0], ii);
        if (gid)
                *gid = btrfs_inode_gid(path->nodes[0], ii);
+        if (rdev)
+                *rdev = btrfs_inode_rdev(path->nodes[0], ii);
 out:
        btrfs_free_path(path);
@@ -852,7 +865,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
        struct extent_buffer *eb;
        struct btrfs_item *item;
        struct btrfs_dir_item *di;
-        struct btrfs_path *tmp_path = NULL;
        struct btrfs_key di_key;
        char *buf = NULL;
        char *buf2 = NULL;
@@ -874,12 +886,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
                goto out;
        }
-        tmp_path = alloc_path_for_send();
-        if (!tmp_path) {
-                ret = -ENOMEM;
-                goto out;
-        }
        eb = path->nodes[0];
        slot = path->slots[0];
        item = btrfs_item_nr(eb, slot);
@@ -941,7 +947,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
        }
 out:
-        btrfs_free_path(tmp_path);
        if (buf_virtual)
                vfree(buf);
        else
@@ -1026,12 +1031,12 @@ struct backref_ctx {
        u64 extent_len;
        /* Just to check for bugs in backref resolving */
-        int found_in_send_root;
+        int found_itself;
 };
 static int __clone_root_cmp_bsearch(const void *key, const void *elt)
 {
-        u64 root = (u64)key;
+        u64 root = (u64)(uintptr_t)key;
        struct clone_root *cr = (struct clone_root *)elt;
        if (root < cr->root->objectid)
@@ -1055,6 +1060,7 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
 /*
 * Called for every backref that is found for the current extent.
+ * Results are collected in sctx->clone_roots->ino/offset/found_refs
 */
 static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 {
@@ -1064,7 +1070,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
        u64 i_size;
        /* First check if the root is in the list of accepted clone sources */
-        found = bsearch((void *)root, bctx->sctx->clone_roots,
+        found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
                        bctx->sctx->clone_roots_cnt,
                        sizeof(struct clone_root),
                        __clone_root_cmp_bsearch);
@@ -1074,14 +1080,15 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
        if (found->root == bctx->sctx->send_root &&
            ino == bctx->cur_objectid &&
            offset == bctx->cur_offset) {
-                bctx->found_in_send_root = 1;
+                bctx->found_itself = 1;
        }
        /*
-         * There are inodes that have extents that lie behind it's i_size. Don't
+         * There are inodes that have extents that lie behind its i_size. Don't
         * accept clones from these extents.
         */
-        ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL);
+        ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
+                        NULL);
        if (ret < 0)
                return ret;
@@ -1101,16 +1108,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
                 */
                if (ino >= bctx->cur_objectid)
                        return 0;
-                /*if (ino > ctx->cur_objectid)
+#if 0
+                if (ino > bctx->cur_objectid)
                        return 0;
-                if (offset + ctx->extent_len > ctx->cur_offset)
+                if (offset + bctx->extent_len > bctx->cur_offset)
-                        return 0;*/
+                        return 0;
+#endif
-                bctx->found++;
-                found->found_refs++;
-                found->ino = ino;
-                found->offset = offset;
-                return 0;
        }
        bctx->found++;
@@ -1130,6 +1133,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 }
 /*
+ * Given an inode, offset and extent item, it finds a good clone for a clone
+ * instruction. Returns -ENOENT when none could be found. The function makes
+ * sure that the returned clone is usable at the point where sending is at the
+ * moment. This means, that no clones are accepted which lie behind the current
+ * inode+offset.
+ *
 * path must point to the extent item when called.
 */
 static int find_extent_clone(struct send_ctx *sctx,
@@ -1141,20 +1150,29 @@ static int find_extent_clone(struct send_ctx *sctx,
        int ret;
        int extent_type;
        u64 logical;
+        u64 disk_byte;
        u64 num_bytes;
        u64 extent_item_pos;
+        u64 flags = 0;
        struct btrfs_file_extent_item *fi;
        struct extent_buffer *eb = path->nodes[0];
-        struct backref_ctx backref_ctx;
+        struct backref_ctx *backref_ctx = NULL;
        struct clone_root *cur_clone_root;
        struct btrfs_key found_key;
        struct btrfs_path *tmp_path;
+        int compressed;
        u32 i;
        tmp_path = alloc_path_for_send();
        if (!tmp_path)
                return -ENOMEM;
+        backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
+        if (!backref_ctx) {
+                ret = -ENOMEM;
+                goto out;
+        }
        if (data_offset >= ino_size) {
                /*
                 * There may be extents that lie behind the file's size.
@@ -1172,22 +1190,23 @@ static int find_extent_clone(struct send_ctx *sctx,
                ret = -ENOENT;
                goto out;
        }
+        compressed = btrfs_file_extent_compression(eb, fi);
        num_bytes = btrfs_file_extent_num_bytes(eb, fi);
-        logical = btrfs_file_extent_disk_bytenr(eb, fi);
+        disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-        if (logical == 0) {
+        if (disk_byte == 0) {
                ret = -ENOENT;
                goto out;
        }
-        logical += btrfs_file_extent_offset(eb, fi);
+        logical = disk_byte + btrfs_file_extent_offset(eb, fi);
-        ret = extent_from_logical(sctx->send_root->fs_info,
+        ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
-                        logical, tmp_path, &found_key);
+                                  &found_key, &flags);
        btrfs_release_path(tmp_path);
        if (ret < 0)
                goto out;
-        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                ret = -EIO;
                goto out;
        }
@@ -1202,12 +1221,12 @@ static int find_extent_clone(struct send_ctx *sctx,
                cur_clone_root->found_refs = 0;
        }
-        backref_ctx.sctx = sctx;
+        backref_ctx->sctx = sctx;
-        backref_ctx.found = 0;
+        backref_ctx->found = 0;
-        backref_ctx.cur_objectid = ino;
+        backref_ctx->cur_objectid = ino;
-        backref_ctx.cur_offset = data_offset;
+        backref_ctx->cur_offset = data_offset;
-        backref_ctx.found_in_send_root = 0;
+        backref_ctx->found_itself = 0;
-        backref_ctx.extent_len = num_bytes;
+        backref_ctx->extent_len = num_bytes;
        /*
         * The last extent of a file may be too large due to page alignment.
@@ -1215,25 +1234,31 @@ static int find_extent_clone(struct send_ctx *sctx,
         * __iterate_backrefs work.
         */
        if (data_offset + num_bytes >= ino_size)
-                backref_ctx.extent_len = ino_size - data_offset;
+                backref_ctx->extent_len = ino_size - data_offset;
        /*
         * Now collect all backrefs.
         */
+        if (compressed == BTRFS_COMPRESS_NONE)
+                extent_item_pos = logical - found_key.objectid;
+        else
+                extent_item_pos = 0;
        extent_item_pos = logical - found_key.objectid;
        ret = iterate_extent_inodes(sctx->send_root->fs_info,
                                        found_key.objectid, extent_item_pos, 1,
-                                        __iterate_backrefs, &backref_ctx);
+                                        __iterate_backrefs, backref_ctx);
        if (ret < 0)
                goto out;
-        if (!backref_ctx.found_in_send_root) {
+        if (!backref_ctx->found_itself) {
                /* found a bug in backref code? */
                ret = -EIO;
                printk(KERN_ERR "btrfs: ERROR did not find backref in "
                                "send_root. inode=%llu, offset=%llu, "
-                                "logical=%llu\n",
+                                "disk_byte=%llu found extent=%llu\n",
-                                ino, data_offset, logical);
+                                ino, data_offset, disk_byte, found_key.objectid);
                goto out;
        }
@@ -1242,7 +1267,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
                "num_bytes=%llu, logical=%llu\n",
                data_offset, ino, num_bytes, logical);
-        if (!backref_ctx.found)
+        if (!backref_ctx->found)
                verbose_printk("btrfs:    no clones found\n");
        cur_clone_root = NULL;
@@ -1253,7 +1278,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
                        else if (sctx->clone_roots[i].root == sctx->send_root)
                                /* prefer clones from send_root over others */
                                cur_clone_root = sctx->clone_roots + i;
-                        break;
                }
        }
@@ -1267,6 +1291,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
 out:
        btrfs_free_path(tmp_path);
+        kfree(backref_ctx);
        return ret;
 }
@@ -1307,8 +1332,6 @@ static int read_symlink(struct send_ctx *sctx,
        len = btrfs_file_extent_inline_len(path->nodes[0], ei);
        ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
-        if (ret < 0)
-                goto out;
 out:
        btrfs_free_path(path);
@@ -1404,7 +1427,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
        u64 right_gen;
        ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
-                        NULL);
+                        NULL, NULL);
        if (ret < 0 && ret != -ENOENT)
                goto out;
        left_ret = ret;
@@ -1413,16 +1436,16 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
                right_ret = -ENOENT;
        } else {
                ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
-                                NULL, NULL, NULL);
+                                NULL, NULL, NULL, NULL);
                if (ret < 0 && ret != -ENOENT)
                        goto out;
                right_ret = ret;
        }
        if (!left_ret && !right_ret) {
-                if (left_gen == gen && right_gen == gen)
+                if (left_gen == gen && right_gen == gen) {
                        ret = inode_state_no_change;
-                else if (left_gen == gen) {
+                } else if (left_gen == gen) {
                        if (ino < sctx->send_progress)
                                ret = inode_state_did_create;
                        else
@@ -1516,6 +1539,10 @@ out:
        return ret;
 }
+/*
+ * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
+ * generation of the parent dir and the name of the dir entry.
+ */
 static int get_first_ref(struct send_ctx *sctx,
                         struct btrfs_root *root, u64 ino,
                         u64 *dir, u64 *dir_gen, struct fs_path *name)
@@ -1557,7 +1584,7 @@ static int get_first_ref(struct send_ctx *sctx,
        btrfs_release_path(path);
        ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
-                        NULL);
+                        NULL, NULL);
        if (ret < 0)
                goto out;
@@ -1586,22 +1613,28 @@ static int is_first_ref(struct send_ctx *sctx,
        if (ret < 0)
                goto out;
-        if (name_len != fs_path_len(tmp_name)) {
+        if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
                ret = 0;
                goto out;
        }
-        ret = memcmp(tmp_name->start, name, name_len);
+        ret = !memcmp(tmp_name->start, name, name_len);
-        if (ret)
-                ret = 0;
-        else
-                ret = 1;
 out:
        fs_path_free(sctx, tmp_name);
        return ret;
 }
+/*
+ * Used by process_recorded_refs to determine if a new ref would overwrite an
+ * already existing ref. In case it detects an overwrite, it returns the
+ * inode/gen in who_ino/who_gen.
+ * When an overwrite is detected, process_recorded_refs does proper orphanizing
+ * to make sure later references to the overwritten inode are possible.
+ * Orphanizing is however only required for the first ref of an inode.
+ * process_recorded_refs does an additional is_first_ref check to see if
+ * orphanizing is really required.
+ */
 static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
                              const char *name, int name_len,
                              u64 *who_ino, u64 *who_gen)
@@ -1626,9 +1659,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
                goto out;
        }
+        /*
+         * Check if the overwritten ref was already processed. If yes, the ref
+         * was already unlinked/moved, so we can safely assume that we will not
+         * overwrite anything at this point in time.
+         */
        if (other_inode > sctx->send_progress) {
                ret = get_inode_info(sctx->parent_root, other_inode, NULL,
-                                who_gen, NULL, NULL, NULL);
+                                who_gen, NULL, NULL, NULL, NULL);
                if (ret < 0)
                        goto out;
@@ -1642,6 +1680,13 @@ out:
        return ret;
 }
+/*
+ * Checks if the ref was overwritten by an already processed inode. This is
+ * used by __get_cur_name_and_parent to find out if the ref was orphanized and
+ * thus the orphan name needs be used.
+ * process_recorded_refs also uses it to avoid unlinking of refs that were
+ * overwritten.
+ */
 static int did_overwrite_ref(struct send_ctx *sctx,
                            u64 dir, u64 dir_gen,
                            u64 ino, u64 ino_gen,
@@ -1671,7 +1716,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
        }
        ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
-                        NULL);
+                        NULL, NULL);
        if (ret < 0)
                goto out;
@@ -1690,6 +1735,11 @@ out:
        return ret;
 }
+/*
+ * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
+ * that got overwritten. This is used by process_recorded_refs to determine
+ * if it has to use the path as returned by get_cur_path or the orphan name.
+ */
 static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
 {
        int ret = 0;
@@ -1710,39 +1760,40 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
        ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
                        name->start, fs_path_len(name));
-        if (ret < 0)
-                goto out;
 out:
        fs_path_free(sctx, name);
        return ret;
 }
+/*
+ * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
+ * so we need to do some special handling in case we have clashes. This function
+ * takes care of this with the help of name_cache_entry::radix_list.
+ * In case of error, nce is kfreed.
+ */
 static int name_cache_insert(struct send_ctx *sctx,
                             struct name_cache_entry *nce)
 {
        int ret = 0;
-        struct name_cache_entry **ncea;
+        struct list_head *nce_head;
-        ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
+        nce_head = radix_tree_lookup(&sctx->name_cache,
-        if (ncea) {
+                        (unsigned long)nce->ino);
-                if (!ncea[0])
+        if (!nce_head) {
-                        ncea[0] = nce;
+                nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
-                else if (!ncea[1])
+                if (!nce_head)
-                        ncea[1] = nce;
-                else
-                        BUG();
-        } else {
-                ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
-                if (!ncea)
                        return -ENOMEM;
+                INIT_LIST_HEAD(nce_head);
-                ncea[0] = nce;
+                ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
-                ncea[1] = NULL;
+                if (ret < 0) {
-                ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea);
+                        kfree(nce_head);
-                if (ret < 0)
+                        kfree(nce);
                        return ret;
+                }
        }
+        list_add_tail(&nce->radix_list, nce_head);
        list_add_tail(&nce->list, &sctx->name_cache_list);
        sctx->name_cache_size++;
@@ -1752,50 +1803,52 @@ static int name_cache_insert(struct send_ctx *sctx,
 static void name_cache_delete(struct send_ctx *sctx,
                              struct name_cache_entry *nce)
 {
-        struct name_cache_entry **ncea;
+        struct list_head *nce_head;
-        ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
-        BUG_ON(!ncea);
-        if (ncea[0] == nce)
-                ncea[0] = NULL;
-        else if (ncea[1] == nce)
-                ncea[1] = NULL;
-        else
-                BUG();
-        if (!ncea[0] && !ncea[1]) {
+        nce_head = radix_tree_lookup(&sctx->name_cache,
-                radix_tree_delete(&sctx->name_cache, nce->ino);
+                        (unsigned long)nce->ino);
-                kfree(ncea);
+        BUG_ON(!nce_head);
-        }
+        list_del(&nce->radix_list);
        list_del(&nce->list);
        sctx->name_cache_size--;
+        if (list_empty(nce_head)) {
+                radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
+                kfree(nce_head);
+        }
 }
 static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
                                                    u64 ino, u64 gen)
 {
-        struct name_cache_entry **ncea;
+        struct list_head *nce_head;
+        struct name_cache_entry *cur;
-        ncea = radix_tree_lookup(&sctx->name_cache, ino);
+        nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
-        if (!ncea)
+        if (!nce_head)
                return NULL;
-        if (ncea[0] && ncea[0]->gen == gen)
+        list_for_each_entry(cur, nce_head, radix_list) {
-                return ncea[0];
+                if (cur->ino == ino && cur->gen == gen)
-        else if (ncea[1] && ncea[1]->gen == gen)
+                        return cur;
-                return ncea[1];
+        }
        return NULL;
 }
+/*
+ * Removes the entry from the list and adds it back to the end. This marks the
+ * entry as recently used so that name_cache_clean_unused does not remove it.
+ */
 static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
 {
        list_del(&nce->list);
        list_add_tail(&nce->list, &sctx->name_cache_list);
 }
+/*
+ * Remove some entries from the beginning of name_cache_list.
+ */
 static void name_cache_clean_unused(struct send_ctx *sctx)
 {
        struct name_cache_entry *nce;
@@ -1814,13 +1867,23 @@ static void name_cache_clean_unused(struct send_ctx *sctx)
 static void name_cache_free(struct send_ctx *sctx)
 {
        struct name_cache_entry *nce;
-        struct name_cache_entry *tmp;
-        list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) {
+        while (!list_empty(&sctx->name_cache_list)) {
+                nce = list_entry(sctx->name_cache_list.next,
+                                struct name_cache_entry, list);
                name_cache_delete(sctx, nce);
+                kfree(nce);
        }
 }
+/*
+ * Used by get_cur_path for each ref up to the root.
+ * Returns 0 if it succeeded.
+ * Returns 1 if the inode is not existent or got overwritten. In that case, the
+ * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
+ * is returned, parent_ino/parent_gen are not guaranteed to be valid.
+ * Returns <0 in case of error.
+ */
 static int __get_cur_name_and_parent(struct send_ctx *sctx,
                                     u64 ino, u64 gen,
                                     u64 *parent_ino,
@@ -1832,6 +1895,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
        struct btrfs_path *path = NULL;
        struct name_cache_entry *nce = NULL;
+        /*
+         * First check if we already did a call to this function with the same
+         * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
+         * return the cached result.
+         */
        nce = name_cache_search(sctx, ino, gen);
        if (nce) {
                if (ino < sctx->send_progress && nce->need_later_update) {
@@ -1854,6 +1922,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
        if (!path)
                return -ENOMEM;
+        /*
+         * If the inode is not existent yet, add the orphan name and return 1.
+         * This should only happen for the parent dir that we determine in
+         * __record_new_ref
+         */
        ret = is_inode_existent(sctx, ino, gen);
        if (ret < 0)
                goto out;
@@ -1866,6 +1939,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
                goto out_cache;
        }
+        /*
+         * Depending on whether the inode was already processed or not, use
+         * send_root or parent_root for ref lookup.
+         */
        if (ino < sctx->send_progress)
                ret = get_first_ref(sctx, sctx->send_root, ino,
                                parent_ino, parent_gen, dest);
@@ -1875,6 +1952,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
        if (ret < 0)
                goto out;
+        /*
+         * Check if the ref was overwritten by an inode's ref that was processed
+         * earlier. If yes, treat as orphan and return 1.
+         */
        ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
                        dest->start, dest->end - dest->start);
        if (ret < 0)
@@ -1888,6 +1969,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
        }
 out_cache:
+        /*
+         * Store the result of the lookup in the name cache.
+         */
        nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
        if (!nce) {
                ret = -ENOMEM;
@@ -1901,7 +1985,6 @@ out_cache:
        nce->name_len = fs_path_len(dest);
        nce->ret = ret;
        strcpy(nce->name, dest->start);
-        memset(&nce->use_list, 0, sizeof(nce->use_list));
        if (ino < sctx->send_progress)
                nce->need_later_update = 0;
@@ -2107,9 +2190,6 @@ static int send_subvol_begin(struct send_ctx *sctx)
        read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
        btrfs_release_path(path);
-        if (ret < 0)
-                goto out;
        if (parent_root) {
                ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
                if (ret < 0)
@@ -2276,7 +2356,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
                        btrfs_inode_mtime(ii));
        TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
                        btrfs_inode_ctime(ii));
-        /* TODO otime? */
+        /* TODO Add otime support when the otime patches get into upstream */
        ret = send_cmd(sctx);
@@ -2292,39 +2372,39 @@ out:
 * a valid path yet because we did not process the refs yet. So, the inode
 * is created as orphan.
 */
-static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path,
+static int send_create_inode(struct send_ctx *sctx, u64 ino)
-                             struct btrfs_key *key)
 {
        int ret = 0;
-        struct extent_buffer *eb = path->nodes[0];
-        struct btrfs_inode_item *ii;
        struct fs_path *p;
-        int slot = path->slots[0];
        int cmd;
+        u64 gen;
        u64 mode;
+        u64 rdev;
-verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
+verbose_printk("btrfs: send_create_inode %llu\n", ino);
        p = fs_path_alloc(sctx);
        if (!p)
                return -ENOMEM;
-        ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+        ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
-        mode = btrfs_inode_mode(eb, ii);
+                        NULL, &rdev);
+        if (ret < 0)
+                goto out;
-        if (S_ISREG(mode))
+        if (S_ISREG(mode)) {
                cmd = BTRFS_SEND_C_MKFILE;
-        else if (S_ISDIR(mode))
+        } else if (S_ISDIR(mode)) {
                cmd = BTRFS_SEND_C_MKDIR;
-        else if (S_ISLNK(mode))
+        } else if (S_ISLNK(mode)) {
                cmd = BTRFS_SEND_C_SYMLINK;
-        else if (S_ISCHR(mode) || S_ISBLK(mode))
+        } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
                cmd = BTRFS_SEND_C_MKNOD;
-        else if (S_ISFIFO(mode))
+        } else if (S_ISFIFO(mode)) {
                cmd = BTRFS_SEND_C_MKFIFO;
-        else if (S_ISSOCK(mode))
+        } else if (S_ISSOCK(mode)) {
                cmd = BTRFS_SEND_C_MKSOCK;
-        else {
+        } else {
                printk(KERN_WARNING "btrfs: unexpected inode type %o",
                                (int)(mode & S_IFMT));
                ret = -ENOTSUPP;
@@ -2335,22 +2415,22 @@ verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
        if (ret < 0)
                goto out;
-        ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+        ret = gen_unique_name(sctx, ino, gen, p);
        if (ret < 0)
                goto out;
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
-        TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino);
+        TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
        if (S_ISLNK(mode)) {
                fs_path_reset(p);
-                ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p);
+                ret = read_symlink(sctx, sctx->send_root, ino, p);
                if (ret < 0)
                        goto out;
                TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
        } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
                   S_ISFIFO(mode) || S_ISSOCK(mode)) {
-                TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii));
+                TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev);
        }
        ret = send_cmd(sctx);
@@ -2364,6 +2444,92 @@ out:
        return ret;
 }
+/*
+ * We need some special handling for inodes that get processed before the parent
+ * directory got created. See process_recorded_refs for details.
+ * This function does the check if we already created the dir out of order.
+ */
+static int did_create_dir(struct send_ctx *sctx, u64 dir)
+{
+        int ret = 0;
+        struct btrfs_path *path = NULL;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct btrfs_key di_key;
+        struct extent_buffer *eb;
+        struct btrfs_dir_item *di;
+        int slot;
+        path = alloc_path_for_send();
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        key.objectid = dir;
+        key.type = BTRFS_DIR_INDEX_KEY;
+        key.offset = 0;
+        while (1) {
+                ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
+                                1, 0);
+                if (ret < 0)
+                        goto out;
+                if (!ret) {
+                        eb = path->nodes[0];
+                        slot = path->slots[0];
+                        btrfs_item_key_to_cpu(eb, &found_key, slot);
+                }
+                if (ret || found_key.objectid != key.objectid ||
+                    found_key.type != key.type) {
+                        ret = 0;
+                        goto out;
+                }
+                di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+                btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+                if (di_key.objectid < sctx->send_progress) {
+                        ret = 1;
+                        goto out;
+                }
+                key.offset = found_key.offset + 1;
+                btrfs_release_path(path);
+        }
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * Only creates the inode if it is:
+ * 1. Not a directory
+ * 2. Or a directory which was not created already due to out of order
+ *    directories. See did_create_dir and process_recorded_refs for details.
+ */
+static int send_create_inode_if_needed(struct send_ctx *sctx)
+{
+        int ret;
+        if (S_ISDIR(sctx->cur_inode_mode)) {
+                ret = did_create_dir(sctx, sctx->cur_ino);
+                if (ret < 0)
+                        goto out;
+                if (ret) {
+                        ret = 0;
+                        goto out;
+                }
+        }
+        ret = send_create_inode(sctx, sctx->cur_ino);
+        if (ret < 0)
+                goto out;
+out:
+        return ret;
+}
 struct recorded_ref {
        struct list_head list;
        char *dir_path;
@@ -2416,13 +2582,13 @@ static int record_ref(struct list_head *head, u64 dir,
 static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
 {
        struct recorded_ref *cur;
-        struct recorded_ref *tmp;
-        list_for_each_entry_safe(cur, tmp, head, list) {
+        while (!list_empty(head)) {
+                cur = list_entry(head->next, struct recorded_ref, list);
                fs_path_free(sctx, cur->full_path);
+                list_del(&cur->list);
                kfree(cur);
        }
-        INIT_LIST_HEAD(head);
 }
 static void free_recorded_refs(struct send_ctx *sctx)
@@ -2432,7 +2598,7 @@ static void free_recorded_refs(struct send_ctx *sctx)
 }
 /*
- * Renames/moves a file/dir to it's orphan name. Used when the first
+ * Renames/moves a file/dir to its orphan name. Used when the first
 * ref of an unprocessed inode gets overwritten and for all non empty
 * directories.
 */
@@ -2472,6 +2638,12 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
        struct btrfs_key loc;
        struct btrfs_dir_item *di;
+        /*
+         * Don't try to rmdir the top/root subvolume dir.
+         */
+        if (dir == BTRFS_FIRST_FREE_OBJECTID)
+                return 0;
        path = alloc_path_for_send();
        if (!path)
                return -ENOMEM;
@@ -2513,160 +2685,6 @@ out:
        return ret;
 }
-struct finish_unordered_dir_ctx {
-        struct send_ctx *sctx;
-        struct fs_path *cur_path;
-        struct fs_path *dir_path;
-        u64 dir_ino;
-        int need_delete;
-        int delete_pass;
-};
-int __finish_unordered_dir(int num, struct btrfs_key *di_key,
-                           const char *name, int name_len,
-                           const char *data, int data_len,
-                           u8 type, void *ctx)
-{
-        int ret = 0;
-        struct finish_unordered_dir_ctx *fctx = ctx;
-        struct send_ctx *sctx = fctx->sctx;
-        u64 di_gen;
-        u64 di_mode;
-        int is_orphan = 0;
-        if (di_key->objectid >= fctx->dir_ino)
-                goto out;
-        fs_path_reset(fctx->cur_path);
-        ret = get_inode_info(sctx->send_root, di_key->objectid,
-                        NULL, &di_gen, &di_mode, NULL, NULL);
-        if (ret < 0)
-                goto out;
-        ret = is_first_ref(sctx, sctx->send_root, di_key->objectid,
-                        fctx->dir_ino, name, name_len);
-        if (ret < 0)
-                goto out;
-        if (ret) {
-                is_orphan = 1;
-                ret = gen_unique_name(sctx, di_key->objectid, di_gen,
-                                fctx->cur_path);
-        } else {
-                ret = get_cur_path(sctx, di_key->objectid, di_gen,
-                                fctx->cur_path);
-        }
-        if (ret < 0)
-                goto out;
-        ret = fs_path_add(fctx->dir_path, name, name_len);
-        if (ret < 0)
-                goto out;
-        if (!fctx->delete_pass) {
-                if (S_ISDIR(di_mode)) {
-                        ret = send_rename(sctx, fctx->cur_path,
-                                        fctx->dir_path);
-                } else {
-                        ret = send_link(sctx, fctx->dir_path,
-                                        fctx->cur_path);
-                        if (is_orphan)
-                                fctx->need_delete = 1;
-                }
-        } else if (!S_ISDIR(di_mode)) {
-                ret = send_unlink(sctx, fctx->cur_path);
-        } else {
-                ret = 0;
-        }
-        fs_path_remove(fctx->dir_path);
-out:
-        return ret;
-}
-/*
- * Go through all dir items and see if we find refs which could not be created
- * in the past because the dir did not exist at that time.
- */
-static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
-{
-        int ret = 0;
-        struct btrfs_path *path = NULL;
-        struct btrfs_key key;
-        struct btrfs_key found_key;
-        struct extent_buffer *eb;
-        struct finish_unordered_dir_ctx fctx;
-        int slot;
-        path = alloc_path_for_send();
-        if (!path) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        memset(&fctx, 0, sizeof(fctx));
-        fctx.sctx = sctx;
-        fctx.cur_path = fs_path_alloc(sctx);
-        fctx.dir_path = fs_path_alloc(sctx);
-        if (!fctx.cur_path || !fctx.dir_path) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        fctx.dir_ino = dir;
-        ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path);
-        if (ret < 0)
-                goto out;
-        /*
-         * We do two passes. The first links in the new refs and the second
-         * deletes orphans if required. Deletion of orphans is not required for
-         * directory inodes, as we always have only one ref and use rename
-         * instead of link for those.
-         */
-again:
-        key.objectid = dir;
-        key.type = BTRFS_DIR_ITEM_KEY;
-        key.offset = 0;
-        while (1) {
-                ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
-                                1, 0);
-                if (ret < 0)
-                        goto out;
-                eb = path->nodes[0];
-                slot = path->slots[0];
-                btrfs_item_key_to_cpu(eb, &found_key, slot);
-                if (found_key.objectid != key.objectid ||
-                    found_key.type != key.type) {
-                        btrfs_release_path(path);
-                        break;
-                }
-                ret = iterate_dir_item(sctx, sctx->send_root, path,
-                                &found_key, __finish_unordered_dir,
-                                &fctx);
-                if (ret < 0)
-                        goto out;
-                key.offset = found_key.offset + 1;
-                btrfs_release_path(path);
-        }
-        if (!fctx.delete_pass && fctx.need_delete) {
-                fctx.delete_pass = 1;
-                goto again;
-        }
-out:
-        btrfs_free_path(path);
-        fs_path_free(sctx, fctx.cur_path);
-        fs_path_free(sctx, fctx.dir_path);
-        return ret;
-}
 /*
 * This does all the move/link/unlink/rmdir magic.
 */
@@ -2674,6 +2692,7 @@ static int process_recorded_refs(struct send_ctx *sctx)
 {
        int ret = 0;
        struct recorded_ref *cur;
+        struct recorded_ref *cur2;
        struct ulist *check_dirs = NULL;
        struct ulist_iterator uit;
        struct ulist_node *un;
@@ -2685,6 +2704,12 @@ static int process_recorded_refs(struct send_ctx *sctx)
 verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
+        /*
+         * This should never happen as the root dir always has the same ref
+         * which is always '..'
+         */
+        BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
        valid_path = fs_path_alloc(sctx);
        if (!valid_path) {
                ret = -ENOMEM;
@@ -2731,6 +2756,46 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
        list_for_each_entry(cur, &sctx->new_refs, list) {
                /*
+                 * We may have refs where the parent directory does not exist
+                 * yet. This happens if the parent directories inum is higher
+                 * the the current inum. To handle this case, we create the
+                 * parent directory out of order. But we need to check if this
+                 * did already happen before due to other refs in the same dir.
+                 */
+                ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+                if (ret < 0)
+                        goto out;
+                if (ret == inode_state_will_create) {
+                        ret = 0;
+                        /*
+                         * First check if any of the current inodes refs did
+                         * already create the dir.
+                         */
+                        list_for_each_entry(cur2, &sctx->new_refs, list) {
+                                if (cur == cur2)
+                                        break;
+                                if (cur2->dir == cur->dir) {
+                                        ret = 1;
+                                        break;
+                                }
+                        }
+                        /*
+                         * If that did not happen, check if a previous inode
+                         * did already create the dir.
+                         */
+                        if (!ret)
+                                ret = did_create_dir(sctx, cur->dir);
+                        if (ret < 0)
+                                goto out;
+                        if (!ret) {
+                                ret = send_create_inode(sctx, cur->dir);
+                                if (ret < 0)
+                                        goto out;
+                        }
+                }
+                /*
                 * Check if this new ref would overwrite the first ref of
                 * another unprocessed inode. If yes, orphanize the
                 * overwritten inode. If we find an overwritten ref that is
@@ -2764,7 +2829,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                 * inode, move it and update valid_path. If not, link or move
                 * it depending on the inode mode.
                 */
-                if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
+                if (is_orphan) {
                        ret = send_rename(sctx, valid_path, cur->full_path);
                        if (ret < 0)
                                goto out;
@@ -2827,6 +2892,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                        if (ret < 0)
                                goto out;
                }
+        } else if (S_ISDIR(sctx->cur_inode_mode) &&
+                   !list_empty(&sctx->deleted_refs)) {
+                /*
+                 * We have a moved dir. Add the old parent to check_dirs
+                 */
+                cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
+                                list);
+                ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
+                                GFP_NOFS);
+                if (ret < 0)
+                        goto out;
        } else if (!S_ISDIR(sctx->cur_inode_mode)) {
                /*
                 * We have a non dir inode. Go through all deleted refs and
@@ -2840,35 +2916,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                        if (ret < 0)
                                goto out;
                        if (!ret) {
-                                /*
+                                ret = send_unlink(sctx, cur->full_path);
-                                 * In case the inode was moved to a directory
+                                if (ret < 0)
-                                 * that was not created yet (see
+                                        goto out;
-                                 * __record_new_ref), we can not unlink the ref
-                                 * as it will be needed later when the parent
-                                 * directory is created, so that we can move in
-                                 * the inode to the new dir.
-                                 */
-                                if (!is_orphan &&
-                                    sctx->cur_inode_first_ref_orphan) {
-                                        ret = orphanize_inode(sctx,
-                                                        sctx->cur_ino,
-                                                        sctx->cur_inode_gen,
-                                                        cur->full_path);
-                                        if (ret < 0)
-                                                goto out;
-                                        ret = gen_unique_name(sctx,
-                                                        sctx->cur_ino,
-                                                        sctx->cur_inode_gen,
-                                                        valid_path);
-                                        if (ret < 0)
-                                                goto out;
-                                        is_orphan = 1;
-                                } else {
-                                        ret = send_unlink(sctx, cur->full_path);
-                                        if (ret < 0)
-                                                goto out;
-                                }
                        }
                        ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
                                        GFP_NOFS);
@@ -2880,12 +2930,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                 * If the inode is still orphan, unlink the orphan. This may
                 * happen when a previous inode did overwrite the first ref
                 * of this inode and no new refs were added for the current
-                 * inode.
+                 * inode. Unlinking does not mean that the inode is deleted in
-                 * We can however not delete the orphan in case the inode relies
+                 * all cases. There may still be links to this inode in other
-                 * in a directory that was not created yet (see
+                 * places.
-                 * __record_new_ref)
                 */
-                if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
+                if (is_orphan) {
                        ret = send_unlink(sctx, valid_path);
                        if (ret < 0)
                                goto out;
@@ -2900,6 +2949,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
         */
        ULIST_ITER_INIT(&uit);
        while ((un = ulist_next(check_dirs, &uit))) {
+                /*
+                 * In case we had refs into dirs that were not processed yet,
+                 * we don't need to do the utime and rmdir logic for these dirs.
+                 * The dir will be processed later.
+                 */
                if (un->val > sctx->cur_ino)
                        continue;
@@ -2929,25 +2983,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                }
        }
-        /*
-         * Current inode is now at it's new position, so we must increase
-         * send_progress
-         */
-        sctx->send_progress = sctx->cur_ino + 1;
-        /*
-         * We may have a directory here that has pending refs which could not
-         * be created before (because the dir did not exist before, see
-         * __record_new_ref). finish_outoforder_dir will link/move the pending
-         * refs.
-         */
-        if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) {
-                ret = finish_outoforder_dir(sctx, sctx->cur_ino,
-                                sctx->cur_inode_gen);
-                if (ret < 0)
-                        goto out;
-        }
        ret = 0;
 out:
@@ -2971,34 +3006,9 @@ static int __record_new_ref(int num, u64 dir, int index,
                return -ENOMEM;
        ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
-                        NULL);
+                        NULL, NULL);
-        if (ret < 0)
-                goto out;
-        /*
-         * The parent may be non-existent at this point in time. This happens
-         * if the ino of the parent dir is higher then the current ino. In this
-         * case, we can not process this ref until the parent dir is finally
-         * created. If we reach the parent dir later, process_recorded_refs
-         * will go through all dir items and process the refs that could not be
-         * processed before. In case this is the first ref, we set
-         * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to
-         * keep an orphan of the inode so that it later can be used for
-         * link/move
-         */
-        ret = is_inode_existent(sctx, dir, gen);
        if (ret < 0)
                goto out;
-        if (!ret) {
-                ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir,
-                                name->start, fs_path_len(name));
-                if (ret < 0)
-                        goto out;
-                if (ret)
-                        sctx->cur_inode_first_ref_orphan = 1;
-                ret = 0;
-                goto out;
-        }
        ret = get_cur_path(sctx, dir, gen, p);
        if (ret < 0)
@@ -3029,7 +3039,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
                return -ENOMEM;
        ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
-                        NULL);
+                        NULL, NULL);
        if (ret < 0)
                goto out;
@@ -3206,33 +3216,28 @@ static int process_all_refs(struct send_ctx *sctx,
        key.offset = 0;
        while (1) {
                ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-                if (ret < 0) {
+                if (ret < 0)
-                        btrfs_release_path(path);
                        goto out;
-                }
+                if (ret)
-                if (ret) {
-                        btrfs_release_path(path);
                        break;
-                }
                eb = path->nodes[0];
                slot = path->slots[0];
                btrfs_item_key_to_cpu(eb, &found_key, slot);
                if (found_key.objectid != key.objectid ||
-                    found_key.type != key.type) {
+                    found_key.type != key.type)
-                        btrfs_release_path(path);
                        break;
-                }
-                ret = iterate_inode_ref(sctx, sctx->parent_root, path,
+                ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
-                                &found_key, 0, cb, sctx);
+                                sctx);
                btrfs_release_path(path);
                if (ret < 0)
                        goto out;
                key.offset = found_key.offset + 1;
        }
+        btrfs_release_path(path);
        ret = process_recorded_refs(sctx);
@@ -3555,7 +3560,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
        int ret = 0;
        struct fs_path *p;
        loff_t pos = offset;
-        int readed = 0;
+        int num_read = 0;
        mm_segment_t old_fs;
        p = fs_path_alloc(sctx);
@@ -3580,8 +3585,8 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
        ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
        if (ret < 0)
                goto out;
-        readed = ret;
+        num_read = ret;
-        if (!readed)
+        if (!num_read)
                goto out;
        ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
@@ -3594,7 +3599,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
        TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
-        TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed);
+        TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
        ret = send_cmd(sctx);
@@ -3604,7 +3609,7 @@ out:
        set_fs(old_fs);
        if (ret < 0)
                return ret;
-        return readed;
+        return num_read;
 }
 /*
@@ -3615,7 +3620,6 @@ static int send_clone(struct send_ctx *sctx,
                      struct clone_root *clone_root)
 {
        int ret = 0;
-        struct btrfs_root *clone_root2 = clone_root->root;
        struct fs_path *p;
        u64 gen;
@@ -3640,22 +3644,23 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
-        if (clone_root2 == sctx->send_root) {
+        if (clone_root->root == sctx->send_root) {
                ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
-                                &gen, NULL, NULL, NULL);
+                                &gen, NULL, NULL, NULL, NULL);
                if (ret < 0)
                        goto out;
                ret = get_cur_path(sctx, clone_root->ino, gen, p);
        } else {
-                ret = get_inode_path(sctx, clone_root2, clone_root->ino, p);
+                ret = get_inode_path(sctx, clone_root->root,
+                                clone_root->ino, p);
        }
        if (ret < 0)
                goto out;
        TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
-                        clone_root2->root_item.uuid);
+                        clone_root->root->root_item.uuid);
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
-                        clone_root2->root_item.ctransid);
+                        clone_root->root->root_item.ctransid);
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
                        clone_root->offset);
@@ -3684,10 +3689,17 @@ static int send_write_or_clone(struct send_ctx *sctx,
        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
                        struct btrfs_file_extent_item);
        type = btrfs_file_extent_type(path->nodes[0], ei);
-        if (type == BTRFS_FILE_EXTENT_INLINE)
+        if (type == BTRFS_FILE_EXTENT_INLINE) {
                len = btrfs_file_extent_inline_len(path->nodes[0], ei);
-        else
+                /*
+                 * it is possible the inline item won't cover the whole page,
+                 * but there may be items after this page.  Make
+                 * sure to send the whole thing
+                 */
+                len = PAGE_CACHE_ALIGN(len);
+        } else {
                len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+        }
        if (offset + len > sctx->cur_inode_size)
                len = sctx->cur_inode_size - offset;
@@ -3735,6 +3747,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
        u64 left_offset_fixed;
        u64 left_len;
        u64 right_len;
+        u64 left_gen;
+        u64 right_gen;
        u8 left_type;
        u8 right_type;
@@ -3744,17 +3758,17 @@ static int is_extent_unchanged(struct send_ctx *sctx,
        eb = left_path->nodes[0];
        slot = left_path->slots[0];
        ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
        left_type = btrfs_file_extent_type(eb, ei);
-        left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
-        left_len = btrfs_file_extent_num_bytes(eb, ei);
-        left_offset = btrfs_file_extent_offset(eb, ei);
        if (left_type != BTRFS_FILE_EXTENT_REG) {
                ret = 0;
                goto out;
        }
+        left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+        left_len = btrfs_file_extent_num_bytes(eb, ei);
+        left_offset = btrfs_file_extent_offset(eb, ei);
+        left_gen = btrfs_file_extent_generation(eb, ei);
        /*
         * Following comments will refer to these graphics. L is the left
@@ -3810,6 +3824,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
                right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
                right_len = btrfs_file_extent_num_bytes(eb, ei);
                right_offset = btrfs_file_extent_offset(eb, ei);
+                right_gen = btrfs_file_extent_generation(eb, ei);
                if (right_type != BTRFS_FILE_EXTENT_REG) {
                        ret = 0;
@@ -3820,7 +3835,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
                 * Are we at extent 8? If yes, we know the extent is changed.
                 * This may only happen on the first iteration.
                 */
-                if (found_key.offset + right_len < ekey->offset) {
+                if (found_key.offset + right_len <= ekey->offset) {
                        ret = 0;
                        goto out;
                }
@@ -3837,8 +3852,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
                /*
                 * Check if we have the same extent.
                 */
-                if (left_disknr + left_offset_fixed !=
+                if (left_disknr != right_disknr ||
-                                right_disknr + right_offset) {
+                    left_offset_fixed != right_offset ||
+                    left_gen != right_gen) {
                        ret = 0;
                        goto out;
                }
@@ -3977,6 +3993,15 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
                goto out;
        ret = process_recorded_refs(sctx);
+        if (ret < 0)
+                goto out;
+        /*
+         * We have processed the refs and thus need to advance send_progress.
+         * Now, calls to get_cur_xxx will take the updated refs of the current
+         * inode into account.
+         */
+        sctx->send_progress = sctx->cur_ino + 1;
 out:
        return ret;
@@ -4004,7 +4029,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
                goto out;
        ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
-                        &left_mode, &left_uid, &left_gid);
+                        &left_mode, &left_uid, &left_gid, NULL);
        if (ret < 0)
                goto out;
@@ -4015,7 +4040,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
                } else {
                        ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
                                        NULL, NULL, &right_mode, &right_uid,
-                                        &right_gid);
+                                        &right_gid, NULL);
                        if (ret < 0)
                                goto out;
@@ -4074,7 +4099,12 @@ static int changed_inode(struct send_ctx *sctx,
        sctx->cur_ino = key->objectid;
        sctx->cur_inode_new_gen = 0;
-        sctx->cur_inode_first_ref_orphan = 0;
+        /*
+         * Set send_progress to current inode. This will tell all get_cur_xxx
+         * functions that the current inode's refs are not updated yet. Later,
+         * when process_recorded_refs is finished, it is set to cur_ino + 1.
+         */
        sctx->send_progress = sctx->cur_ino;
        if (result == BTRFS_COMPARE_TREE_NEW ||
@@ -4098,7 +4128,14 @@ static int changed_inode(struct send_ctx *sctx,
                right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
                                right_ii);
-                if (left_gen != right_gen)
+                /*
+                 * The cur_ino = root dir case is special here. We can't treat
+                 * the inode as deleted+reused because it would generate a
+                 * stream that tries to delete/mkdir the root dir.
+                 */
+                if (left_gen != right_gen &&
+                    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
                        sctx->cur_inode_new_gen = 1;
        }
@@ -4111,8 +4148,7 @@ static int changed_inode(struct send_ctx *sctx,
                sctx->cur_inode_mode = btrfs_inode_mode(
                                sctx->left_path->nodes[0], left_ii);
                if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
-                        ret = send_create_inode(sctx, sctx->left_path,
+                        ret = send_create_inode_if_needed(sctx);
-                                        sctx->cmp_key);
        } else if (result == BTRFS_COMPARE_TREE_DELETED) {
                sctx->cur_inode_gen = right_gen;
                sctx->cur_inode_new = 0;
@@ -4122,7 +4158,17 @@ static int changed_inode(struct send_ctx *sctx,
                sctx->cur_inode_mode = btrfs_inode_mode(
                                sctx->right_path->nodes[0], right_ii);
        } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+                /*
+                 * We need to do some special handling in case the inode was
+                 * reported as changed with a changed generation number. This
+                 * means that the original inode was deleted and new inode
+                 * reused the same inum. So we have to treat the old inode as
+                 * deleted and the new one as new.
+                 */
                if (sctx->cur_inode_new_gen) {
+                        /*
+                         * First, process the inode as if it was deleted.
+                         */
                        sctx->cur_inode_gen = right_gen;
                        sctx->cur_inode_new = 0;
                        sctx->cur_inode_deleted = 1;
@@ -4135,6 +4181,9 @@ static int changed_inode(struct send_ctx *sctx,
                        if (ret < 0)
                                goto out;
+                        /*
+                         * Now process the inode as if it was new.
+                         */
                        sctx->cur_inode_gen = left_gen;
                        sctx->cur_inode_new = 1;
                        sctx->cur_inode_deleted = 0;
@@ -4142,14 +4191,23 @@ static int changed_inode(struct send_ctx *sctx,
                                        sctx->left_path->nodes[0], left_ii);
                        sctx->cur_inode_mode = btrfs_inode_mode(
                                        sctx->left_path->nodes[0], left_ii);
-                        ret = send_create_inode(sctx, sctx->left_path,
+                        ret = send_create_inode_if_needed(sctx);
-                                        sctx->cmp_key);
                        if (ret < 0)
                                goto out;
                        ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
                        if (ret < 0)
                                goto out;
+                        /*
+                         * Advance send_progress now as we did not get into
+                         * process_recorded_refs_if_needed in the new_gen case.
+                         */
+                        sctx->send_progress = sctx->cur_ino + 1;
+                        /*
+                         * Now process all extents and xattrs of the inode as if
+                         * they were all new.
+                         */
                        ret = process_all_extents(sctx);
                        if (ret < 0)
                                goto out;
@@ -4172,6 +4230,16 @@ out:
        return ret;
 }
+/*
+ * We have to process new refs before deleted refs, but compare_trees gives us
+ * the new and deleted refs mixed. To fix this, we record the new/deleted refs
+ * first and later process them in process_recorded_refs.
+ * For the cur_inode_new_gen case, we skip recording completely because
+ * changed_inode did already initiate processing of refs. The reason for this is
+ * that in this case, compare_tree actually compares the refs of 2 different
+ * inodes. To fix this, process_all_refs is used in changed_inode to handle all
+ * refs of the right tree as deleted and all refs of the left tree as new.
+ */
 static int changed_ref(struct send_ctx *sctx,
                       enum btrfs_compare_tree_result result)
 {
@@ -4192,6 +4260,11 @@ static int changed_ref(struct send_ctx *sctx,
        return ret;
 }
+/*
+ * Process new/deleted/changed xattrs. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of xattrs. The reason is the same as in changed_ref
+ */
 static int changed_xattr(struct send_ctx *sctx,
                         enum btrfs_compare_tree_result result)
 {
@@ -4211,6 +4284,11 @@ static int changed_xattr(struct send_ctx *sctx,
        return ret;
 }
+/*
+ * Process new/deleted/changed extents. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of extents. The reason is the same as in changed_ref
+ */
 static int changed_extent(struct send_ctx *sctx,
                          enum btrfs_compare_tree_result result)
 {
@@ -4227,7 +4305,10 @@ static int changed_extent(struct send_ctx *sctx,
        return ret;
 }
+/*
+ * Updates compare related fields in sctx and simply forwards to the actual
+ * changed_xxx functions.
+ */
 static int changed_cb(struct btrfs_root *left_root,
                      struct btrfs_root *right_root,
                      struct btrfs_path *left_path,
@@ -4247,6 +4328,11 @@ static int changed_cb(struct btrfs_root *left_root,
        if (ret < 0)
                goto out;
+        /* Ignore non-FS objects */
+        if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
+            key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+                goto out;
        if (key->type == BTRFS_INODE_ITEM_KEY)
                ret = changed_inode(sctx, result);
        else if (key->type == BTRFS_INODE_REF_KEY)
@@ -4299,7 +4385,8 @@ join_trans:
        }
        /*
-         * Make sure the tree has not changed
+         * Make sure the tree has not changed after re-joining. We detect this
+         * by comparing start_ctransid and ctransid. They should always match.
         */
        spin_lock(&send_root->root_times_lock);
        ctransid = btrfs_root_ctransid(&send_root->root_item);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 9934e948e57f..1bf4f32fd4ef 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -130,4 +130,5 @@ enum {
 #ifdef __KERNEL__
 long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
+int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off);
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 83d6f9f9c220..915ac14c2064 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -243,12 +243,18 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root, const char *function,
                               unsigned int line, int errno)
 {
-        WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted");
+        WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n");
        trans->aborted = errno;
        /* Nothing used. The other threads that have joined this
         * transaction may be able to continue. */
        if (!trans->blocks_used) {
-                btrfs_printk(root->fs_info, "Aborting unused transaction.\n");
+                char nbuf[16];
+                const char *errstr;
+                errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
+                btrfs_printk(root->fs_info,
+                             "%s:%d: Aborting unused transaction(%s).\n",
+                             function, line, errstr);
                return;
        }
        trans->transaction->aborted = errno;
@@ -407,7 +413,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
                case Opt_nodatacow:
-                        printk(KERN_INFO "btrfs: setting nodatacow\n");
+                        if (!btrfs_test_opt(root, COMPRESS) ||
+                                !btrfs_test_opt(root, FORCE_COMPRESS)) {
+                                        printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n");
+                        } else {
+                                printk(KERN_INFO "btrfs: setting nodatacow\n");
+                        }
+                        info->compress_type = BTRFS_COMPRESS_NONE;
+                        btrfs_clear_opt(info->mount_opt, COMPRESS);
+                        btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
                        btrfs_set_opt(info->mount_opt, NODATACOW);
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
@@ -422,10 +436,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                compress_type = "zlib";
                                info->compress_type = BTRFS_COMPRESS_ZLIB;
                                btrfs_set_opt(info->mount_opt, COMPRESS);
+                                btrfs_clear_opt(info->mount_opt, NODATACOW);
+                                btrfs_clear_opt(info->mount_opt, NODATASUM);
                        } else if (strcmp(args[0].from, "lzo") == 0) {
                                compress_type = "lzo";
                                info->compress_type = BTRFS_COMPRESS_LZO;
                                btrfs_set_opt(info->mount_opt, COMPRESS);
+                                btrfs_clear_opt(info->mount_opt, NODATACOW);
+                                btrfs_clear_opt(info->mount_opt, NODATASUM);
                                btrfs_set_fs_incompat(info, COMPRESS_LZO);
                        } else if (strncmp(args[0].from, "no", 2) == 0) {
                                compress_type = "no";
@@ -543,11 +561,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
                        break;
                case Opt_defrag:
-                        printk(KERN_INFO "btrfs: enabling auto defrag");
+                        printk(KERN_INFO "btrfs: enabling auto defrag\n");
                        btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
                        break;
                case Opt_recovery:
-                        printk(KERN_INFO "btrfs: enabling auto recovery");
+                        printk(KERN_INFO "btrfs: enabling auto recovery\n");
                        btrfs_set_opt(info->mount_opt, RECOVERY);
                        break;
                case Opt_skip_balance:
@@ -846,18 +864,15 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
                return 0;
        }
-        btrfs_wait_ordered_extents(root, 0, 0);
+        btrfs_wait_ordered_extents(root, 0);
-        spin_lock(&fs_info->trans_lock);
-        if (!fs_info->running_transaction) {
-                spin_unlock(&fs_info->trans_lock);
-                return 0;
-        }
-        spin_unlock(&fs_info->trans_lock);
-        trans = btrfs_join_transaction(root);
+        trans = btrfs_attach_transaction(root);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
+                /* no transaction, don't bother */
+                if (PTR_ERR(trans) == -ENOENT)
+                        return 0;
                return PTR_ERR(trans);
+        }
        return btrfs_commit_transaction(trans, root);
 }
@@ -1508,17 +1523,21 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 static int btrfs_freeze(struct super_block *sb)
 {
-        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+        struct btrfs_trans_handle *trans;
-        mutex_lock(&fs_info->transaction_kthread_mutex);
+        struct btrfs_root *root = btrfs_sb(sb)->tree_root;
-        mutex_lock(&fs_info->cleaner_mutex);
-        return 0;
+        trans = btrfs_attach_transaction(root);
+        if (IS_ERR(trans)) {
+                /* no transaction, don't bother */
+                if (PTR_ERR(trans) == -ENOENT)
+                        return 0;
+                return PTR_ERR(trans);
+        }
+        return btrfs_commit_transaction(trans, root);
 }
 static int btrfs_unfreeze(struct super_block *sb)
 {
-        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-        mutex_unlock(&fs_info->cleaner_mutex);
-        mutex_unlock(&fs_info->transaction_kthread_mutex);
        return 0;
 }
@@ -1595,7 +1614,7 @@ static int btrfs_interface_init(void)
 static void btrfs_interface_exit(void)
 {
        if (misc_deregister(&btrfs_misc) < 0)
-                printk(KERN_INFO "misc_deregister failed for control device");
+                printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
 }
 static int __init init_btrfs_fs(void)
@@ -1620,10 +1639,14 @@ static int __init init_btrfs_fs(void)
        if (err)
                goto free_extent_io;
-        err = btrfs_delayed_inode_init();
+        err = ordered_data_init();
        if (err)
                goto free_extent_map;
+        err = btrfs_delayed_inode_init();
+        if (err)
+                goto free_ordered_data;
        err = btrfs_interface_init();
        if (err)
                goto free_delayed_inode;
@@ -1641,6 +1664,8 @@ unregister_ioctl:
        btrfs_interface_exit();
 free_delayed_inode:
        btrfs_delayed_inode_exit();
+free_ordered_data:
+        ordered_data_exit();
 free_extent_map:
        extent_map_exit();
 free_extent_io:
@@ -1657,6 +1682,7 @@ static void __exit exit_btrfs_fs(void)
 {
        btrfs_destroy_cachep();
        btrfs_delayed_inode_exit();
+        ordered_data_exit();
        extent_map_exit();
        extent_io_exit();
        btrfs_interface_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 27c26004e050..77db875b5116 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,7 +53,7 @@ static noinline void switch_commit_root(struct btrfs_root *root)
 /*
 * either allocate a new transaction or hop into the existing one
 */
-static noinline int join_transaction(struct btrfs_root *root, int nofail)
+static noinline int join_transaction(struct btrfs_root *root, int type)
 {
        struct btrfs_transaction *cur_trans;
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -67,7 +67,13 @@ loop:
        }
        if (fs_info->trans_no_join) {
-                if (!nofail) {
+                /* 
+                 * If we are JOIN_NOLOCK we're already committing a current
+                 * transaction, we just need a handle to deal with something
+                 * when committing the transaction, such as inode cache and
+                 * space cache. It is a special case.
+                 */
+                if (type != TRANS_JOIN_NOLOCK) {
                        spin_unlock(&fs_info->trans_lock);
                        return -EBUSY;
                }
@@ -87,6 +93,13 @@ loop:
        }
        spin_unlock(&fs_info->trans_lock);
+        /*
+         * If we are ATTACH, we just want to catch the current transaction,
+         * and commit it. If there is no transaction, just return ENOENT.
+         */
+        if (type == TRANS_ATTACH)
+                return -ENOENT;
        cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
        if (!cur_trans)
                return -ENOMEM;
@@ -267,13 +280,6 @@ static void wait_current_trans(struct btrfs_root *root)
        }
 }
-enum btrfs_trans_type {
-        TRANS_START,
-        TRANS_JOIN,
-        TRANS_USERSPACE,
-        TRANS_JOIN_NOLOCK,
-};
 static int may_wait_transaction(struct btrfs_root *root, int type)
 {
        if (root->fs_info->log_root_recovering)
@@ -290,7 +296,8 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
 }
 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-                                                    u64 num_items, int type)
+                                                    u64 num_items, int type,
+                                                    int noflush)
 {
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
@@ -324,9 +331,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
                }
                num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-                ret = btrfs_block_rsv_add(root,
+                if (noflush)
-                                          &root->fs_info->trans_block_rsv,
+                        ret = btrfs_block_rsv_add_noflush(root,
-                                          num_bytes);
+                                                &root->fs_info->trans_block_rsv,
+                                                num_bytes);
+                else
+                        ret = btrfs_block_rsv_add(root,
+                                                &root->fs_info->trans_block_rsv,
+                                                num_bytes);
                if (ret)
                        return ERR_PTR(ret);
        }
@@ -335,19 +347,34 @@ again:
        if (!h)
                return ERR_PTR(-ENOMEM);
-        sb_start_intwrite(root->fs_info->sb);
+        /*
+         * If we are JOIN_NOLOCK we're already committing a transaction and
+         * waiting on this guy, so we don't need to do the sb_start_intwrite
+         * because we're already holding a ref.  We need this because we could
+         * have raced in and did an fsync() on a file which can kick a commit
+         * and then we deadlock with somebody doing a freeze.
+         *
+         * If we are ATTACH, it means we just want to catch the current
+         * transaction and commit it, so we needn't do sb_start_intwrite(). 
+         */
+        if (type < TRANS_JOIN_NOLOCK)
+                sb_start_intwrite(root->fs_info->sb);
        if (may_wait_transaction(root, type))
                wait_current_trans(root);
        do {
-                ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+                ret = join_transaction(root, type);
                if (ret == -EBUSY)
                        wait_current_trans(root);
        } while (ret == -EBUSY);
        if (ret < 0) {
-                sb_end_intwrite(root->fs_info->sb);
+                /* We must get the transaction if we are JOIN_NOLOCK. */
+                BUG_ON(type == TRANS_JOIN_NOLOCK);
+                if (type < TRANS_JOIN_NOLOCK)
+                        sb_end_intwrite(root->fs_info->sb);
                kmem_cache_free(btrfs_trans_handle_cachep, h);
                return ERR_PTR(ret);
        }
@@ -367,7 +394,9 @@ again:
        h->aborted = 0;
        h->qgroup_reserved = qgroup_reserved;
        h->delayed_ref_elem.seq = 0;
+        h->type = type;
        INIT_LIST_HEAD(&h->qgroup_ref_list);
+        INIT_LIST_HEAD(&h->new_bgs);
        smp_mb();
        if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -393,21 +422,33 @@ got_it:
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_items)
 {
-        return start_transaction(root, num_items, TRANS_START);
+        return start_transaction(root, num_items, TRANS_START, 0);
+}
+struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+                                        struct btrfs_root *root, int num_items)
+{
+        return start_transaction(root, num_items, TRANS_START, 1);
 }
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
 {
-        return start_transaction(root, 0, TRANS_JOIN);
+        return start_transaction(root, 0, TRANS_JOIN, 0);
 }
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
 {
-        return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+        return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
 }
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
 {
-        return start_transaction(root, 0, TRANS_USERSPACE);
+        return start_transaction(root, 0, TRANS_USERSPACE, 0);
+}
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
+{
+        return start_transaction(root, 0, TRANS_ATTACH, 0);
 }
 /* wait for a transaction commit to be fully complete */
@@ -506,11 +547,12 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 }
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, int throttle, int lock)
+                          struct btrfs_root *root, int throttle)
 {
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *info = root->fs_info;
        int count = 0;
+        int lock = (trans->type != TRANS_JOIN_NOLOCK);
        int err = 0;
        if (--trans->use_count) {
@@ -536,6 +578,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                trans->qgroup_reserved = 0;
        }
+        if (!list_empty(&trans->new_bgs))
+                btrfs_create_pending_block_groups(trans, root);
        while (count < 2) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
@@ -551,7 +596,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
-        sb_end_intwrite(root->fs_info->sb);
+        if (!list_empty(&trans->new_bgs))
+                btrfs_create_pending_block_groups(trans, root);
        if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
            should_end_transaction(trans, root)) {
@@ -573,6 +619,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                }
        }
+        if (trans->type < TRANS_JOIN_NOLOCK)
+                sb_end_intwrite(root->fs_info->sb);
        WARN_ON(cur_trans != info->running_transaction);
        WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
        atomic_dec(&cur_trans->num_writers);
@@ -604,7 +653,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
        int ret;
-        ret = __btrfs_end_transaction(trans, root, 0, 1);
+        ret = __btrfs_end_transaction(trans, root, 0);
        if (ret)
                return ret;
        return 0;
@@ -615,18 +664,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 {
        int ret;
-        ret = __btrfs_end_transaction(trans, root, 1, 1);
+        ret = __btrfs_end_transaction(trans, root, 1);
-        if (ret)
-                return ret;
-        return 0;
-}
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root)
-{
-        int ret;
-        ret = __btrfs_end_transaction(trans, root, 0, 0);
        if (ret)
                return ret;
        return 0;
@@ -635,7 +673,7 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root)
 {
-        return __btrfs_end_transaction(trans, root, 1, 1);
+        return __btrfs_end_transaction(trans, root, 1);
 }
 /*
@@ -649,13 +687,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
        int err = 0;
        int werr = 0;
        struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+        struct extent_state *cached_state = NULL;
        u64 start = 0;
        u64 end;
        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-                                      mark)) {
+                                      mark, &cached_state)) {
-                convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
+                convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
-                                   GFP_NOFS);
+                                   mark, &cached_state, GFP_NOFS);
+                cached_state = NULL;
                err = filemap_fdatawrite_range(mapping, start, end);
                if (err)
                        werr = err;
@@ -679,12 +719,14 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
        int err = 0;
        int werr = 0;
        struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+        struct extent_state *cached_state = NULL;
        u64 start = 0;
        u64 end;
        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-                                      EXTENT_NEED_WAIT)) {
+                                      EXTENT_NEED_WAIT, &cached_state)) {
-                clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
+                clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+                                 0, 0, &cached_state, GFP_NOFS);
                err = filemap_fdatawait_range(mapping, start, end);
                if (err)
                        werr = err;
@@ -955,6 +997,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct btrfs_root *parent_root;
        struct btrfs_block_rsv *rsv;
        struct inode *parent_inode;
+        struct btrfs_path *path;
+        struct btrfs_dir_item *dir_item;
        struct dentry *parent;
        struct dentry *dentry;
        struct extent_buffer *tmp;
@@ -967,18 +1011,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        u64 root_flags;
        uuid_le new_uuid;
-        rsv = trans->block_rsv;
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = pending->error = -ENOMEM;
+                goto path_alloc_fail;
+        }
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
        if (!new_root_item) {
                ret = pending->error = -ENOMEM;
-                goto fail;
+                goto root_item_alloc_fail;
        }
        ret = btrfs_find_free_objectid(tree_root, &objectid);
        if (ret) {
                pending->error = ret;
-                goto fail;
+                goto no_free_objectid;
        }
        btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
@@ -988,22 +1036,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                                  to_reserve);
                if (ret) {
                        pending->error = ret;
-                        goto fail;
+                        goto no_free_objectid;
                }
        }
        ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
                                   objectid, pending->inherit);
-        kfree(pending->inherit);
        if (ret) {
                pending->error = ret;
-                goto fail;
+                goto no_free_objectid;
        }
        key.objectid = objectid;
        key.offset = (u64)-1;
        key.type = BTRFS_ROOT_ITEM_KEY;
+        rsv = trans->block_rsv;
        trans->block_rsv = &pending->block_rsv;
        dentry = pending->dentry;
@@ -1017,24 +1065,21 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         */
        ret = btrfs_set_inode_index(parent_inode, &index);
        BUG_ON(ret); /* -ENOMEM */
-        ret = btrfs_insert_dir_item(trans, parent_root,
-                                dentry->d_name.name, dentry->d_name.len,
+        /* check if there is a file/dir which has the same name. */
-                                parent_inode, &key,
+        dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
-                                BTRFS_FT_DIR, index);
+                                         btrfs_ino(parent_inode),
-        if (ret == -EEXIST) {
+                                         dentry->d_name.name,
+                                         dentry->d_name.len, 0);
+        if (dir_item != NULL && !IS_ERR(dir_item)) {
                pending->error = -EEXIST;
-                dput(parent);
                goto fail;
-        } else if (ret) {
+        } else if (IS_ERR(dir_item)) {
-                goto abort_trans_dput;
+                ret = PTR_ERR(dir_item);
+                btrfs_abort_transaction(trans, root, ret);
+                goto fail;
        }
+        btrfs_release_path(path);
-        btrfs_i_size_write(parent_inode, parent_inode->i_size +
-                                         dentry->d_name.len * 2);
-        parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
-        ret = btrfs_update_inode(trans, parent_root, parent_inode);
-        if (ret)
-                goto abort_trans_dput;
        /*
         * pull in the delayed directory update
@@ -1043,8 +1088,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         * snapshot
         */
        ret = btrfs_run_delayed_items(trans, root);
-        if (ret) { /* Transaction aborted */
+        if (ret) {      /* Transaction aborted */
-                dput(parent);
+                btrfs_abort_transaction(trans, root, ret);
                goto fail;
        }
@@ -1079,7 +1124,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        if (ret) {
                btrfs_tree_unlock(old);
                free_extent_buffer(old);
-                goto abort_trans_dput;
+                btrfs_abort_transaction(trans, root, ret);
+                goto fail;
        }
        btrfs_set_lock_blocking(old);
@@ -1088,8 +1134,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        /* clean up in any case */
        btrfs_tree_unlock(old);
        free_extent_buffer(old);
-        if (ret)
+        if (ret) {
-                goto abort_trans_dput;
+                btrfs_abort_transaction(trans, root, ret);
+                goto fail;
+        }
        /* see comments in should_cow_block() */
        root->force_cow = 1;
@@ -1101,8 +1149,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
        btrfs_tree_unlock(tmp);
        free_extent_buffer(tmp);
-        if (ret)
+        if (ret) {
-                goto abort_trans_dput;
+                btrfs_abort_transaction(trans, root, ret);
+                goto fail;
+        }
        /*
         * insert root back/forward references
@@ -1111,32 +1161,58 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                 parent_root->root_key.objectid,
                                 btrfs_ino(parent_inode), index,
                                 dentry->d_name.name, dentry->d_name.len);
-        dput(parent);
+        if (ret) {
-        if (ret)
+                btrfs_abort_transaction(trans, root, ret);
                goto fail;
+        }
        key.offset = (u64)-1;
        pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
        if (IS_ERR(pending->snap)) {
                ret = PTR_ERR(pending->snap);
-                goto abort_trans;
+                btrfs_abort_transaction(trans, root, ret);
+                goto fail;
        }
        ret = btrfs_reloc_post_snapshot(trans, pending);
+        if (ret) {
+                btrfs_abort_transaction(trans, root, ret);
+                goto fail;
+        }
+        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+        if (ret) {
+                btrfs_abort_transaction(trans, root, ret);
+                goto fail;
+        }
+        ret = btrfs_insert_dir_item(trans, parent_root,
+                                    dentry->d_name.name, dentry->d_name.len,
+                                    parent_inode, &key,
+                                    BTRFS_FT_DIR, index);
+        /* We have check then name at the beginning, so it is impossible. */
+        BUG_ON(ret == -EEXIST);
+        if (ret) {
+                btrfs_abort_transaction(trans, root, ret);
+                goto fail;
+        }
+        btrfs_i_size_write(parent_inode, parent_inode->i_size +
+                                         dentry->d_name.len * 2);
+        parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+        ret = btrfs_update_inode(trans, parent_root, parent_inode);
        if (ret)
-                goto abort_trans;
+                btrfs_abort_transaction(trans, root, ret);
-        ret = 0;
 fail:
-        kfree(new_root_item);
+        dput(parent);
        trans->block_rsv = rsv;
+no_free_objectid:
+        kfree(new_root_item);
+root_item_alloc_fail:
+        btrfs_free_path(path);
+path_alloc_fail:
        btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
        return ret;
-abort_trans_dput:
-        dput(parent);
-abort_trans:
-        btrfs_abort_transaction(trans, root, ret);
-        goto fail;
 }
 /*
@@ -1229,6 +1305,16 @@ static void do_async_commit(struct work_struct *work)
        struct btrfs_async_commit *ac =
                container_of(work, struct btrfs_async_commit, work.work);
+        /*
+         * We've got freeze protection passed with the transaction.
+         * Tell lockdep about it.
+         */
+        rwsem_acquire_read(
+                &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                0, 1, _THIS_IP_);
+        current->journal_info = ac->newtrans;
        btrfs_commit_transaction(ac->newtrans, ac->root);
        kfree(ac);
 }
@@ -1258,6 +1344,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        atomic_inc(&cur_trans->use_count);
        btrfs_end_transaction(trans, root);
+        /*
+         * Tell lockdep we've released the freeze rwsem, since the
+         * async commit thread will be the one to unlock it.
+         */
+        rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                      1, _THIS_IP_);
        schedule_delayed_work(&ac->work, 0);
        /* wait for transaction to start and unblock */
@@ -1348,6 +1442,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         */
        cur_trans->delayed_refs.flushing = 1;
+        if (!list_empty(&trans->new_bgs))
+                btrfs_create_pending_block_groups(trans, root);
        ret = btrfs_run_delayed_refs(trans, root, 0);
        if (ret)
                goto cleanup_transaction;
@@ -1403,7 +1500,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                if (flush_on_commit || snap_pending) {
                        btrfs_start_delalloc_inodes(root, 1);
-                        btrfs_wait_ordered_extents(root, 0, 1);
+                        btrfs_wait_ordered_extents(root, 1);
                }
                ret = btrfs_run_delayed_items(trans, root);
@@ -1456,13 +1553,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         */
        mutex_lock(&root->fs_info->reloc_mutex);
-        ret = btrfs_run_delayed_items(trans, root);
+        /*
+         * We needn't worry about the delayed items because we will
+         * deal with them in create_pending_snapshot(), which is the
+         * core function of the snapshot creation.
+         */
+        ret = create_pending_snapshots(trans, root->fs_info);
        if (ret) {
                mutex_unlock(&root->fs_info->reloc_mutex);
                goto cleanup_transaction;
        }
-        ret = create_pending_snapshots(trans, root->fs_info);
+        /*
+         * We insert the dir indexes of the snapshots and update the inode
+         * of the snapshots' parents after the snapshot creation, so there
+         * are some delayed items which are not dealt with. Now deal with
+         * them.
+         *
+         * We needn't worry that this operation will corrupt the snapshots,
+         * because all the tree which are snapshoted will be forced to COW
+         * the nodes and leaves.
+         */
+        ret = btrfs_run_delayed_items(trans, root);
        if (ret) {
                mutex_unlock(&root->fs_info->reloc_mutex);
                goto cleanup_transaction;
@@ -1584,7 +1696,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        put_transaction(cur_trans);
        put_transaction(cur_trans);
-        sb_end_intwrite(root->fs_info->sb);
+        if (trans->type < TRANS_JOIN_NOLOCK)
+                sb_end_intwrite(root->fs_info->sb);
        trace_btrfs_transaction_commit(root);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e8b8416c688b..80961947a6b2 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,14 @@ struct btrfs_transaction {
        int aborted;
 };
+enum btrfs_trans_type {
+        TRANS_START,
+        TRANS_JOIN,
+        TRANS_USERSPACE,
+        TRANS_JOIN_NOLOCK,
+        TRANS_ATTACH,
+};
 struct btrfs_trans_handle {
        u64 transid;
        u64 bytes_reserved;
@@ -58,8 +66,9 @@ struct btrfs_trans_handle {
        struct btrfs_transaction *transaction;
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_block_rsv *orig_rsv;
-        int aborted;
+        short aborted;
-        int adding_csums;
+        short adding_csums;
+        enum btrfs_trans_type type;
        /*
         * this root is only needed to validate that the root passed to
         * start_transaction is the same as the one passed to end_transaction.
@@ -68,6 +77,7 @@ struct btrfs_trans_handle {
        struct btrfs_root *root;
        struct seq_list delayed_ref_elem;
        struct list_head qgroup_ref_list;
+        struct list_head new_bgs;
 };
 struct btrfs_pending_snapshot {
@@ -88,16 +98,18 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 {
        BTRFS_I(inode)->last_trans = trans->transaction->transid;
        BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
 }
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root);
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+                                        struct btrfs_root *root, int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c86670f4f285..81e407d9677a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,13 +18,16 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/list_sort.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
 #include "locking.h"
 #include "print-tree.h"
+#include "backref.h"
 #include "compat.h"
 #include "tree-log.h"
+#include "hash.h"
 /* magic values for the inode_only field in btrfs_log_inode:
 *
@@ -146,7 +149,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
                        root->log_multiple_pids = true;
                }
-                root->log_batch++;
+                atomic_inc(&root->log_batch);
                atomic_inc(&root->log_writers);
                mutex_unlock(&root->log_mutex);
                return 0;
@@ -165,7 +168,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
                        err = ret;
        }
        mutex_unlock(&root->fs_info->tree_log_mutex);
-        root->log_batch++;
+        atomic_inc(&root->log_batch);
        atomic_inc(&root->log_writers);
        mutex_unlock(&root->log_mutex);
        return err;
@@ -484,7 +487,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
        int found_type;
        u64 mask = root->sectorsize - 1;
        u64 extent_end;
-        u64 alloc_hint;
        u64 start = key->offset;
        u64 saved_nbytes;
        struct btrfs_file_extent_item *item;
@@ -550,8 +552,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
        saved_nbytes = inode_get_bytes(inode);
        /* drop any overlapping extents */
-        ret = btrfs_drop_extents(trans, inode, start, extent_end,
+        ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
-                                 &alloc_hint, 1);
        BUG_ON(ret);
        if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -744,6 +745,7 @@ out:
 */
 static noinline int backref_in_log(struct btrfs_root *log,
                                   struct btrfs_key *key,
+                                   u64 ref_objectid,
                                   char *name, int namelen)
 {
        struct btrfs_path *path;
@@ -764,8 +766,17 @@ static noinline int backref_in_log(struct btrfs_root *log,
        if (ret != 0)
                goto out;
-        item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
        ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+        if (key->type == BTRFS_INODE_EXTREF_KEY) {
+                if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+                                                   name, namelen, NULL))
+                        match = 1;
+                goto out;
+        }
+        item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
        ptr_end = ptr + item_size;
        while (ptr < ptr_end) {
                ref = (struct btrfs_inode_ref *)ptr;
@@ -786,91 +797,42 @@ out:
        return match;
 }
+static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
-/*
- * replay one inode back reference item found in the log tree.
- * eb, slot and key refer to the buffer and key found in the log tree.
- * root is the destination we are replaying into, and path is for temp
- * use by this function.  (it should be released on return).
- */
-static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
-                                  struct btrfs_root *log,
                                  struct btrfs_path *path,
-                                  struct extent_buffer *eb, int slot,
+                                  struct btrfs_root *log_root,
-                                  struct btrfs_key *key)
+                                  struct inode *dir, struct inode *inode,
+                                  struct extent_buffer *eb,
+                                  u64 inode_objectid, u64 parent_objectid,
+                                  u64 ref_index, char *name, int namelen,
+                                  int *search_done)
 {
-        struct btrfs_inode_ref *ref;
-        struct btrfs_dir_item *di;
-        struct inode *dir;
-        struct inode *inode;
-        unsigned long ref_ptr;
-        unsigned long ref_end;
-        char *name;
-        int namelen;
        int ret;
-        int search_done = 0;
+        char *victim_name;
+        int victim_name_len;
-        /*
+        struct extent_buffer *leaf;
-         * it is possible that we didn't log all the parent directories
+        struct btrfs_dir_item *di;
-         * for a given inode.  If we don't find the dir, just don't
+        struct btrfs_key search_key;
-         * copy the back ref in.  The link count fixup code will take
+        struct btrfs_inode_extref *extref;
-         * care of the rest
-         */
-        dir = read_one_inode(root, key->offset);
-        if (!dir)
-                return -ENOENT;
-        inode = read_one_inode(root, key->objectid);
-        if (!inode) {
-                iput(dir);
-                return -EIO;
-        }
-        ref_ptr = btrfs_item_ptr_offset(eb, slot);
-        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
 again:
-        ref = (struct btrfs_inode_ref *)ref_ptr;
+        /* Search old style refs */
+        search_key.objectid = inode_objectid;
-        namelen = btrfs_inode_ref_name_len(eb, ref);
+        search_key.type = BTRFS_INODE_REF_KEY;
-        name = kmalloc(namelen, GFP_NOFS);
+        search_key.offset = parent_objectid;
-        BUG_ON(!name);
+        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
-        read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
-        /* if we already have a perfect match, we're done */
-        if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
-                         btrfs_inode_ref_index(eb, ref),
-                         name, namelen)) {
-                goto out;
-        }
-        /*
-         * look for a conflicting back reference in the metadata.
-         * if we find one we have to unlink that name of the file
-         * before we add our new link.  Later on, we overwrite any
-         * existing back reference, and we don't want to create
-         * dangling pointers in the directory.
-         */
-        if (search_done)
-                goto insert;
-        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
        if (ret == 0) {
-                char *victim_name;
-                int victim_name_len;
                struct btrfs_inode_ref *victim_ref;
                unsigned long ptr;
                unsigned long ptr_end;
-                struct extent_buffer *leaf = path->nodes[0];
+                leaf = path->nodes[0];
                /* are we trying to overwrite a back ref for the root directory
                 * if so, just jump out, we're done
                 */
-                if (key->objectid == key->offset)
+                if (search_key.objectid == search_key.offset)
-                        goto out_nowrite;
+                        return 1;
                /* check all the names in this back reference to see
                 * if they are in the log.  if so, we allow them to stay
@@ -889,7 +851,9 @@ again:
                                           (unsigned long)(victim_ref + 1),
                                           victim_name_len);
-                        if (!backref_in_log(log, key, victim_name,
+                        if (!backref_in_log(log_root, &search_key,
+                                            parent_objectid,
+                                            victim_name,
                                            victim_name_len)) {
                                btrfs_inc_nlink(inode);
                                btrfs_release_path(path);
@@ -897,9 +861,14 @@ again:
                                ret = btrfs_unlink_inode(trans, root, dir,
                                                         inode, victim_name,
                                                         victim_name_len);
+                                BUG_ON(ret);
                                btrfs_run_delayed_items(trans, root);
+                                kfree(victim_name);
+                                *search_done = 1;
+                                goto again;
                        }
                        kfree(victim_name);
                        ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
                }
                BUG_ON(ret);
@@ -908,14 +877,78 @@ again:
                 * NOTE: we have searched root tree and checked the
                 * coresponding ref, it does not need to check again.
                 */
-                search_done = 1;
+                *search_done = 1;
+        }
+        btrfs_release_path(path);
+        /* Same search but for extended refs */
+        extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
+                                           inode_objectid, parent_objectid, 0,
+                                           0);
+        if (!IS_ERR_OR_NULL(extref)) {
+                u32 item_size;
+                u32 cur_offset = 0;
+                unsigned long base;
+                struct inode *victim_parent;
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+                base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+                while (cur_offset < item_size) {
+                        extref = (struct btrfs_inode_extref *)base + cur_offset;
+                        victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
+                        if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+                                goto next;
+                        victim_name = kmalloc(victim_name_len, GFP_NOFS);
+                        read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
+                                           victim_name_len);
+                        search_key.objectid = inode_objectid;
+                        search_key.type = BTRFS_INODE_EXTREF_KEY;
+                        search_key.offset = btrfs_extref_hash(parent_objectid,
+                                                              victim_name,
+                                                              victim_name_len);
+                        ret = 0;
+                        if (!backref_in_log(log_root, &search_key,
+                                            parent_objectid, victim_name,
+                                            victim_name_len)) {
+                                ret = -ENOENT;
+                                victim_parent = read_one_inode(root,
+                                                               parent_objectid);
+                                if (victim_parent) {
+                                        btrfs_inc_nlink(inode);
+                                        btrfs_release_path(path);
+                                        ret = btrfs_unlink_inode(trans, root,
+                                                                 victim_parent,
+                                                                 inode,
+                                                                 victim_name,
+                                                                 victim_name_len);
+                                        btrfs_run_delayed_items(trans, root);
+                                }
+                                BUG_ON(ret);
+                                iput(victim_parent);
+                                kfree(victim_name);
+                                *search_done = 1;
+                                goto again;
+                        }
+                        kfree(victim_name);
+                        BUG_ON(ret);
+next:
+                        cur_offset += victim_name_len + sizeof(*extref);
+                }
+                *search_done = 1;
        }
        btrfs_release_path(path);
        /* look for a conflicting sequence number */
        di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
-                                         btrfs_inode_ref_index(eb, ref),
+                                         ref_index, name, namelen, 0);
-                                         name, namelen, 0);
        if (di && !IS_ERR(di)) {
                ret = drop_one_dir_item(trans, root, path, dir, di);
                BUG_ON(ret);
@@ -931,25 +964,173 @@ again:
        }
        btrfs_release_path(path);
-insert:
+        return 0;
-        /* insert our name */
+}
-        ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
-                             btrfs_inode_ref_index(eb, ref));
-        BUG_ON(ret);
-        btrfs_update_inode(trans, root, inode);
+static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+                             u32 *namelen, char **name, u64 *index,
+                             u64 *parent_objectid)
+{
+        struct btrfs_inode_extref *extref;
-out:
+        extref = (struct btrfs_inode_extref *)ref_ptr;
-        ref_ptr = (unsigned long)(ref + 1) + namelen;
-        kfree(name);
+        *namelen = btrfs_inode_extref_name_len(eb, extref);
-        if (ref_ptr < ref_end)
+        *name = kmalloc(*namelen, GFP_NOFS);
-                goto again;
+        if (*name == NULL)
+                return -ENOMEM;
+        read_extent_buffer(eb, *name, (unsigned long)&extref->name,
+                           *namelen);
+        *index = btrfs_inode_extref_index(eb, extref);
+        if (parent_objectid)
+                *parent_objectid = btrfs_inode_extref_parent(eb, extref);
+        return 0;
+}
+static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+                          u32 *namelen, char **name, u64 *index)
+{
+        struct btrfs_inode_ref *ref;
+        ref = (struct btrfs_inode_ref *)ref_ptr;
+        *namelen = btrfs_inode_ref_name_len(eb, ref);
+        *name = kmalloc(*namelen, GFP_NOFS);
+        if (*name == NULL)
+                return -ENOMEM;
+        read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+        *index = btrfs_inode_ref_index(eb, ref);
+        return 0;
+}
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function.  (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  struct btrfs_root *log,
+                                  struct btrfs_path *path,
+                                  struct extent_buffer *eb, int slot,
+                                  struct btrfs_key *key)
+{
+        struct inode *dir;
+        struct inode *inode;
+        unsigned long ref_ptr;
+        unsigned long ref_end;
+        char *name;
+        int namelen;
+        int ret;
+        int search_done = 0;
+        int log_ref_ver = 0;
+        u64 parent_objectid;
+        u64 inode_objectid;
+        u64 ref_index = 0;
+        int ref_struct_size;
+        ref_ptr = btrfs_item_ptr_offset(eb, slot);
+        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+        if (key->type == BTRFS_INODE_EXTREF_KEY) {
+                struct btrfs_inode_extref *r;
+                ref_struct_size = sizeof(struct btrfs_inode_extref);
+                log_ref_ver = 1;
+                r = (struct btrfs_inode_extref *)ref_ptr;
+                parent_objectid = btrfs_inode_extref_parent(eb, r);
+        } else {
+                ref_struct_size = sizeof(struct btrfs_inode_ref);
+                parent_objectid = key->offset;
+        }
+        inode_objectid = key->objectid;
+        /*
+         * it is possible that we didn't log all the parent directories
+         * for a given inode.  If we don't find the dir, just don't
+         * copy the back ref in.  The link count fixup code will take
+         * care of the rest
+         */
+        dir = read_one_inode(root, parent_objectid);
+        if (!dir)
+                return -ENOENT;
+        inode = read_one_inode(root, inode_objectid);
+        if (!inode) {
+                iput(dir);
+                return -EIO;
+        }
+        while (ref_ptr < ref_end) {
+                if (log_ref_ver) {
+                        ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+                                                &ref_index, &parent_objectid);
+                        /*
+                         * parent object can change from one array
+                         * item to another.
+                         */
+                        if (!dir)
+                                dir = read_one_inode(root, parent_objectid);
+                        if (!dir)
+                                return -ENOENT;
+                } else {
+                        ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+                                             &ref_index);
+                }
+                if (ret)
+                        return ret;
+                /* if we already have a perfect match, we're done */
+                if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+                                  ref_index, name, namelen)) {
+                        /*
+                         * look for a conflicting back reference in the
+                         * metadata. if we find one we have to unlink that name
+                         * of the file before we add our new link.  Later on, we
+                         * overwrite any existing back reference, and we don't
+                         * want to create dangling pointers in the directory.
+                         */
+                        if (!search_done) {
+                                ret = __add_inode_ref(trans, root, path, log,
+                                                      dir, inode, eb,
+                                                      inode_objectid,
+                                                      parent_objectid,
+                                                      ref_index, name, namelen,
+                                                      &search_done);
+                                if (ret == 1)
+                                        goto out;
+                                BUG_ON(ret);
+                        }
+                        /* insert our name */
+                        ret = btrfs_add_link(trans, dir, inode, name, namelen,
+                                             0, ref_index);
+                        BUG_ON(ret);
+                        btrfs_update_inode(trans, root, inode);
+                }
+                ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
+                kfree(name);
+                if (log_ref_ver) {
+                        iput(dir);
+                        dir = NULL;
+                }
+        }
        /* finally write the back reference in the inode */
        ret = overwrite_item(trans, root, path, eb, slot, key);
        BUG_ON(ret);
-out_nowrite:
+out:
        btrfs_release_path(path);
        iput(dir);
        iput(inode);
@@ -966,25 +1147,55 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,
        return ret;
 }
+static int count_inode_extrefs(struct btrfs_root *root,
+                               struct inode *inode, struct btrfs_path *path)
+{
+        int ret = 0;
+        int name_len;
+        unsigned int nlink = 0;
+        u32 item_size;
+        u32 cur_offset = 0;
+        u64 inode_objectid = btrfs_ino(inode);
+        u64 offset = 0;
+        unsigned long ptr;
+        struct btrfs_inode_extref *extref;
+        struct extent_buffer *leaf;
-/*
+        while (1) {
- * There are a few corners where the link count of the file can't
+                ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
- * be properly maintained during replay.  So, instead of adding
+                                            &extref, &offset);
- * lots of complexity to the log code, we just scan the backrefs
+                if (ret)
- * for any file that has been through replay.
+                        break;
- *
- * The scan will update the link count on the inode to reflect the
+                leaf = path->nodes[0];
- * number of back refs found.  If it goes down to zero, the iput
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- * will free the inode.
+                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- */
-static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+                while (cur_offset < item_size) {
-                                           struct btrfs_root *root,
+                        extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
-                                           struct inode *inode)
+                        name_len = btrfs_inode_extref_name_len(leaf, extref);
+                        nlink++;
+                        cur_offset += name_len + sizeof(*extref);
+                }
+                offset++;
+                btrfs_release_path(path);
+        }
+        btrfs_release_path(path);
+        if (ret < 0)
+                return ret;
+        return nlink;
+}
+static int count_inode_refs(struct btrfs_root *root,
+                               struct inode *inode, struct btrfs_path *path)
 {
-        struct btrfs_path *path;
        int ret;
        struct btrfs_key key;
-        u64 nlink = 0;
+        unsigned int nlink = 0;
        unsigned long ptr;
        unsigned long ptr_end;
        int name_len;
@@ -994,10 +1205,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
        key.type = BTRFS_INODE_REF_KEY;
        key.offset = (u64)-1;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
@@ -1031,6 +1238,50 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
                btrfs_release_path(path);
        }
        btrfs_release_path(path);
+        return nlink;
+}
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+                                           struct btrfs_root *root,
+                                           struct inode *inode)
+{
+        struct btrfs_path *path;
+        int ret;
+        u64 nlink = 0;
+        u64 ino = btrfs_ino(inode);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = count_inode_refs(root, inode, path);
+        if (ret < 0)
+                goto out;
+        nlink = ret;
+        ret = count_inode_extrefs(root, inode, path);
+        if (ret == -ENOENT)
+                ret = 0;
+        if (ret < 0)
+                goto out;
+        nlink += ret;
+        ret = 0;
        if (nlink != inode->i_nlink) {
                set_nlink(inode, nlink);
                btrfs_update_inode(trans, root, inode);
@@ -1046,9 +1297,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
                ret = insert_orphan_item(trans, root, ino);
                BUG_ON(ret);
        }
-        btrfs_free_path(path);
-        return 0;
+out:
+        btrfs_free_path(path);
+        return ret;
 }
 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
@@ -1695,6 +1947,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                        ret = add_inode_ref(wc->trans, root, log, path,
                                            eb, i, &key);
                        BUG_ON(ret && ret != -ENOENT);
+                } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
+                        ret = add_inode_ref(wc->trans, root, log, path,
+                                            eb, i, &key);
+                        BUG_ON(ret && ret != -ENOENT);
                } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
                        ret = replay_one_extent(wc->trans, root, path,
                                                eb, i, &key);
@@ -2037,7 +2293,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
                wait_log_commit(trans, root, root->log_transid - 1);
        while (1) {
-                unsigned long batch = root->log_batch;
+                int batch = atomic_read(&root->log_batch);
                /* when we're on an ssd, just kick the log commit out */
                if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
                        mutex_unlock(&root->log_mutex);
@@ -2045,7 +2301,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                        mutex_lock(&root->log_mutex);
                }
                wait_for_writer(trans, root);
-                if (batch == root->log_batch)
+                if (batch == atomic_read(&root->log_batch))
                        break;
        }
@@ -2074,7 +2330,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        btrfs_set_root_node(&log->root_item, log->node);
-        root->log_batch = 0;
        root->log_transid++;
        log->log_transid = root->log_transid;
        root->log_start_pid = 0;
@@ -2087,7 +2342,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        mutex_unlock(&root->log_mutex);
        mutex_lock(&log_root_tree->log_mutex);
-        log_root_tree->log_batch++;
+        atomic_inc(&log_root_tree->log_batch);
        atomic_inc(&log_root_tree->log_writers);
        mutex_unlock(&log_root_tree->log_mutex);
@@ -2157,7 +2412,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
                                btrfs_header_level(log_root_tree->node));
-        log_root_tree->log_batch = 0;
        log_root_tree->log_transid++;
        smp_mb();
@@ -2171,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * in and cause problems either.
         */
        btrfs_scrub_pause_super(root);
-        write_ctree_super(trans, root->fs_info->tree_root, 1);
+        ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
        btrfs_scrub_continue_super(root);
-        ret = 0;
+        if (ret) {
+                btrfs_abort_transaction(trans, root, ret);
+                goto out_wake_log_root;
+        }
        mutex_lock(&root->log_mutex);
        if (root->last_log_commit < log_transid)
@@ -2209,7 +2466,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
        while (1) {
                ret = find_first_extent_bit(&log->dirty_log_pages,
-                                0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
+                                0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
+                                NULL);
                if (ret)
                        break;
@@ -2646,6 +2904,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
        int ret;
        struct btrfs_key key;
        struct btrfs_key found_key;
+        int start_slot;
        key.objectid = objectid;
        key.type = max_key_type;
@@ -2667,8 +2926,18 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
                if (found_key.objectid != objectid)
                        break;
-                ret = btrfs_del_item(trans, log, path);
+                found_key.offset = 0;
-                if (ret)
+                found_key.type = 0;
+                ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
+                                       &start_slot);
+                ret = btrfs_del_items(trans, log, path, start_slot,
+                                      path->slots[0] - start_slot + 1);
+                /*
+                 * If start slot isn't 0 then we don't need to re-search, we've
+                 * found the last guy with the objectid in this tree.
+                 */
+                if (ret || start_slot != 0)
                        break;
                btrfs_release_path(path);
        }
@@ -2678,14 +2947,64 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
        return ret;
 }
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+                            struct extent_buffer *leaf,
+                            struct btrfs_inode_item *item,
+                            struct inode *inode, int log_inode_only)
+{
+        btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+        btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
+        btrfs_set_inode_mode(leaf, item, inode->i_mode);
+        btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+        btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
+                               inode->i_atime.tv_sec);
+        btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
+                                inode->i_atime.tv_nsec);
+        btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
+                               inode->i_mtime.tv_sec);
+        btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
+                                inode->i_mtime.tv_nsec);
+        btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
+                               inode->i_ctime.tv_sec);
+        btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
+                                inode->i_ctime.tv_nsec);
+        btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+        btrfs_set_inode_sequence(leaf, item, inode->i_version);
+        btrfs_set_inode_transid(leaf, item, trans->transid);
+        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
+        btrfs_set_inode_block_group(leaf, item, 0);
+        if (log_inode_only) {
+                /* set the generation to zero so the recover code
+                 * can tell the difference between an logging
+                 * just to say 'this inode exists' and a logging
+                 * to say 'update this inode with these values'
+                 */
+                btrfs_set_inode_generation(leaf, item, 0);
+                btrfs_set_inode_size(leaf, item, 0);
+        } else {
+                btrfs_set_inode_generation(leaf, item,
+                                           BTRFS_I(inode)->generation);
+                btrfs_set_inode_size(leaf, item, inode->i_size);
+        }
+}
 static noinline int copy_items(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *log,
+                               struct inode *inode,
                               struct btrfs_path *dst_path,
                               struct extent_buffer *src,
                               int start_slot, int nr, int inode_only)
 {
        unsigned long src_offset;
        unsigned long dst_offset;
+        struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
        struct btrfs_file_extent_item *extent;
        struct btrfs_inode_item *inode_item;
        int ret;
@@ -2694,6 +3013,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        char *ins_data;
        int i;
        struct list_head ordered_sums;
+        int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
        INIT_LIST_HEAD(&ordered_sums);
@@ -2722,29 +3042,23 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                src_offset = btrfs_item_ptr_offset(src, start_slot + i);
-                copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+                if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
-                                   src_offset, ins_sizes[i]);
-                if (inode_only == LOG_INODE_EXISTS &&
-                    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
                        inode_item = btrfs_item_ptr(dst_path->nodes[0],
                                                    dst_path->slots[0],
                                                    struct btrfs_inode_item);
-                        btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
+                        fill_inode_item(trans, dst_path->nodes[0], inode_item,
+                                        inode, inode_only == LOG_INODE_EXISTS);
-                        /* set the generation to zero so the recover code
+                } else {
-                         * can tell the difference between an logging
+                        copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
-                         * just to say 'this inode exists' and a logging
+                                           src_offset, ins_sizes[i]);
-                         * to say 'update this inode with these values'
-                         */
-                        btrfs_set_inode_generation(dst_path->nodes[0],
-                                                   inode_item, 0);
                }
                /* take a reference on file data extents so that truncates
                 * or deletes of this inode don't have to relog the inode
                 * again
                 */
-                if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+                if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
+                    !skip_csum) {
                        int found_type;
                        extent = btrfs_item_ptr(src, start_slot + i,
                                                struct btrfs_file_extent_item);
@@ -2753,8 +3067,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                                continue;
                        found_type = btrfs_file_extent_type(src, extent);
-                        if (found_type == BTRFS_FILE_EXTENT_REG ||
+                        if (found_type == BTRFS_FILE_EXTENT_REG) {
-                            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                                u64 ds, dl, cs, cl;
                                ds = btrfs_file_extent_disk_bytenr(src,
                                                                extent);
@@ -2803,6 +3116,239 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        return ret;
 }
+static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        struct extent_map *em1, *em2;
+        em1 = list_entry(a, struct extent_map, list);
+        em2 = list_entry(b, struct extent_map, list);
+        if (em1->start < em2->start)
+                return -1;
+        else if (em1->start > em2->start)
+                return 1;
+        return 0;
+}
+struct log_args {
+        struct extent_buffer *src;
+        u64 next_offset;
+        int start_slot;
+        int nr;
+};
+static int log_one_extent(struct btrfs_trans_handle *trans,
+                          struct inode *inode, struct btrfs_root *root,
+                          struct extent_map *em, struct btrfs_path *path,
+                          struct btrfs_path *dst_path, struct log_args *args)
+{
+        struct btrfs_root *log = root->log_root;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        u64 start = em->mod_start;
+        u64 search_start = start;
+        u64 len = em->mod_len;
+        u64 num_bytes;
+        int nritems;
+        int ret;
+        if (BTRFS_I(inode)->logged_trans == trans->transid) {
+                ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
+                                           start + len, NULL, 0);
+                if (ret)
+                        return ret;
+        }
+        while (len) {
+                if (args->nr)
+                        goto next_slot;
+again:
+                key.objectid = btrfs_ino(inode);
+                key.type = BTRFS_EXTENT_DATA_KEY;
+                key.offset = search_start;
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        return ret;
+                if (ret) {
+                        /*
+                         * A rare case were we can have an em for a section of a
+                         * larger extent so we need to make sure that this em
+                         * falls within the extent we've found.  If not we just
+                         * bail and go back to ye-olde way of doing things but
+                         * it happens often enough in testing that we need to do
+                         * this dance to make sure.
+                         */
+                        do {
+                                if (path->slots[0] == 0) {
+                                        btrfs_release_path(path);
+                                        if (search_start == 0)
+                                                return -ENOENT;
+                                        search_start--;
+                                        goto again;
+                                }
+                                path->slots[0]--;
+                                btrfs_item_key_to_cpu(path->nodes[0], &key,
+                                                      path->slots[0]);
+                                if (key.objectid != btrfs_ino(inode) ||
+                                    key.type != BTRFS_EXTENT_DATA_KEY) {
+                                        btrfs_release_path(path);
+                                        return -ENOENT;
+                                }
+                        } while (key.offset > start);
+                        fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                            struct btrfs_file_extent_item);
+                        num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
+                                                                fi);
+                        if (key.offset + num_bytes <= start) {
+                                btrfs_release_path(path);
+                                return -ENOENT;
+                        }
+                }
+                args->src = path->nodes[0];
+next_slot:
+                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+                fi = btrfs_item_ptr(args->src, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                if (args->nr &&
+                    args->start_slot + args->nr == path->slots[0]) {
+                        args->nr++;
+                } else if (args->nr) {
+                        ret = copy_items(trans, inode, dst_path, args->src,
+                                         args->start_slot, args->nr,
+                                         LOG_INODE_ALL);
+                        if (ret)
+                                return ret;
+                        args->nr = 1;
+                        args->start_slot = path->slots[0];
+                } else if (!args->nr) {
+                        args->nr = 1;
+                        args->start_slot = path->slots[0];
+                }
+                nritems = btrfs_header_nritems(path->nodes[0]);
+                path->slots[0]++;
+                num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
+                if (len < num_bytes) {
+                        /* I _think_ this is ok, envision we write to a
+                         * preallocated space that is adjacent to a previously
+                         * written preallocated space that gets merged when we
+                         * mark this preallocated space written.  If we do not
+                         * have the adjacent extent in cache then when we copy
+                         * this extent it could end up being larger than our EM
+                         * thinks it is, which is a-ok, so just set len to 0.
+                         */
+                        len = 0;
+                } else {
+                        len -= num_bytes;
+                }
+                start = key.offset + num_bytes;
+                args->next_offset = start;
+                search_start = start;
+                if (path->slots[0] < nritems) {
+                        if (len)
+                                goto next_slot;
+                        break;
+                }
+                if (args->nr) {
+                        ret = copy_items(trans, inode, dst_path, args->src,
+                                         args->start_slot, args->nr,
+                                         LOG_INODE_ALL);
+                        if (ret)
+                                return ret;
+                        args->nr = 0;
+                        btrfs_release_path(path);
+                }
+        }
+        return 0;
+}
+static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     struct inode *inode,
+                                     struct btrfs_path *path,
+                                     struct btrfs_path *dst_path)
+{
+        struct log_args args;
+        struct extent_map *em, *n;
+        struct list_head extents;
+        struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+        u64 test_gen;
+        int ret = 0;
+        INIT_LIST_HEAD(&extents);
+        memset(&args, 0, sizeof(args));
+        write_lock(&tree->lock);
+        test_gen = root->fs_info->last_trans_committed;
+        list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+                list_del_init(&em->list);
+                if (em->generation <= test_gen)
+                        continue;
+                /* Need a ref to keep it from getting evicted from cache */
+                atomic_inc(&em->refs);
+                set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+                list_add_tail(&em->list, &extents);
+        }
+        list_sort(NULL, &extents, extent_cmp);
+        while (!list_empty(&extents)) {
+                em = list_entry(extents.next, struct extent_map, list);
+                list_del_init(&em->list);
+                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+                /*
+                 * If we had an error we just need to delete everybody from our
+                 * private list.
+                 */
+                if (ret) {
+                        free_extent_map(em);
+                        continue;
+                }
+                write_unlock(&tree->lock);
+                /*
+                 * If the previous EM and the last extent we left off on aren't
+                 * sequential then we need to copy the items we have and redo
+                 * our search
+                 */
+                if (args.nr && em->mod_start != args.next_offset) {
+                        ret = copy_items(trans, inode, dst_path, args.src,
+                                         args.start_slot, args.nr,
+                                         LOG_INODE_ALL);
+                        if (ret) {
+                                free_extent_map(em);
+                                write_lock(&tree->lock);
+                                continue;
+                        }
+                        btrfs_release_path(path);
+                        args.nr = 0;
+                }
+                ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
+                free_extent_map(em);
+                write_lock(&tree->lock);
+        }
+        WARN_ON(!list_empty(&extents));
+        write_unlock(&tree->lock);
+        if (!ret && args.nr)
+                ret = copy_items(trans, inode, dst_path, args.src,
+                                 args.start_slot, args.nr, LOG_INODE_ALL);
+        btrfs_release_path(path);
+        return ret;
+}
 /* log a single inode in the tree log.
 * At least one parent directory for this inode must exist in the tree
 * or be logged already.
@@ -2832,6 +3378,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        int nritems;
        int ins_start_slot = 0;
        int ins_nr;
+        bool fast_search = false;
        u64 ino = btrfs_ino(inode);
        log = root->log_root;
@@ -2851,21 +3398,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        max_key.objectid = ino;
-        /* today the code can only do partial logging of directories */
-        if (!S_ISDIR(inode->i_mode))
-            inode_only = LOG_INODE_ALL;
+        /* today the code can only do partial logging of directories */
        if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
                max_key.type = BTRFS_XATTR_ITEM_KEY;
        else
                max_key.type = (u8)-1;
        max_key.offset = (u64)-1;
-        ret = btrfs_commit_inode_delayed_items(trans, inode);
+        /* Only run delayed items if we are a dir or a new file */
-        if (ret) {
+        if (S_ISDIR(inode->i_mode) ||
-                btrfs_free_path(path);
+            BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
-                btrfs_free_path(dst_path);
+                ret = btrfs_commit_inode_delayed_items(trans, inode);
-                return ret;
+                if (ret) {
+                        btrfs_free_path(path);
+                        btrfs_free_path(dst_path);
+                        return ret;
+                }
        }
        mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -2881,7 +3430,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                        max_key_type = BTRFS_XATTR_ITEM_KEY;
                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
        } else {
-                ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+                if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                                       &BTRFS_I(inode)->runtime_flags)) {
+                        ret = btrfs_truncate_inode_items(trans, log,
+                                                         inode, 0, 0);
+                } else {
+                        fast_search = true;
+                        max_key.type = BTRFS_XATTR_ITEM_KEY;
+                        ret = drop_objectid_items(trans, log, path, ino,
+                                                  BTRFS_XATTR_ITEM_KEY);
+                }
        }
        if (ret) {
                err = ret;
@@ -2912,7 +3470,7 @@ again:
                        goto next_slot;
                }
-                ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+                ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
                                 ins_nr, inode_only);
                if (ret) {
                        err = ret;
@@ -2930,7 +3488,7 @@ next_slot:
                        goto again;
                }
                if (ins_nr) {
-                        ret = copy_items(trans, log, dst_path, src,
+                        ret = copy_items(trans, inode, dst_path, src,
                                         ins_start_slot,
                                         ins_nr, inode_only);
                        if (ret) {
@@ -2951,8 +3509,7 @@ next_slot:
                        break;
        }
        if (ins_nr) {
-                ret = copy_items(trans, log, dst_path, src,
+                ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
-                                 ins_start_slot,
                                 ins_nr, inode_only);
                if (ret) {
                        err = ret;
@@ -2960,7 +3517,24 @@ next_slot:
                }
                ins_nr = 0;
        }
-        WARN_ON(ins_nr);
+        if (fast_search) {
+                btrfs_release_path(path);
+                btrfs_release_path(dst_path);
+                ret = btrfs_log_changed_extents(trans, root, inode, path,
+                                                dst_path);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
+        } else {
+                struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+                struct extent_map *em, *n;
+                list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+                        list_del_init(&em->list);
+        }
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
                btrfs_release_path(path);
                btrfs_release_path(dst_path);
@@ -2971,6 +3545,7 @@ next_slot:
                }
        }
        BTRFS_I(inode)->logged_trans = trans->transid;
+        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
@@ -3138,7 +3713,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 end_trans:
        dput(old_parent);
        if (ret < 0) {
-                BUG_ON(ret != -ENOSPC);
+                WARN_ON(ret != -ENOSPC);
                root->fs_info->last_trans_log_full_commit = trans->transid;
                ret = 1;
        }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index ab942f46b3dd..99be4c138db6 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -143,14 +143,13 @@ EXPORT_SYMBOL(ulist_free);
 * In case of allocation failure -ENOMEM is returned and the ulist stays
 * unaltered.
 */
-int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
-              gfp_t gfp_mask)
 {
        return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
 }
-int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
-                    unsigned long *old_aux, gfp_t gfp_mask)
+                    u64 *old_aux, gfp_t gfp_mask)
 {
        int i;
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 21bdc8ec8130..21a1963439c3 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -33,7 +33,7 @@ struct ulist_iterator {
 */
 struct ulist_node {
        u64 val;                /* value to store */
-        unsigned long aux;      /* auxiliary value saved along with the val */
+        u64 aux;                /* auxiliary value saved along with the val */
 };
 struct ulist {
@@ -65,10 +65,9 @@ void ulist_fini(struct ulist *ulist);
 void ulist_reinit(struct ulist *ulist);
 struct ulist *ulist_alloc(gfp_t gfp_mask);
 void ulist_free(struct ulist *ulist);
-int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
-              gfp_t gfp_mask);
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
-int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
+                    u64 *old_aux, gfp_t gfp_mask);
-                    unsigned long *old_aux, gfp_t gfp_mask);
 struct ulist_node *ulist_next(struct ulist *ulist,
                              struct ulist_iterator *uiter);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 88b969aeeb71..029b903a4ae3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -639,7 +639,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                bdev = blkdev_get_by_path(device->name->str, flags, holder);
                if (IS_ERR(bdev)) {
-                        printk(KERN_INFO "open %s failed\n", device->name->str);
+                        printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
                        goto error;
                }
                filemap_write_and_wait(bdev->bd_inode->i_mapping);
@@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                free_fs_devices(cur_devices);
        }
+        root->fs_info->num_tolerated_disk_barrier_failures =
+                btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
        /*
         * at this point, the device is zero sized.  We want to
         * remove it from the devices list and zero out the old super
@@ -1775,15 +1778,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        if (seeding_dev) {
                ret = init_first_rw_device(trans, root, device);
-                if (ret)
+                if (ret) {
+                        btrfs_abort_transaction(trans, root, ret);
                        goto error_trans;
+                }
                ret = btrfs_finish_sprout(trans, root);
-                if (ret)
+                if (ret) {
+                        btrfs_abort_transaction(trans, root, ret);
                        goto error_trans;
+                }
        } else {
                ret = btrfs_add_device(trans, root, device);
-                if (ret)
+                if (ret) {
+                        btrfs_abort_transaction(trans, root, ret);
                        goto error_trans;
+                }
        }
        /*
@@ -1793,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        btrfs_clear_space_info_full(root->fs_info);
        unlock_chunks(root);
+        root->fs_info->num_tolerated_disk_barrier_failures =
+                btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
        ret = btrfs_commit_transaction(trans, root);
        if (seeding_dev) {
@@ -1814,7 +1825,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 error_trans:
        unlock_chunks(root);
-        btrfs_abort_transaction(trans, root, ret);
        btrfs_end_transaction(trans, root);
        rcu_string_free(device->name);
        kfree(device);
@@ -2804,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                }
        }
+        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                int num_tolerated_disk_barrier_failures;
+                u64 target = bctl->sys.target;
+                num_tolerated_disk_barrier_failures =
+                        btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+                if (num_tolerated_disk_barrier_failures > 0 &&
+                    (target &
+                     (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+                      BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
+                        num_tolerated_disk_barrier_failures = 0;
+                else if (num_tolerated_disk_barrier_failures > 1 &&
+                         (target &
+                          (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
+                        num_tolerated_disk_barrier_failures = 1;
+                fs_info->num_tolerated_disk_barrier_failures =
+                        num_tolerated_disk_barrier_failures;
+        }
        ret = insert_balance_item(fs_info->tree_root, bctl);
        if (ret && ret != -EEXIST)
                goto out;
@@ -2836,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                __cancel_balance(fs_info);
        }
+        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                fs_info->num_tolerated_disk_barrier_failures =
+                        btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+        }
        wake_up(&fs_info->balance_wait_q);
        return ret;
@@ -3608,12 +3643,16 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
                                  &sys_chunk_size, &sys_stripe_size,
                                  sys_chunk_offset, alloc_profile);
-        if (ret)
+        if (ret) {
-                goto abort;
+                btrfs_abort_transaction(trans, root, ret);
+                goto out;
+        }
        ret = btrfs_add_device(trans, fs_info->chunk_root, device);
-        if (ret)
+        if (ret) {
-                goto abort;
+                btrfs_abort_transaction(trans, root, ret);
+                goto out;
+        }
        /*
         * Modifying chunk tree needs allocating new blocks from both
@@ -3623,19 +3662,19 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
         */
        ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
                                   chunk_size, stripe_size);
-        if (ret)
+        if (ret) {
-                goto abort;
+                btrfs_abort_transaction(trans, root, ret);
+                goto out;
+        }
        ret = __finish_chunk_alloc(trans, extent_root, sys_map,
                                   sys_chunk_offset, sys_chunk_size,
                                   sys_stripe_size);
        if (ret)
-                goto abort;
+                btrfs_abort_transaction(trans, root, ret);
-        return 0;
+out:
-abort:
-        btrfs_abort_transaction(trans, root, ret);
        return ret;
 }
@@ -3760,7 +3799,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        read_unlock(&em_tree->lock);
        if (!em) {
-                printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+                printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",
                       (unsigned long long)logical,
                       (unsigned long long)*length);
                BUG();
@@ -4217,7 +4256,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        total_devs = bbio->num_stripes;
        if (map_length < length) {
-                printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+                printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
                       "len %llu\n", (unsigned long long)logical,
                       (unsigned long long)length,
                       (unsigned long long)map_length);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 92c20654cc55..9acb846c3e7f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
        *total_in = 0;
        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
-                printk(KERN_WARNING "deflateInit failed\n");
+                printk(KERN_WARNING "btrfs: deflateInit failed\n");
                ret = -1;
                goto out;
        }
@@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws,
        while (workspace->def_strm.total_in < len) {
                ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
                if (ret != Z_OK) {
-                        printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+                        printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",
                               ret);
                        zlib_deflateEnd(&workspace->def_strm);
                        ret = -1;
@@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
        }
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-                printk(KERN_WARNING "inflateInit failed\n");
+                printk(KERN_WARNING "btrfs: inflateInit failed\n");
                return -1;
        }
        while (workspace->inf_strm.total_in < srclen) {
@@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
        }
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-                printk(KERN_WARNING "inflateInit failed\n");
+                printk(KERN_WARNING "btrfs: inflateInit failed\n");
                return -1;
        }
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 8e1b60e557b6..02ce90972d81 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -99,7 +99,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
 * FIXME: we should try harder by querying the mds for the ino.
 */
 static struct dentry *__fh_to_dentry(struct super_block *sb,
-                                     struct ceph_nfs_fh *fh)
+                                     struct ceph_nfs_fh *fh, int fh_len)
 {
        struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
        struct inode *inode;
@@ -107,6 +107,9 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
        struct ceph_vino vino;
        int err;
+        if (fh_len < sizeof(*fh) / 4)
+                return ERR_PTR(-ESTALE);
        dout("__fh_to_dentry %llx\n", fh->ino);
        vino.ino = fh->ino;
        vino.snap = CEPH_NOSNAP;
@@ -150,7 +153,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 * convert connectable fh to dentry
 */
 static struct dentry *__cfh_to_dentry(struct super_block *sb,
-                                      struct ceph_nfs_confh *cfh)
+                                      struct ceph_nfs_confh *cfh, int fh_len)
 {
        struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
        struct inode *inode;
@@ -158,6 +161,9 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
        struct ceph_vino vino;
        int err;
+        if (fh_len < sizeof(*cfh) / 4)
+                return ERR_PTR(-ESTALE);
        dout("__cfh_to_dentry %llx (%llx/%x)\n",
             cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
@@ -207,9 +213,11 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
                                        int fh_len, int fh_type)
 {
        if (fh_type == 1)
-                return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
+                return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw,
+                                                                fh_len);
        else
-                return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
+                return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw,
+                                                                fh_len);
 }
 /*
@@ -230,6 +238,8 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
        if (fh_type == 1)
                return ERR_PTR(-ESTALE);
+        if (fh_len < sizeof(*cfh) / 4)
+                return ERR_PTR(-ESTALE);
        pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
                 cfh->parent_name_hash);
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index e622863b292f..086f381d6489 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -31,18 +31,18 @@
 /* create a new cifs key */
 static int
-cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen)
+cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 {
        char *payload;
        int ret;
        ret = -ENOMEM;
-        payload = kmalloc(datalen, GFP_KERNEL);
+        payload = kmalloc(prep->datalen, GFP_KERNEL);
        if (!payload)
                goto error;
        /* attach the data */
-        memcpy(payload, data, datalen);
+        memcpy(payload, prep->data, prep->datalen);
        key->payload.data = payload;
        ret = 0;
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 53cf2aabce87..71d5d0a5f6b2 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -203,6 +203,27 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len,
        int i;
        wchar_t wchar_to; /* needed to quiet sparse */
+        /* special case for utf8 to handle no plane0 chars */
+        if (!strcmp(codepage->charset, "utf8")) {
+                /*
+                 * convert utf8 -> utf16, we assume we have enough space
+                 * as caller should have assumed conversion does not overflow
+                 * in destination len is length in wchar_t units (16bits)
+                 */
+                i  = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
+                                       (wchar_t *) to, len);
+                /* if success terminate and exit */
+                if (i >= 0)
+                        goto success;
+                /*
+                 * if fails fall back to UCS encoding as this
+                 * function should not return negative values
+                 * currently can fail only if source contains
+                 * invalid encoded characters
+                 */
+        }
        for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
                charlen = codepage->char2uni(from, len, &wchar_to);
                if (charlen < 1) {
@@ -215,6 +236,7 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len,
                put_unaligned_le16(wchar_to, &to[i]);
        }
+success:
        put_unaligned_le16(0, &to[i]);
        return i;
 }
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 2ee5c54797fa..fc783e264420 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -167,17 +167,17 @@ static struct shrinker cifs_shrinker = {
 };
 static int
-cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen)
+cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 {
        char *payload;
-        payload = kmalloc(datalen, GFP_KERNEL);
+        payload = kmalloc(prep->datalen, GFP_KERNEL);
        if (!payload)
                return -ENOMEM;
-        memcpy(payload, data, datalen);
+        memcpy(payload, prep->data, prep->datalen);
        key->payload.data = payload;
-        key->datalen = datalen;
+        key->datalen = prep->datalen;
        return 0;
 }
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2fdbe08a7a23..5c670b998ffb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -67,6 +67,7 @@ enum {
        /* Mount options that take no arguments */
        Opt_user_xattr, Opt_nouser_xattr,
        Opt_forceuid, Opt_noforceuid,
+        Opt_forcegid, Opt_noforcegid,
        Opt_noblocksend, Opt_noautotune,
        Opt_hard, Opt_soft, Opt_perm, Opt_noperm,
        Opt_mapchars, Opt_nomapchars, Opt_sfu,
@@ -117,6 +118,8 @@ static const match_table_t cifs_mount_option_tokens = {
        { Opt_nouser_xattr, "nouser_xattr" },
        { Opt_forceuid, "forceuid" },
        { Opt_noforceuid, "noforceuid" },
+        { Opt_forcegid, "forcegid" },
+        { Opt_noforcegid, "noforcegid" },
        { Opt_noblocksend, "noblocksend" },
        { Opt_noautotune, "noautotune" },
        { Opt_hard, "hard" },
@@ -1195,6 +1198,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                case Opt_noforceuid:
                        override_uid = 0;
                        break;
+                case Opt_forcegid:
+                        override_gid = 1;
+                        break;
+                case Opt_noforcegid:
+                        override_gid = 0;
+                        break;
                case Opt_noblocksend:
                        vol->noblocksnd = 1;
                        break;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 2126ab185045..76d974c952fe 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -183,6 +183,12 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
                rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec],
                                    n_vec - first_vec, remaining);
                if (rc == -ENOSPC || rc == -EAGAIN) {
+                        /*
+                         * Catch if a low level driver returns -ENOSPC. This
+                         * WARN_ON will be removed by 3.10 if no one reports
+                         * seeing this.
+                         */
+                        WARN_ON_ONCE(rc == -ENOSPC);
                        i++;
                        if (i >= 14 || (!server->noblocksnd && (i > 2))) {
                                cERROR(1, "sends on sock %p stuck for 15 "
diff --git a/fs/compat.c b/fs/compat.c
index b7a24d0ca30d..015e1e1f87c6 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -776,16 +776,16 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
        char *kernel_type;
        unsigned long data_page;
        char *kernel_dev;
-        char *dir_page;
+        struct filename *dir;
        int retval;
        retval = copy_mount_string(type, &kernel_type);
        if (retval < 0)
                goto out;
-        dir_page = getname(dir_name);
+        dir = getname(dir_name);
-        retval = PTR_ERR(dir_page);
+        retval = PTR_ERR(dir);
-        if (IS_ERR(dir_page))
+        if (IS_ERR(dir))
                goto out1;
        retval = copy_mount_string(dev_name, &kernel_dev);
@@ -807,7 +807,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
                }
        }
-        retval = do_mount(kernel_dev, dir_page, kernel_type,
+        retval = do_mount(kernel_dev, dir->name, kernel_type,
                        flags, (void*)data_page);
 out4:
@@ -815,7 +815,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
 out3:
        kfree(kernel_dev);
 out2:
-        putname(dir_page);
+        putname(dir);
 out1:
        kfree(kernel_type);
 out:
diff --git a/fs/coredump.c b/fs/coredump.c
index fd37facac8dc..ce47379bfa61 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -450,11 +450,12 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
        cp->file = files[1];
-        replace_fd(0, files[0], 0);
+        err = replace_fd(0, files[0], 0);
+        fput(files[0]);
        /* and disallow core files too */
        current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
-        return 0;
+        return err;
 }
 void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
diff --git a/fs/exec.c b/fs/exec.c
index 4f2bebc276c5..8b9011b67041 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,7 +59,6 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
-#include <asm/exec.h>
 #include <trace/events/task.h>
 #include "internal.h"
@@ -106,7 +105,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
 SYSCALL_DEFINE1(uselib, const char __user *, library)
 {
        struct file *file;
-        char *tmp = getname(library);
+        struct filename *tmp = getname(library);
        int error = PTR_ERR(tmp);
        static const struct open_flags uselib_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
@@ -392,7 +391,7 @@ struct user_arg_ptr {
        union {
                const char __user *const __user *native;
 #ifdef CONFIG_COMPAT
-                compat_uptr_t __user *compat;
+                const compat_uptr_t __user *compat;
 #endif
        } ptr;
 };
@@ -752,13 +751,14 @@ struct file *open_exec(const char *name)
 {
        struct file *file;
        int err;
+        struct filename tmp = { .name = name };
        static const struct open_flags open_exec_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
                .acc_mode = MAY_EXEC | MAY_OPEN,
                .intent = LOOKUP_OPEN
        };
-        file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
+        file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags, LOOKUP_FOLLOW);
        if (IS_ERR(file))
                goto out;
@@ -1574,9 +1574,9 @@ int do_execve(const char *filename,
 }
 #ifdef CONFIG_COMPAT
-int compat_do_execve(char *filename,
+int compat_do_execve(const char *filename,
-        compat_uptr_t __user *__argv,
+        const compat_uptr_t __user *__argv,
-        compat_uptr_t __user *__envp,
+        const compat_uptr_t __user *__envp,
        struct pt_regs *regs)
 {
        struct user_arg_ptr argv = {
@@ -1658,3 +1658,56 @@ int get_dumpable(struct mm_struct *mm)
 {
        return __get_dumpable(mm->flags);
 }
+#ifdef __ARCH_WANT_SYS_EXECVE
+SYSCALL_DEFINE3(execve,
+                const char __user *, filename,
+                const char __user *const __user *, argv,
+                const char __user *const __user *, envp)
+{
+        struct filename *path = getname(filename);
+        int error = PTR_ERR(path);
+        if (!IS_ERR(path)) {
+                error = do_execve(path->name, argv, envp, current_pt_regs());
+                putname(path);
+        }
+        return error;
+}
+#ifdef CONFIG_COMPAT
+asmlinkage long compat_sys_execve(const char __user * filename,
+        const compat_uptr_t __user * argv,
+        const compat_uptr_t __user * envp)
+{
+        struct filename *path = getname(filename);
+        int error = PTR_ERR(path);
+        if (!IS_ERR(path)) {
+                error = compat_do_execve(path->name, argv, envp,
+                                                        current_pt_regs());
+                putname(path);
+        }
+        return error;
+}
+#endif
+#endif
+#ifdef __ARCH_WANT_KERNEL_EXECVE
+int kernel_execve(const char *filename,
+                  const char *const argv[],
+                  const char *const envp[])
+{
+        struct pt_regs *p = current_pt_regs();
+        int ret;
+        ret = do_execve(filename,
+                        (const char __user *const __user *)argv,
+                        (const char __user *const __user *)envp, p);
+        if (ret < 0)
+                return ret;
+        /*
+         * We were successful.  We won't be returning to our caller, but
+         * instead to user space by manipulating the kernel stack.
+         */
+        ret_from_kernel_execve(p);
+}
+#endif
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 1585db1aa365..f936cb50dc0d 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -814,8 +814,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
                        struct bio *bio;
                        if (per_dev != master_dev) {
-                                bio = bio_kmalloc(GFP_KERNEL,
+                                bio = bio_clone_kmalloc(master_dev->bio,
-                                                  master_dev->bio->bi_max_vecs);
+                                                        GFP_KERNEL);
                                if (unlikely(!bio)) {
                                        ORE_DBGMSG(
                                              "Failed to allocate BIO size=%u\n",
@@ -824,7 +824,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
                                        goto out;
                                }
-                                __bio_clone(bio, master_dev->bio);
                                bio->bi_bdev = NULL;
                                bio->bi_next = NULL;
                                per_dev->offset = master_dev->offset;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 59e3bbfac0b1..5e59280d42d7 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -389,8 +389,6 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
        if (unlikely(ret))
                goto out;
-        lock_super(sb);
        ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
        memset(fscb, 0, ios->length);
        fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -406,8 +404,6 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
        if (unlikely(ret))
                EXOFS_ERR("%s: ore_write failed.\n", __func__);
-        unlock_super(sb);
 out:
        EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
        ore_put_io_state(ios);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index ebf8312c3a4e..5366393528df 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2578,11 +2578,9 @@ out:
 static int ext3_unfreeze(struct super_block *sb)
 {
        if (!(sb->s_flags & MS_RDONLY)) {
-                lock_super(sb);
                /* Reser the needs_recovery flag before the fs is unlocked. */
                EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
                ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
-                unlock_super(sb);
                journal_unlock_updates(EXT3_SB(sb)->s_journal);
        }
        return 0;
@@ -2602,7 +2600,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 #endif
        /* Store the original options */
-        lock_super(sb);
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
        old_opts.s_resuid = sbi->s_resuid;
@@ -2708,8 +2705,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
                        kfree(old_opts.s_qf_names[i]);
 #endif
-        unlock_super(sb);
        if (enable_quota)
                dquot_resume(sb, -1);
        return 0;
@@ -2728,7 +2723,6 @@ restore_opts:
                sbi->s_qf_names[i] = old_opts.s_qf_names[i];
        }
 #endif
-        unlock_super(sb);
        return err;
 }
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index bca6d0a1255e..2a182342442e 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -571,7 +571,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
        int short_len = 0, fill_len = 0;
        int ret = 0;
-        lock_super(sb);
+        mutex_lock(&sbi->s_lock);
        cpos = filp->f_pos;
        /* Fake . and .. for the root directory. */
@@ -693,7 +693,7 @@ fill_failed:
        if (unicode)
                __putname(unicode);
 out:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_lock);
        return ret;
 }
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index ca7e8f8bad7c..623f36f0423b 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -71,8 +71,9 @@ struct msdos_sb_info {
        unsigned long root_cluster;   /* first cluster of the root directory */
        unsigned long fsinfo_sector;  /* sector number of FAT32 fsinfo */
        struct mutex fat_lock;
-        unsigned int prev_free;       /* previously allocated cluster number */
+        struct mutex s_lock;
-        unsigned int free_clusters;   /* -1 if undefined */
+        unsigned int prev_free;      /* previously allocated cluster number */
+        unsigned int free_clusters;  /* -1 if undefined */
        unsigned int free_clus_valid; /* is free_clusters valid? */
        struct fat_mount_options options;
        struct nls_table *nls_disk;   /* Codepage used on disk */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 76f60c642c06..5bafaad00530 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -673,9 +673,9 @@ static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
        if (inode->i_ino == MSDOS_FSINFO_INO) {
                struct super_block *sb = inode->i_sb;
-                lock_super(sb);
+                mutex_lock(&MSDOS_SB(sb)->s_lock);
                err = fat_clusters_flush(sb);
-                unlock_super(sb);
+                mutex_unlock(&MSDOS_SB(sb)->s_lock);
        } else
                err = __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
@@ -1268,6 +1268,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
                b = (struct fat_boot_sector *) bh->b_data;
        }
+        mutex_init(&sbi->s_lock);
        sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus;
        sbi->cluster_bits = ffs(sbi->cluster_size) - 1;
        sbi->fats = b->fats;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index c1055e778fff..e2cfda94a28d 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -208,7 +208,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
        struct inode *inode;
        int err;
-        lock_super(sb);
+        mutex_lock(&MSDOS_SB(sb)->s_lock);
        err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
        switch (err) {
        case -ENOENT:
@@ -221,7 +221,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
        default:
                inode = ERR_PTR(err);
        }
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return d_splice_alias(inode, dentry);
 }
@@ -273,7 +273,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        unsigned char msdos_name[MSDOS_NAME];
        int err, is_hid;
-        lock_super(sb);
+        mutex_lock(&MSDOS_SB(sb)->s_lock);
        err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
                                msdos_name, &MSDOS_SB(sb)->options);
@@ -302,7 +302,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        d_instantiate(dentry, inode);
 out:
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        if (!err)
                err = fat_flush_inodes(sb, dir, inode);
        return err;
@@ -316,7 +316,7 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
        struct fat_slot_info sinfo;
        int err;
-        lock_super(sb);
+        mutex_lock(&MSDOS_SB(sb)->s_lock);
        /*
         * Check whether the directory is not in use, then check
         * whether it is empty.
@@ -337,7 +337,7 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
        inode->i_ctime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        if (!err)
                err = fat_flush_inodes(sb, dir, inode);
@@ -354,7 +354,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        struct timespec ts;
        int err, is_hid, cluster;
-        lock_super(sb);
+        mutex_lock(&MSDOS_SB(sb)->s_lock);
        err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
                                msdos_name, &MSDOS_SB(sb)->options);
@@ -392,14 +392,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        d_instantiate(dentry, inode);
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        fat_flush_inodes(sb, dir, inode);
        return 0;
 out_free:
        fat_free_clusters(dir, cluster);
 out:
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return err;
 }
@@ -411,7 +411,7 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
        struct fat_slot_info sinfo;
        int err;
-        lock_super(sb);
+        mutex_lock(&MSDOS_SB(sb)->s_lock);
        err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
        if (err)
                goto out;
@@ -423,7 +423,7 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
        inode->i_ctime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        if (!err)
                err = fat_flush_inodes(sb, dir, inode);
@@ -606,7 +606,7 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
        unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
        int err, is_hid;
-        lock_super(sb);
+        mutex_lock(&MSDOS_SB(sb)->s_lock);
        err = msdos_format_name(old_dentry->d_name.name,
                                old_dentry->d_name.len, old_msdos_name,
@@ -625,7 +625,7 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
        err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
                              new_dir, new_msdos_name, new_dentry, is_hid);
 out:
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        if (!err)
                err = fat_flush_inodes(sb, old_dir, new_dir);
        return err;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index e535dd75b986..ac959d655e7d 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -721,7 +721,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
        struct dentry *alias;
        int err;
-        lock_super(sb);
+        mutex_lock(&MSDOS_SB(sb)->s_lock);
        err = vfat_find(dir, &dentry->d_name, &sinfo);
        if (err) {
@@ -752,13 +752,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
                if (!S_ISDIR(inode->i_mode))
                        d_move(alias, dentry);
                iput(inode);
-                unlock_super(sb);
+                mutex_unlock(&MSDOS_SB(sb)->s_lock);
                return alias;
        } else
                dput(alias);
 out:
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        dentry = d_splice_alias(inode, dentry);
        if (dentry)
@@ -766,7 +766,7 @@ out:
        return dentry;
 error:
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return ERR_PTR(err);
 }
@@ -779,7 +779,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        struct timespec ts;
        int err;
-        lock_super(sb);
+        mutex_lock(&MSDOS_SB(sb)->s_lock);
        ts = CURRENT_TIME_SEC;
        err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -800,7 +800,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        d_instantiate(dentry, inode);
 out:
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return err;
 }
@@ -811,7 +811,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
        struct fat_slot_info sinfo;
        int err;
-        lock_super(sb);
+        mutex_lock(&MSDOS_SB(sb)->s_lock);
        err = fat_dir_empty(inode);
        if (err)
@@ -829,7 +829,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
        inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return err;
 }
@@ -841,7 +841,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
        struct fat_slot_info sinfo;
        int err;
-        lock_super(sb);
+        mutex_lock(&MSDOS_SB(sb)->s_lock);
        err = vfat_find(dir, &dentry->d_name, &sinfo);
        if (err)
@@ -854,7 +854,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
        inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return err;
 }
@@ -867,7 +867,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        struct timespec ts;
        int err, cluster;
-        lock_super(sb);
+        mutex_lock(&MSDOS_SB(sb)->s_lock);
        ts = CURRENT_TIME_SEC;
        cluster = fat_alloc_new_dir(dir, &ts);
@@ -896,13 +896,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        d_instantiate(dentry, inode);
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return 0;
 out_free:
        fat_free_clusters(dir, cluster);
 out:
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return err;
 }
@@ -921,7 +921,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
        old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
        old_inode = old_dentry->d_inode;
        new_inode = new_dentry->d_inode;
-        lock_super(sb);
+        mutex_lock(&MSDOS_SB(sb)->s_lock);
        err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
        if (err)
                goto out;
@@ -996,7 +996,7 @@ out:
        brelse(sinfo.bh);
        brelse(dotdot_bh);
        brelse(old_sinfo.bh);
-        unlock_super(sb);
+        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return err;
diff --git a/fs/file.c b/fs/file.c
index 0f1bda4bebfa..d3b5fa80b71b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -922,6 +922,9 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
        if ((flags & ~O_CLOEXEC) != 0)
                return -EINVAL;
+        if (unlikely(oldfd == newfd))
+                return -EINVAL;
        if (newfd >= rlimit(RLIMIT_NOFILE))
                return -EMFILE;
diff --git a/fs/file_table.c b/fs/file_table.c
index dac67923330f..a72bf9ddd0d2 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -36,7 +36,7 @@ struct files_stat_struct files_stat = {
        .max_files = NR_FILE
 };
-DEFINE_LGLOCK(files_lglock);
+DEFINE_STATIC_LGLOCK(files_lglock);
 /* SLAB cache for file structures */
 static struct kmem_cache *filp_cachep __read_mostly;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 96f24286667a..da165f6adcbf 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(unregister_filesystem);
 static int fs_index(const char __user * __name)
 {
        struct file_system_type * tmp;
-        char * name;
+        struct filename *name;
        int err, index;
        name = getname(__name);
@@ -135,7 +135,7 @@ static int fs_index(const char __user * __name)
        err = -EINVAL;
        read_lock(&file_systems_lock);
        for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
-                if (strcmp(tmp->name,name) == 0) {
+                if (strcmp(tmp->name, name->name) == 0) {
                        err = index;
                        break;
                }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 401b6c6248ae..51ea267d444c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -249,7 +249,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 }
 /*
- * Move expired (dirtied after work->older_than_this) dirty inodes from
+ * Move expired (dirtied before work->older_than_this) dirty inodes from
 * @delaying_queue to @dispatch_queue.
 */
 static int move_expired_inodes(struct list_head *delaying_queue,
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index e8ed6d4a6181..4767774a5f3e 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -161,6 +161,8 @@ static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
        case GFS2_SMALL_FH_SIZE:
        case GFS2_LARGE_FH_SIZE:
        case GFS2_OLD_FH_SIZE:
+                if (fh_len < GFS2_SMALL_FH_SIZE)
+                        return NULL;
                this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
                this.no_formal_ino |= be32_to_cpu(fh[1]);
                this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
@@ -180,6 +182,8 @@ static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid,
        switch (fh_type) {
        case GFS2_LARGE_FH_SIZE:
        case GFS2_OLD_FH_SIZE:
+                if (fh_len < GFS2_LARGE_FH_SIZE)
+                        return NULL;
                parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
                parent.no_formal_ino |= be32_to_cpu(fh[5]);
                parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 1fe731337f07..9c88da0e855a 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -1,7 +1,7 @@
 #ifndef __UM_FS_HOSTFS
 #define __UM_FS_HOSTFS
-#include "os.h"
+#include <os.h>
 /*
 * These are exactly the same definitions as in fs.h, but the names are
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 6c9f3a9d5e21..457addc5c91f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -16,8 +16,8 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include "hostfs.h"
-#include "init.h"
+#include <init.h>
-#include "kern.h"
+#include <kern.h>
 struct hostfs_inode_info {
        int fd;
@@ -848,9 +848,11 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
            attr->ia_size != i_size_read(inode)) {
                int error;
-                error = vmtruncate(inode, attr->ia_size);
+                error = inode_newsize_ok(inode, attr->ia_size);
-                if (err)
+                if (error)
-                        return err;
+                        return error;
+                truncate_setsize(inode, attr->ia_size);
        }
        setattr_copy(inode, attr);
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index a74ad0d371c2..67838f3aa20a 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -15,7 +15,6 @@
 #include <sys/types.h>
 #include <sys/vfs.h>
 #include "hostfs.h"
-#include "os.h"
 #include <utime.h>
 static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index bc28bf077a6a..a3076228523d 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -398,7 +398,6 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        *flags |= MS_NOATIME;
        
        hpfs_lock(s);
-        lock_super(s);
        uid = sbi->sb_uid; gid = sbi->sb_gid;
        umask = 0777 & ~sbi->sb_mode;
        lowercase = sbi->sb_lowercase;
@@ -431,12 +430,10 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        replace_mount_options(s, new_opts);
-        unlock_super(s);
        hpfs_unlock(s);
        return 0;
 out_err:
-        unlock_super(s);
        hpfs_unlock(s);
        kfree(new_opts);
        return -EINVAL;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index c1dffe47fde2..78f21f8dc2ec 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -18,7 +18,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/namei.h>
 #include <asm/uaccess.h>
-#include "os.h"
+#include <os.h>
 static struct inode *get_inode(struct super_block *, struct dentry *);
@@ -674,7 +674,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
        if (!inode) {
                dput(dentry);
-                return ERR_PTR(-ENOMEM);
+                return NULL;
        }
        if (S_ISDIR(dentry->d_inode->i_mode)) {
diff --git a/fs/internal.h b/fs/internal.h
index 371bcc4b1697..916b7cbf3e3e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -97,8 +97,8 @@ struct open_flags {
        int acc_mode;
        int intent;
 };
-extern struct file *do_filp_open(int dfd, const char *pathname,
+extern struct file *do_filp_open(int dfd, struct filename *pathname,
-                const struct open_flags *op, int lookup_flags);
+                const struct open_flags *op, int flags);
 extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
                const char *, const struct open_flags *, int lookup_flags);
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 1d3804492aa7..2b4f2358eadb 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -175,7 +175,7 @@ static struct dentry *isofs_fh_to_parent(struct super_block *sb,
 {
        struct isofs_fid *ifid = (struct isofs_fid *)fid;
-        if (fh_type != 2)
+        if (fh_len < 2 || fh_type != 2)
                return NULL;
        return isofs_export_iget(sb,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index ff487954cd96..d3d8799e2187 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -100,6 +100,10 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
 {
        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+        cancel_delayed_work_sync(&c->wbuf_dwork);
+#endif
        mutex_lock(&c->alloc_sem);
        jffs2_flush_wbuf_pad(c);
        mutex_unlock(&c->alloc_sem);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 6f4529d3697f..a6597d60d76d 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1044,10 +1044,10 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
        ops.datbuf = NULL;
        ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
-        if (ret || ops.oobretlen != ops.ooblen) {
+        if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) {
                pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
                       jeb->offset, ops.ooblen, ops.oobretlen, ret);
-                if (!ret)
+                if (!ret || mtd_is_bitflip(ret))
                        ret = -EIO;
                return ret;
        }
@@ -1086,10 +1086,10 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
        ops.datbuf = NULL;
        ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
-        if (ret || ops.oobretlen != ops.ooblen) {
+        if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) {
                pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n",
                       jeb->offset, ops.ooblen, ops.oobretlen, ret);
-                if (!ret)
+                if (!ret || mtd_is_bitflip(ret))
                        ret = -EIO;
                return ret;
        }
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 7ef14b3c5bee..e4fb3ba5a58a 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -7,7 +7,6 @@
 */
 #include <linux/types.h>
-#include <linux/utsname.h>
 #include <linux/kernel.h>
 #include <linux/ktime.h>
 #include <linux/slab.h>
@@ -19,6 +18,8 @@
 #include <asm/unaligned.h>
+#include "netns.h"
 #define NLMDBG_FACILITY         NLMDBG_MONITOR
 #define NSM_PROGRAM             100024
 #define NSM_VERSION             1
@@ -40,6 +41,7 @@ struct nsm_args {
        u32                     proc;
        char                    *mon_name;
+        char                    *nodename;
 };
 struct nsm_res {
@@ -70,7 +72,7 @@ static struct rpc_clnt *nsm_create(struct net *net)
        };
        struct rpc_create_args args = {
                .net                    = net,
-                .protocol               = XPRT_TRANSPORT_UDP,
+                .protocol               = XPRT_TRANSPORT_TCP,
                .address                = (struct sockaddr *)&sin,
                .addrsize               = sizeof(sin),
                .servername             = "rpc.statd",
@@ -83,10 +85,54 @@ static struct rpc_clnt *nsm_create(struct net *net)
        return rpc_create(&args);
 }
-static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
+static struct rpc_clnt *nsm_client_get(struct net *net)
-                         struct net *net)
 {
+        static DEFINE_MUTEX(nsm_create_mutex);
        struct rpc_clnt *clnt;
+        struct lockd_net *ln = net_generic(net, lockd_net_id);
+        spin_lock(&ln->nsm_clnt_lock);
+        if (ln->nsm_users) {
+                ln->nsm_users++;
+                clnt = ln->nsm_clnt;
+                spin_unlock(&ln->nsm_clnt_lock);
+                goto out;
+        }
+        spin_unlock(&ln->nsm_clnt_lock);
+        mutex_lock(&nsm_create_mutex);
+        clnt = nsm_create(net);
+        if (!IS_ERR(clnt)) {
+                ln->nsm_clnt = clnt;
+                smp_wmb();
+                ln->nsm_users = 1;
+        }
+        mutex_unlock(&nsm_create_mutex);
+out:
+        return clnt;
+}
+static void nsm_client_put(struct net *net)
+{
+        struct lockd_net *ln = net_generic(net, lockd_net_id);
+        struct rpc_clnt *clnt = ln->nsm_clnt;
+        int shutdown = 0;
+        spin_lock(&ln->nsm_clnt_lock);
+        if (ln->nsm_users) {
+                if (--ln->nsm_users)
+                        ln->nsm_clnt = NULL;
+                shutdown = !ln->nsm_users;
+        }
+        spin_unlock(&ln->nsm_clnt_lock);
+        if (shutdown)
+                rpc_shutdown_client(clnt);
+}
+static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
+                         struct rpc_clnt *clnt)
+{
        int             status;
        struct nsm_args args = {
                .priv           = &nsm->sm_priv,
@@ -94,31 +140,24 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
                .vers           = 3,
                .proc           = NLMPROC_NSM_NOTIFY,
                .mon_name       = nsm->sm_mon_name,
+                .nodename       = clnt->cl_nodename,
        };
        struct rpc_message msg = {
                .rpc_argp       = &args,
                .rpc_resp       = res,
        };
-        clnt = nsm_create(net);
+        BUG_ON(clnt == NULL);
-        if (IS_ERR(clnt)) {
-                status = PTR_ERR(clnt);
-                dprintk("lockd: failed to create NSM upcall transport, "
-                                "status=%d\n", status);
-                goto out;
-        }
        memset(res, 0, sizeof(*res));
        msg.rpc_proc = &clnt->cl_procinfo[proc];
-        status = rpc_call_sync(clnt, &msg, 0);
+        status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
        if (status < 0)
                dprintk("lockd: NSM upcall RPC failed, status=%d\n",
                                status);
        else
                status = 0;
-        rpc_shutdown_client(clnt);
- out:
        return status;
 }
@@ -138,6 +177,7 @@ int nsm_monitor(const struct nlm_host *host)
        struct nsm_handle *nsm = host->h_nsmhandle;
        struct nsm_res  res;
        int             status;
+        struct rpc_clnt *clnt;
        dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
@@ -150,7 +190,15 @@ int nsm_monitor(const struct nlm_host *host)
         */
        nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
-        status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host->net);
+        clnt = nsm_client_get(host->net);
+        if (IS_ERR(clnt)) {
+                status = PTR_ERR(clnt);
+                dprintk("lockd: failed to create NSM upcall transport, "
+                                "status=%d, net=%p\n", status, host->net);
+                return status;
+        }
+        status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt);
        if (unlikely(res.status != 0))
                status = -EIO;
        if (unlikely(status < 0)) {
@@ -182,9 +230,11 @@ void nsm_unmonitor(const struct nlm_host *host)
        if (atomic_read(&nsm->sm_count) == 1
         && nsm->sm_monitored && !nsm->sm_sticky) {
+                struct lockd_net *ln = net_generic(host->net, lockd_net_id);
                dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
-                status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host->net);
+                status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt);
                if (res.status != 0)
                        status = -EIO;
                if (status < 0)
@@ -192,6 +242,8 @@ void nsm_unmonitor(const struct nlm_host *host)
                                        nsm->sm_name);
                else
                        nsm->sm_monitored = 0;
+                nsm_client_put(host->net);
        }
 }
@@ -430,7 +482,7 @@ static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
        __be32 *p;
-        encode_nsm_string(xdr, utsname()->nodename);
+        encode_nsm_string(xdr, argp->nodename);
        p = xdr_reserve_space(xdr, 4 + 4 + 4);
        *p++ = cpu_to_be32(argp->prog);
        *p++ = cpu_to_be32(argp->vers);
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 4eee248ba96e..5010b55628b4 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -12,6 +12,10 @@ struct lockd_net {
        struct delayed_work grace_period_end;
        struct lock_manager lockd_manager;
        struct list_head grace_list;
+        spinlock_t nsm_clnt_lock;
+        unsigned int nsm_users;
+        struct rpc_clnt *nsm_clnt;
 };
 extern int lockd_net_id;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 31a63f87b806..a2aa97d45670 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -126,7 +126,7 @@ static void restart_grace(void)
 static int
 lockd(void *vrqstp)
 {
-        int             err = 0, preverr = 0;
+        int             err = 0;
        struct svc_rqst *rqstp = vrqstp;
        /* try_to_freeze() is called from svc_recv() */
@@ -165,21 +165,8 @@ lockd(void *vrqstp)
                 * recvfrom routine.
                 */
                err = svc_recv(rqstp, timeout);
-                if (err == -EAGAIN || err == -EINTR) {
+                if (err == -EAGAIN || err == -EINTR)
-                        preverr = err;
                        continue;
-                }
-                if (err < 0) {
-                        if (err != preverr) {
-                                printk(KERN_WARNING "%s: unexpected error "
-                                        "from svc_recv (%d)\n", __func__, err);
-                                preverr = err;
-                        }
-                        schedule_timeout_interruptible(HZ);
-                        continue;
-                }
-                preverr = err;
                dprintk("lockd: request from %s\n",
                                svc_print_addr(rqstp, buf, sizeof(buf)));
@@ -596,6 +583,7 @@ static int lockd_init_net(struct net *net)
        INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
        INIT_LIST_HEAD(&ln->grace_list);
+        spin_lock_init(&ln->nsm_clnt_lock);
        return 0;
 }
diff --git a/fs/locks.c b/fs/locks.c
index abc7dc6c490b..a94e331a52a2 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1289,7 +1289,7 @@ EXPORT_SYMBOL(__break_lease);
 void lease_get_mtime(struct inode *inode, struct timespec *time)
 {
        struct file_lock *flock = inode->i_flock;
-        if (flock && IS_LEASE(flock) && (flock->fl_type & F_WRLCK))
+        if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK))
                *time = current_fs_time(inode->i_sb);
        else
                *time = inode->i_mtime;
@@ -2185,8 +2185,8 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
        } else {
                seq_printf(f, "%s ",
                               (lease_breaking(fl))
-                               ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ "
+                               ? (fl->fl_type == F_UNLCK) ? "UNLCK" : "READ "
-                               : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ ");
+                               : (fl->fl_type == F_WRLCK) ? "WRITE" : "READ ");
        }
        if (inode) {
 #ifdef WE_CAN_BREAK_LSLK_NOW
diff --git a/fs/namei.c b/fs/namei.c
index aa30d19e9edd..d1895f308156 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -117,18 +117,70 @@
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
-static char *getname_flags(const char __user *filename, int flags, int *empty)
+void final_putname(struct filename *name)
 {
-        char *result = __getname(), *err;
+        if (name->separate) {
+                __putname(name->name);
+                kfree(name);
+        } else {
+                __putname(name);
+        }
+}
+#define EMBEDDED_NAME_MAX       (PATH_MAX - sizeof(struct filename))
+static struct filename *
+getname_flags(const char __user *filename, int flags, int *empty)
+{
+        struct filename *result, *err;
        int len;
+        long max;
+        char *kname;
+        result = audit_reusename(filename);
+        if (result)
+                return result;
+        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);
-        len = strncpy_from_user(result, filename, PATH_MAX);
+        /*
-        err = ERR_PTR(len);
+         * First, try to embed the struct filename inside the names_cache
-        if (unlikely(len < 0))
+         * allocation
+         */
+        kname = (char *)result + sizeof(*result);
+        result->name = kname;
+        result->separate = false;
+        max = EMBEDDED_NAME_MAX;
+recopy:
+        len = strncpy_from_user(kname, filename, max);
+        if (unlikely(len < 0)) {
+                err = ERR_PTR(len);
                goto error;
+        }
+        /*
+         * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
+         * separate struct filename so we can dedicate the entire
+         * names_cache allocation for the pathname, and re-do the copy from
+         * userland.
+         */
+        if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
+                kname = (char *)result;
+                result = kzalloc(sizeof(*result), GFP_KERNEL);
+                if (!result) {
+                        err = ERR_PTR(-ENOMEM);
+                        result = (struct filename *)kname;
+                        goto error;
+                }
+                result->name = kname;
+                result->separate = true;
+                max = PATH_MAX;
+                goto recopy;
+        }
        /* The empty path is special. */
        if (unlikely(!len)) {
@@ -140,30 +192,32 @@ static char *getname_flags(const char __user *filename, int flags, int *empty)
        }
        err = ERR_PTR(-ENAMETOOLONG);
-        if (likely(len < PATH_MAX)) {
+        if (unlikely(len >= PATH_MAX))
-                audit_getname(result);
+                goto error;
-                return result;
-        }
+        result->uptr = filename;
+        audit_getname(result);
+        return result;
 error:
-        __putname(result);
+        final_putname(result);
        return err;
 }
-char *getname(const char __user * filename)
+struct filename *
+getname(const char __user * filename)
 {
        return getname_flags(filename, 0, NULL);
 }
+EXPORT_SYMBOL(getname);
 #ifdef CONFIG_AUDITSYSCALL
-void putname(const char *name)
+void putname(struct filename *name)
 {
        if (unlikely(!audit_dummy_context()))
-                audit_putname(name);
+                return audit_putname(name);
-        else
+        final_putname(name);
-                __putname(name);
 }
-EXPORT_SYMBOL(putname);
 #endif
 static int check_acl(struct inode *inode, int mask)
@@ -692,9 +746,9 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd)
        if (uid_eq(parent->i_uid, inode->i_uid))
                return 0;
+        audit_log_link_denied("follow_link", link);
        path_put_conditional(link, nd);
        path_put(&nd->path);
-        audit_log_link_denied("follow_link", link);
        return -EACCES;
 }
@@ -810,6 +864,7 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
        return error;
 out_put_nd_path:
+        *p = NULL;
        path_put(&nd->path);
        path_put(link);
        return error;
@@ -1962,24 +2017,29 @@ static int path_lookupat(int dfd, const char *name,
        return err;
 }
-static int do_path_lookup(int dfd, const char *name,
+static int filename_lookup(int dfd, struct filename *name,
                                unsigned int flags, struct nameidata *nd)
 {
-        int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+        int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd);
        if (unlikely(retval == -ECHILD))
-                retval = path_lookupat(dfd, name, flags, nd);
+                retval = path_lookupat(dfd, name->name, flags, nd);
        if (unlikely(retval == -ESTALE))
-                retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
+                retval = path_lookupat(dfd, name->name,
+                                                flags | LOOKUP_REVAL, nd);
-        if (likely(!retval)) {
+        if (likely(!retval))
-                if (unlikely(!audit_dummy_context())) {
+                audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
-                        if (nd->path.dentry && nd->inode)
-                                audit_inode(name, nd->path.dentry);
-                }
-        }
        return retval;
 }
+static int do_path_lookup(int dfd, const char *name,
+                                unsigned int flags, struct nameidata *nd)
+{
+        struct filename filename = { .name = name };
+        return filename_lookup(dfd, &filename, flags, nd);
+}
 /* does lookup, returns the object with parent locked */
 struct dentry *kern_path_locked(const char *name, struct path *path)
 {
@@ -2097,13 +2157,13 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                 struct path *path, int *empty)
 {
        struct nameidata nd;
-        char *tmp = getname_flags(name, flags, empty);
+        struct filename *tmp = getname_flags(name, flags, empty);
        int err = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
                BUG_ON(flags & LOOKUP_PARENT);
-                err = do_path_lookup(dfd, tmp, flags, &nd);
+                err = filename_lookup(dfd, tmp, flags, &nd);
                putname(tmp);
                if (!err)
                        *path = nd.path;
@@ -2117,22 +2177,28 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
        return user_path_at_empty(dfd, name, flags, path, NULL);
 }
-static int user_path_parent(int dfd, const char __user *path,
+/*
-                        struct nameidata *nd, char **name)
+ * NB: most callers don't do anything directly with the reference to the
+ *     to struct filename, but the nd->last pointer points into the name string
+ *     allocated by getname. So we must hold the reference to it until all
+ *     path-walking is complete.
+ */
+static struct filename *
+user_path_parent(int dfd, const char __user *path, struct nameidata *nd)
 {
-        char *s = getname(path);
+        struct filename *s = getname(path);
        int error;
        if (IS_ERR(s))
-                return PTR_ERR(s);
+                return s;
-        error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd);
+        error = filename_lookup(dfd, s, LOOKUP_PARENT, nd);
-        if (error)
+        if (error) {
                putname(s);
-        else
+                return ERR_PTR(error);
-                *name = s;
+        }
-        return error;
+        return s;
 }
 /*
@@ -2179,7 +2245,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
                return -ENOENT;
        BUG_ON(victim->d_parent->d_inode != dir);
-        audit_inode_child(victim, dir);
+        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
@@ -2624,7 +2690,7 @@ out_dput:
 */
 static int do_last(struct nameidata *nd, struct path *path,
                   struct file *file, const struct open_flags *op,
-                   int *opened, const char *pathname)
+                   int *opened, struct filename *name)
 {
        struct dentry *dir = nd->path.dentry;
        int open_flag = op->open_flag;
@@ -2651,7 +2717,7 @@ static int do_last(struct nameidata *nd, struct path *path,
                error = complete_walk(nd);
                if (error)
                        return error;
-                audit_inode(pathname, nd->path.dentry);
+                audit_inode(name, nd->path.dentry, 0);
                if (open_flag & O_CREAT) {
                        error = -EISDIR;
                        goto out;
@@ -2661,7 +2727,7 @@ static int do_last(struct nameidata *nd, struct path *path,
                error = complete_walk(nd);
                if (error)
                        return error;
-                audit_inode(pathname, dir);
+                audit_inode(name, dir, 0);
                goto finish_open;
        }
@@ -2690,7 +2756,7 @@ static int do_last(struct nameidata *nd, struct path *path,
                if (error)
                        return error;
-                audit_inode(pathname, dir);
+                audit_inode(name, dir, 0);
                error = -EISDIR;
                /* trailing slashes? */
                if (nd->last.name[nd->last.len])
@@ -2720,7 +2786,7 @@ retry_lookup:
                    !S_ISREG(file->f_path.dentry->d_inode->i_mode))
                        will_truncate = false;
-                audit_inode(pathname, file->f_path.dentry);
+                audit_inode(name, file->f_path.dentry, 0);
                goto opened;
        }
@@ -2737,7 +2803,7 @@ retry_lookup:
         * create/update audit record if it already exists.
         */
        if (path->dentry->d_inode)
-                audit_inode(pathname, path->dentry);
+                audit_inode(name, path->dentry, 0);
        /*
         * If atomic_open() acquired write access it is dropped now due to
@@ -2802,7 +2868,7 @@ finish_lookup:
        error = -ENOTDIR;
        if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup)
                goto out;
-        audit_inode(pathname, nd->path.dentry);
+        audit_inode(name, nd->path.dentry, 0);
 finish_open:
        if (!S_ISREG(nd->inode->i_mode))
                will_truncate = false;
@@ -2870,7 +2936,7 @@ stale_open:
        goto retry_lookup;
 }
-static struct file *path_openat(int dfd, const char *pathname,
+static struct file *path_openat(int dfd, struct filename *pathname,
                struct nameidata *nd, const struct open_flags *op, int flags)
 {
        struct file *base = NULL;
@@ -2885,12 +2951,12 @@ static struct file *path_openat(int dfd, const char *pathname,
        file->f_flags = op->open_flag;
-        error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
+        error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
        if (unlikely(error))
                goto out;
        current->total_link_count = 0;
-        error = link_path_walk(pathname, nd);
+        error = link_path_walk(pathname->name, nd);
        if (unlikely(error))
                goto out;
@@ -2936,7 +3002,7 @@ out:
        return file;
 }
-struct file *do_filp_open(int dfd, const char *pathname,
+struct file *do_filp_open(int dfd, struct filename *pathname,
                const struct open_flags *op, int flags)
 {
        struct nameidata nd;
@@ -2955,6 +3021,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 {
        struct nameidata nd;
        struct file *file;
+        struct filename filename = { .name = name };
        nd.root.mnt = mnt;
        nd.root.dentry = dentry;
@@ -2964,11 +3031,11 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
        if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
                return ERR_PTR(-ELOOP);
-        file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
+        file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU);
        if (unlikely(file == ERR_PTR(-ECHILD)))
-                file = path_openat(-1, name, &nd, op, flags);
+                file = path_openat(-1, &filename, &nd, op, flags);
        if (unlikely(file == ERR_PTR(-ESTALE)))
-                file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
+                file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL);
        return file;
 }
@@ -3043,11 +3110,11 @@ EXPORT_SYMBOL(done_path_create);
 struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
 {
-        char *tmp = getname(pathname);
+        struct filename *tmp = getname(pathname);
        struct dentry *res;
        if (IS_ERR(tmp))
                return ERR_CAST(tmp);
-        res = kern_path_create(dfd, tmp, path, is_dir);
+        res = kern_path_create(dfd, tmp->name, path, is_dir);
        putname(tmp);
        return res;
 }
@@ -3252,13 +3319,13 @@ out:
 static long do_rmdir(int dfd, const char __user *pathname)
 {
        int error = 0;
-        char * name;
+        struct filename *name;
        struct dentry *dentry;
        struct nameidata nd;
-        error = user_path_parent(dfd, pathname, &nd, &name);
+        name = user_path_parent(dfd, pathname, &nd);
-        if (error)
+        if (IS_ERR(name))
-                return error;
+                return PTR_ERR(name);
        switch(nd.last_type) {
        case LAST_DOTDOT:
@@ -3347,14 +3414,14 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 static long do_unlinkat(int dfd, const char __user *pathname)
 {
        int error;
-        char *name;
+        struct filename *name;
        struct dentry *dentry;
        struct nameidata nd;
        struct inode *inode = NULL;
-        error = user_path_parent(dfd, pathname, &nd, &name);
+        name = user_path_parent(dfd, pathname, &nd);
-        if (error)
+        if (IS_ERR(name))
-                return error;
+                return PTR_ERR(name);
        error = -EISDIR;
        if (nd.last_type != LAST_NORM)
@@ -3438,7 +3505,7 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
 {
        int error;
-        char *from;
+        struct filename *from;
        struct dentry *dentry;
        struct path path;
@@ -3451,9 +3518,9 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
        if (IS_ERR(dentry))
                goto out_putname;
-        error = security_path_symlink(&path, dentry, from);
+        error = security_path_symlink(&path, dentry, from->name);
        if (!error)
-                error = vfs_symlink(path.dentry->d_inode, dentry, from);
+                error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
        done_path_create(&path, dentry);
 out_putname:
        putname(from);
@@ -3733,17 +3800,21 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
        struct dentry *old_dentry, *new_dentry;
        struct dentry *trap;
        struct nameidata oldnd, newnd;
-        char *from;
+        struct filename *from;
-        char *to;
+        struct filename *to;
        int error;
-        error = user_path_parent(olddfd, oldname, &oldnd, &from);
+        from = user_path_parent(olddfd, oldname, &oldnd);
-        if (error)
+        if (IS_ERR(from)) {
+                error = PTR_ERR(from);
                goto exit;
+        }
-        error = user_path_parent(newdfd, newname, &newnd, &to);
+        to = user_path_parent(newdfd, newname, &newnd);
-        if (error)
+        if (IS_ERR(to)) {
+                error = PTR_ERR(to);
                goto exit1;
+        }
        error = -EXDEV;
        if (oldnd.path.mnt != newnd.path.mnt)
@@ -3967,7 +4038,6 @@ EXPORT_SYMBOL(follow_down_one);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* nfsd */
-EXPORT_SYMBOL(getname);
 EXPORT_SYMBOL(lock_rename);
 EXPORT_SYMBOL(lookup_one_len);
 EXPORT_SYMBOL(page_follow_link_light);
diff --git a/fs/namespace.c b/fs/namespace.c
index 7bdf7907413f..24960626bb6b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1640,7 +1640,7 @@ static int do_change_type(struct path *path, int flag)
 /*
 * do loopback mount.
 */
-static int do_loopback(struct path *path, char *old_name,
+static int do_loopback(struct path *path, const char *old_name,
                                int recurse)
 {
        LIST_HEAD(umount_list);
@@ -1764,7 +1764,7 @@ static inline int tree_contains_unbindable(struct mount *mnt)
        return 0;
 }
-static int do_move_mount(struct path *path, char *old_name)
+static int do_move_mount(struct path *path, const char *old_name)
 {
        struct path old_path, parent_path;
        struct mount *p;
@@ -1917,8 +1917,8 @@ unlock:
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
-static int do_new_mount(struct path *path, char *type, int flags,
+static int do_new_mount(struct path *path, const char *type, int flags,
-                        int mnt_flags, char *name, void *data)
+                        int mnt_flags, const char *name, void *data)
 {
        struct vfsmount *mnt;
        int err;
@@ -2191,8 +2191,8 @@ int copy_mount_string(const void __user *data, char **where)
 * Therefore, if this magic number is present, it carries no information
 * and must be discarded.
 */
-long do_mount(char *dev_name, char *dir_name, char *type_page,
+long do_mount(const char *dev_name, const char *dir_name,
-                  unsigned long flags, void *data_page)
+                const char *type_page, unsigned long flags, void *data_page)
 {
        struct path path;
        int retval = 0;
@@ -2408,7 +2408,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 {
        int ret;
        char *kernel_type;
-        char *kernel_dir;
+        struct filename *kernel_dir;
        char *kernel_dev;
        unsigned long data_page;
@@ -2430,7 +2430,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
        if (ret < 0)
                goto out_data;
-        ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,
+        ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags,
                (void *) data_page);
        free_page(data_page);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index db7ad719628a..13ca196385f5 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -95,8 +95,8 @@ config NFS_SWAP
          This option enables swapon to work on files located on NFS mounts.
 config NFS_V4_1
-        bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
+        bool "NFS client support for NFSv4.1"
-        depends on NFS_V4 && EXPERIMENTAL
+        depends on NFS_V4
        select SUNRPC_BACKCHANNEL
        help
          This option enables support for minor version 1 of the NFSv4 protocol
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index dd392ed5f2e2..f1027b06a1a9 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -37,6 +37,7 @@
 #include <linux/bio.h>          /* struct bio */
 #include <linux/buffer_head.h>  /* various write calls */
 #include <linux/prefetch.h>
+#include <linux/pagevec.h>
 #include "../pnfs.h"
 #include "../internal.h"
@@ -162,25 +163,39 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
        return bio;
 }
-static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
                                      sector_t isect, struct page *page,
                                      struct pnfs_block_extent *be,
                                      void (*end_io)(struct bio *, int err),
-                                      struct parallel_io *par)
+                                      struct parallel_io *par,
+                                      unsigned int offset, int len)
 {
+        isect = isect + (offset >> SECTOR_SHIFT);
+        dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
+                npg, rw, (unsigned long long)isect, offset, len);
 retry:
        if (!bio) {
                bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
                if (!bio)
                        return ERR_PTR(-ENOMEM);
        }
-        if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+        if (bio_add_page(bio, page, len, offset) < len) {
                bio = bl_submit_bio(rw, bio);
                goto retry;
        }
        return bio;
 }
+static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+                                      sector_t isect, struct page *page,
+                                      struct pnfs_block_extent *be,
+                                      void (*end_io)(struct bio *, int err),
+                                      struct parallel_io *par)
+{
+        return do_add_page_to_bio(bio, npg, rw, isect, page, be,
+                                  end_io, par, 0, PAGE_CACHE_SIZE);
+}
 /* This is basically copied from mpage_end_io_read */
 static void bl_end_io_read(struct bio *bio, int err)
 {
@@ -228,14 +243,6 @@ bl_end_par_io_read(void *data, int unused)
        schedule_work(&rdata->task.u.tk_work);
 }
-static bool
-bl_check_alignment(u64 offset, u32 len, unsigned long blkmask)
-{
-        if ((offset & blkmask) || (len & blkmask))
-                return false;
-        return true;
-}
 static enum pnfs_try_status
 bl_read_pagelist(struct nfs_read_data *rdata)
 {
@@ -246,15 +253,15 @@ bl_read_pagelist(struct nfs_read_data *rdata)
        sector_t isect, extent_length = 0;
        struct parallel_io *par;
        loff_t f_offset = rdata->args.offset;
+        size_t bytes_left = rdata->args.count;
+        unsigned int pg_offset, pg_len;
        struct page **pages = rdata->args.pages;
        int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+        const bool is_dio = (header->dreq != NULL);
        dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
               rdata->pages.npages, f_offset, (unsigned int)rdata->args.count);
-        if (!bl_check_alignment(f_offset, rdata->args.count, PAGE_CACHE_MASK))
-                goto use_mds;
        par = alloc_parallel(rdata);
        if (!par)
                goto use_mds;
@@ -284,36 +291,53 @@ bl_read_pagelist(struct nfs_read_data *rdata)
                                extent_length = min(extent_length, cow_length);
                        }
                }
+                if (is_dio) {
+                        pg_offset = f_offset & ~PAGE_CACHE_MASK;
+                        if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
+                                pg_len = PAGE_CACHE_SIZE - pg_offset;
+                        else
+                                pg_len = bytes_left;
+                        f_offset += pg_len;
+                        bytes_left -= pg_len;
+                        isect += (pg_offset >> SECTOR_SHIFT);
+                } else {
+                        pg_offset = 0;
+                        pg_len = PAGE_CACHE_SIZE;
+                }
                hole = is_hole(be, isect);
                if (hole && !cow_read) {
                        bio = bl_submit_bio(READ, bio);
                        /* Fill hole w/ zeroes w/o accessing device */
                        dprintk("%s Zeroing page for hole\n", __func__);
-                        zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+                        zero_user_segment(pages[i], pg_offset, pg_len);
                        print_page(pages[i]);
                        SetPageUptodate(pages[i]);
                } else {
                        struct pnfs_block_extent *be_read;
                        be_read = (hole && cow_read) ? cow_read : be;
-                        bio = bl_add_page_to_bio(bio, rdata->pages.npages - i,
+                        bio = do_add_page_to_bio(bio, rdata->pages.npages - i,
                                                 READ,
                                                 isect, pages[i], be_read,
-                                                 bl_end_io_read, par);
+                                                 bl_end_io_read, par,
+                                                 pg_offset, pg_len);
                        if (IS_ERR(bio)) {
                                header->pnfs_error = PTR_ERR(bio);
                                bio = NULL;
                                goto out;
                        }
                }
-                isect += PAGE_CACHE_SECTORS;
+                isect += (pg_len >> SECTOR_SHIFT);
                extent_length -= PAGE_CACHE_SECTORS;
        }
        if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
                rdata->res.eof = 1;
-                rdata->res.count = header->inode->i_size - f_offset;
+                rdata->res.count = header->inode->i_size - rdata->args.offset;
        } else {
-                rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
+                rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset;
        }
 out:
        bl_put_extent(be);
@@ -461,6 +485,106 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
        return;
 }
+static void
+bl_read_single_end_io(struct bio *bio, int error)
+{
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct page *page = bvec->bv_page;
+        /* Only one page in bvec */
+        unlock_page(page);
+}
+static int
+bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
+                    unsigned int offset, unsigned int len)
+{
+        struct bio *bio;
+        struct page *shadow_page;
+        sector_t isect;
+        char *kaddr, *kshadow_addr;
+        int ret = 0;
+        dprintk("%s: offset %u len %u\n", __func__, offset, len);
+        shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+        if (shadow_page == NULL)
+                return -ENOMEM;
+        bio = bio_alloc(GFP_NOIO, 1);
+        if (bio == NULL)
+                return -ENOMEM;
+        isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
+                (offset / SECTOR_SIZE);
+        bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+        bio->bi_bdev = be->be_mdev;
+        bio->bi_end_io = bl_read_single_end_io;
+        lock_page(shadow_page);
+        if (bio_add_page(bio, shadow_page,
+                         SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
+                unlock_page(shadow_page);
+                bio_put(bio);
+                return -EIO;
+        }
+        submit_bio(READ, bio);
+        wait_on_page_locked(shadow_page);
+        if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
+                ret = -EIO;
+        } else {
+                kaddr = kmap_atomic(page);
+                kshadow_addr = kmap_atomic(shadow_page);
+                memcpy(kaddr + offset, kshadow_addr + offset, len);
+                kunmap_atomic(kshadow_addr);
+                kunmap_atomic(kaddr);
+        }
+        __free_page(shadow_page);
+        bio_put(bio);
+        return ret;
+}
+static int
+bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
+                          unsigned int dirty_offset, unsigned int dirty_len,
+                          bool full_page)
+{
+        int ret = 0;
+        unsigned int start, end;
+        if (full_page) {
+                start = 0;
+                end = PAGE_CACHE_SIZE;
+        } else {
+                start = round_down(dirty_offset, SECTOR_SIZE);
+                end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
+        }
+        dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
+        if (!be) {
+                zero_user_segments(page, start, dirty_offset,
+                                   dirty_offset + dirty_len, end);
+                if (start == 0 && end == PAGE_CACHE_SIZE &&
+                    trylock_page(page)) {
+                        SetPageUptodate(page);
+                        unlock_page(page);
+                }
+                return ret;
+        }
+        if (start != dirty_offset)
+                ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
+        if (!ret && (dirty_offset + dirty_len < end))
+                ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
+                                          end - dirty_offset - dirty_len);
+        return ret;
+}
 /* Given an unmapped page, zero it or read in page for COW, page is locked
 * by caller.
 */
@@ -494,7 +618,6 @@ init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
        SetPageUptodate(page);
 cleanup:
-        bl_put_extent(cow_read);
        if (bh)
                free_buffer_head(bh);
        if (ret) {
@@ -566,6 +689,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
        struct parallel_io *par = NULL;
        loff_t offset = wdata->args.offset;
        size_t count = wdata->args.count;
+        unsigned int pg_offset, pg_len, saved_len;
        struct page **pages = wdata->args.pages;
        struct page *page;
        pgoff_t index;
@@ -574,10 +698,13 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
            NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
        dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
-        /* Check for alignment first */
-        if (!bl_check_alignment(offset, count, PAGE_CACHE_MASK))
-                goto out_mds;
+        if (header->dreq != NULL &&
+            (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
+             !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
+                dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
+                goto out_mds;
+        }
        /* At this point, wdata->pages is a (sequential) list of nfs_pages.
         * We want to write each, and if there is an error set pnfs_error
         * to have it redone using nfs.
@@ -674,10 +801,11 @@ next_page:
                if (!extent_length) {
                        /* We've used up the previous extent */
                        bl_put_extent(be);
+                        bl_put_extent(cow_read);
                        bio = bl_submit_bio(WRITE, bio);
                        /* Get the next one */
                        be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
-                                             isect, NULL);
+                                             isect, &cow_read);
                        if (!be || !is_writable(be, isect)) {
                                header->pnfs_error = -EINVAL;
                                goto out;
@@ -694,7 +822,26 @@ next_page:
                        extent_length = be->be_length -
                            (isect - be->be_f_offset);
                }
-                if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
+                pg_offset = offset & ~PAGE_CACHE_MASK;
+                if (pg_offset + count > PAGE_CACHE_SIZE)
+                        pg_len = PAGE_CACHE_SIZE - pg_offset;
+                else
+                        pg_len = count;
+                saved_len = pg_len;
+                if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
+                    !bl_is_sector_init(be->be_inval, isect)) {
+                        ret = bl_read_partial_page_sync(pages[i], cow_read,
+                                                        pg_offset, pg_len, true);
+                        if (ret) {
+                                dprintk("%s bl_read_partial_page_sync fail %d\n",
+                                        __func__, ret);
+                                header->pnfs_error = ret;
+                                goto out;
+                        }
                        ret = bl_mark_sectors_init(be->be_inval, isect,
                                                       PAGE_CACHE_SECTORS);
                        if (unlikely(ret)) {
@@ -703,15 +850,35 @@ next_page:
                                header->pnfs_error = ret;
                                goto out;
                        }
+                        /* Expand to full page write */
+                        pg_offset = 0;
+                        pg_len = PAGE_CACHE_SIZE;
+                } else if  ((pg_offset & (SECTOR_SIZE - 1)) ||
+                            (pg_len & (SECTOR_SIZE - 1))){
+                        /* ahh, nasty case. We have to do sync full sector
+                         * read-modify-write cycles.
+                         */
+                        unsigned int saved_offset = pg_offset;
+                        ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
+                                                        pg_len, false);
+                        pg_offset = round_down(pg_offset, SECTOR_SIZE);
+                        pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
+                                 - pg_offset;
                }
-                bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
+                bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
                                         isect, pages[i], be,
-                                         bl_end_io_write, par);
+                                         bl_end_io_write, par,
+                                         pg_offset, pg_len);
                if (IS_ERR(bio)) {
                        header->pnfs_error = PTR_ERR(bio);
                        bio = NULL;
                        goto out;
                }
+                offset += saved_len;
+                count -= saved_len;
                isect += PAGE_CACHE_SECTORS;
                last_isect = isect;
                extent_length -= PAGE_CACHE_SECTORS;
@@ -729,17 +896,16 @@ next_page:
        }
 write_done:
-        wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
+        wdata->res.count = wdata->args.count;
-        if (count < wdata->res.count) {
-                wdata->res.count = count;
-        }
 out:
        bl_put_extent(be);
+        bl_put_extent(cow_read);
        bl_submit_bio(WRITE, bio);
        put_parallel(par);
        return PNFS_ATTEMPTED;
 out_mds:
        bl_put_extent(be);
+        bl_put_extent(cow_read);
        kfree(par);
        return PNFS_NOT_ATTEMPTED;
 }
@@ -874,7 +1040,7 @@ static void free_blk_mountid(struct block_mount_id *mid)
        }
 }
-/* This is mostly copied from the filelayout's get_device_info function.
+/* This is mostly copied from the filelayout_get_device_info function.
 * It seems much of this should be at the generic pnfs level.
 */
 static struct pnfs_block_dev *
@@ -1011,33 +1177,95 @@ bl_clear_layoutdriver(struct nfs_server *server)
        return 0;
 }
+static bool
+is_aligned_req(struct nfs_page *req, unsigned int alignment)
+{
+        return IS_ALIGNED(req->wb_offset, alignment) &&
+               IS_ALIGNED(req->wb_bytes, alignment);
+}
 static void
 bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
-        if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK))
+        if (pgio->pg_dreq != NULL &&
+            !is_aligned_req(req, SECTOR_SIZE))
                nfs_pageio_reset_read_mds(pgio);
        else
                pnfs_generic_pg_init_read(pgio, req);
 }
+static bool
+bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+                struct nfs_page *req)
+{
+        if (pgio->pg_dreq != NULL &&
+            !is_aligned_req(req, SECTOR_SIZE))
+                return false;
+        return pnfs_generic_pg_test(pgio, prev, req);
+}
+/*
+ * Return the number of contiguous bytes for a given inode
+ * starting at page frame idx.
+ */
+static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
+{
+        struct address_space *mapping = inode->i_mapping;
+        pgoff_t end;
+        /* Optimize common case that writes from 0 to end of file */
+        end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
+        if (end != NFS_I(inode)->npages) {
+                rcu_read_lock();
+                end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX);
+                rcu_read_unlock();
+        }
+        if (!end)
+                return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT);
+        else
+                return (end - idx) << PAGE_CACHE_SHIFT;
+}
 static void
 bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
-        if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK))
+        if (pgio->pg_dreq != NULL &&
+            !is_aligned_req(req, PAGE_CACHE_SIZE)) {
                nfs_pageio_reset_write_mds(pgio);
-        else
+        } else {
-                pnfs_generic_pg_init_write(pgio, req);
+                u64 wb_size;
+                if (pgio->pg_dreq == NULL)
+                        wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
+                                                      req->wb_index);
+                else
+                        wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+                pnfs_generic_pg_init_write(pgio, req, wb_size);
+        }
+}
+static bool
+bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+                 struct nfs_page *req)
+{
+        if (pgio->pg_dreq != NULL &&
+            !is_aligned_req(req, PAGE_CACHE_SIZE))
+                return false;
+        return pnfs_generic_pg_test(pgio, prev, req);
 }
 static const struct nfs_pageio_ops bl_pg_read_ops = {
        .pg_init = bl_pg_init_read,
-        .pg_test = pnfs_generic_pg_test,
+        .pg_test = bl_pg_test_read,
        .pg_doio = pnfs_generic_pg_readpages,
 };
 static const struct nfs_pageio_ops bl_pg_write_ops = {
        .pg_init = bl_pg_init_write,
-        .pg_test = pnfs_generic_pg_test,
+        .pg_test = bl_pg_test_write,
        .pg_doio = pnfs_generic_pg_writepages,
 };
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 03350690118e..f4891bde8851 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -41,6 +41,7 @@
 #define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
 #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
 struct block_mount_id {
        spinlock_t                      bm_lock;    /* protects list */
@@ -172,7 +173,6 @@ struct bl_msg_hdr {
 /* blocklayoutdev.c */
 ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
 void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-struct block_device *nfs4_blkdev_get(dev_t dev);
 int nfs4_blkdev_put(struct block_device *bdev);
 struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
                                                struct pnfs_device *dev);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index c96554245ccf..a86c5bdad9e3 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -53,22 +53,6 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
        return 0;
 }
-/* Open a block_device by device number. */
-struct block_device *nfs4_blkdev_get(dev_t dev)
-{
-        struct block_device *bd;
-        dprintk("%s enter\n", __func__);
-        bd = blkdev_get_by_dev(dev, FMODE_READ, NULL);
-        if (IS_ERR(bd))
-                goto fail;
-        return bd;
-fail:
-        dprintk("%s failed to open device : %ld\n",
-                        __func__, PTR_ERR(bd));
-        return NULL;
-}
 /*
 * Release the block device
 */
@@ -172,11 +156,12 @@ nfs4_blk_decode_device(struct nfs_server *server,
                goto out;
        }
-        bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
+        bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
+                               FMODE_READ, NULL);
        if (IS_ERR(bd)) {
-                rc = PTR_ERR(bd);
+                dprintk("%s failed to open device : %ld\n", __func__,
-                dprintk("%s failed to open device : %d\n", __func__, rc);
+                        PTR_ERR(bd));
-                rv = ERR_PTR(rc);
+                rv = ERR_CAST(bd);
                goto out;
        }
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 1f9a6032796b..9c3e117c3ed1 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -683,8 +683,7 @@ encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
                p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
                p = xdr_encode_hyper(p, 0LL);
                *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
-                list_del(&lce->bse_node);
+                list_move_tail(&lce->bse_node, &bl->bl_committing);
-                list_add_tail(&lce->bse_node, &bl->bl_committing);
                bl->bl_count--;
                count++;
        }
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 4c8459e5bdee..9a521fb39869 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -12,6 +12,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfs_fs.h>
+#include <linux/errno.h>
 #include <linux/mutex.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
@@ -23,6 +24,7 @@
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "internal.h"
+#include "netns.h"
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -37,7 +39,32 @@ static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
 static DEFINE_MUTEX(nfs_callback_mutex);
 static struct svc_program nfs4_callback_program;
-unsigned short nfs_callback_tcpport6;
+static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
+{
+        int ret;
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        ret = svc_create_xprt(serv, "tcp", net, PF_INET,
+                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+        if (ret <= 0)
+                goto out_err;
+        nn->nfs_callback_tcpport = ret;
+        dprintk("NFS: Callback listener port = %u (af %u, net %p)\n",
+                        nn->nfs_callback_tcpport, PF_INET, net);
+        ret = svc_create_xprt(serv, "tcp", net, PF_INET6,
+                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+        if (ret > 0) {
+                nn->nfs_callback_tcpport6 = ret;
+                dprintk("NFS: Callback listener port = %u (af %u, net %p)\n",
+                                nn->nfs_callback_tcpport6, PF_INET6, net);
+        } else if (ret != -EAFNOSUPPORT)
+                goto out_err;
+        return 0;
+out_err:
+        return (ret) ? ret : -ENOMEM;
+}
 /*
 * This is the NFSv4 callback kernel thread.
@@ -45,7 +72,7 @@ unsigned short nfs_callback_tcpport6;
 static int
 nfs4_callback_svc(void *vrqstp)
 {
-        int err, preverr = 0;
+        int err;
        struct svc_rqst *rqstp = vrqstp;
        set_freezable();
@@ -55,20 +82,8 @@ nfs4_callback_svc(void *vrqstp)
                 * Listen for a request on the socket
                 */
                err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT);
-                if (err == -EAGAIN || err == -EINTR) {
+                if (err == -EAGAIN || err == -EINTR)
-                        preverr = err;
-                        continue;
-                }
-                if (err < 0) {
-                        if (err != preverr) {
-                                printk(KERN_WARNING "NFS: %s: unexpected error "
-                                        "from svc_recv (%d)\n", __func__, err);
-                                preverr = err;
-                        }
-                        schedule_timeout_uninterruptible(HZ);
                        continue;
-                }
-                preverr = err;
                svc_process(rqstp);
        }
        return 0;
@@ -78,38 +93,23 @@ nfs4_callback_svc(void *vrqstp)
 * Prepare to bring up the NFSv4 callback service
 */
 static struct svc_rqst *
-nfs4_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
+nfs4_callback_up(struct svc_serv *serv)
 {
-        int ret;
-        ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
-                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
-        if (ret <= 0)
-                goto out_err;
-        nfs_callback_tcpport = ret;
-        dprintk("NFS: Callback listener port = %u (af %u)\n",
-                        nfs_callback_tcpport, PF_INET);
-        ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
-                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
-        if (ret > 0) {
-                nfs_callback_tcpport6 = ret;
-                dprintk("NFS: Callback listener port = %u (af %u)\n",
-                                nfs_callback_tcpport6, PF_INET6);
-        } else if (ret == -EAFNOSUPPORT)
-                ret = 0;
-        else
-                goto out_err;
        return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
-out_err:
-        if (ret == 0)
-                ret = -ENOMEM;
-        return ERR_PTR(ret);
 }
 #if defined(CONFIG_NFS_V4_1)
+static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
+{
+        /*
+         * Create an svc_sock for the back channel service that shares the
+         * fore channel connection.
+         * Returns the input port (0) and sets the svc_serv bc_xprt on success
+         */
+        return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0,
+                              SVC_SOCK_ANONYMOUS);
+}
 /*
 * The callback service for NFSv4.1 callbacks
 */
@@ -149,28 +149,9 @@ nfs41_callback_svc(void *vrqstp)
 * Bring up the NFSv4.1 callback service
 */
 static struct svc_rqst *
-nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
+nfs41_callback_up(struct svc_serv *serv)
 {
        struct svc_rqst *rqstp;
-        int ret;
-        /*
-         * Create an svc_sock for the back channel service that shares the
-         * fore channel connection.
-         * Returns the input port (0) and sets the svc_serv bc_xprt on success
-         */
-        ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
-                              SVC_SOCK_ANONYMOUS);
-        if (ret < 0) {
-                rqstp = ERR_PTR(ret);
-                goto out;
-        }
-        /*
-         * Save the svc_serv in the transport so that it can
-         * be referenced when the session backchannel is initialized
-         */
-        xprt->bc_serv = serv;
        INIT_LIST_HEAD(&serv->sv_cb_list);
        spin_lock_init(&serv->sv_cb_lock);
@@ -180,90 +161,74 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
                svc_xprt_put(serv->sv_bc_xprt);
                serv->sv_bc_xprt = NULL;
        }
-out:
        dprintk("--> %s return %ld\n", __func__,
                IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
        return rqstp;
 }
-static inline int nfs_minorversion_callback_svc_setup(u32 minorversion,
+static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
-                struct svc_serv *serv, struct rpc_xprt *xprt,
                struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
 {
-        if (minorversion) {
+        *rqstpp = nfs41_callback_up(serv);
-                *rqstpp = nfs41_callback_up(serv, xprt);
+        *callback_svc = nfs41_callback_svc;
-                *callback_svc = nfs41_callback_svc;
-        }
-        return minorversion;
 }
 static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
-                struct nfs_callback_data *cb_info)
+                struct svc_serv *serv)
 {
        if (minorversion)
-                xprt->bc_serv = cb_info->serv;
+                /*
+                 * Save the svc_serv in the transport so that it can
+                 * be referenced when the session backchannel is initialized
+                 */
+                xprt->bc_serv = serv;
 }
 #else
-static inline int nfs_minorversion_callback_svc_setup(u32 minorversion,
+static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
-                struct svc_serv *serv, struct rpc_xprt *xprt,
-                struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
 {
        return 0;
 }
+static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
+                struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
+{
+        *rqstpp = ERR_PTR(-ENOTSUPP);
+        *callback_svc = ERR_PTR(-ENOTSUPP);
+}
 static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
-                struct nfs_callback_data *cb_info)
+                struct svc_serv *serv)
 {
 }
 #endif /* CONFIG_NFS_V4_1 */
-/*
+static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
- * Bring up the callback thread if it is not already up.
+                                  struct svc_serv *serv)
- */
-int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
 {
-        struct svc_serv *serv = NULL;
        struct svc_rqst *rqstp;
        int (*callback_svc)(void *vrqstp);
        struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
        char svc_name[12];
-        int ret = 0;
+        int ret;
-        int minorversion_setup;
-        struct net *net = &init_net;
-        mutex_lock(&nfs_callback_mutex);
+        nfs_callback_bc_serv(minorversion, xprt, serv);
-        if (cb_info->users++ || cb_info->task != NULL) {
-                nfs_callback_bc_serv(minorversion, xprt, cb_info);
-                goto out;
-        }
-        serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
-        if (!serv) {
-                ret = -ENOMEM;
-                goto out_err;
-        }
-        /* As there is only one thread we need to over-ride the
-         * default maximum of 80 connections
-         */
-        serv->sv_maxconn = 1024;
-        ret = svc_bind(serv, net);
+        if (cb_info->task)
-        if (ret < 0) {
+                return 0;
-                printk(KERN_WARNING "NFS: bind callback service failed\n");
-                goto out_err;
-        }
-        minorversion_setup =  nfs_minorversion_callback_svc_setup(minorversion,
+        switch (minorversion) {
-                                        serv, xprt, &rqstp, &callback_svc);
+        case 0:
-        if (!minorversion_setup) {
                /* v4.0 callback setup */
-                rqstp = nfs4_callback_up(serv, xprt);
+                rqstp = nfs4_callback_up(serv);
                callback_svc = nfs4_callback_svc;
+                break;
+        default:
+                nfs_minorversion_callback_svc_setup(serv,
+                                &rqstp, &callback_svc);
        }
-        if (IS_ERR(rqstp)) {
+        if (IS_ERR(rqstp))
-                ret = PTR_ERR(rqstp);
+                return PTR_ERR(rqstp);
-                goto out_err;
-        }
        svc_sock_update_bufs(serv);
@@ -276,41 +241,165 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
                svc_exit_thread(cb_info->rqst);
                cb_info->rqst = NULL;
                cb_info->task = NULL;
-                goto out_err;
+                return PTR_ERR(cb_info->task);
+        }
+        dprintk("nfs_callback_up: service started\n");
+        return 0;
+}
+static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struct net *net)
+{
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        if (--nn->cb_users[minorversion])
+                return;
+        dprintk("NFS: destroy per-net callback data; net=%p\n", net);
+        svc_shutdown_net(serv, net);
+}
+static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net)
+{
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        int ret;
+        if (nn->cb_users[minorversion]++)
+                return 0;
+        dprintk("NFS: create per-net callback data; net=%p\n", net);
+        ret = svc_bind(serv, net);
+        if (ret < 0) {
+                printk(KERN_WARNING "NFS: bind callback service failed\n");
+                goto err_bind;
+        }
+        switch (minorversion) {
+                case 0:
+                        ret = nfs4_callback_up_net(serv, net);
+                        break;
+                case 1:
+                        ret = nfs41_callback_up_net(serv, net);
+                        break;
+                default:
+                        printk(KERN_ERR "NFS: unknown callback version: %d\n",
+                                        minorversion);
+                        ret = -EINVAL;
+                        break;
+        }
+        if (ret < 0) {
+                printk(KERN_ERR "NFS: callback service start failed\n");
+                goto err_socks;
+        }
+        return 0;
+err_socks:
+        svc_rpcb_cleanup(serv, net);
+err_bind:
+        dprintk("NFS: Couldn't create callback socket: err = %d; "
+                        "net = %p\n", ret, net);
+        return ret;
+}
+static struct svc_serv *nfs_callback_create_svc(int minorversion)
+{
+        struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+        struct svc_serv *serv;
+        /*
+         * Check whether we're already up and running.
+         */
+        if (cb_info->task) {
+                /*
+                 * Note: increase service usage, because later in case of error
+                 * svc_destroy() will be called.
+                 */
+                svc_get(cb_info->serv);
+                return cb_info->serv;
+        }
+        /*
+         * Sanity check: if there's no task,
+         * we should be the first user ...
+         */
+        if (cb_info->users)
+                printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
+                        cb_info->users);
+        serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+        if (!serv) {
+                printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        /* As there is only one thread we need to over-ride the
+         * default maximum of 80 connections
+         */
+        serv->sv_maxconn = 1024;
+        dprintk("nfs_callback_create_svc: service created\n");
+        return serv;
+}
+/*
+ * Bring up the callback thread if it is not already up.
+ */
+int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
+{
+        struct svc_serv *serv;
+        struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+        int ret;
+        struct net *net = xprt->xprt_net;
+        mutex_lock(&nfs_callback_mutex);
+        serv = nfs_callback_create_svc(minorversion);
+        if (IS_ERR(serv)) {
+                ret = PTR_ERR(serv);
+                goto err_create;
        }
-out:
+        ret = nfs_callback_up_net(minorversion, serv, net);
+        if (ret < 0)
+                goto err_net;
+        ret = nfs_callback_start_svc(minorversion, xprt, serv);
+        if (ret < 0)
+                goto err_start;
+        cb_info->users++;
        /*
         * svc_create creates the svc_serv with sv_nrthreads == 1, and then
         * svc_prepare_thread increments that. So we need to call svc_destroy
         * on both success and failure so that the refcount is 1 when the
         * thread exits.
         */
-        if (serv)
+err_net:
-                svc_destroy(serv);
+        svc_destroy(serv);
+err_create:
        mutex_unlock(&nfs_callback_mutex);
        return ret;
-out_err:
-        dprintk("NFS: Couldn't create callback socket or server thread; "
+err_start:
-                "err = %d\n", ret);
+        nfs_callback_down_net(minorversion, serv, net);
-        cb_info->users--;
+        dprintk("NFS: Couldn't create server thread; err = %d\n", ret);
-        if (serv)
+        goto err_net;
-                svc_shutdown_net(serv, net);
-        goto out;
 }
 /*
 * Kill the callback thread if it's no longer being used.
 */
-void nfs_callback_down(int minorversion)
+void nfs_callback_down(int minorversion, struct net *net)
 {
        struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
        mutex_lock(&nfs_callback_mutex);
+        nfs_callback_down_net(minorversion, cb_info->serv, net);
        cb_info->users--;
        if (cb_info->users == 0 && cb_info->task != NULL) {
                kthread_stop(cb_info->task);
-                svc_shutdown_net(cb_info->serv, &init_net);
+                dprintk("nfs_callback_down: service stopped\n");
                svc_exit_thread(cb_info->rqst);
+                dprintk("nfs_callback_down: service destroyed\n");
                cb_info->serv = NULL;
                cb_info->rqst = NULL;
                cb_info->task = NULL;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index b44d7b128b71..4251c2ae06ad 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -194,7 +194,7 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
                                   struct cb_process_state *cps);
 #if IS_ENABLED(CONFIG_NFS_V4)
 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
-extern void nfs_callback_down(int minorversion);
+extern void nfs_callback_down(int minorversion, struct net *net);
 extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
                                            const nfs4_stateid *stateid);
 extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
@@ -209,6 +209,5 @@ extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
 extern unsigned int nfs_callback_set_tcpport;
 extern unsigned short nfs_callback_tcpport;
-extern unsigned short nfs_callback_tcpport6;
 #endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 1b5d809a105e..76b4a7a3e559 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -122,7 +122,15 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
                        ino = igrab(lo->plh_inode);
                        if (!ino)
                                continue;
-                        get_layout_hdr(lo);
+                        spin_lock(&ino->i_lock);
+                        /* Is this layout in the process of being freed? */
+                        if (NFS_I(ino)->layout != lo) {
+                                spin_unlock(&ino->i_lock);
+                                iput(ino);
+                                continue;
+                        }
+                        pnfs_get_layout_hdr(lo);
+                        spin_unlock(&ino->i_lock);
                        return lo;
                }
        }
@@ -158,7 +166,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
        ino = lo->plh_inode;
        spin_lock(&ino->i_lock);
        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
-            mark_matching_lsegs_invalid(lo, &free_me_list,
+            pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
                                        &args->cbl_range))
                rv = NFS4ERR_DELAY;
        else
@@ -166,7 +174,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
        pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&free_me_list);
-        put_layout_hdr(lo);
+        pnfs_put_layout_hdr(lo);
        iput(ino);
        return rv;
 }
@@ -196,9 +204,18 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
                        continue;
                list_for_each_entry(lo, &server->layouts, plh_layouts) {
-                        if (!igrab(lo->plh_inode))
+                        ino = igrab(lo->plh_inode);
+                        if (ino)
+                                continue;
+                        spin_lock(&ino->i_lock);
+                        /* Is this layout in the process of being freed? */
+                        if (NFS_I(ino)->layout != lo) {
+                                spin_unlock(&ino->i_lock);
+                                iput(ino);
                                continue;
-                        get_layout_hdr(lo);
+                        }
+                        pnfs_get_layout_hdr(lo);
+                        spin_unlock(&ino->i_lock);
                        BUG_ON(!list_empty(&lo->plh_bulk_recall));
                        list_add(&lo->plh_bulk_recall, &recall_list);
                }
@@ -211,12 +228,12 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
                ino = lo->plh_inode;
                spin_lock(&ino->i_lock);
                set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
-                if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
+                if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range))
                        rv = NFS4ERR_DELAY;
                list_del_init(&lo->plh_bulk_recall);
                spin_unlock(&ino->i_lock);
                pnfs_free_lseg_list(&free_me_list);
-                put_layout_hdr(lo);
+                pnfs_put_layout_hdr(lo);
                iput(ino);
        }
        return rv;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 99694442b93f..8b39a42ac35e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -93,10 +93,10 @@ static struct nfs_subversion *find_nfs_version(unsigned int version)
                        spin_unlock(&nfs_version_lock);
                        return nfs;
                }
-        };
+        }
        spin_unlock(&nfs_version_lock);
-        return ERR_PTR(-EPROTONOSUPPORT);;
+        return ERR_PTR(-EPROTONOSUPPORT);
 }
 struct nfs_subversion *get_nfs_version(unsigned int version)
@@ -498,7 +498,8 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
                        return nfs_found_client(cl_init, clp);
                }
                if (new) {
-                        list_add(&new->cl_share_link, &nn->nfs_client_list);
+                        list_add_tail(&new->cl_share_link,
+                                        &nn->nfs_client_list);
                        spin_unlock(&nn->nfs_client_lock);
                        new->cl_flags = cl_init->init_flags;
                        return rpc_ops->init_client(new, timeparms, ip_addr,
@@ -668,7 +669,8 @@ int nfs_init_server_rpcclient(struct nfs_server *server,
 {
        struct nfs_client *clp = server->nfs_client;
-        server->client = rpc_clone_client(clp->cl_rpcclient);
+        server->client = rpc_clone_client_set_auth(clp->cl_rpcclient,
+                                                        pseudoflavour);
        if (IS_ERR(server->client)) {
                dprintk("%s: couldn't create rpc_client!\n", __func__);
                return PTR_ERR(server->client);
@@ -678,16 +680,6 @@ int nfs_init_server_rpcclient(struct nfs_server *server,
                        timeo,
                        sizeof(server->client->cl_timeout_default));
        server->client->cl_timeout = &server->client->cl_timeout_default;
-        if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) {
-                struct rpc_auth *auth;
-                auth = rpcauth_create(pseudoflavour, server->client);
-                if (IS_ERR(auth)) {
-                        dprintk("%s: couldn't create credcache!\n", __func__);
-                        return PTR_ERR(auth);
-                }
-        }
        server->client->cl_softrtry = 0;
        if (server->flags & NFS_MOUNT_SOFT)
                server->client->cl_softrtry = 1;
@@ -761,6 +753,8 @@ static int nfs_init_server(struct nfs_server *server,
                        data->timeo, data->retrans);
        if (data->flags & NFS_MOUNT_NORESVPORT)
                set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+        if (server->options & NFS_OPTION_MIGRATION)
+                set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
        /* Allocate or find a client reference we can use */
        clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
@@ -855,7 +849,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
                server->wsize = NFS_MAX_FILE_IO_SIZE;
        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        server->pnfs_blksize = fsinfo->blksize;
        server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 627f108ede23..ce8cb926526b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2072,7 +2072,7 @@ found:
        nfs_access_free_entry(entry);
 }
-static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 {
        struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
        if (cache == NULL)
@@ -2098,6 +2098,20 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
                spin_unlock(&nfs_access_lru_lock);
        }
 }
+EXPORT_SYMBOL_GPL(nfs_access_add_cache);
+void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result)
+{
+        entry->mask = 0;
+        if (access_result & NFS4_ACCESS_READ)
+                entry->mask |= MAY_READ;
+        if (access_result &
+            (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE))
+                entry->mask |= MAY_WRITE;
+        if (access_result & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
+                entry->mask |= MAY_EXEC;
+}
+EXPORT_SYMBOL_GPL(nfs_access_set_mask);
 static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
 {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1ba385b7c90d..cae26cbd59ee 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -46,6 +46,7 @@
 #include <linux/kref.h>
 #include <linux/slab.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/module.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
@@ -78,6 +79,7 @@ struct nfs_direct_req {
        atomic_t                io_count;       /* i/os we're waiting for */
        spinlock_t              lock;           /* protect completion state */
        ssize_t                 count,          /* bytes actually processed */
+                                bytes_left,     /* bytes left to be sent */
                                error;          /* any reported error */
        struct completion       completion;     /* wait for i/o completion */
@@ -190,6 +192,12 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq)
        kref_put(&dreq->kref, nfs_direct_req_free);
 }
+ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
+{
+        return dreq->bytes_left;
+}
+EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
 /*
 * Collects and returns the final error value/byte-count.
 */
@@ -390,6 +398,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
                        user_addr += req_len;
                        pos += req_len;
                        count -= req_len;
+                        dreq->bytes_left -= req_len;
                }
                /* The nfs_page now hold references to these pages */
                nfs_direct_release_pages(pagevec, npages);
@@ -450,23 +459,28 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
        ssize_t result = -ENOMEM;
        struct inode *inode = iocb->ki_filp->f_mapping->host;
        struct nfs_direct_req *dreq;
+        struct nfs_lock_context *l_ctx;
        dreq = nfs_direct_req_alloc();
        if (dreq == NULL)
                goto out;
        dreq->inode = inode;
+        dreq->bytes_left = iov_length(iov, nr_segs);
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
-        dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+        l_ctx = nfs_get_lock_context(dreq->ctx);
-        if (dreq->l_ctx == NULL)
+        if (IS_ERR(l_ctx)) {
+                result = PTR_ERR(l_ctx);
                goto out_release;
+        }
+        dreq->l_ctx = l_ctx;
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
+        NFS_I(inode)->read_io += iov_length(iov, nr_segs);
        result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
        if (!result)
                result = nfs_direct_wait(dreq);
-        NFS_I(inode)->read_io += result;
 out_release:
        nfs_direct_req_release(dreq);
 out:
@@ -706,6 +720,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
                        user_addr += req_len;
                        pos += req_len;
                        count -= req_len;
+                        dreq->bytes_left -= req_len;
                }
                /* The nfs_page now hold references to these pages */
                nfs_direct_release_pages(pagevec, npages);
@@ -814,6 +829,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
        get_dreq(dreq);
        atomic_inc(&inode->i_dio_count);
+        NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs);
        for (seg = 0; seg < nr_segs; seg++) {
                const struct iovec *vec = &iov[seg];
                result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
@@ -825,7 +841,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
                pos += vec->iov_len;
        }
        nfs_pageio_complete(&desc);
-        NFS_I(dreq->inode)->write_io += desc.pg_bytes_written;
        /*
         * If no bytes were started, return the error, and let the
@@ -849,16 +864,21 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
        ssize_t result = -ENOMEM;
        struct inode *inode = iocb->ki_filp->f_mapping->host;
        struct nfs_direct_req *dreq;
+        struct nfs_lock_context *l_ctx;
        dreq = nfs_direct_req_alloc();
        if (!dreq)
                goto out;
        dreq->inode = inode;
+        dreq->bytes_left = count;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
-        dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+        l_ctx = nfs_get_lock_context(dreq->ctx);
-        if (dreq->l_ctx == NULL)
+        if (IS_ERR(l_ctx)) {
+                result = PTR_ERR(l_ctx);
                goto out_release;
+        }
+        dreq->l_ctx = l_ctx;
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f692be97676d..582bb8866131 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -259,7 +259,7 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
        struct dentry *dentry = file->f_path.dentry;
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct inode *inode = dentry->d_inode;
-        int have_error, status;
+        int have_error, do_resend, status;
        int ret = 0;
        dprintk("NFS: fsync file(%s/%s) datasync %d\n",
@@ -267,15 +267,23 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
                        datasync);
        nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
+        do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
        have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
        status = nfs_commit_inode(inode, FLUSH_SYNC);
-        if (status >= 0 && ret < 0)
-                status = ret;
        have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
-        if (have_error)
+        if (have_error) {
                ret = xchg(&ctx->error, 0);
-        if (!ret && status < 0)
+                if (ret)
+                        goto out;
+        }
+        if (status < 0) {
                ret = status;
+                goto out;
+        }
+        do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
+        if (do_resend)
+                ret = -EAGAIN;
+out:
        return ret;
 }
 EXPORT_SYMBOL_GPL(nfs_file_fsync_commit);
@@ -286,13 +294,22 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        int ret;
        struct inode *inode = file->f_path.dentry->d_inode;
-        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        do {
-        if (ret != 0)
+                ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-                goto out;
+                if (ret != 0)
-        mutex_lock(&inode->i_mutex);
+                        break;
-        ret = nfs_file_fsync_commit(file, start, end, datasync);
+                mutex_lock(&inode->i_mutex);
-        mutex_unlock(&inode->i_mutex);
+                ret = nfs_file_fsync_commit(file, start, end, datasync);
-out:
+                mutex_unlock(&inode->i_mutex);
+                /*
+                 * If nfs_file_fsync_commit detected a server reboot, then
+                 * resend all dirty pages that might have been covered by
+                 * the NFS_CONTEXT_RESEND_WRITES flag
+                 */
+                start = 0;
+                end = LLONG_MAX;
+        } while (ret == -EAGAIN);
        return ret;
 }
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 4654ced096a6..033803c36644 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -32,6 +32,8 @@
 #include <asm/uaccess.h>
+#include "internal.h"
 #define NFSDBG_FACILITY         NFSDBG_CLIENT
 /*
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index a850079467d8..9cc4a3fbf4b0 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -55,18 +55,19 @@
 static const struct cred *id_resolver_cache;
 static struct key_type key_type_id_resolver_legacy;
-struct idmap {
-        struct rpc_pipe         *idmap_pipe;
-        struct key_construction *idmap_key_cons;
-        struct mutex            idmap_mutex;
-};
 struct idmap_legacy_upcalldata {
        struct rpc_pipe_msg pipe_msg;
        struct idmap_msg idmap_msg;
+        struct key_construction *key_cons;
        struct idmap *idmap;
 };
+struct idmap {
+        struct rpc_pipe         *idmap_pipe;
+        struct idmap_legacy_upcalldata *idmap_upcall_data;
+        struct mutex            idmap_mutex;
+};
 /**
 * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
 * @fattr: fully initialised struct nfs_fattr
@@ -158,7 +159,7 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re
                return 0;
        memcpy(buf, name, namelen);
        buf[namelen] = '\0';
-        if (strict_strtoul(buf, 0, &val) != 0)
+        if (kstrtoul(buf, 0, &val) != 0)
                return 0;
        *res = val;
        return 1;
@@ -330,7 +331,6 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
                ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
                                            name, namelen, type, data,
                                            data_size, idmap);
-                idmap->idmap_key_cons = NULL;
                mutex_unlock(&idmap->idmap_mutex);
        }
        return ret;
@@ -364,7 +364,7 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *typ
        if (data_size <= 0) {
                ret = -EINVAL;
        } else {
-                ret = strict_strtol(id_str, 10, &id_long);
+                ret = kstrtol(id_str, 10, &id_long);
                *id = (__u32)id_long;
        }
        return ret;
@@ -465,8 +465,6 @@ nfs_idmap_new(struct nfs_client *clp)
        struct rpc_pipe *pipe;
        int error;
-        BUG_ON(clp->cl_idmap != NULL);
        idmap = kzalloc(sizeof(*idmap), GFP_KERNEL);
        if (idmap == NULL)
                return -ENOMEM;
@@ -510,7 +508,6 @@ static int __rpc_pipefs_event(struct nfs_client *clp, unsigned long event,
        switch (event) {
        case RPC_PIPEFS_MOUNT:
-                BUG_ON(clp->cl_rpcclient->cl_dentry == NULL);
                err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry,
                                                clp->cl_idmap,
                                                clp->cl_idmap->idmap_pipe);
@@ -632,9 +629,6 @@ static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
        substring_t substr;
        int token, ret;
-        memset(im,  0, sizeof(*im));
-        memset(msg, 0, sizeof(*msg));
        im->im_type = IDMAP_TYPE_GROUP;
        token = match_token(desc, nfs_idmap_tokens, &substr);
@@ -665,6 +659,35 @@ out:
        return ret;
 }
+static bool
+nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
+                struct idmap_legacy_upcalldata *data)
+{
+        if (idmap->idmap_upcall_data != NULL) {
+                WARN_ON_ONCE(1);
+                return false;
+        }
+        idmap->idmap_upcall_data = data;
+        return true;
+}
+static void
+nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
+{
+        struct key_construction *cons = idmap->idmap_upcall_data->key_cons;
+        kfree(idmap->idmap_upcall_data);
+        idmap->idmap_upcall_data = NULL;
+        complete_request_key(cons, ret);
+}
+static void
+nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)
+{
+        if (idmap->idmap_upcall_data != NULL)
+                nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
+}
 static int nfs_idmap_legacy_upcall(struct key_construction *cons,
                                   const char *op,
                                   void *aux)
@@ -677,29 +700,28 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
        int ret = -ENOMEM;
        /* msg and im are freed in idmap_pipe_destroy_msg */
-        data = kmalloc(sizeof(*data), GFP_KERNEL);
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (!data)
                goto out1;
        msg = &data->pipe_msg;
        im = &data->idmap_msg;
        data->idmap = idmap;
+        data->key_cons = cons;
        ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
        if (ret < 0)
                goto out2;
-        BUG_ON(idmap->idmap_key_cons != NULL);
+        ret = -EAGAIN;
-        idmap->idmap_key_cons = cons;
+        if (!nfs_idmap_prepare_pipe_upcall(idmap, data))
+                goto out2;
        ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
        if (ret < 0)
-                goto out3;
+                nfs_idmap_abort_pipe_upcall(idmap, ret);
        return ret;
-out3:
-        idmap->idmap_key_cons = NULL;
 out2:
        kfree(data);
 out1:
@@ -714,21 +736,32 @@ static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *dat
                                        authkey);
 }
-static int nfs_idmap_read_message(struct idmap_msg *im, struct key *key, struct key *authkey)
+static int nfs_idmap_read_and_verify_message(struct idmap_msg *im,
+                struct idmap_msg *upcall,
+                struct key *key, struct key *authkey)
 {
        char id_str[NFS_UINT_MAXLEN];
-        int ret = -EINVAL;
+        int ret = -ENOKEY;
+        /* ret = -ENOKEY */
+        if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv)
+                goto out;
        switch (im->im_conv) {
        case IDMAP_CONV_NAMETOID:
+                if (strcmp(upcall->im_name, im->im_name) != 0)
+                        break;
                sprintf(id_str, "%d", im->im_id);
                ret = nfs_idmap_instantiate(key, authkey, id_str);
                break;
        case IDMAP_CONV_IDTONAME:
+                if (upcall->im_id != im->im_id)
+                        break;
                ret = nfs_idmap_instantiate(key, authkey, im->im_name);
                break;
+        default:
+                ret = -EINVAL;
        }
+out:
        return ret;
 }
@@ -740,14 +773,16 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
        struct key_construction *cons;
        struct idmap_msg im;
        size_t namelen_in;
-        int ret;
+        int ret = -ENOKEY;
        /* If instantiation is successful, anyone waiting for key construction
         * will have been woken up and someone else may now have used
         * idmap_key_cons - so after this point we may no longer touch it.
         */
-        cons = ACCESS_ONCE(idmap->idmap_key_cons);
+        if (idmap->idmap_upcall_data == NULL)
-        idmap->idmap_key_cons = NULL;
+                goto out_noupcall;
+        cons = idmap->idmap_upcall_data->key_cons;
        if (mlen != sizeof(im)) {
                ret = -ENOSPC;
@@ -768,16 +803,19 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
        if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
                ret = -EINVAL;
                goto out;
-        }
+}
-        ret = nfs_idmap_read_message(&im, cons->key, cons->authkey);
+        ret = nfs_idmap_read_and_verify_message(&im,
+                        &idmap->idmap_upcall_data->idmap_msg,
+                        cons->key, cons->authkey);
        if (ret >= 0) {
                key_set_timeout(cons->key, nfs_idmap_cache_timeout);
                ret = mlen;
        }
 out:
-        complete_request_key(cons, ret);
+        nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
+out_noupcall:
        return ret;
 }
@@ -788,14 +826,9 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
                        struct idmap_legacy_upcalldata,
                        pipe_msg);
        struct idmap *idmap = data->idmap;
-        struct key_construction *cons;
-        if (msg->errno) {
+        if (msg->errno)
-                cons = ACCESS_ONCE(idmap->idmap_key_cons);
+                nfs_idmap_abort_pipe_upcall(idmap, msg->errno);
-                idmap->idmap_key_cons = NULL;
-                complete_request_key(cons, msg->errno);
-        }
-        /* Free memory allocated in nfs_idmap_legacy_upcall() */
-        kfree(data);
 }
 static void
@@ -803,7 +836,8 @@ idmap_release_pipe(struct inode *inode)
 {
        struct rpc_inode *rpci = RPC_I(inode);
        struct idmap *idmap = (struct idmap *)rpci->private;
-        idmap->idmap_key_cons = NULL;
+        nfs_idmap_abort_pipe_upcall(idmap, -EPIPE);
 }
 int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e4c716d374a8..5c7325c5c5e6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -547,8 +547,8 @@ EXPORT_SYMBOL_GPL(nfs_getattr);
 static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
 {
        atomic_set(&l_ctx->count, 1);
-        l_ctx->lockowner = current->files;
+        l_ctx->lockowner.l_owner = current->files;
-        l_ctx->pid = current->tgid;
+        l_ctx->lockowner.l_pid = current->tgid;
        INIT_LIST_HEAD(&l_ctx->list);
 }
@@ -557,9 +557,9 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context
        struct nfs_lock_context *pos;
        list_for_each_entry(pos, &ctx->lock_context.list, list) {
-                if (pos->lockowner != current->files)
+                if (pos->lockowner.l_owner != current->files)
                        continue;
-                if (pos->pid != current->tgid)
+                if (pos->lockowner.l_pid != current->tgid)
                        continue;
                atomic_inc(&pos->count);
                return pos;
@@ -578,7 +578,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
                spin_unlock(&inode->i_lock);
                new = kmalloc(sizeof(*new), GFP_KERNEL);
                if (new == NULL)
-                        return NULL;
+                        return ERR_PTR(-ENOMEM);
                nfs_init_lock_context(new);
                spin_lock(&inode->i_lock);
                res = __nfs_find_lock_context(ctx);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 31fdb03225cd..59b133c5d652 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -101,11 +101,11 @@ struct nfs_client_initdata {
 */
 struct nfs_parsed_mount_data {
        int                     flags;
-        int                     rsize, wsize;
+        unsigned int            rsize, wsize;
-        int                     timeo, retrans;
+        unsigned int            timeo, retrans;
-        int                     acregmin, acregmax,
+        unsigned int            acregmin, acregmax,
                                acdirmin, acdirmax;
-        int                     namlen;
+        unsigned int            namlen;
        unsigned int            options;
        unsigned int            bsize;
        unsigned int            auth_flavor_len;
@@ -464,6 +464,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
 {
        inode_dio_wait(inode);
 }
+extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
 /* nfs4proc.c */
 extern void __nfs4_read_done_cb(struct nfs_read_data *);
@@ -483,6 +484,12 @@ extern int _nfs4_call_sync_session(struct rpc_clnt *clnt,
                                   struct nfs4_sequence_args *args,
                                   struct nfs4_sequence_res *res,
                                   int cache_reply);
+extern int nfs40_walk_client_list(struct nfs_client *clp,
+                                struct nfs_client **result,
+                                struct rpc_cred *cred);
+extern int nfs41_walk_client_list(struct nfs_client *clp,
+                                struct nfs_client **result,
+                                struct rpc_cred *cred);
 /*
 * Determine the device name as a string
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index 0539de1b8d1f..8ee1fab83268 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -5,6 +5,7 @@
 #ifndef __NFS_NETNS_H__
 #define __NFS_NETNS_H__
+#include <linux/nfs4.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
@@ -22,6 +23,9 @@ struct nfs_net {
        struct list_head nfs_volume_list;
 #if IS_ENABLED(CONFIG_NFS_V4)
        struct idr cb_ident_idr; /* Protected by nfs_client_lock */
+        unsigned short nfs_callback_tcpport;
+        unsigned short nfs_callback_tcpport6;
+        int cb_users[NFS4_MAX_MINOR_VERSION + 1];
 #endif
        spinlock_t nfs_client_lock;
        struct timespec boot_time;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index da0618aeeadb..a525fdefccde 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -132,8 +132,8 @@ struct nfs4_lock_owner {
 struct nfs4_lock_state {
        struct list_head        ls_locks;       /* Other lock stateids */
        struct nfs4_state *     ls_state;       /* Pointer to open state */
-#define NFS_LOCK_INITIALIZED 1
+#define NFS_LOCK_INITIALIZED 0
-        int                     ls_flags;
+        unsigned long           ls_flags;
        struct nfs_seqid_counter        ls_seqid;
        nfs4_stateid            ls_stateid;
        atomic_t                ls_count;
@@ -191,6 +191,8 @@ struct nfs4_state_recovery_ops {
        int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
        struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
        int (*reclaim_complete)(struct nfs_client *);
+        int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
+                struct rpc_cred *);
 };
 struct nfs4_state_maintenance_ops {
@@ -223,7 +225,7 @@ extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
 extern int nfs4_destroy_clientid(struct nfs_client *clp);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
+extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,
                                  struct nfs4_fs_locations *, struct page *);
@@ -320,9 +322,15 @@ extern void nfs4_renew_state(struct work_struct *);
 /* nfs4state.c */
 struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
+int nfs4_discover_server_trunking(struct nfs_client *clp,
+                        struct nfs_client **);
+int nfs40_discover_server_trunking(struct nfs_client *clp,
+                        struct nfs_client **, struct rpc_cred *);
 #if defined(CONFIG_NFS_V4_1)
 struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
+int nfs41_discover_server_trunking(struct nfs_client *clp,
+                        struct nfs_client **, struct rpc_cred *);
 extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
 #else
 static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
@@ -351,7 +359,7 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
-                fmode_t, fl_owner_t, pid_t);
+                fmode_t, const struct nfs_lockowner *);
 extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
 extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -372,6 +380,9 @@ extern bool nfs4_disable_idmapping;
 extern unsigned short max_session_slots;
 extern unsigned short send_implementation_id;
+#define NFS4_CLIENT_ID_UNIQ_LEN         (64)
+extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN];
 /* nfs4sysctl.c */
 #ifdef CONFIG_SYSCTL
 int nfs4_register_sysctl(void);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 24eb663f8ed5..6bacfde1319a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -84,7 +84,7 @@ error:
 static void nfs4_destroy_callback(struct nfs_client *clp)
 {
        if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
-                nfs_callback_down(clp->cl_mvops->minor_version);
+                nfs_callback_down(clp->cl_mvops->minor_version, clp->cl_net);
 }
 static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -185,6 +185,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
                                    rpc_authflavor_t authflavour)
 {
        char buf[INET6_ADDRSTRLEN + 1];
+        struct nfs_client *old;
        int error;
        if (clp->cl_cons_state == NFS_CS_READY) {
@@ -230,6 +231,17 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
        if (!nfs4_has_session(clp))
                nfs_mark_client_ready(clp, NFS_CS_READY);
+        error = nfs4_discover_server_trunking(clp, &old);
+        if (error < 0)
+                goto error;
+        if (clp != old) {
+                clp->cl_preserve_clid = true;
+                nfs_put_client(clp);
+                clp = old;
+                atomic_inc(&clp->cl_count);
+        }
        return clp;
 error:
@@ -239,6 +251,248 @@ error:
        return ERR_PTR(error);
 }
+/*
+ * SETCLIENTID just did a callback update with the callback ident in
+ * "drop," but server trunking discovery claims "drop" and "keep" are
+ * actually the same server.  Swap the callback IDs so that "keep"
+ * will continue to use the callback ident the server now knows about,
+ * and so that "keep"'s original callback ident is destroyed when
+ * "drop" is freed.
+ */
+static void nfs4_swap_callback_idents(struct nfs_client *keep,
+                                      struct nfs_client *drop)
+{
+        struct nfs_net *nn = net_generic(keep->cl_net, nfs_net_id);
+        unsigned int save = keep->cl_cb_ident;
+        if (keep->cl_cb_ident == drop->cl_cb_ident)
+                return;
+        dprintk("%s: keeping callback ident %u and dropping ident %u\n",
+                __func__, keep->cl_cb_ident, drop->cl_cb_ident);
+        spin_lock(&nn->nfs_client_lock);
+        idr_replace(&nn->cb_ident_idr, keep, drop->cl_cb_ident);
+        keep->cl_cb_ident = drop->cl_cb_ident;
+        idr_replace(&nn->cb_ident_idr, drop, save);
+        drop->cl_cb_ident = save;
+        spin_unlock(&nn->nfs_client_lock);
+}
+/**
+ * nfs40_walk_client_list - Find server that recognizes a client ID
+ *
+ * @new: nfs_client with client ID to test
+ * @result: OUT: found nfs_client, or new
+ * @cred: credential to use for trunking test
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status.
+ * If zero is returned, an nfs_client pointer is planted in "result."
+ *
+ * NB: nfs40_walk_client_list() relies on the new nfs_client being
+ *     the last nfs_client on the list.
+ */
+int nfs40_walk_client_list(struct nfs_client *new,
+                           struct nfs_client **result,
+                           struct rpc_cred *cred)
+{
+        struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
+        struct nfs_client *pos, *n, *prev = NULL;
+        struct nfs4_setclientid_res clid = {
+                .clientid       = new->cl_clientid,
+                .confirm        = new->cl_confirm,
+        };
+        int status;
+        spin_lock(&nn->nfs_client_lock);
+        list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
+                /* If "pos" isn't marked ready, we can't trust the
+                 * remaining fields in "pos" */
+                if (pos->cl_cons_state < NFS_CS_READY)
+                        continue;
+                if (pos->rpc_ops != new->rpc_ops)
+                        continue;
+                if (pos->cl_proto != new->cl_proto)
+                        continue;
+                if (pos->cl_minorversion != new->cl_minorversion)
+                        continue;
+                if (pos->cl_clientid != new->cl_clientid)
+                        continue;
+                atomic_inc(&pos->cl_count);
+                spin_unlock(&nn->nfs_client_lock);
+                if (prev)
+                        nfs_put_client(prev);
+                status = nfs4_proc_setclientid_confirm(pos, &clid, cred);
+                if (status == 0) {
+                        nfs4_swap_callback_idents(pos, new);
+                        nfs_put_client(pos);
+                        *result = pos;
+                        dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
+                                __func__, pos, atomic_read(&pos->cl_count));
+                        return 0;
+                }
+                if (status != -NFS4ERR_STALE_CLIENTID) {
+                        nfs_put_client(pos);
+                        dprintk("NFS: <-- %s status = %d, no result\n",
+                                __func__, status);
+                        return status;
+                }
+                spin_lock(&nn->nfs_client_lock);
+                prev = pos;
+        }
+        /*
+         * No matching nfs_client found.  This should be impossible,
+         * because the new nfs_client has already been added to
+         * nfs_client_list by nfs_get_client().
+         *
+         * Don't BUG(), since the caller is holding a mutex.
+         */
+        if (prev)
+                nfs_put_client(prev);
+        spin_unlock(&nn->nfs_client_lock);
+        pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
+        return -NFS4ERR_STALE_CLIENTID;
+}
+#ifdef CONFIG_NFS_V4_1
+/*
+ * Returns true if the client IDs match
+ */
+static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
+{
+        if (a->cl_clientid != b->cl_clientid) {
+                dprintk("NFS: --> %s client ID %llx does not match %llx\n",
+                        __func__, a->cl_clientid, b->cl_clientid);
+                return false;
+        }
+        dprintk("NFS: --> %s client ID %llx matches %llx\n",
+                __func__, a->cl_clientid, b->cl_clientid);
+        return true;
+}
+/*
+ * Returns true if the server owners match
+ */
+static bool
+nfs4_match_serverowners(struct nfs_client *a, struct nfs_client *b)
+{
+        struct nfs41_server_owner *o1 = a->cl_serverowner;
+        struct nfs41_server_owner *o2 = b->cl_serverowner;
+        if (o1->minor_id != o2->minor_id) {
+                dprintk("NFS: --> %s server owner minor IDs do not match\n",
+                        __func__);
+                return false;
+        }
+        if (o1->major_id_sz != o2->major_id_sz)
+                goto out_major_mismatch;
+        if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
+                goto out_major_mismatch;
+        dprintk("NFS: --> %s server owners match\n", __func__);
+        return true;
+out_major_mismatch:
+        dprintk("NFS: --> %s server owner major IDs do not match\n",
+                __func__);
+        return false;
+}
+/**
+ * nfs41_walk_client_list - Find nfs_client that matches a client/server owner
+ *
+ * @new: nfs_client with client ID to test
+ * @result: OUT: found nfs_client, or new
+ * @cred: credential to use for trunking test
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status.
+ * If zero is returned, an nfs_client pointer is planted in "result."
+ *
+ * NB: nfs41_walk_client_list() relies on the new nfs_client being
+ *     the last nfs_client on the list.
+ */
+int nfs41_walk_client_list(struct nfs_client *new,
+                           struct nfs_client **result,
+                           struct rpc_cred *cred)
+{
+        struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
+        struct nfs_client *pos, *n, *prev = NULL;
+        int error;
+        spin_lock(&nn->nfs_client_lock);
+        list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
+                /* If "pos" isn't marked ready, we can't trust the
+                 * remaining fields in "pos", especially the client
+                 * ID and serverowner fields.  Wait for CREATE_SESSION
+                 * to finish. */
+                if (pos->cl_cons_state < NFS_CS_READY) {
+                        atomic_inc(&pos->cl_count);
+                        spin_unlock(&nn->nfs_client_lock);
+                        if (prev)
+                                nfs_put_client(prev);
+                        prev = pos;
+                        error = nfs_wait_client_init_complete(pos);
+                        if (error < 0) {
+                                nfs_put_client(pos);
+                                spin_lock(&nn->nfs_client_lock);
+                                continue;
+                        }
+                        spin_lock(&nn->nfs_client_lock);
+                }
+                if (pos->rpc_ops != new->rpc_ops)
+                        continue;
+                if (pos->cl_proto != new->cl_proto)
+                        continue;
+                if (pos->cl_minorversion != new->cl_minorversion)
+                        continue;
+                if (!nfs4_match_clientids(pos, new))
+                        continue;
+                if (!nfs4_match_serverowners(pos, new))
+                        continue;
+                spin_unlock(&nn->nfs_client_lock);
+                dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
+                        __func__, pos, atomic_read(&pos->cl_count));
+                *result = pos;
+                return 0;
+        }
+        /*
+         * No matching nfs_client found.  This should be impossible,
+         * because the new nfs_client has already been added to
+         * nfs_client_list by nfs_get_client().
+         *
+         * Don't BUG(), since the caller is holding a mutex.
+         */
+        spin_unlock(&nn->nfs_client_lock);
+        pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
+        return -NFS4ERR_STALE_CLIENTID;
+}
+#endif  /* CONFIG_NFS_V4_1 */
 static void nfs4_destroy_server(struct nfs_server *server)
 {
        nfs_server_return_all_delegations(server);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index eb5eb8eef4d3..afddd6639afb 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -95,16 +95,25 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        int ret;
        struct inode *inode = file->f_path.dentry->d_inode;
-        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        do {
-        if (ret != 0)
+                ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-                goto out;
+                if (ret != 0)
-        mutex_lock(&inode->i_mutex);
+                        break;
-        ret = nfs_file_fsync_commit(file, start, end, datasync);
+                mutex_lock(&inode->i_mutex);
-        if (!ret && !datasync)
+                ret = nfs_file_fsync_commit(file, start, end, datasync);
-                /* application has asked for meta-data sync */
+                if (!ret && !datasync)
-                ret = pnfs_layoutcommit_inode(inode, true);
+                        /* application has asked for meta-data sync */
-        mutex_unlock(&inode->i_mutex);
+                        ret = pnfs_layoutcommit_inode(inode, true);
-out:
+                mutex_unlock(&inode->i_mutex);
+                /*
+                 * If nfs_file_fsync_commit detected a server reboot, then
+                 * resend all dirty pages that might have been covered by
+                 * the NFS_CONTEXT_RESEND_WRITES flag
+                 */
+                start = 0;
+                end = LLONG_MAX;
+        } while (ret == -EAGAIN);
        return ret;
 }
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 53f94d915bd1..52d847212066 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -190,8 +190,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
                 * i/o and all i/o waiting on the slot table to the MDS until
                 * layout is destroyed and a new valid layout is obtained.
                 */
-                set_bit(NFS_LAYOUT_INVALID,
-                                &NFS_I(inode)->layout->plh_flags);
                pnfs_destroy_layout(NFS_I(inode));
                rpc_wake_up(&tbl->slot_tbl_waitq);
                goto reset;
@@ -205,7 +203,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
        case -EPIPE:
                dprintk("%s DS connection error %d\n", __func__,
                        task->tk_status);
-                filelayout_mark_devid_invalid(devid);
+                nfs4_mark_deviceid_unavailable(devid);
                clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags);
                _pnfs_return_layout(inode);
                rpc_wake_up(&tbl->slot_tbl_waitq);
@@ -269,6 +267,21 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
                (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
 }
+bool
+filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node)
+{
+        return filelayout_test_devid_invalid(node) ||
+                nfs4_test_deviceid_unavailable(node);
+}
+static bool
+filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
+{
+        struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg);
+        return filelayout_test_devid_unavailable(node);
+}
 /*
 * Call ops for the async read/write cases
 * In the case of dense layouts, the offset needs to be reset to its
@@ -453,7 +466,7 @@ static void filelayout_commit_release(void *calldata)
        struct nfs_commit_data *data = calldata;
        data->completion_ops->completion(data);
-        put_lseg(data->lseg);
+        pnfs_put_lseg(data->lseg);
        nfs_put_client(data->ds_clp);
        nfs_commitdata_release(data);
 }
@@ -608,13 +621,13 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
                                   NFS_SERVER(lo->plh_inode)->nfs_client, id);
        if (d == NULL) {
-                dsaddr = get_device_info(lo->plh_inode, id, gfp_flags);
+                dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags);
                if (dsaddr == NULL)
                        goto out;
        } else
                dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
-        /* Found deviceid is being reaped */
+        /* Found deviceid is unavailable */
-        if (test_bit(NFS_DEVICEID_INVALID, &dsaddr->id_node.flags))
+        if (filelayout_test_devid_unavailable(&dsaddr->id_node))
                        goto out_put;
        fl->dsaddr = dsaddr;
@@ -931,7 +944,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
        nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
        status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
        if (status < 0) {
-                put_lseg(pgio->pg_lseg);
+                pnfs_put_lseg(pgio->pg_lseg);
                pgio->pg_lseg = NULL;
                goto out_mds;
        }
@@ -985,7 +998,7 @@ filelayout_clear_request_commit(struct nfs_page *req,
 out:
        nfs_request_remove_commit_list(req, cinfo);
        spin_unlock(cinfo->lock);
-        put_lseg(freeme);
+        pnfs_put_lseg(freeme);
 }
 static struct list_head *
@@ -1018,7 +1031,7 @@ filelayout_choose_commit_list(struct nfs_page *req,
                 * off due to a rewrite, in which case it will be done in
                 * filelayout_clear_request_commit
                 */
-                buckets[i].wlseg = get_lseg(lseg);
+                buckets[i].wlseg = pnfs_get_lseg(lseg);
        }
        set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
        cinfo->ds->nwritten++;
@@ -1128,7 +1141,7 @@ filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
                if (list_empty(src))
                        bucket->wlseg = NULL;
                else
-                        get_lseg(bucket->clseg);
+                        pnfs_get_lseg(bucket->clseg);
        }
        return ret;
 }
@@ -1159,12 +1172,12 @@ static void filelayout_recover_commit_reqs(struct list_head *dst,
        /* NOTE cinfo->lock is NOT held, relying on fact that this is
         * only called on single thread per dreq.
-         * Can't take the lock because need to do put_lseg
+         * Can't take the lock because need to do pnfs_put_lseg
         */
        for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
                if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
                        BUG_ON(!list_empty(&b->written));
-                        put_lseg(b->wlseg);
+                        pnfs_put_lseg(b->wlseg);
                        b->wlseg = NULL;
                }
        }
@@ -1200,7 +1213,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
                if (list_empty(&bucket->committing))
                        continue;
                nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
-                put_lseg(bucket->clseg);
+                pnfs_put_lseg(bucket->clseg);
                bucket->clseg = NULL;
        }
        /* Caller will clean up entries put on list */
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 43fe802dd678..dca47d786710 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -129,23 +129,13 @@ filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node)
 }
 static inline bool
-filelayout_test_layout_invalid(struct pnfs_layout_hdr *lo)
-{
-        return test_bit(NFS_LAYOUT_INVALID, &lo->plh_flags);
-}
-static inline bool
 filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
 {
        return test_bit(NFS_DEVICEID_INVALID, &node->flags);
 }
-static inline bool
+extern bool
-filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
+filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node);
-{
-        return filelayout_test_devid_invalid(FILELAYOUT_DEVID_NODE(lseg)) ||
-                filelayout_test_layout_invalid(lseg->pls_layout);
-}
 extern struct nfs_fh *
 nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
@@ -158,7 +148,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
 extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
-get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
+filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
 void nfs4_ds_disconnect(struct nfs_client *clp);
 #endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index f81231f30d94..3336d5eaf879 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -690,7 +690,7 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
 * of available devices, and return it.
 */
 struct nfs4_file_layout_dsaddr *
-get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags)
+filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags)
 {
        struct pnfs_device *pdev = NULL;
        u32 max_resp_sz;
@@ -804,13 +804,14 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
        struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
        struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
-        if (filelayout_test_devid_invalid(devid))
+        if (filelayout_test_devid_unavailable(devid))
                return NULL;
        if (ds == NULL) {
                printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
                        __func__, ds_idx);
-                goto mark_dev_invalid;
+                filelayout_mark_devid_invalid(devid);
+                return NULL;
        }
        if (!ds->ds_clp) {
@@ -818,14 +819,12 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
                int err;
                err = nfs4_ds_connect(s, ds);
-                if (err)
+                if (err) {
-                        goto mark_dev_invalid;
+                        nfs4_mark_deviceid_unavailable(devid);
+                        return NULL;
+                }
        }
        return ds;
-mark_dev_invalid:
-        filelayout_mark_devid_invalid(devid);
-        return NULL;
 }
 module_param(dataserver_retrans, uint, 0644);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 017b4b01a69c..79fbb61ce202 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -192,25 +192,13 @@ out:
 struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode,
                                        struct qstr *name)
 {
-        struct rpc_clnt *clone;
-        struct rpc_auth *auth;
        rpc_authflavor_t flavor;
        flavor = nfs4_negotiate_security(inode, name);
        if ((int)flavor < 0)
-                return ERR_PTR(flavor);
+                return ERR_PTR((int)flavor);
-        clone = rpc_clone_client(clnt);
+        return rpc_clone_client_set_auth(clnt, flavor);
-        if (IS_ERR(clone))
-                return clone;
-        auth = rpcauth_create(flavor, clone);
-        if (!auth) {
-                rpc_shutdown_client(clone);
-                clone = ERR_PTR(-EIO);
-        }
-        return clone;
 }
 static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1e50326d00dd..68b21d81b7ac 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -104,6 +104,8 @@ static int nfs4_map_errors(int err)
                return -EACCES;
        case -NFS4ERR_MINOR_VERS_MISMATCH:
                return -EPROTONOSUPPORT;
+        case -NFS4ERR_ACCESS:
+                return -EACCES;
        default:
                dprintk("%s could not handle NFSv4 error %d\n",
                                __func__, -err);
@@ -150,6 +152,12 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
        FATTR4_WORD2_MDSTHRESHOLD
 };
+static const u32 nfs4_open_noattr_bitmap[3] = {
+        FATTR4_WORD0_TYPE
+        | FATTR4_WORD0_CHANGE
+        | FATTR4_WORD0_FILEID,
+};
 const u32 nfs4_statfs_bitmap[2] = {
        FATTR4_WORD0_FILES_AVAIL
        | FATTR4_WORD0_FILES_FREE
@@ -832,6 +840,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
        p->o_res.seqid = p->o_arg.seqid;
        p->c_res.seqid = p->c_arg.seqid;
        p->o_res.server = p->o_arg.server;
+        p->o_res.access_request = p->o_arg.access;
        nfs_fattr_init(&p->f_attr);
        nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
 }
@@ -860,6 +869,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        p->o_arg.fh = NFS_FH(dir);
        p->o_arg.open_flags = flags;
        p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
+        /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
+         * will return permission denied for all bits until close */
+        if (!(flags & O_EXCL)) {
+                /* ask server to check for all possible rights as results
+                 * are cached */
+                p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY |
+                                  NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE;
+        }
        p->o_arg.clientid = server->nfs_client->cl_clientid;
        p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);
        p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
@@ -1115,11 +1132,80 @@ out_return_state:
        return state;
 }
-static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+static void
+nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state)
+{
+        struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client;
+        struct nfs_delegation *delegation;
+        int delegation_flags = 0;
+        rcu_read_lock();
+        delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+        if (delegation)
+                delegation_flags = delegation->flags;
+        rcu_read_unlock();
+        if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) {
+                pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
+                                   "returning a delegation for "
+                                   "OPEN(CLAIM_DELEGATE_CUR)\n",
+                                   clp->cl_hostname);
+        } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
+                nfs_inode_set_delegation(state->inode,
+                                         data->owner->so_cred,
+                                         &data->o_res);
+        else
+                nfs_inode_reclaim_delegation(state->inode,
+                                             data->owner->so_cred,
+                                             &data->o_res);
+}
+/*
+ * Check the inode attributes against the CLAIM_PREVIOUS returned attributes
+ * and update the nfs4_state.
+ */
+static struct nfs4_state *
+_nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
+{
+        struct inode *inode = data->state->inode;
+        struct nfs4_state *state = data->state;
+        int ret;
+        if (!data->rpc_done) {
+                ret = data->rpc_status;
+                goto err;
+        }
+        ret = -ESTALE;
+        if (!(data->f_attr.valid & NFS_ATTR_FATTR_TYPE) ||
+            !(data->f_attr.valid & NFS_ATTR_FATTR_FILEID) ||
+            !(data->f_attr.valid & NFS_ATTR_FATTR_CHANGE))
+                goto err;
+        ret = -ENOMEM;
+        state = nfs4_get_open_state(inode, data->owner);
+        if (state == NULL)
+                goto err;
+        ret = nfs_refresh_inode(inode, &data->f_attr);
+        if (ret)
+                goto err;
+        if (data->o_res.delegation_type != 0)
+                nfs4_opendata_check_deleg(data, state);
+        update_open_stateid(state, &data->o_res.stateid, NULL,
+                            data->o_arg.fmode);
+        return state;
+err:
+        return ERR_PTR(ret);
+}
+static struct nfs4_state *
+_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
 {
        struct inode *inode;
        struct nfs4_state *state = NULL;
-        struct nfs_delegation *delegation;
        int ret;
        if (!data->rpc_done) {
@@ -1138,30 +1224,8 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
        state = nfs4_get_open_state(inode, data->owner);
        if (state == NULL)
                goto err_put_inode;
-        if (data->o_res.delegation_type != 0) {
+        if (data->o_res.delegation_type != 0)
-                struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+                nfs4_opendata_check_deleg(data, state);
-                int delegation_flags = 0;
-                rcu_read_lock();
-                delegation = rcu_dereference(NFS_I(inode)->delegation);
-                if (delegation)
-                        delegation_flags = delegation->flags;
-                rcu_read_unlock();
-                if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) {
-                        pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
-                                        "returning a delegation for "
-                                        "OPEN(CLAIM_DELEGATE_CUR)\n",
-                                        clp->cl_hostname);
-                } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
-                        nfs_inode_set_delegation(state->inode,
-                                        data->owner->so_cred,
-                                        &data->o_res);
-                else
-                        nfs_inode_reclaim_delegation(state->inode,
-                                        data->owner->so_cred,
-                                        &data->o_res);
-        }
        update_open_stateid(state, &data->o_res.stateid, NULL,
                        data->o_arg.fmode);
        iput(inode);
@@ -1173,6 +1237,14 @@ err:
        return ERR_PTR(ret);
 }
+static struct nfs4_state *
+nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+{
+        if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
+                return _nfs4_opendata_reclaim_to_nfs4_state(data);
+        return _nfs4_opendata_to_nfs4_state(data);
+}
 static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
 {
        struct nfs_inode *nfsi = NFS_I(state->inode);
@@ -1494,6 +1566,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
        data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
        if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
+                data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0];
                nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
        }
        data->timestamp = jiffies;
@@ -1526,7 +1599,8 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
                return;
        if (task->tk_status == 0) {
-                switch (data->o_res.f_attr->mode & S_IFMT) {
+                if (data->o_res.f_attr->valid & NFS_ATTR_FATTR_TYPE) {
+                        switch (data->o_res.f_attr->mode & S_IFMT) {
                        case S_IFREG:
                                break;
                        case S_IFLNK:
@@ -1537,6 +1611,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
                                break;
                        default:
                                data->rpc_status = -ENOTDIR;
+                        }
                }
                renew_lease(data->o_res.server, data->timestamp);
                if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM))
@@ -1643,6 +1718,39 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
        return status;
 }
+static int nfs4_opendata_access(struct rpc_cred *cred,
+                                struct nfs4_opendata *opendata,
+                                struct nfs4_state *state, fmode_t fmode)
+{
+        struct nfs_access_entry cache;
+        u32 mask;
+        /* access call failed or for some reason the server doesn't
+         * support any access modes -- defer access call until later */
+        if (opendata->o_res.access_supported == 0)
+                return 0;
+        mask = 0;
+        /* don't check MAY_WRITE - a newly created file may not have
+         * write mode bits, but POSIX allows the creating process to write */
+        if (fmode & FMODE_READ)
+                mask |= MAY_READ;
+        if (fmode & FMODE_EXEC)
+                mask |= MAY_EXEC;
+        cache.cred = cred;
+        cache.jiffies = jiffies;
+        nfs_access_set_mask(&cache, opendata->o_res.access_result);
+        nfs_access_add_cache(state->inode, &cache);
+        if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0)
+                return 0;
+        /* even though OPEN succeeded, access is denied. Close the file */
+        nfs4_close_state(state, fmode);
+        return -NFS4ERR_ACCESS;
+}
 /*
 * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
 */
@@ -1774,7 +1882,11 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
                 * informs us the stateid is unrecognized. */
                if (status != -NFS4ERR_BAD_STATEID)
                        nfs41_free_stateid(server, stateid);
+                nfs_remove_bad_delegation(state->inode);
+                write_seqlock(&state->seqlock);
+                nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+                write_sequnlock(&state->seqlock);
                clear_bit(NFS_DELEGATED_STATE, &state->flags);
        }
 }
@@ -1790,7 +1902,7 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
 static int nfs41_check_open_stateid(struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
-        nfs4_stateid *stateid = &state->stateid;
+        nfs4_stateid *stateid = &state->open_stateid;
        int status;
        /* If a state reset has been done, test_stateid is unneeded */
@@ -1896,6 +2008,10 @@ static int _nfs4_do_open(struct inode *dir,
        if (server->caps & NFS_CAP_POSIX_LOCK)
                set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+        status = nfs4_opendata_access(cred, opendata, state, fmode);
+        if (status != 0)
+                goto err_opendata_put;
        if (opendata->o_arg.open_flags & O_EXCL) {
                nfs4_exclusive_attrset(opendata, sattr);
@@ -1941,7 +2057,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
        struct nfs4_state *res;
        int status;
-        fmode &= FMODE_READ|FMODE_WRITE;
+        fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC;
        do {
                status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred,
                                       &res, ctx_th);
@@ -2013,8 +2129,12 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        nfs_fattr_init(fattr);
        if (state != NULL) {
+                struct nfs_lockowner lockowner = {
+                        .l_owner = current->files,
+                        .l_pid = current->tgid,
+                };
                nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
-                                current->files, current->tgid);
+                                &lockowner);
        } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode,
                                FMODE_WRITE)) {
                /* Use that stateid */
@@ -2133,6 +2253,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 {
        struct nfs4_closedata *calldata = data;
        struct nfs4_state *state = calldata->state;
+        struct inode *inode = calldata->inode;
        int call_close = 0;
        dprintk("%s: begin!\n", __func__);
@@ -2166,16 +2287,13 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        if (calldata->arg.fmode == 0) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
                if (calldata->roc &&
-                    pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
+                    pnfs_roc_drain(inode, &calldata->roc_barrier, task))
-                        rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
-                                     task, NULL);
                        goto out;
-                }
        }
        nfs_fattr_init(calldata->res.fattr);
        calldata->timestamp = jiffies;
-        if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
+        if (nfs4_setup_sequence(NFS_SERVER(inode),
                                &calldata->arg.seq_args,
                                &calldata->res.seq_res,
                                task))
@@ -2202,7 +2320,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
 *
 * NOTE: Caller must be holding the sp->so_owner semaphore!
 */
-int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
+int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        struct nfs4_closedata *calldata;
@@ -2238,7 +2356,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
-        calldata->roc = roc;
+        calldata->roc = pnfs_roc(state->inode);
        nfs_sb_active(calldata->inode->i_sb);
        msg.rpc_argp = &calldata->arg;
@@ -2255,8 +2373,6 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
 out_free_calldata:
        kfree(calldata);
 out:
-        if (roc)
-                pnfs_roc_release(state->inode);
        nfs4_put_open_state(state);
        nfs4_put_state_owner(sp);
        return status;
@@ -2399,7 +2515,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl
        int ret;
        auth = rpcauth_create(flavor, server->client);
-        if (!auth) {
+        if (IS_ERR(auth)) {
                ret = -EIO;
                goto out;
        }
@@ -2767,13 +2883,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
        if (!status) {
-                entry->mask = 0;
+                nfs_access_set_mask(entry, res.access);
-                if (res.access & NFS4_ACCESS_READ)
-                        entry->mask |= MAY_READ;
-                if (res.access & (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE))
-                        entry->mask |= MAY_WRITE;
-                if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
-                        entry->mask |= MAY_EXEC;
                nfs_refresh_inode(inode, res.fattr);
        }
        nfs_free_fattr(res.fattr);
@@ -3362,8 +3472,11 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s
        nfs_fattr_init(fsinfo->fattr);
        error = nfs4_do_fsinfo(server, fhandle, fsinfo);
-        if (error == 0)
+        if (error == 0) {
+                /* block layout checks this! */
+                server->pnfs_blksize = fsinfo->blksize;
                set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype);
+        }
        return error;
 }
@@ -4007,6 +4120,36 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
        memcpy(bootverf->data, verf, sizeof(bootverf->data));
 }
+static unsigned int
+nfs4_init_nonuniform_client_string(const struct nfs_client *clp,
+                                   char *buf, size_t len)
+{
+        unsigned int result;
+        rcu_read_lock();
+        result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s",
+                                clp->cl_ipaddr,
+                                rpc_peeraddr2str(clp->cl_rpcclient,
+                                                        RPC_DISPLAY_ADDR),
+                                rpc_peeraddr2str(clp->cl_rpcclient,
+                                                        RPC_DISPLAY_PROTO));
+        rcu_read_unlock();
+        return result;
+}
+static unsigned int
+nfs4_init_uniform_client_string(const struct nfs_client *clp,
+                                char *buf, size_t len)
+{
+        char *nodename = clp->cl_rpcclient->cl_nodename;
+        if (nfs4_client_id_uniquifier[0] != '\0')
+                nodename = nfs4_client_id_uniquifier;
+        return scnprintf(buf, len, "Linux NFSv%u.%u %s",
+                                clp->rpc_ops->version, clp->cl_minorversion,
+                                nodename);
+}
 /**
 * nfs4_proc_setclientid - Negotiate client ID
 * @clp: state data structure
@@ -4037,15 +4180,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
        /* nfs_client_id4 */
        nfs4_init_boot_verifier(clp, &sc_verifier);
-        rcu_read_lock();
+        if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
-        setclientid.sc_name_len = scnprintf(setclientid.sc_name,
+                setclientid.sc_name_len =
-                        sizeof(setclientid.sc_name), "%s/%s %s",
+                                nfs4_init_uniform_client_string(clp,
-                        clp->cl_ipaddr,
+                                                setclientid.sc_name,
-                        rpc_peeraddr2str(clp->cl_rpcclient,
+                                                sizeof(setclientid.sc_name));
-                                                RPC_DISPLAY_ADDR),
+        else
-                        rpc_peeraddr2str(clp->cl_rpcclient,
+                setclientid.sc_name_len =
-                                                RPC_DISPLAY_PROTO));
+                                nfs4_init_nonuniform_client_string(clp,
+                                                setclientid.sc_name,
+                                                sizeof(setclientid.sc_name));
        /* cb_client4 */
+        rcu_read_lock();
        setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
                                sizeof(setclientid.sc_netid),
                                rpc_peeraddr2str(clp->cl_rpcclient,
@@ -4391,7 +4537,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
        if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
                return;
-        if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) {
+        if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
                /* Note: exit _without_ running nfs4_locku_done */
                task->tk_action = NULL;
                return;
@@ -4585,7 +4731,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
        }
        if (data->rpc_status == 0) {
                nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid);
-                data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;
+                set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags);
                renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
        }
 out:
@@ -4632,7 +4778,7 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_
        case -NFS4ERR_BAD_STATEID:
                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
                if (new_lock_owner != 0 ||
-                   (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
+                   test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0)
                        nfs4_schedule_stateid_recovery(server, lsp->ls_state);
                break;
        case -NFS4ERR_STALE_STATEID:
@@ -4756,7 +4902,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
        struct nfs_server *server = NFS_SERVER(state->inode);
        list_for_each_entry(lsp, &state->lock_states, ls_locks) {
-                if (lsp->ls_flags & NFS_LOCK_INITIALIZED) {
+                if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
                        status = nfs41_test_stateid(server, &lsp->ls_stateid);
                        if (status != NFS_OK) {
                                /* Free the stateid unless the server
@@ -4764,7 +4910,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
                                if (status != -NFS4ERR_BAD_STATEID)
                                        nfs41_free_stateid(server,
                                                        &lsp->ls_stateid);
-                                lsp->ls_flags &= ~NFS_LOCK_INITIALIZED;
+                                clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
                                ret = status;
                        }
                }
@@ -5267,10 +5413,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        };
        nfs4_init_boot_verifier(clp, &verifier);
-        args.id_len = scnprintf(args.id, sizeof(args.id),
+        args.id_len = nfs4_init_uniform_client_string(clp, args.id,
-                                "%s/%s",
+                                                        sizeof(args.id));
-                                clp->cl_ipaddr,
-                                clp->cl_rpcclient->cl_nodename);
        dprintk("NFS call  exchange_id auth=%s, '%.*s'\n",
                clp->cl_rpcclient->cl_auth->au_ops->au_name,
                args.id_len, args.id);
@@ -5391,6 +5535,8 @@ int nfs4_destroy_clientid(struct nfs_client *clp)
                goto out;
        if (clp->cl_exchange_flags == 0)
                goto out;
+        if (clp->cl_preserve_clid)
+                goto out;
        cred = nfs4_get_exchange_id_cred(clp);
        ret = nfs4_proc_destroy_clientid(clp, cred);
        if (cred)
@@ -6196,26 +6342,44 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_layoutget *lgp = calldata;
-        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+        struct inode *inode = lgp->args.inode;
+        struct nfs_server *server = NFS_SERVER(inode);
+        struct pnfs_layout_hdr *lo;
+        struct nfs4_state *state = NULL;
        dprintk("--> %s\n", __func__);
        if (!nfs4_sequence_done(task, &lgp->res.seq_res))
-                return;
+                goto out;
        switch (task->tk_status) {
        case 0:
-                break;
+                goto out;
        case -NFS4ERR_LAYOUTTRYLATER:
        case -NFS4ERR_RECALLCONFLICT:
                task->tk_status = -NFS4ERR_DELAY;
-                /* Fall through */
+                break;
-        default:
+        case -NFS4ERR_EXPIRED:
-                if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+        case -NFS4ERR_BAD_STATEID:
-                        rpc_restart_call_prepare(task);
+                spin_lock(&inode->i_lock);
-                        return;
+                lo = NFS_I(inode)->layout;
+                if (!lo || list_empty(&lo->plh_segs)) {
+                        spin_unlock(&inode->i_lock);
+                        /* If the open stateid was bad, then recover it. */
+                        state = lgp->args.ctx->state;
+                } else {
+                        LIST_HEAD(head);
+                        pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+                        spin_unlock(&inode->i_lock);
+                        /* Mark the bad layout state as invalid, then
+                         * retry using the open stateid. */
+                        pnfs_free_lseg_list(&head);
                }
        }
+        if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
+                rpc_restart_call_prepare(task);
+out:
        dprintk("<-- %s\n", __func__);
 }
@@ -6282,7 +6446,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
        .rpc_release = nfs4_layoutget_release,
 };
-void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
+struct pnfs_layout_segment *
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
 {
        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
        size_t max_pages = max_response_pages(server);
@@ -6299,6 +6464,7 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
                .callback_data = lgp,
                .flags = RPC_TASK_ASYNC,
        };
+        struct pnfs_layout_segment *lseg = NULL;
        int status = 0;
        dprintk("--> %s\n", __func__);
@@ -6306,7 +6472,7 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
        lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
        if (!lgp->args.layout.pages) {
                nfs4_layoutget_release(lgp);
-                return;
+                return ERR_PTR(-ENOMEM);
        }
        lgp->args.layout.pglen = max_pages * PAGE_SIZE;
@@ -6315,15 +6481,17 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
        nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
-                return;
+                return ERR_CAST(task);
        status = nfs4_wait_for_completion_rpc_task(task);
        if (status == 0)
                status = task->tk_status;
        if (status == 0)
-                status = pnfs_layout_process(lgp);
+                lseg = pnfs_layout_process(lgp);
        rpc_put_task(task);
        dprintk("<-- %s status=%d\n", __func__, status);
-        return;
+        if (status)
+                return ERR_PTR(status);
+        return lseg;
 }
 static void
@@ -6342,7 +6510,6 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_layoutreturn *lrp = calldata;
        struct nfs_server *server;
-        struct pnfs_layout_hdr *lo = lrp->args.layout;
        dprintk("--> %s\n", __func__);
@@ -6354,20 +6521,21 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
                rpc_restart_call_prepare(task);
                return;
        }
-        spin_lock(&lo->plh_inode->i_lock);
-        if (task->tk_status == 0 && lrp->res.lrs_present)
-                pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
-        lo->plh_block_lgets--;
-        spin_unlock(&lo->plh_inode->i_lock);
        dprintk("<-- %s\n", __func__);
 }
 static void nfs4_layoutreturn_release(void *calldata)
 {
        struct nfs4_layoutreturn *lrp = calldata;
+        struct pnfs_layout_hdr *lo = lrp->args.layout;
        dprintk("--> %s\n", __func__);
-        put_layout_hdr(lrp->args.layout);
+        spin_lock(&lo->plh_inode->i_lock);
+        if (lrp->res.lrs_present)
+                pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+        lo->plh_block_lgets--;
+        spin_unlock(&lo->plh_inode->i_lock);
+        pnfs_put_layout_hdr(lrp->args.layout);
        kfree(calldata);
        dprintk("<-- %s\n", __func__);
 }
@@ -6541,7 +6709,7 @@ static void nfs4_layoutcommit_release(void *calldata)
                list_del_init(&lseg->pls_lc_list);
                if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT,
                                       &lseg->pls_flags))
-                        put_lseg(lseg);
+                        pnfs_put_lseg(lseg);
        }
        clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
@@ -6800,6 +6968,7 @@ static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
        .recover_lock   = nfs4_lock_reclaim,
        .establish_clid = nfs4_init_clientid,
        .get_clid_cred  = nfs4_get_setclientid_cred,
+        .detect_trunking = nfs40_discover_server_trunking,
 };
 #if defined(CONFIG_NFS_V4_1)
@@ -6811,6 +6980,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
        .establish_clid = nfs41_init_clientid,
        .get_clid_cred  = nfs4_get_exchange_id_cred,
        .reclaim_complete = nfs41_proc_reclaim_complete,
+        .detect_trunking = nfs41_discover_server_trunking,
 };
 #endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 55148def5540..c351e6b39838 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -51,18 +51,21 @@
 #include <linux/bitops.h>
 #include <linux/jiffies.h>
+#include <linux/sunrpc/clnt.h>
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
 #include "pnfs.h"
+#include "netns.h"
 #define NFSDBG_FACILITY         NFSDBG_STATE
 #define OPENOWNER_POOL_SIZE     8
 const nfs4_stateid zero_stateid;
+static DEFINE_MUTEX(nfs_clid_init_mutex);
 static LIST_HEAD(nfs4_clientid_list);
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
@@ -73,12 +76,13 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
        };
        unsigned short port;
        int status;
+        struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
        if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
                goto do_confirm;
-        port = nfs_callback_tcpport;
+        port = nn->nfs_callback_tcpport;
        if (clp->cl_addr.ss_family == AF_INET6)
-                port = nfs_callback_tcpport6;
+                port = nn->nfs_callback_tcpport6;
        status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
        if (status != 0)
@@ -96,6 +100,56 @@ out:
        return status;
 }
+/**
+ * nfs40_discover_server_trunking - Detect server IP address trunking (mv0)
+ *
+ * @clp: nfs_client under test
+ * @result: OUT: found nfs_client, or clp
+ * @cred: credential to use for trunking test
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status.
+ * If zero is returned, an nfs_client pointer is planted in
+ * "result".
+ *
+ * Note: The returned client may not yet be marked ready.
+ */
+int nfs40_discover_server_trunking(struct nfs_client *clp,
+                                   struct nfs_client **result,
+                                   struct rpc_cred *cred)
+{
+        struct nfs4_setclientid_res clid = {
+                .clientid = clp->cl_clientid,
+                .confirm = clp->cl_confirm,
+        };
+        struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+        unsigned short port;
+        int status;
+        port = nn->nfs_callback_tcpport;
+        if (clp->cl_addr.ss_family == AF_INET6)
+                port = nn->nfs_callback_tcpport6;
+        status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
+        if (status != 0)
+                goto out;
+        clp->cl_clientid = clid.clientid;
+        clp->cl_confirm = clid.confirm;
+        status = nfs40_walk_client_list(clp, result, cred);
+        switch (status) {
+        case -NFS4ERR_STALE_CLIENTID:
+                set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+        case 0:
+                /* Sustain the lease, even if it's empty.  If the clientid4
+                 * goes stale it's of no use for trunking discovery. */
+                nfs4_schedule_state_renewal(*result);
+                break;
+        }
+out:
+        return status;
+}
 struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
 {
        struct rpc_cred *cred = NULL;
@@ -275,6 +329,33 @@ out:
        return status;
 }
+/**
+ * nfs41_discover_server_trunking - Detect server IP address trunking (mv1)
+ *
+ * @clp: nfs_client under test
+ * @result: OUT: found nfs_client, or clp
+ * @cred: credential to use for trunking test
+ *
+ * Returns NFS4_OK, a negative errno, or a negative NFS4ERR status.
+ * If NFS4_OK is returned, an nfs_client pointer is planted in
+ * "result".
+ *
+ * Note: The returned client may not yet be marked ready.
+ */
+int nfs41_discover_server_trunking(struct nfs_client *clp,
+                                   struct nfs_client **result,
+                                   struct rpc_cred *cred)
+{
+        int status;
+        status = nfs4_proc_exchange_id(clp, cred);
+        if (status != NFS4_OK)
+                return status;
+        set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+        return nfs41_walk_client_list(clp, result, cred);
+}
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
 {
        struct rpc_cred *cred;
@@ -729,11 +810,8 @@ static void __nfs4_close(struct nfs4_state *state,
        if (!call_close) {
                nfs4_put_open_state(state);
                nfs4_put_state_owner(owner);
-        } else {
+        } else
-                bool roc = pnfs_roc(state->inode);
+                nfs4_do_close(state, gfp_mask, wait);
-                nfs4_do_close(state, gfp_mask, wait, roc);
-        }
 }
 void nfs4_close_state(struct nfs4_state *state, fmode_t fmode)
@@ -865,7 +943,7 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
        if (list_empty(&state->lock_states))
                clear_bit(LK_STATE_IN_USE, &state->flags);
        spin_unlock(&state->state_lock);
-        if (lsp->ls_flags & NFS_LOCK_INITIALIZED) {
+        if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
                if (nfs4_release_lockowner(lsp) == 0)
                        return;
        }
@@ -911,17 +989,25 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
 }
 static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state,
-                fl_owner_t fl_owner, pid_t fl_pid)
+                const struct nfs_lockowner *lockowner)
 {
        struct nfs4_lock_state *lsp;
+        fl_owner_t fl_owner;
+        pid_t fl_pid;
        bool ret = false;
+        if (lockowner == NULL)
+                goto out;
        if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
                goto out;
+        fl_owner = lockowner->l_owner;
+        fl_pid = lockowner->l_pid;
        spin_lock(&state->state_lock);
        lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
-        if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) {
+        if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
                nfs4_stateid_copy(dst, &lsp->ls_stateid);
                ret = true;
        }
@@ -946,11 +1032,11 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
 * requests.
 */
 void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
-                fmode_t fmode, fl_owner_t fl_owner, pid_t fl_pid)
+                fmode_t fmode, const struct nfs_lockowner *lockowner)
 {
        if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
                return;
-        if (nfs4_copy_lock_stateid(dst, state, fl_owner, fl_pid))
+        if (nfs4_copy_lock_stateid(dst, state, lockowner))
                return;
        nfs4_copy_open_stateid(dst, state);
 }
@@ -1289,7 +1375,7 @@ restart:
                        if (status >= 0) {
                                spin_lock(&state->state_lock);
                                list_for_each_entry(lock, &state->lock_states, ls_locks) {
-                                        if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
+                                        if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
                                                pr_warn_ratelimited("NFS: "
                                                        "%s: Lock reclaim "
                                                        "failed!\n", __func__);
@@ -1361,7 +1447,7 @@ static void nfs4_clear_open_state(struct nfs4_state *state)
        spin_lock(&state->state_lock);
        list_for_each_entry(lock, &state->lock_states, ls_locks) {
                lock->ls_seqid.flags = 0;
-                lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
+                clear_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags);
        }
        spin_unlock(&state->state_lock);
 }
@@ -1595,8 +1681,8 @@ out:
        return nfs4_recovery_handle_error(clp, status);
 }
-/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
+/* Set NFS4CLNT_LEASE_EXPIRED and reclaim reboot state for all v4.0 errors
- * on EXCHANGE_ID for v4.1
+ * and for recoverable errors on EXCHANGE_ID for v4.1
 */
 static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
 {
@@ -1606,8 +1692,12 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
                        return -ESERVERFAULT;
                /* Lease confirmation error: retry after purging the lease */
                ssleep(1);
+                clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+                break;
        case -NFS4ERR_STALE_CLIENTID:
                clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+                nfs4_state_clear_reclaim_reboot(clp);
+                nfs4_state_start_reclaim_reboot(clp);
                break;
        case -NFS4ERR_CLID_INUSE:
                pr_err("NFS: Server %s reports our clientid is in use\n",
@@ -1698,6 +1788,109 @@ static int nfs4_purge_lease(struct nfs_client *clp)
        return 0;
 }
+/**
+ * nfs4_discover_server_trunking - Detect server IP address trunking
+ *
+ * @clp: nfs_client under test
+ * @result: OUT: found nfs_client, or clp
+ *
+ * Returns zero or a negative errno.  If zero is returned,
+ * an nfs_client pointer is planted in "result".
+ *
+ * Note: since we are invoked in process context, and
+ * not from inside the state manager, we cannot use
+ * nfs4_handle_reclaim_lease_error().
+ */
+int nfs4_discover_server_trunking(struct nfs_client *clp,
+                                  struct nfs_client **result)
+{
+        const struct nfs4_state_recovery_ops *ops =
+                                clp->cl_mvops->reboot_recovery_ops;
+        rpc_authflavor_t *flavors, flav, save;
+        struct rpc_clnt *clnt;
+        struct rpc_cred *cred;
+        int i, len, status;
+        dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname);
+        len = NFS_MAX_SECFLAVORS;
+        flavors = kcalloc(len, sizeof(*flavors), GFP_KERNEL);
+        if (flavors == NULL) {
+                status = -ENOMEM;
+                goto out;
+        }
+        len = rpcauth_list_flavors(flavors, len);
+        if (len < 0) {
+                status = len;
+                goto out_free;
+        }
+        clnt = clp->cl_rpcclient;
+        save = clnt->cl_auth->au_flavor;
+        i = 0;
+        mutex_lock(&nfs_clid_init_mutex);
+        status  = -ENOENT;
+again:
+        cred = ops->get_clid_cred(clp);
+        if (cred == NULL)
+                goto out_unlock;
+        status = ops->detect_trunking(clp, result, cred);
+        put_rpccred(cred);
+        switch (status) {
+        case 0:
+                break;
+        case -EACCES:
+                if (clp->cl_machine_cred == NULL)
+                        break;
+                /* Handle case where the user hasn't set up machine creds */
+                nfs4_clear_machine_cred(clp);
+        case -NFS4ERR_DELAY:
+        case -ETIMEDOUT:
+        case -EAGAIN:
+                ssleep(1);
+                dprintk("NFS: %s after status %d, retrying\n",
+                        __func__, status);
+                goto again;
+        case -NFS4ERR_CLID_INUSE:
+        case -NFS4ERR_WRONGSEC:
+                status = -EPERM;
+                if (i >= len)
+                        break;
+                flav = flavors[i++];
+                if (flav == save)
+                        flav = flavors[i++];
+                clnt = rpc_clone_client_set_auth(clnt, flav);
+                if (IS_ERR(clnt)) {
+                        status = PTR_ERR(clnt);
+                        break;
+                }
+                clp->cl_rpcclient = clnt;
+                goto again;
+        case -NFS4ERR_MINOR_VERS_MISMATCH:
+                status = -EPROTONOSUPPORT;
+                break;
+        case -EKEYEXPIRED:
+                nfs4_warn_keyexpired(clp->cl_hostname);
+        case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
+                                 * in nfs4_exchange_id */
+                status = -EKEYEXPIRED;
+        }
+out_unlock:
+        mutex_unlock(&nfs_clid_init_mutex);
+out_free:
+        kfree(flavors);
+out:
+        dprintk("NFS: %s: status = %d\n", __func__, status);
+        return status;
+}
 #ifdef CONFIG_NFS_V4_1
 void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 {
@@ -2008,6 +2201,7 @@ out_error:
        pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s"
                        " with error %d\n", section_sep, section,
                        clp->cl_hostname, -status);
+        ssleep(1);
        nfs4_end_drain_session(clp);
        nfs4_clear_state_manager_bit(clp);
 }
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 5729bc8aa75d..2628d921b7e3 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -9,6 +9,7 @@
 #include <linux/nfs_idmap.h>
 #include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
 #include "callback.h"
 static const int nfs_set_port_min = 0;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8dba6bd48557..40836ee5dc3a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -447,12 +447,14 @@ static int nfs4_stat_to_errno(int);
                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
                                encode_open_maxsz + \
+                                encode_access_maxsz + \
                                encode_getfh_maxsz + \
                                encode_getattr_maxsz)
 #define NFS4_dec_open_sz        (compound_decode_hdr_maxsz + \
                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_open_maxsz + \
+                                decode_access_maxsz + \
                                decode_getfh_maxsz + \
                                decode_getattr_maxsz)
 #define NFS4_enc_open_confirm_sz \
@@ -467,11 +469,13 @@ static int nfs4_stat_to_errno(int);
                                        encode_sequence_maxsz + \
                                        encode_putfh_maxsz + \
                                        encode_open_maxsz + \
+                                        encode_access_maxsz + \
                                        encode_getattr_maxsz)
 #define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \
                                        decode_sequence_maxsz + \
                                        decode_putfh_maxsz + \
                                        decode_open_maxsz + \
+                                        decode_access_maxsz + \
                                        decode_getattr_maxsz)
 #define NFS4_enc_open_downgrade_sz \
                                (compound_encode_hdr_maxsz + \
@@ -1509,8 +1513,12 @@ static void encode_open_stateid(struct xdr_stream *xdr,
        nfs4_stateid stateid;
        if (ctx->state != NULL) {
+                const struct nfs_lockowner *lockowner = NULL;
+                if (l_ctx != NULL)
+                        lockowner = &l_ctx->lockowner;
                nfs4_select_rw_stateid(&stateid, ctx->state,
-                                fmode, l_ctx->lockowner, l_ctx->pid);
+                                fmode, lockowner);
                if (zero_seqid)
                        stateid.seqid = 0;
                encode_nfs4_stateid(xdr, &stateid);
@@ -2216,6 +2224,8 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
        encode_putfh(xdr, args->fh, &hdr);
        encode_open(xdr, args, &hdr);
        encode_getfh(xdr, &hdr);
+        if (args->access)
+                encode_access(xdr, args->access, &hdr);
        encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
        encode_nops(&hdr);
 }
@@ -2252,7 +2262,9 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
        encode_sequence(xdr, &args->seq_args, &hdr);
        encode_putfh(xdr, args->fh, &hdr);
        encode_open(xdr, args, &hdr);
-        encode_getfattr(xdr, args->bitmask, &hdr);
+        if (args->access)
+                encode_access(xdr, args->access, &hdr);
+        encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
        encode_nops(&hdr);
 }
@@ -4095,7 +4107,7 @@ out_overflow:
        return -EIO;
 }
-static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
+static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
 {
        __be32 *p;
        uint32_t supp, acc;
@@ -4109,8 +4121,8 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
                goto out_overflow;
        supp = be32_to_cpup(p++);
        acc = be32_to_cpup(p);
-        access->supported = supp;
+        *supported = supp;
-        access->access = acc;
+        *access = acc;
        return 0;
 out_overflow:
        print_overflow_msg(__func__, xdr);
@@ -5642,7 +5654,8 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
         * and places the remaining xdr data in xdr_buf->tail
         */
        pdev->mincount = be32_to_cpup(p);
-        xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
+        if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount)
+                goto out_overflow;
        /* Parse notification bitmap, verifying that it is zero. */
        p = xdr_inline_decode(xdr, 4);
@@ -5887,7 +5900,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_putfh(xdr);
        if (status != 0)
                goto out;
-        status = decode_access(xdr, res);
+        status = decode_access(xdr, &res->supported, &res->access);
        if (status != 0)
                goto out;
        decode_getfattr(xdr, res->fattr, res->server);
@@ -6228,6 +6241,8 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_getfh(xdr, &res->fh);
        if (status)
                goto out;
+        if (res->access_request)
+                decode_access(xdr, &res->access_supported, &res->access_result);
        decode_getfattr(xdr, res->f_attr, res->server);
 out:
        return status;
@@ -6276,6 +6291,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
        status = decode_open(xdr, res);
        if (status)
                goto out;
+        if (res->access_request)
+                decode_access(xdr, &res->access_supported, &res->access_result);
        decode_getfattr(xdr, res->f_attr, res->server);
 out:
        return status;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index ea6d111b03e9..be731e6b7b9c 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -41,6 +41,7 @@
 #include <scsi/osd_ore.h>
 #include "objlayout.h"
+#include "../internal.h"
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
@@ -606,8 +607,14 @@ static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout,
 void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
        unsigned long stripe_end = 0;
+        u64 wb_size;
-        pnfs_generic_pg_init_write(pgio, req);
+        if (pgio->pg_dreq == NULL)
+                wb_size = i_size_read(pgio->pg_inode) - req_offset(req);
+        else
+                wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+        pnfs_generic_pg_init_write(pgio, req, wb_size);
        if (unlikely(pgio->pg_lseg == NULL))
                return; /* Not pNFS */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 311a79681e2b..e56e846e9d2d 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -102,6 +102,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
                   unsigned int offset, unsigned int count)
 {
        struct nfs_page         *req;
+        struct nfs_lock_context *l_ctx;
        /* try to allocate the request struct */
        req = nfs_page_alloc();
@@ -109,11 +110,12 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
                return ERR_PTR(-ENOMEM);
        /* get lock context early so we can deal with alloc failures */
-        req->wb_lock_context = nfs_get_lock_context(ctx);
+        l_ctx = nfs_get_lock_context(ctx);
-        if (req->wb_lock_context == NULL) {
+        if (IS_ERR(l_ctx)) {
                nfs_page_free(req);
-                return ERR_PTR(-ENOMEM);
+                return ERR_CAST(l_ctx);
        }
+        req->wb_lock_context = l_ctx;
        /* Initialize the request struct. Initially, we assume a
         * long write-back delay. This will be adjusted in
@@ -290,7 +292,9 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
 {
        if (req->wb_context->cred != prev->wb_context->cred)
                return false;
-        if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
+        if (req->wb_lock_context->lockowner.l_owner != prev->wb_lock_context->lockowner.l_owner)
+                return false;
+        if (req->wb_lock_context->lockowner.l_pid != prev->wb_lock_context->lockowner.l_pid)
                return false;
        if (req->wb_context->state != prev->wb_context->state)
                return false;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2e00feacd4be..fe624c91bd00 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -35,6 +35,7 @@
 #include "iostat.h"
 #define NFSDBG_FACILITY         NFSDBG_PNFS
+#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
 /* Locking:
 *
@@ -190,7 +191,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 /* Need to hold i_lock if caller does not already hold reference */
 void
-get_layout_hdr(struct pnfs_layout_hdr *lo)
+pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
 {
        atomic_inc(&lo->plh_refcount);
 }
@@ -199,43 +200,107 @@ static struct pnfs_layout_hdr *
 pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
 {
        struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
-        return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
+        return ld->alloc_layout_hdr(ino, gfp_flags);
-                kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
 }
 static void
 pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-        struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
+        struct nfs_server *server = NFS_SERVER(lo->plh_inode);
+        struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
+        if (!list_empty(&lo->plh_layouts)) {
+                struct nfs_client *clp = server->nfs_client;
+                spin_lock(&clp->cl_lock);
+                list_del_init(&lo->plh_layouts);
+                spin_unlock(&clp->cl_lock);
+        }
        put_rpccred(lo->plh_lc_cred);
-        return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
+        return ld->free_layout_hdr(lo);
 }
 static void
-destroy_layout_hdr(struct pnfs_layout_hdr *lo)
+pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
 {
+        struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
        dprintk("%s: freeing layout cache %p\n", __func__, lo);
-        BUG_ON(!list_empty(&lo->plh_layouts));
+        nfsi->layout = NULL;
-        NFS_I(lo->plh_inode)->layout = NULL;
+        /* Reset MDS Threshold I/O counters */
-        pnfs_free_layout_hdr(lo);
+        nfsi->write_io = 0;
+        nfsi->read_io = 0;
+}
+void
+pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        struct inode *inode = lo->plh_inode;
+        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+                pnfs_detach_layout_hdr(lo);
+                spin_unlock(&inode->i_lock);
+                pnfs_free_layout_hdr(lo);
+        }
+}
+static int
+pnfs_iomode_to_fail_bit(u32 iomode)
+{
+        return iomode == IOMODE_RW ?
+                NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
 }
 static void
-put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
 {
-        if (atomic_dec_and_test(&lo->plh_refcount))
+        lo->plh_retry_timestamp = jiffies;
-                destroy_layout_hdr(lo);
+        if (test_and_set_bit(fail_bit, &lo->plh_flags))
+                atomic_inc(&lo->plh_refcount);
 }
-void
+static void
-put_layout_hdr(struct pnfs_layout_hdr *lo)
+pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
+{
+        if (test_and_clear_bit(fail_bit, &lo->plh_flags))
+                atomic_dec(&lo->plh_refcount);
+}
+static void
+pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 {
        struct inode *inode = lo->plh_inode;
+        struct pnfs_layout_range range = {
+                .iomode = iomode,
+                .offset = 0,
+                .length = NFS4_MAX_UINT64,
+        };
+        LIST_HEAD(head);
-        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+        spin_lock(&inode->i_lock);
-                destroy_layout_hdr(lo);
+        pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
-                spin_unlock(&inode->i_lock);
+        pnfs_mark_matching_lsegs_invalid(lo, &head, &range);
+        spin_unlock(&inode->i_lock);
+        pnfs_free_lseg_list(&head);
+        dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
+                        iomode == IOMODE_RW ?  "RW" : "READ");
+}
+static bool
+pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+        unsigned long start, end;
+        int fail_bit = pnfs_iomode_to_fail_bit(iomode);
+        if (test_bit(fail_bit, &lo->plh_flags) == 0)
+                return false;
+        end = jiffies;
+        start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
+        if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
+                /* It is time to retry the failed layoutgets */
+                pnfs_layout_clear_fail_bit(lo, fail_bit);
+                return false;
        }
+        return true;
 }
 static void
@@ -249,33 +314,32 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
        lseg->pls_layout = lo;
 }
-static void free_lseg(struct pnfs_layout_segment *lseg)
+static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
 {
        struct inode *ino = lseg->pls_layout->plh_inode;
        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
-        /* Matched by get_layout_hdr in pnfs_insert_layout */
-        put_layout_hdr(NFS_I(ino)->layout);
 }
 static void
-put_lseg_common(struct pnfs_layout_segment *lseg)
+pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
+                struct pnfs_layout_segment *lseg)
 {
-        struct inode *inode = lseg->pls_layout->plh_inode;
+        struct inode *inode = lo->plh_inode;
        WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
        list_del_init(&lseg->pls_list);
-        if (list_empty(&lseg->pls_layout->plh_segs)) {
+        /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
-                set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
+        atomic_dec(&lo->plh_refcount);
-                /* Matched by initial refcount set in alloc_init_layout_hdr */
+        if (list_empty(&lo->plh_segs))
-                put_layout_hdr_locked(lseg->pls_layout);
+                clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
-        }
        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
 }
 void
-put_lseg(struct pnfs_layout_segment *lseg)
+pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 {
+        struct pnfs_layout_hdr *lo;
        struct inode *inode;
        if (!lseg)
@@ -284,17 +348,17 @@ put_lseg(struct pnfs_layout_segment *lseg)
        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
                atomic_read(&lseg->pls_refcount),
                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
-        inode = lseg->pls_layout->plh_inode;
+        lo = lseg->pls_layout;
+        inode = lo->plh_inode;
        if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
-                LIST_HEAD(free_me);
+                pnfs_get_layout_hdr(lo);
+                pnfs_layout_remove_lseg(lo, lseg);
-                put_lseg_common(lseg);
-                list_add(&lseg->pls_list, &free_me);
                spin_unlock(&inode->i_lock);
-                pnfs_free_lseg_list(&free_me);
+                pnfs_free_lseg(lseg);
+                pnfs_put_layout_hdr(lo);
        }
 }
-EXPORT_SYMBOL_GPL(put_lseg);
+EXPORT_SYMBOL_GPL(pnfs_put_lseg);
 static inline u64
 end_offset(u64 start, u64 len)
@@ -378,7 +442,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
                dprintk("%s: lseg %p ref %d\n", __func__, lseg,
                        atomic_read(&lseg->pls_refcount));
                if (atomic_dec_and_test(&lseg->pls_refcount)) {
-                        put_lseg_common(lseg);
+                        pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
                        list_add(&lseg->pls_list, tmp_list);
                        rv = 1;
                }
@@ -390,7 +454,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 * after call.
 */
 int
-mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                            struct list_head *tmp_list,
                            struct pnfs_layout_range *recall_range)
 {
@@ -399,14 +463,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
        dprintk("%s:Begin lo %p\n", __func__, lo);
-        if (list_empty(&lo->plh_segs)) {
+        if (list_empty(&lo->plh_segs))
-                /* Reset MDS Threshold I/O counters */
-                NFS_I(lo->plh_inode)->write_io = 0;
-                NFS_I(lo->plh_inode)->read_io = 0;
-                if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
-                        put_layout_hdr_locked(lo);
                return 0;
-        }
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
                if (!recall_range ||
                    should_free_lseg(&lseg->pls_range, recall_range)) {
@@ -426,25 +484,13 @@ void
 pnfs_free_lseg_list(struct list_head *free_me)
 {
        struct pnfs_layout_segment *lseg, *tmp;
-        struct pnfs_layout_hdr *lo;
        if (list_empty(free_me))
                return;
-        lo = list_first_entry(free_me, struct pnfs_layout_segment,
-                              pls_list)->pls_layout;
-        if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
-                struct nfs_client *clp;
-                clp = NFS_SERVER(lo->plh_inode)->nfs_client;
-                spin_lock(&clp->cl_lock);
-                list_del_init(&lo->plh_layouts);
-                spin_unlock(&clp->cl_lock);
-        }
        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
                list_del(&lseg->pls_list);
-                free_lseg(lseg);
+                pnfs_free_lseg(lseg);
        }
 }
@@ -458,10 +504,15 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
        lo = nfsi->layout;
        if (lo) {
                lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
-                mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+                pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
-        }
+                pnfs_get_layout_hdr(lo);
-        spin_unlock(&nfsi->vfs_inode.i_lock);
+                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
-        pnfs_free_lseg_list(&tmp_list);
+                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
+                spin_unlock(&nfsi->vfs_inode.i_lock);
+                pnfs_free_lseg_list(&tmp_list);
+                pnfs_put_layout_hdr(lo);
+        } else
+                spin_unlock(&nfsi->vfs_inode.i_lock);
 }
 EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
@@ -498,46 +549,54 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
        }
 }
+/*
+ * Compare 2 layout stateid sequence ids, to see which is newer,
+ * taking into account wraparound issues.
+ */
+static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
+{
+        return (s32)s1 - (s32)s2 > 0;
+}
 /* update lo->plh_stateid with new if is more recent */
 void
 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
                        bool update_barrier)
 {
-        u32 oldseq, newseq;
+        u32 oldseq, newseq, new_barrier;
+        int empty = list_empty(&lo->plh_segs);
        oldseq = be32_to_cpu(lo->plh_stateid.seqid);
        newseq = be32_to_cpu(new->seqid);
-        if ((int)(newseq - oldseq) > 0) {
+        if (empty || pnfs_seqid_is_newer(newseq, oldseq)) {
                nfs4_stateid_copy(&lo->plh_stateid, new);
                if (update_barrier) {
-                        u32 new_barrier = be32_to_cpu(new->seqid);
+                        new_barrier = be32_to_cpu(new->seqid);
-                        if ((int)(new_barrier - lo->plh_barrier))
-                                lo->plh_barrier = new_barrier;
                } else {
                        /* Because of wraparound, we want to keep the barrier
-                         * "close" to the current seqids.  It needs to be
+                         * "close" to the current seqids.
-                         * within 2**31 to count as "behind", so if it
-                         * gets too near that limit, give us a litle leeway
-                         * and bring it to within 2**30.
-                         * NOTE - and yes, this is all unsigned arithmetic.
                         */
-                        if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
+                        new_barrier = newseq - atomic_read(&lo->plh_outstanding);
-                                lo->plh_barrier = newseq - (1 << 30);
                }
+                if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
+                        lo->plh_barrier = new_barrier;
        }
 }
+static bool
+pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
+                const nfs4_stateid *stateid)
+{
+        u32 seqid = be32_to_cpu(stateid->seqid);
+        return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
+}
 /* lget is set to 1 if called from inside send_layoutget call chain */
 static bool
-pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget)
-                        int lget)
 {
-        if ((stateid) &&
-            (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0)
-                return true;
        return lo->plh_block_lgets ||
-                test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
                (list_empty(&lo->plh_segs) &&
                 (atomic_read(&lo->plh_outstanding) > lget));
@@ -551,7 +610,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
        dprintk("--> %s\n", __func__);
        spin_lock(&lo->plh_inode->i_lock);
-        if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
+        if (pnfs_layoutgets_blocked(lo, 1)) {
                status = -EAGAIN;
        } else if (list_empty(&lo->plh_segs)) {
                int seq;
@@ -582,7 +641,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        struct inode *ino = lo->plh_inode;
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs4_layoutget *lgp;
-        struct pnfs_layout_segment *lseg = NULL;
+        struct pnfs_layout_segment *lseg;
        dprintk("--> %s\n", __func__);
@@ -599,16 +658,22 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        lgp->args.type = server->pnfs_curr_ld->id;
        lgp->args.inode = ino;
        lgp->args.ctx = get_nfs_open_context(ctx);
-        lgp->lsegpp = &lseg;
        lgp->gfp_flags = gfp_flags;
        /* Synchronously retrieve layout information from server and
         * store in lseg.
         */
-        nfs4_proc_layoutget(lgp, gfp_flags);
+        lseg = nfs4_proc_layoutget(lgp, gfp_flags);
-        if (!lseg) {
+        if (IS_ERR(lseg)) {
-                /* remember that LAYOUTGET failed and suspend trying */
+                switch (PTR_ERR(lseg)) {
-                set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
+                case -ENOMEM:
+                case -ERESTARTSYS:
+                        break;
+                default:
+                        /* remember that LAYOUTGET failed and suspend trying */
+                        pnfs_layout_io_set_failed(lo, range->iomode);
+                }
+                return NULL;
        }
        return lseg;
@@ -636,25 +701,24 @@ _pnfs_return_layout(struct inode *ino)
        spin_lock(&ino->i_lock);
        lo = nfsi->layout;
-        if (!lo || pnfs_test_layout_returned(lo)) {
+        if (!lo) {
                spin_unlock(&ino->i_lock);
                dprintk("NFS: %s no layout to return\n", __func__);
                goto out;
        }
        stateid = nfsi->layout->plh_stateid;
        /* Reference matched in nfs4_layoutreturn_release */
-        get_layout_hdr(lo);
+        pnfs_get_layout_hdr(lo);
        empty = list_empty(&lo->plh_segs);
-        mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+        pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
        /* Don't send a LAYOUTRETURN if list was initially empty */
        if (empty) {
                spin_unlock(&ino->i_lock);
-                put_layout_hdr(lo);
+                pnfs_put_layout_hdr(lo);
                dprintk("NFS: %s no layout segments to return\n", __func__);
                goto out;
        }
        lo->plh_block_lgets++;
-        pnfs_mark_layout_returned(lo);
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&tmp_list);
@@ -663,10 +727,10 @@ _pnfs_return_layout(struct inode *ino)
        lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
        if (unlikely(lrp == NULL)) {
                status = -ENOMEM;
-                set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags);
+                spin_lock(&ino->i_lock);
-                set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags);
+                lo->plh_block_lgets--;
-                pnfs_clear_layout_returned(lo);
+                spin_unlock(&ino->i_lock);
-                put_layout_hdr(lo);
+                pnfs_put_layout_hdr(lo);
                goto out;
        }
@@ -703,7 +767,7 @@ bool pnfs_roc(struct inode *ino)
        if (!found)
                goto out_nolayout;
        lo->plh_block_lgets++;
-        get_layout_hdr(lo); /* matched in pnfs_roc_release */
+        pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&tmp_list);
        return true;
@@ -720,8 +784,12 @@ void pnfs_roc_release(struct inode *ino)
        spin_lock(&ino->i_lock);
        lo = NFS_I(ino)->layout;
        lo->plh_block_lgets--;
-        put_layout_hdr_locked(lo);
+        if (atomic_dec_and_test(&lo->plh_refcount)) {
-        spin_unlock(&ino->i_lock);
+                pnfs_detach_layout_hdr(lo);
+                spin_unlock(&ino->i_lock);
+                pnfs_free_layout_hdr(lo);
+        } else
+                spin_unlock(&ino->i_lock);
 }
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
@@ -730,32 +798,34 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
        spin_lock(&ino->i_lock);
        lo = NFS_I(ino)->layout;
-        if ((int)(barrier - lo->plh_barrier) > 0)
+        if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
                lo->plh_barrier = barrier;
        spin_unlock(&ino->i_lock);
 }
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
 {
        struct nfs_inode *nfsi = NFS_I(ino);
+        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg;
+        u32 current_seqid;
        bool found = false;
        spin_lock(&ino->i_lock);
        list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+                        rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
                        found = true;
-                        break;
+                        goto out;
                }
-        if (!found) {
+        lo = nfsi->layout;
-                struct pnfs_layout_hdr *lo = nfsi->layout;
+        current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
-                u32 current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
-                /* Since close does not return a layout stateid for use as
+        /* Since close does not return a layout stateid for use as
-                 * a barrier, we choose the worst-case barrier.
+         * a barrier, we choose the worst-case barrier.
-                 */
+         */
-                *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
+        *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
-        }
+out:
        spin_unlock(&ino->i_lock);
        return found;
 }
@@ -786,14 +856,13 @@ cmp_layout(struct pnfs_layout_range *l1,
 }
 static void
-pnfs_insert_layout(struct pnfs_layout_hdr *lo,
+pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
                   struct pnfs_layout_segment *lseg)
 {
        struct pnfs_layout_segment *lp;
        dprintk("%s:Begin\n", __func__);
-        assert_spin_locked(&lo->plh_inode->i_lock);
        list_for_each_entry(lp, &lo->plh_segs, pls_list) {
                if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
                        continue;
@@ -813,7 +882,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
                __func__, lseg, lseg->pls_range.iomode,
                lseg->pls_range.offset, lseg->pls_range.length);
 out:
-        get_layout_hdr(lo);
+        pnfs_get_layout_hdr(lo);
        dprintk("%s:Return\n", __func__);
 }
@@ -847,21 +916,19 @@ pnfs_find_alloc_layout(struct inode *ino,
        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
-        assert_spin_locked(&ino->i_lock);
+        if (nfsi->layout != NULL)
-        if (nfsi->layout) {
+                goto out_existing;
-                if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
-                        return NULL;
-                else
-                        return nfsi->layout;
-        }
        spin_unlock(&ino->i_lock);
        new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
        spin_lock(&ino->i_lock);
-        if (likely(nfsi->layout == NULL))       /* Won the race? */
+        if (likely(nfsi->layout == NULL)) {     /* Won the race? */
                nfsi->layout = new;
-        else
+                return new;
-                pnfs_free_layout_hdr(new);
+        }
+        pnfs_free_layout_hdr(new);
+out_existing:
+        pnfs_get_layout_hdr(nfsi->layout);
        return nfsi->layout;
 }
@@ -904,11 +971,10 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
        dprintk("%s:Begin\n", __func__);
-        assert_spin_locked(&lo->plh_inode->i_lock);
        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
                    is_matching_lseg(&lseg->pls_range, range)) {
-                        ret = get_lseg(lseg);
+                        ret = pnfs_get_lseg(lseg);
                        break;
                }
                if (lseg->pls_range.offset > range->offset)
@@ -1013,7 +1079,6 @@ pnfs_update_layout(struct inode *ino,
                .length = count,
        };
        unsigned pg_offset;
-        struct nfs_inode *nfsi = NFS_I(ino);
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs_client *clp = server->nfs_client;
        struct pnfs_layout_hdr *lo;
@@ -1021,16 +1086,16 @@ pnfs_update_layout(struct inode *ino,
        bool first = false;
        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
-                return NULL;
+                goto out;
        if (pnfs_within_mdsthreshold(ctx, ino, iomode))
-                return NULL;
+                goto out;
        spin_lock(&ino->i_lock);
        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
        if (lo == NULL) {
-                dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
+                spin_unlock(&ino->i_lock);
-                goto out_unlock;
+                goto out;
        }
        /* Do we even need to bother with this? */
@@ -1040,7 +1105,7 @@ pnfs_update_layout(struct inode *ino,
        }
        /* if LAYOUTGET already failed once we don't try again */
-        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
+        if (pnfs_layout_io_test_failed(lo, iomode))
                goto out_unlock;
        /* Check to see if the layout for the given range already exists */
@@ -1048,17 +1113,13 @@ pnfs_update_layout(struct inode *ino,
        if (lseg)
                goto out_unlock;
-        if (pnfs_layoutgets_blocked(lo, NULL, 0))
+        if (pnfs_layoutgets_blocked(lo, 0))
                goto out_unlock;
        atomic_inc(&lo->plh_outstanding);
-        get_layout_hdr(lo);
        if (list_empty(&lo->plh_segs))
                first = true;
-        /* Enable LAYOUTRETURNs */
-        pnfs_clear_layout_returned(lo);
        spin_unlock(&ino->i_lock);
        if (first) {
                /* The lo must be on the clp list if there is any
@@ -1079,24 +1140,26 @@ pnfs_update_layout(struct inode *ino,
                arg.length = PAGE_CACHE_ALIGN(arg.length);
        lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
-        if (!lseg && first) {
-                spin_lock(&clp->cl_lock);
-                list_del_init(&lo->plh_layouts);
-                spin_unlock(&clp->cl_lock);
-        }
        atomic_dec(&lo->plh_outstanding);
-        put_layout_hdr(lo);
+out_put_layout_hdr:
+        pnfs_put_layout_hdr(lo);
 out:
-        dprintk("%s end, state 0x%lx lseg %p\n", __func__,
+        dprintk("%s: inode %s/%llu pNFS layout segment %s for "
-                nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
+                        "(%s, offset: %llu, length: %llu)\n",
+                        __func__, ino->i_sb->s_id,
+                        (unsigned long long)NFS_FILEID(ino),
+                        lseg == NULL ? "not found" : "found",
+                        iomode==IOMODE_RW ?  "read/write" : "read-only",
+                        (unsigned long long)pos,
+                        (unsigned long long)count);
        return lseg;
 out_unlock:
        spin_unlock(&ino->i_lock);
-        goto out;
+        goto out_put_layout_hdr;
 }
 EXPORT_SYMBOL_GPL(pnfs_update_layout);
-int
+struct pnfs_layout_segment *
 pnfs_layout_process(struct nfs4_layoutget *lgp)
 {
        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
@@ -1123,25 +1186,29 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
                goto out_forget_reply;
        }
-        if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
+        if (pnfs_layoutgets_blocked(lo, 1) ||
+            pnfs_layout_stateid_blocked(lo, &res->stateid)) {
                dprintk("%s forget reply due to state\n", __func__);
                goto out_forget_reply;
        }
+        /* Done processing layoutget. Set the layout stateid */
+        pnfs_set_layout_stateid(lo, &res->stateid, false);
        init_lseg(lo, lseg);
        lseg->pls_range = res->range;
-        *lgp->lsegpp = get_lseg(lseg);
+        pnfs_get_lseg(lseg);
-        pnfs_insert_layout(lo, lseg);
+        pnfs_layout_insert_lseg(lo, lseg);
        if (res->return_on_close) {
                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
                set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
        }
-        /* Done processing layoutget. Set the layout stateid */
-        pnfs_set_layout_stateid(lo, &res->stateid, false);
        spin_unlock(&ino->i_lock);
+        return lseg;
 out:
-        return status;
+        return ERR_PTR(status);
 out_forget_reply:
        spin_unlock(&ino->i_lock);
@@ -1153,16 +1220,24 @@ out_forget_reply:
 void
 pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
+        u64 rd_size = req->wb_bytes;
        BUG_ON(pgio->pg_lseg != NULL);
        if (req->wb_offset != req->wb_pgbase) {
                nfs_pageio_reset_read_mds(pgio);
                return;
        }
+        if (pgio->pg_dreq == NULL)
+                rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
+        else
+                rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                           req->wb_context,
                                           req_offset(req),
-                                           req->wb_bytes,
+                                           rd_size,
                                           IOMODE_READ,
                                           GFP_KERNEL);
        /* If no lseg, fall back to read through mds */
@@ -1173,7 +1248,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
 void
-pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
+                           struct nfs_page *req, u64 wb_size)
 {
        BUG_ON(pgio->pg_lseg != NULL);
@@ -1181,10 +1257,11 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *
                nfs_pageio_reset_write_mds(pgio);
                return;
        }
        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                           req->wb_context,
                                           req_offset(req),
-                                           req->wb_bytes,
+                                           wb_size,
                                           IOMODE_RW,
                                           GFP_NOFS);
        /* If no lseg, fall back to write through mds */
@@ -1362,12 +1439,12 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he
                if (trypnfs == PNFS_NOT_ATTEMPTED)
                        pnfs_write_through_mds(desc, data);
        }
-        put_lseg(lseg);
+        pnfs_put_lseg(lseg);
 }
 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
-        put_lseg(hdr->lseg);
+        pnfs_put_lseg(hdr->lseg);
        nfs_writehdr_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
@@ -1382,17 +1459,17 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
        whdr = nfs_writehdr_alloc();
        if (!whdr) {
                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
-                put_lseg(desc->pg_lseg);
+                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
                return -ENOMEM;
        }
        hdr = &whdr->header;
        nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
-        hdr->lseg = get_lseg(desc->pg_lseg);
+        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
        atomic_inc(&hdr->refcnt);
        ret = nfs_generic_flush(desc, hdr);
        if (ret != 0) {
-                put_lseg(desc->pg_lseg);
+                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
        } else
                pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
@@ -1517,12 +1594,12 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea
                if (trypnfs == PNFS_NOT_ATTEMPTED)
                        pnfs_read_through_mds(desc, data);
        }
-        put_lseg(lseg);
+        pnfs_put_lseg(lseg);
 }
 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
 {
-        put_lseg(hdr->lseg);
+        pnfs_put_lseg(hdr->lseg);
        nfs_readhdr_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
@@ -1538,17 +1615,17 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
        if (!rhdr) {
                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
                ret = -ENOMEM;
-                put_lseg(desc->pg_lseg);
+                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
                return ret;
        }
        hdr = &rhdr->header;
        nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
-        hdr->lseg = get_lseg(desc->pg_lseg);
+        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
        atomic_inc(&hdr->refcnt);
        ret = nfs_generic_pagein(desc, hdr);
        if (ret != 0) {
-                put_lseg(desc->pg_lseg);
+                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
        } else
                pnfs_do_multiple_reads(desc, &hdr->rpc_list);
@@ -1574,13 +1651,7 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
 {
-        if (lseg->pls_range.iomode == IOMODE_RW) {
+        pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
-                dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
-                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-        } else {
-                dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
-                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
-        }
 }
 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
@@ -1601,7 +1672,7 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
        }
        if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) {
                /* references matched in nfs4_layoutcommit_release */
-                get_lseg(hdr->lseg);
+                pnfs_get_lseg(hdr->lseg);
        }
        if (end_pos > nfsi->layout->plh_lwb)
                nfsi->layout->plh_lwb = end_pos;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 745aa1b39e7c..2d722dba1111 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -62,9 +62,6 @@ enum {
        NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
        NFS_LAYOUT_BULK_RECALL,         /* bulk recall affecting layout */
        NFS_LAYOUT_ROC,                 /* some lseg had roc bit set */
-        NFS_LAYOUT_DESTROYED,           /* no new use of layout allowed */
-        NFS_LAYOUT_INVALID,             /* layout is being destroyed */
-        NFS_LAYOUT_RETURNED,            /* layout has already been returned */
 };
 enum layoutdriver_policy_flags {
@@ -140,6 +137,7 @@ struct pnfs_layout_hdr {
        atomic_t                plh_outstanding; /* number of RPCs out */
        unsigned long           plh_block_lgets; /* block LAYOUTGET if >0 */
        u32                     plh_barrier; /* ignore lower seqids */
+        unsigned long           plh_retry_timestamp;
        unsigned long           plh_flags;
        loff_t                  plh_lwb; /* last write byte for layoutcommit */
        struct rpc_cred         *plh_lc_cred; /* layoutcommit cred */
@@ -172,12 +170,12 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
                                   struct pnfs_devicelist *devlist);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
                                   struct pnfs_device *dev);
-extern void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
+extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 /* pnfs.c */
-void get_layout_hdr(struct pnfs_layout_hdr *lo);
+void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
-void put_lseg(struct pnfs_layout_segment *lseg);
+void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
 void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
                           const struct nfs_pgio_completion_ops *);
@@ -188,28 +186,29 @@ void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
 int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
-void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *);
+void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
+                                struct nfs_page *req, u64 wb_size);
 int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
 bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
-int pnfs_layout_process(struct nfs4_layoutget *lgp);
+struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
 void pnfs_destroy_all_layouts(struct nfs_client *);
-void put_layout_hdr(struct pnfs_layout_hdr *lo);
+void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
                             const nfs4_stateid *new,
                             bool update_barrier);
 int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
                                  struct pnfs_layout_hdr *lo,
                                  struct nfs4_state *open_state);
-int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
                                struct pnfs_layout_range *recall_range);
 bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
 void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
@@ -233,6 +232,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
 /* nfs4_deviceid_flags */
 enum {
        NFS_DEVICEID_INVALID = 0,       /* set when MDS clientid recalled */
+        NFS_DEVICEID_UNAVAILABLE,       /* device temporarily unavailable */
 };
 /* pnfs_dev.c */
@@ -242,6 +242,7 @@ struct nfs4_deviceid_node {
        const struct pnfs_layoutdriver_type *ld;
        const struct nfs_client         *nfs_client;
        unsigned long                   flags;
+        unsigned long                   timestamp_unavailable;
        struct nfs4_deviceid            deviceid;
        atomic_t                        ref;
 };
@@ -254,34 +255,12 @@ void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
                             const struct nfs4_deviceid *);
 struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
 bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
+bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
 void nfs4_deviceid_purge_client(const struct nfs_client *);
-static inline void
-pnfs_mark_layout_returned(struct pnfs_layout_hdr *lo)
-{
-        set_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags);
-}
-static inline void
-pnfs_clear_layout_returned(struct pnfs_layout_hdr *lo)
-{
-        clear_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags);
-}
-static inline bool
-pnfs_test_layout_returned(struct pnfs_layout_hdr *lo)
-{
-        return test_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags);
-}
-static inline int lo_fail_bit(u32 iomode)
-{
-        return iomode == IOMODE_RW ?
-                         NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
-}
 static inline struct pnfs_layout_segment *
-get_lseg(struct pnfs_layout_segment *lseg)
+pnfs_get_lseg(struct pnfs_layout_segment *lseg)
 {
        if (lseg) {
                atomic_inc(&lseg->pls_refcount);
@@ -406,12 +385,12 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
 }
 static inline struct pnfs_layout_segment *
-get_lseg(struct pnfs_layout_segment *lseg)
+pnfs_get_lseg(struct pnfs_layout_segment *lseg)
 {
        return NULL;
 }
-static inline void put_lseg(struct pnfs_layout_segment *lseg)
+static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 {
 }
@@ -443,7 +422,7 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 }
 static inline bool
-pnfs_roc_drain(struct inode *ino, u32 *barrier)
+pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
 {
        return false;
 }
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 73f701f1f4d3..d35b62e83ea6 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -40,6 +40,8 @@
 #define NFS4_DEVICE_ID_HASH_SIZE        (1 << NFS4_DEVICE_ID_HASH_BITS)
 #define NFS4_DEVICE_ID_HASH_MASK        (NFS4_DEVICE_ID_HASH_SIZE - 1)
+#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ)
 static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
 static DEFINE_SPINLOCK(nfs4_deviceid_lock);
@@ -218,6 +220,30 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
 }
 EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
+void
+nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node)
+{
+        node->timestamp_unavailable = jiffies;
+        set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+}
+EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable);
+bool
+nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node)
+{
+        if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+                unsigned long start, end;
+                end = jiffies;
+                start = end - PNFS_DEVICE_RETRY_TIMEOUT;
+                if (time_in_range(node->timestamp_unavailable, start, end))
+                        return true;
+                clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+        }
+        return false;
+}
+EXPORT_SYMBOL_GPL(nfs4_test_deviceid_unavailable);
 static void
 _deviceid_purge_client(const struct nfs_client *clp, long hash)
 {
@@ -276,3 +302,4 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
        }
        rcu_read_unlock();
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d2c7f5db0847..e831bce49766 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -88,6 +88,7 @@ enum {
        Opt_sharecache, Opt_nosharecache,
        Opt_resvport, Opt_noresvport,
        Opt_fscache, Opt_nofscache,
+        Opt_migration, Opt_nomigration,
        /* Mount options that take integer arguments */
        Opt_port,
@@ -147,6 +148,8 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_noresvport, "noresvport" },
        { Opt_fscache, "fsc" },
        { Opt_nofscache, "nofsc" },
+        { Opt_migration, "migration" },
+        { Opt_nomigration, "nomigration" },
        { Opt_port, "port=%s" },
        { Opt_rsize, "rsize=%s" },
@@ -676,6 +679,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        if (nfss->options & NFS_OPTION_FSCACHE)
                seq_printf(m, ",fsc");
+        if (nfss->options & NFS_OPTION_MIGRATION)
+                seq_printf(m, ",migration");
        if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
                if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
                        seq_printf(m, ",lookupcache=none");
@@ -1106,7 +1112,7 @@ static int nfs_get_option_ul(substring_t args[], unsigned long *option)
        string = match_strdup(args);
        if (string == NULL)
                return -ENOMEM;
-        rc = strict_strtoul(string, 10, option);
+        rc = kstrtoul(string, 10, option);
        kfree(string);
        return rc;
@@ -1243,6 +1249,12 @@ static int nfs_parse_mount_options(char *raw,
                        kfree(mnt->fscache_uniq);
                        mnt->fscache_uniq = NULL;
                        break;
+                case Opt_migration:
+                        mnt->options |= NFS_OPTION_MIGRATION;
+                        break;
+                case Opt_nomigration:
+                        mnt->options &= NFS_OPTION_MIGRATION;
+                        break;
                /*
                 * options that take numeric values
@@ -1535,6 +1547,10 @@ static int nfs_parse_mount_options(char *raw,
        if (mnt->minorversion && mnt->version != 4)
                goto out_minorversion_mismatch;
+        if (mnt->options & NFS_OPTION_MIGRATION &&
+            mnt->version != 4 && mnt->minorversion != 0)
+                goto out_migration_misuse;
        /*
         * verify that any proto=/mountproto= options match the address
         * families in the addr=/mountaddr= options.
@@ -1572,6 +1588,10 @@ out_minorversion_mismatch:
        printk(KERN_INFO "NFS: mount option vers=%u does not support "
                         "minorversion=%u\n", mnt->version, mnt->minorversion);
        return 0;
+out_migration_misuse:
+        printk(KERN_INFO
+                "NFS: 'migration' not supported for this NFS version\n");
+        return 0;
 out_nomem:
        printk(KERN_INFO "NFS: not enough memory to parse option\n");
        return 0;
@@ -2494,7 +2514,7 @@ EXPORT_SYMBOL_GPL(nfs_kill_super);
 /*
 * Clone an NFS2/3/4 server record on xdev traversal (FSID-change)
 */
-struct dentry *
+static struct dentry *
 nfs_xdev_mount(struct file_system_type *fs_type, int flags,
                const char *dev_name, void *raw_data)
 {
@@ -2642,6 +2662,7 @@ unsigned int nfs_idmap_cache_timeout = 600;
 bool nfs4_disable_idmapping = true;
 unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
 unsigned short send_implementation_id = 1;
+char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
 EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
 EXPORT_SYMBOL_GPL(nfs_callback_tcpport);
@@ -2649,6 +2670,7 @@ EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
 EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
 EXPORT_SYMBOL_GPL(max_session_slots);
 EXPORT_SYMBOL_GPL(send_implementation_id);
+EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
 #define NFS_CALLBACK_MAXPORTNR (65535U)
@@ -2659,7 +2681,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp)
        if (!val)
                return -EINVAL;
-        ret = strict_strtoul(val, 0, &num);
+        ret = kstrtoul(val, 0, &num);
        if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
                return -EINVAL;
        *((unsigned int *)kp->arg) = num;
@@ -2674,6 +2696,8 @@ static struct kernel_param_ops param_ops_portnr = {
 module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
 module_param(nfs_idmap_cache_timeout, int, 0644);
 module_param(nfs4_disable_idmapping, bool, 0644);
+module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier,
+                        NFS4_CLIENT_ID_UNIQ_LEN, 0600);
 MODULE_PARM_DESC(nfs4_disable_idmapping,
                "Turn off NFSv4 idmapping when using 'sec=sys'");
 module_param(max_session_slots, ushort, 0644);
@@ -2682,6 +2706,7 @@ MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
 module_param(send_implementation_id, ushort, 0644);
 MODULE_PARM_DESC(send_implementation_id,
                "Send implementation ID with NFSv4.1 exchange_id");
+MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string");
 MODULE_ALIAS("nfs4");
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e3b55372726c..9347ab7c9574 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -846,6 +846,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
 int nfs_flush_incompatible(struct file *file, struct page *page)
 {
        struct nfs_open_context *ctx = nfs_file_open_context(file);
+        struct nfs_lock_context *l_ctx;
        struct nfs_page *req;
        int do_flush, status;
        /*
@@ -860,9 +861,12 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
                req = nfs_page_find_request(page);
                if (req == NULL)
                        return 0;
-                do_flush = req->wb_page != page || req->wb_context != ctx ||
+                l_ctx = req->wb_lock_context;
-                        req->wb_lock_context->lockowner != current->files ||
+                do_flush = req->wb_page != page || req->wb_context != ctx;
-                        req->wb_lock_context->pid != current->tgid;
+                if (l_ctx) {
+                        do_flush |= l_ctx->lockowner.l_owner != current->files
+                                || l_ctx->lockowner.l_pid != current->tgid;
+                }
                nfs_release_request(req);
                if (!do_flush)
                        return 0;
@@ -1576,6 +1580,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
                /* We have a mismatch. Write the page again */
                dprintk(" mismatch\n");
                nfs_mark_request_dirty(req);
+                set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
        next:
                nfs_unlock_and_release_request(req);
        }
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 6aa5590c3679..b314888825d5 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -218,8 +218,7 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
 * There must be an encoding function for void results so svc_process
 * will work properly.
 */
-int
+static int nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
-nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9095f3c21df9..97d90d1c8608 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -247,7 +247,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
        /* Now create the file and set attributes */
        nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
                                attr, newfhp,
-                                argp->createmode, argp->verf, NULL, NULL);
+                                argp->createmode, (u32 *)argp->verf, NULL, NULL);
        RETURN_STATUS(nfserr);
 }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4c7bd35b1876..bdf29c96e4cd 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1028,7 +1028,6 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
        cb->cb_msg.rpc_cred = callback_cred;
        cb->cb_ops = &nfsd4_cb_recall_ops;
-        dp->dl_retries = 1;
        INIT_LIST_HEAD(&cb->cb_per_client);
        cb->cb_done = true;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index fdc91a6fc9c4..a1f10c0a6255 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -478,7 +478,7 @@ nfsd_idmap_init(struct net *net)
                goto destroy_idtoname_cache;
        nn->nametoid_cache = cache_create_net(&nametoid_cache_template, net);
        if (IS_ERR(nn->nametoid_cache)) {
-                rv = PTR_ERR(nn->idtoname_cache);
+                rv = PTR_ERR(nn->nametoid_cache);
                goto unregister_idtoname_cache;
        }
        rv = cache_register_net(nn->nametoid_cache, net);
@@ -598,7 +598,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel
        /* Just to make sure it's null-terminated: */
        memcpy(buf, name, namelen);
        buf[namelen] = '\0';
-        ret = kstrtouint(name, 10, id);
+        ret = kstrtouint(buf, 10, id);
        return ret == 0;
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c9c1c0a25417..6c9a4b291dba 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -370,7 +370,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        break;
                case NFS4_OPEN_CLAIM_PREVIOUS:
                        open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
-                        status = nfs4_check_open_reclaim(&open->op_clientid);
+                        status = nfs4_check_open_reclaim(&open->op_clientid, cstate->minorversion);
                        if (status)
                                goto out;
                case NFS4_OPEN_CLAIM_FH:
@@ -1054,8 +1054,8 @@ struct nfsd4_operation {
        char *op_name;
        /* Try to get response size before operation */
        nfsd4op_rsize op_rsize_bop;
-        stateid_setter op_get_currentstateid;
+        stateid_getter op_get_currentstateid;
-        stateid_getter op_set_currentstateid;
+        stateid_setter op_set_currentstateid;
 };
 static struct nfsd4_operation nfsd4_ops[];
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 48a1bad37334..d0237f872cc4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -758,7 +758,7 @@ static void nfsd4_put_drc_mem(int slotsize, int num)
        spin_unlock(&nfsd_drc_lock);
 }
-static struct nfsd4_session *alloc_session(int slotsize, int numslots)
+static struct nfsd4_session *__alloc_session(int slotsize, int numslots)
 {
        struct nfsd4_session *new;
        int mem, i;
@@ -852,35 +852,28 @@ static int nfsd4_register_conn(struct nfsd4_conn *conn)
        return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
 }
-static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir)
+static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, struct nfsd4_session *ses)
 {
-        struct nfsd4_conn *conn;
        int ret;
-        conn = alloc_conn(rqstp, dir);
-        if (!conn)
-                return nfserr_jukebox;
        nfsd4_hash_conn(conn, ses);
        ret = nfsd4_register_conn(conn);
        if (ret)
                /* oops; xprt is already down: */
                nfsd4_conn_lost(&conn->cn_xpt_user);
-        if (ses->se_client->cl_cb_state == NFSD4_CB_DOWN &&
+        if (conn->cn_flags & NFS4_CDFC4_BACK) {
-                dir & NFS4_CDFC4_BACK) {
                /* callback channel may be back up */
                nfsd4_probe_callback(ses->se_client);
        }
-        return nfs_ok;
 }
-static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses)
 {
        u32 dir = NFS4_CDFC4_FORE;
-        if (ses->se_flags & SESSION4_BACK_CHAN)
+        if (cses->flags & SESSION4_BACK_CHAN)
                dir |= NFS4_CDFC4_BACK;
+        return alloc_conn(rqstp, dir);
-        return nfsd4_new_conn(rqstp, ses, dir);
 }
 /* must be called under client_lock */
@@ -903,20 +896,21 @@ static void nfsd4_del_conns(struct nfsd4_session *s)
        spin_unlock(&clp->cl_lock);
 }
+static void __free_session(struct nfsd4_session *ses)
+{
+        nfsd4_put_drc_mem(slot_bytes(&ses->se_fchannel), ses->se_fchannel.maxreqs);
+        free_session_slots(ses);
+        kfree(ses);
+}
 static void free_session(struct kref *kref)
 {
        struct nfsd4_session *ses;
-        int mem;
        lockdep_assert_held(&client_lock);
        ses = container_of(kref, struct nfsd4_session, se_ref);
        nfsd4_del_conns(ses);
-        spin_lock(&nfsd_drc_lock);
+        __free_session(ses);
-        mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
-        nfsd_drc_mem_used -= mem;
-        spin_unlock(&nfsd_drc_lock);
-        free_session_slots(ses);
-        kfree(ses);
 }
 void nfsd4_put_session(struct nfsd4_session *ses)
@@ -926,14 +920,10 @@ void nfsd4_put_session(struct nfsd4_session *ses)
        spin_unlock(&client_lock);
 }
-static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan)
 {
        struct nfsd4_session *new;
-        struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
        int numslots, slotsize;
-        __be32 status;
-        int idx;
        /*
         * Note decreasing slot size below client's request may
         * make it difficult for client to function correctly, whereas
@@ -946,12 +936,18 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
        if (numslots < 1)
                return NULL;
-        new = alloc_session(slotsize, numslots);
+        new = __alloc_session(slotsize, numslots);
        if (!new) {
                nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
                return NULL;
        }
        init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
+        return new;
+}
+static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+{
+        int idx;
        new->se_client = clp;
        gen_sessionid(new);
@@ -970,14 +966,6 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
        spin_unlock(&clp->cl_lock);
        spin_unlock(&client_lock);
-        status = nfsd4_new_conn_from_crses(rqstp, new);
-        /* whoops: benny points out, status is ignored! (err, or bogus) */
-        if (status) {
-                spin_lock(&client_lock);
-                free_session(&new->se_ref);
-                spin_unlock(&client_lock);
-                return NULL;
-        }
        if (cses->flags & SESSION4_BACK_CHAN) {
                struct sockaddr *sa = svc_addr(rqstp);
                /*
@@ -990,7 +978,6 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
                rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
                clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
        }
-        nfsd4_probe_callback(clp);
        return new;
 }
@@ -1131,7 +1118,7 @@ unhash_client_locked(struct nfs4_client *clp)
 }
 static void
-expire_client(struct nfs4_client *clp)
+destroy_client(struct nfs4_client *clp)
 {
        struct nfs4_openowner *oo;
        struct nfs4_delegation *dp;
@@ -1165,6 +1152,12 @@ expire_client(struct nfs4_client *clp)
        spin_unlock(&client_lock);
 }
+static void expire_client(struct nfs4_client *clp)
+{
+        nfsd4_client_record_remove(clp);
+        destroy_client(clp);
+}
 static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
 {
        memcpy(target->cl_verifier.data, source->data,
@@ -1223,10 +1216,26 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2)
        return true;
 }
+/*
+ * RFC 3530 language requires clid_inuse be returned when the
+ * "principal" associated with a requests differs from that previously
+ * used.  We use uid, gid's, and gss principal string as our best
+ * approximation.  We also don't want to allow non-gss use of a client
+ * established using gss: in theory cr_principal should catch that
+ * change, but in practice cr_principal can be null even in the gss case
+ * since gssd doesn't always pass down a principal string.
+ */
+static bool is_gss_cred(struct svc_cred *cr)
+{
+        /* Is cr_flavor one of the gss "pseudoflavors"?: */
+        return (cr->cr_flavor > RPC_AUTH_MAXFLAVOR);
+}
 static bool
 same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
 {
-        if ((cr1->cr_flavor != cr2->cr_flavor)
+        if ((is_gss_cred(cr1) != is_gss_cred(cr2))
                || (cr1->cr_uid != cr2->cr_uid)
                || (cr1->cr_gid != cr2->cr_gid)
                || !groups_equal(cr1->cr_group_info, cr2->cr_group_info))
@@ -1340,13 +1349,15 @@ move_to_confirmed(struct nfs4_client *clp)
 }
 static struct nfs4_client *
-find_confirmed_client(clientid_t *clid)
+find_confirmed_client(clientid_t *clid, bool sessions)
 {
        struct nfs4_client *clp;
        unsigned int idhashval = clientid_hashval(clid->cl_id);
        list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
                if (same_clid(&clp->cl_clientid, clid)) {
+                        if ((bool)clp->cl_minorversion != sessions)
+                                return NULL;
                        renew_client(clp);
                        return clp;
                }
@@ -1355,14 +1366,17 @@ find_confirmed_client(clientid_t *clid)
 }
 static struct nfs4_client *
-find_unconfirmed_client(clientid_t *clid)
+find_unconfirmed_client(clientid_t *clid, bool sessions)
 {
        struct nfs4_client *clp;
        unsigned int idhashval = clientid_hashval(clid->cl_id);
        list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) {
-                if (same_clid(&clp->cl_clientid, clid))
+                if (same_clid(&clp->cl_clientid, clid)) {
+                        if ((bool)clp->cl_minorversion != sessions)
+                                return NULL;
                        return clp;
+                }
        }
        return NULL;
 }
@@ -1651,6 +1665,7 @@ out_new:
                status = nfserr_jukebox;
                goto out;
        }
+        new->cl_minorversion = 1;
        gen_clid(new);
        add_to_unconfirmed(new, strhashval);
@@ -1743,67 +1758,71 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        struct sockaddr *sa = svc_addr(rqstp);
        struct nfs4_client *conf, *unconf;
        struct nfsd4_session *new;
+        struct nfsd4_conn *conn;
        struct nfsd4_clid_slot *cs_slot = NULL;
-        bool confirm_me = false;
        __be32 status = 0;
        if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
                return nfserr_inval;
+        if (check_forechannel_attrs(cr_ses->fore_channel))
+                return nfserr_toosmall;
+        new = alloc_session(&cr_ses->fore_channel);
+        if (!new)
+                return nfserr_jukebox;
+        status = nfserr_jukebox;
+        conn = alloc_conn_from_crses(rqstp, cr_ses);
+        if (!conn)
+                goto out_free_session;
        nfs4_lock_state();
-        unconf = find_unconfirmed_client(&cr_ses->clientid);
+        unconf = find_unconfirmed_client(&cr_ses->clientid, true);
-        conf = find_confirmed_client(&cr_ses->clientid);
+        conf = find_confirmed_client(&cr_ses->clientid, true);
        if (conf) {
                cs_slot = &conf->cl_cs_slot;
                status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
                if (status == nfserr_replay_cache) {
                        status = nfsd4_replay_create_session(cr_ses, cs_slot);
-                        goto out;
+                        goto out_free_conn;
                } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
                        status = nfserr_seq_misordered;
-                        goto out;
+                        goto out_free_conn;
                }
        } else if (unconf) {
+                unsigned int hash;
+                struct nfs4_client *old;
                if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
                    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
                        status = nfserr_clid_inuse;
-                        goto out;
+                        goto out_free_conn;
                }
                cs_slot = &unconf->cl_cs_slot;
                status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
                if (status) {
                        /* an unconfirmed replay returns misordered */
                        status = nfserr_seq_misordered;
-                        goto out;
+                        goto out_free_conn;
                }
-                confirm_me = true;
+                hash = clientstr_hashval(unconf->cl_recdir);
+                old = find_confirmed_client_by_str(unconf->cl_recdir, hash);
+                if (old)
+                        expire_client(old);
+                move_to_confirmed(unconf);
                conf = unconf;
        } else {
                status = nfserr_stale_clientid;
-                goto out;
+                goto out_free_conn;
        }
+        status = nfs_ok;
-        /*
-         * XXX: we should probably set this at creation time, and check
-         * for consistent minorversion use throughout:
-         */
-        conf->cl_minorversion = 1;
        /*
         * We do not support RDMA or persistent sessions
         */
        cr_ses->flags &= ~SESSION4_PERSIST;
        cr_ses->flags &= ~SESSION4_RDMA;
-        status = nfserr_toosmall;
+        init_session(rqstp, new, conf, cr_ses);
-        if (check_forechannel_attrs(cr_ses->fore_channel))
+        nfsd4_init_conn(rqstp, conn, new);
-                goto out;
-        status = nfserr_jukebox;
-        new = alloc_init_session(rqstp, conf, cr_ses);
-        if (!new)
-                goto out;
-        status = nfs_ok;
        memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
               NFS4_MAX_SESSIONID_LEN);
        memcpy(&cr_ses->fore_channel, &new->se_fchannel,
@@ -1813,18 +1832,15 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        /* cache solo and embedded create sessions under the state lock */
        nfsd4_cache_create_session(cr_ses, cs_slot, status);
-        if (confirm_me) {
-                unsigned int hash = clientstr_hashval(unconf->cl_recdir);
-                struct nfs4_client *old =
-                        find_confirmed_client_by_str(conf->cl_recdir, hash);
-                if (old)
-                        expire_client(old);
-                move_to_confirmed(conf);
-        }
 out:
        nfs4_unlock_state();
        dprintk("%s returns %d\n", __func__, ntohl(status));
        return status;
+out_free_conn:
+        free_conn(conn);
+out_free_session:
+        __free_session(new);
+        goto out;
 }
 static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
@@ -1854,6 +1870,7 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
                     struct nfsd4_bind_conn_to_session *bcts)
 {
        __be32 status;
+        struct nfsd4_conn *conn;
        if (!nfsd4_last_compound_op(rqstp))
                return nfserr_not_only_op;
@@ -1870,9 +1887,13 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
                return nfserr_badsession;
        status = nfsd4_map_bcts_dir(&bcts->dir);
-        if (!status)
+        if (status)
-                nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+                return status;
-        return status;
+        conn = alloc_conn(rqstp, bcts->dir);
+        if (!conn)
+                return nfserr_jukebox;
+        nfsd4_init_conn(rqstp, conn, cstate->session);
+        return nfs_ok;
 }
 static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
@@ -2085,8 +2106,8 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
        __be32 status = 0;
        nfs4_lock_state();
-        unconf = find_unconfirmed_client(&dc->clientid);
+        unconf = find_unconfirmed_client(&dc->clientid, true);
-        conf = find_confirmed_client(&dc->clientid);
+        conf = find_confirmed_client(&dc->clientid, true);
        if (conf) {
                clp = conf;
@@ -2200,10 +2221,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                copy_clid(new, conf);
        else /* case 4 (new client) or cases 2, 3 (client reboot): */
                gen_clid(new);
-        /*
-         * XXX: we should probably set this at creation time, and check
-         * for consistent minorversion use throughout:
-         */
        new->cl_minorversion = 0;
        gen_callback(new, setclid, rqstp);
        add_to_unconfirmed(new, strhashval);
@@ -2232,8 +2249,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                return nfserr_stale_clientid;
        nfs4_lock_state();
-        conf = find_confirmed_client(clid);
+        conf = find_confirmed_client(clid, false);
-        unconf = find_unconfirmed_client(clid);
+        unconf = find_unconfirmed_client(clid, false);
        /*
         * We try hard to give out unique clientid's, so if we get an
         * attempt to confirm the same clientid with a different cred,
@@ -2262,10 +2279,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                unsigned int hash = clientstr_hashval(unconf->cl_recdir);
                conf = find_confirmed_client_by_str(unconf->cl_recdir, hash);
-                if (conf) {
+                if (conf)
-                        nfsd4_client_record_remove(conf);
                        expire_client(conf);
-                }
                move_to_confirmed(unconf);
                nfsd4_probe_callback(unconf);
        }
@@ -2447,16 +2462,20 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
 }
 static struct nfs4_openowner *
-find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
+find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, bool sessions)
 {
        struct nfs4_stateowner *so;
        struct nfs4_openowner *oo;
+        struct nfs4_client *clp;
        list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
                if (!so->so_is_open_owner)
                        continue;
                if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
                        oo = openowner(so);
+                        clp = oo->oo_owner.so_client;
+                        if ((bool)clp->cl_minorversion != sessions)
+                                return NULL;
                        renew_client(oo->oo_owner.so_client);
                        return oo;
                }
@@ -2600,10 +2619,10 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
                return nfserr_jukebox;
        strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
-        oo = find_openstateowner_str(strhashval, open);
+        oo = find_openstateowner_str(strhashval, open, cstate->minorversion);
        open->op_openowner = oo;
        if (!oo) {
-                clp = find_confirmed_client(clientid);
+                clp = find_confirmed_client(clientid, cstate->minorversion);
                if (clp == NULL)
                        return nfserr_expired;
                goto new_owner;
@@ -2705,11 +2724,6 @@ nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_st
        return nfs_ok;
 }
-static void nfs4_free_stateid(struct nfs4_ol_stateid *s)
-{
-        kmem_cache_free(stateid_slab, s);
-}
 static inline int nfs4_access_to_access(u32 nfs4_access)
 {
        int flags = 0;
@@ -3087,7 +3101,7 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
        if (open->op_file)
                nfsd4_free_file(open->op_file);
        if (open->op_stp)
-                nfs4_free_stateid(open->op_stp);
+                free_generic_stateid(open->op_stp);
 }
 __be32
@@ -3104,7 +3118,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        status = nfserr_stale_clientid;
        if (STALE_CLIENTID(clid, nn))
                goto out;
-        clp = find_confirmed_client(clid);
+        clp = find_confirmed_client(clid, cstate->minorversion);
        status = nfserr_expired;
        if (clp == NULL) {
                /* We assume the client took too long to RENEW. */
@@ -3180,7 +3194,6 @@ nfs4_laundromat(void)
                clp = list_entry(pos, struct nfs4_client, cl_lru);
                dprintk("NFSD: purging unused client (clientid %08x)\n",
                        clp->cl_clientid.cl_id);
-                nfsd4_client_record_remove(clp);
                expire_client(clp);
        }
        spin_lock(&recall_lock);
@@ -3372,7 +3385,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
        return nfs_ok;
 }
-static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s)
+static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, bool sessions)
 {
        struct nfs4_client *cl;
        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
@@ -3381,7 +3394,7 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, s
                return nfserr_bad_stateid;
        if (STALE_STATEID(stateid, nn))
                return nfserr_stale_stateid;
-        cl = find_confirmed_client(&stateid->si_opaque.so_clid);
+        cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions);
        if (!cl)
                return nfserr_expired;
        *s = find_stateid_by_type(cl, stateid, typemask);
@@ -3414,7 +3427,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
                return check_special_stateids(net, current_fh, stateid, flags);
-        status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s);
+        status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, cstate->minorversion);
        if (status)
                return status;
        status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
@@ -3564,7 +3577,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
                seqid, STATEID_VAL(stateid));
        *stpp = NULL;
-        status = nfsd4_lookup_stateid(stateid, typemask, &s);
+        status = nfsd4_lookup_stateid(stateid, typemask, &s, cstate->minorversion);
        if (status)
                return status;
        *stpp = openlockstateid(s);
@@ -3765,6 +3778,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
        nfsd4_close_open_stateid(stp);
+        release_last_closed_stateid(oo);
        oo->oo_last_closed_stid = stp;
        if (list_empty(&oo->oo_owner.so_stateids)) {
@@ -3801,7 +3815,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        inode = cstate->current_fh.fh_dentry->d_inode;
        nfs4_lock_state();
-        status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s);
+        status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, cstate->minorversion);
        if (status)
                goto out;
        dp = delegstateid(s);
@@ -4045,8 +4059,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct nfs4_lockowner *lock_sop = NULL;
        struct nfs4_ol_stateid *lock_stp;
        struct file *filp = NULL;
-        struct file_lock file_lock;
+        struct file_lock *file_lock = NULL;
-        struct file_lock conflock;
+        struct file_lock *conflock = NULL;
        __be32 status = 0;
        bool new_state = false;
        int lkflg;
@@ -4116,21 +4130,28 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim)
                goto out;
-        locks_init_lock(&file_lock);
+        file_lock = locks_alloc_lock();
+        if (!file_lock) {
+                dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
+                status = nfserr_jukebox;
+                goto out;
+        }
+        locks_init_lock(file_lock);
        switch (lock->lk_type) {
                case NFS4_READ_LT:
                case NFS4_READW_LT:
                        filp = find_readable_file(lock_stp->st_file);
                        if (filp)
                                get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
-                        file_lock.fl_type = F_RDLCK;
+                        file_lock->fl_type = F_RDLCK;
                        break;
                case NFS4_WRITE_LT:
                case NFS4_WRITEW_LT:
                        filp = find_writeable_file(lock_stp->st_file);
                        if (filp)
                                get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
-                        file_lock.fl_type = F_WRLCK;
+                        file_lock->fl_type = F_WRLCK;
                        break;
                default:
                        status = nfserr_inval;
@@ -4140,22 +4161,23 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                status = nfserr_openmode;
                goto out;
        }
-        file_lock.fl_owner = (fl_owner_t)lock_sop;
+        file_lock->fl_owner = (fl_owner_t)lock_sop;
-        file_lock.fl_pid = current->tgid;
+        file_lock->fl_pid = current->tgid;
-        file_lock.fl_file = filp;
+        file_lock->fl_file = filp;
-        file_lock.fl_flags = FL_POSIX;
+        file_lock->fl_flags = FL_POSIX;
-        file_lock.fl_lmops = &nfsd_posix_mng_ops;
+        file_lock->fl_lmops = &nfsd_posix_mng_ops;
+        file_lock->fl_start = lock->lk_offset;
-        file_lock.fl_start = lock->lk_offset;
+        file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
-        file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
+        nfs4_transform_lock_offset(file_lock);
-        nfs4_transform_lock_offset(&file_lock);
+        conflock = locks_alloc_lock();
-        /*
+        if (!conflock) {
-        * Try to lock the file in the VFS.
+                dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
-        * Note: locks.c uses the BKL to protect the inode's lock list.
+                status = nfserr_jukebox;
-        */
+                goto out;
+        }
-        err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
+        err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
        switch (-err) {
        case 0: /* success! */
                update_stateid(&lock_stp->st_stid.sc_stateid);
@@ -4166,7 +4188,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        case (EAGAIN):          /* conflock holds conflicting lock */
                status = nfserr_denied;
                dprintk("NFSD: nfsd4_lock: conflicting lock found!\n");
-                nfs4_set_lock_denied(&conflock, &lock->lk_denied);
+                nfs4_set_lock_denied(conflock, &lock->lk_denied);
                break;
        case (EDEADLK):
                status = nfserr_deadlock;
@@ -4181,6 +4203,10 @@ out:
                release_lockowner(lock_sop);
        if (!cstate->replay_owner)
                nfs4_unlock_state();
+        if (file_lock)
+                locks_free_lock(file_lock);
+        if (conflock)
+                locks_free_lock(conflock);
        return status;
 }
@@ -4209,7 +4235,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
            struct nfsd4_lockt *lockt)
 {
        struct inode *inode;
-        struct file_lock file_lock;
+        struct file_lock *file_lock = NULL;
        struct nfs4_lockowner *lo;
        __be32 status;
        struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
@@ -4230,15 +4256,21 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                goto out;
        inode = cstate->current_fh.fh_dentry->d_inode;
-        locks_init_lock(&file_lock);
+        file_lock = locks_alloc_lock();
+        if (!file_lock) {
+                dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
+                status = nfserr_jukebox;
+                goto out;
+        }
+        locks_init_lock(file_lock);
        switch (lockt->lt_type) {
                case NFS4_READ_LT:
                case NFS4_READW_LT:
-                        file_lock.fl_type = F_RDLCK;
+                        file_lock->fl_type = F_RDLCK;
                break;
                case NFS4_WRITE_LT:
                case NFS4_WRITEW_LT:
-                        file_lock.fl_type = F_WRLCK;
+                        file_lock->fl_type = F_WRLCK;
                break;
                default:
                        dprintk("NFSD: nfs4_lockt: bad lock type!\n");
@@ -4248,25 +4280,27 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner);
        if (lo)
-                file_lock.fl_owner = (fl_owner_t)lo;
+                file_lock->fl_owner = (fl_owner_t)lo;
-        file_lock.fl_pid = current->tgid;
+        file_lock->fl_pid = current->tgid;
-        file_lock.fl_flags = FL_POSIX;
+        file_lock->fl_flags = FL_POSIX;
-        file_lock.fl_start = lockt->lt_offset;
+        file_lock->fl_start = lockt->lt_offset;
-        file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
+        file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
-        nfs4_transform_lock_offset(&file_lock);
+        nfs4_transform_lock_offset(file_lock);
-        status = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock);
+        status = nfsd_test_lock(rqstp, &cstate->current_fh, file_lock);
        if (status)
                goto out;
-        if (file_lock.fl_type != F_UNLCK) {
+        if (file_lock->fl_type != F_UNLCK) {
                status = nfserr_denied;
-                nfs4_set_lock_denied(&file_lock, &lockt->lt_denied);
+                nfs4_set_lock_denied(file_lock, &lockt->lt_denied);
        }
 out:
        nfs4_unlock_state();
+        if (file_lock)
+                locks_free_lock(file_lock);
        return status;
 }
@@ -4276,7 +4310,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
        struct nfs4_ol_stateid *stp;
        struct file *filp = NULL;
-        struct file_lock file_lock;
+        struct file_lock *file_lock = NULL;
        __be32 status;
        int err;
                                                        
@@ -4298,23 +4332,29 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                status = nfserr_lock_range;
                goto out;
        }
-        BUG_ON(!filp);
+        file_lock = locks_alloc_lock();
-        locks_init_lock(&file_lock);
+        if (!file_lock) {
-        file_lock.fl_type = F_UNLCK;
+                dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
-        file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
+                status = nfserr_jukebox;
-        file_lock.fl_pid = current->tgid;
+                goto out;
-        file_lock.fl_file = filp;
+        }
-        file_lock.fl_flags = FL_POSIX; 
+        locks_init_lock(file_lock);
-        file_lock.fl_lmops = &nfsd_posix_mng_ops;
+        file_lock->fl_type = F_UNLCK;
-        file_lock.fl_start = locku->lu_offset;
+        file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
+        file_lock->fl_pid = current->tgid;
-        file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length);
+        file_lock->fl_file = filp;
-        nfs4_transform_lock_offset(&file_lock);
+        file_lock->fl_flags = FL_POSIX;
+        file_lock->fl_lmops = &nfsd_posix_mng_ops;
+        file_lock->fl_start = locku->lu_offset;
+        file_lock->fl_end = last_byte_offset(locku->lu_offset,
+                                                locku->lu_length);
+        nfs4_transform_lock_offset(file_lock);
        /*
        *  Try to unlock the file in the VFS.
        */
-        err = vfs_lock_file(filp, F_SETLK, &file_lock, NULL);
+        err = vfs_lock_file(filp, F_SETLK, file_lock, NULL);
        if (err) {
                dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
                goto out_nfserr;
@@ -4328,6 +4368,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 out:
        if (!cstate->replay_owner)
                nfs4_unlock_state();
+        if (file_lock)
+                locks_free_lock(file_lock);
        return status;
 out_nfserr:
@@ -4501,12 +4543,12 @@ nfsd4_find_reclaim_client(struct nfs4_client *clp)
 * Called from OPEN. Look for clientid in reclaim list.
 */
 __be32
-nfs4_check_open_reclaim(clientid_t *clid)
+nfs4_check_open_reclaim(clientid_t *clid, bool sessions)
 {
        struct nfs4_client *clp;
        /* find clientid in conf_id_hashtbl */
-        clp = find_confirmed_client(clid);
+        clp = find_confirmed_client(clid, sessions);
        if (clp == NULL)
                return nfserr_reclaim_bad;
@@ -4522,7 +4564,6 @@ void nfsd_forget_clients(u64 num)
        nfs4_lock_state();
        list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
-                nfsd4_client_record_remove(clp);
                expire_client(clp);
                if (++count == num)
                        break;
@@ -4582,7 +4623,7 @@ void nfsd_forget_openowners(u64 num)
        printk(KERN_INFO "NFSD: Forgot %d open owners", count);
 }
-int nfsd_process_n_delegations(u64 num, struct list_head *list)
+static int nfsd_process_n_delegations(u64 num, struct list_head *list)
 {
        int i, count = 0;
        struct nfs4_file *fp, *fnext;
@@ -4747,11 +4788,11 @@ __nfs4_state_shutdown(void)
        for (i = 0; i < CLIENT_HASH_SIZE; i++) {
                while (!list_empty(&conf_id_hashtbl[i])) {
                        clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
-                        expire_client(clp);
+                        destroy_client(clp);
                }
                while (!list_empty(&unconf_str_hashtbl[i])) {
                        clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash);
-                        expire_client(clp);
+                        destroy_client(clp);
                }
        }
        INIT_LIST_HEAD(&reaplist);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6322df36031f..fd548d155088 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2659,7 +2659,7 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
                RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
                WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
                WRITE32(bcts->dir);
-                /* XXX: ? */
+                /* Sorry, we do not yet support RDMA over 4.1: */
                WRITE32(0);
                ADJUST_ARGS();
        }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index fa49cff5ee65..dab350dfc376 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -406,7 +406,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
                        return rv;
                if (newthreads < 0)
                        return -EINVAL;
-                rv = nfsd_svc(NFS_PORT, newthreads);
+                rv = nfsd_svc(newthreads);
                if (rv < 0)
                        return rv;
        } else
@@ -683,25 +683,6 @@ static ssize_t __write_ports_addfd(char *buf)
 }
 /*
- * A '-' followed by the 'name' of a socket means we close the socket.
- */
-static ssize_t __write_ports_delfd(char *buf)
-{
-        char *toclose;
-        int len = 0;
-        toclose = kstrdup(buf + 1, GFP_KERNEL);
-        if (toclose == NULL)
-                return -ENOMEM;
-        if (nfsd_serv != NULL)
-                len = svc_sock_names(nfsd_serv, buf,
-                                        SIMPLE_TRANSACTION_LIMIT, toclose);
-        kfree(toclose);
-        return len;
-}
-/*
 * A transport listener is added by writing it's transport name and
 * a port number.
 */
@@ -712,7 +693,7 @@ static ssize_t __write_ports_addxprt(char *buf)
        int port, err;
        struct net *net = &init_net;
-        if (sscanf(buf, "%15s %4u", transport, &port) != 2)
+        if (sscanf(buf, "%15s %5u", transport, &port) != 2)
                return -EINVAL;
        if (port < 1 || port > USHRT_MAX)
@@ -746,31 +727,6 @@ out_err:
        return err;
 }
-/*
- * A transport listener is removed by writing a "-", it's transport
- * name, and it's port number.
- */
-static ssize_t __write_ports_delxprt(char *buf)
-{
-        struct svc_xprt *xprt;
-        char transport[16];
-        int port;
-        if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
-                return -EINVAL;
-        if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
-                return -EINVAL;
-        xprt = svc_find_xprt(nfsd_serv, transport, &init_net, AF_UNSPEC, port);
-        if (xprt == NULL)
-                return -ENOTCONN;
-        svc_close_xprt(xprt);
-        svc_xprt_put(xprt);
-        return 0;
-}
 static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 {
        if (size == 0)
@@ -779,15 +735,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
        if (isdigit(buf[0]))
                return __write_ports_addfd(buf);
-        if (buf[0] == '-' && isdigit(buf[1]))
-                return __write_ports_delfd(buf);
        if (isalpha(buf[0]))
                return __write_ports_addxprt(buf);
-        if (buf[0] == '-' && isalpha(buf[1]))
-                return __write_ports_delxprt(buf);
        return -EINVAL;
 }
@@ -825,21 +775,6 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 * OR
 *
 * Input:
- *                      buf:            C string containing a "-" followed
- *                                      by an integer value representing a
- *                                      previously passed in socket file
- *                                      descriptor
- *                      size:           non-zero length of C string in @buf
- * Output:
- *      On success:     NFS service no longer listens on that socket;
- *                      passed-in buffer filled with a '\n'-terminated C
- *                      string containing a unique name of the listener;
- *                      return code is the size in bytes of the string
- *      On error:       return code is a negative errno value
- *
- * OR
- *
- * Input:
 *                      buf:            C string containing a transport
 *                                      name and an unsigned integer value
 *                                      representing the port to listen on,
@@ -848,19 +783,6 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 * Output:
 *      On success:     returns zero; NFS service is started
 *      On error:       return code is a negative errno value
- *
- * OR
- *
- * Input:
- *                      buf:            C string containing a "-" followed
- *                                      by a transport name and an unsigned
- *                                      integer value representing the port
- *                                      to listen on, separated by whitespace
- *                      size:           non-zero length of C string in @buf
- * Output:
- *      On success:     returns zero; NFS service no longer listens
- *                      on that transport
- *      On error:       return code is a negative errno value
 */
 static ssize_t write_ports(struct file *file, char *buf, size_t size)
 {
@@ -1008,8 +930,6 @@ static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
        return nfsd4_write_time(file, buf, size, &nfsd4_grace);
 }
-extern char *nfs4_recoverydir(void);
 static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 2244222368ab..80d5ce40aadb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -65,7 +65,7 @@ extern const struct seq_operations nfs_exports_op;
 /*
 * Function prototypes.
 */
-int             nfsd_svc(unsigned short port, int nrservs);
+int             nfsd_svc(int nrservs);
 int             nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
 int             nfsd_nrthreads(void);
@@ -124,6 +124,7 @@ int nfs4_state_start(void);
 void nfs4_state_shutdown(void);
 void nfs4_reset_lease(time_t leasetime);
 int nfs4_reset_recoverydir(char *recdir);
+char * nfs4_recoverydir(void);
 #else
 static inline void nfs4_state_init(void) { }
 static inline int nfsd4_init_slabs(void) { return 0; }
@@ -132,6 +133,7 @@ static inline int nfs4_state_start(void) { return 0; }
 static inline void nfs4_state_shutdown(void) { }
 static inline void nfs4_reset_lease(time_t leasetime) { }
 static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
+static inline char * nfs4_recoverydir(void) {return NULL; }
 #endif
 /*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 240473cb708f..2013aa001dab 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -183,18 +183,18 @@ int nfsd_nrthreads(void)
        return rv;
 }
-static int nfsd_init_socks(int port)
+static int nfsd_init_socks(void)
 {
        int error;
        if (!list_empty(&nfsd_serv->sv_permsocks))
                return 0;
-        error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port,
+        error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, NFS_PORT,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
-        error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port,
+        error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, NFS_PORT,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
@@ -204,7 +204,7 @@ static int nfsd_init_socks(int port)
 static bool nfsd_up = false;
-static int nfsd_startup(unsigned short port, int nrservs)
+static int nfsd_startup(int nrservs)
 {
        int ret;
@@ -218,7 +218,7 @@ static int nfsd_startup(unsigned short port, int nrservs)
        ret = nfsd_racache_init(2*nrservs);
        if (ret)
                return ret;
-        ret = nfsd_init_socks(port);
+        ret = nfsd_init_socks();
        if (ret)
                goto out_racache;
        ret = lockd_up(&init_net);
@@ -436,7 +436,7 @@ int nfsd_set_nrthreads(int n, int *nthreads)
 * this is the first time nrservs is nonzero.
 */
 int
-nfsd_svc(unsigned short port, int nrservs)
+nfsd_svc(int nrservs)
 {
        int     error;
        bool    nfsd_up_before;
@@ -458,7 +458,7 @@ nfsd_svc(unsigned short port, int nrservs)
        nfsd_up_before = nfsd_up;
-        error = nfsd_startup(port, nrservs);
+        error = nfsd_startup(nrservs);
        if (error)
                goto out_destroy;
        error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
@@ -487,7 +487,7 @@ static int
 nfsd(void *vrqstp)
 {
        struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
-        int err, preverr = 0;
+        int err;
        /* Lock module and set up kernel thread */
        mutex_lock(&nfsd_mutex);
@@ -534,16 +534,6 @@ nfsd(void *vrqstp)
                        ;
                if (err == -EINTR)
                        break;
-                else if (err < 0) {
-                        if (err != preverr) {
-                                printk(KERN_WARNING "%s: unexpected error "
-                                        "from svc_recv (%d)\n", __func__, -err);
-                                preverr = err;
-                        }
-                        schedule_timeout_uninterruptible(HZ);
-                        continue;
-                }
                validate_process_creds();
                svc_process(rqstp);
                validate_process_creds();
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 22bd0a66c356..e036894bce57 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -373,11 +373,7 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
        return container_of(so, struct nfs4_lockowner, lo_owner);
 }
-/*
+/* nfs4_file: a file opened by some number of (open) nfs4_stateowners. */
-*  nfs4_file: a file opened by some number of (open) nfs4_stateowners.
-*    o fi_perfile list is used to search for conflicting 
-*      share_acces, share_deny on the file.
-*/
 struct nfs4_file {
        atomic_t                fi_ref;
        struct list_head        fi_hash;    /* hash by "struct inode *" */
@@ -459,7 +455,7 @@ extern void nfs4_unlock_state(void);
 extern int nfs4_in_grace(void);
 extern void nfs4_release_reclaim(void);
 extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp);
-extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
+extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions);
 extern void nfs4_free_openowner(struct nfs4_openowner *);
 extern void nfs4_free_lockowner(struct nfs4_lockowner *);
 extern int set_callback_cred(void);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 3f67b8e12251..c120b48ec305 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1581,7 +1581,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
         */
        oldfs = get_fs(); set_fs(KERNEL_DS);
-        host_err = inode->i_op->readlink(path.dentry, buf, *lenp);
+        host_err = inode->i_op->readlink(path.dentry, (char __user *)buf, *lenp);
        set_fs(oldfs);
        if (host_err < 0)
diff --git a/fs/open.c b/fs/open.c
index 44da0feeca2c..59071f55bf7f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -478,7 +478,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
        file = fget(fd);
        if (file) {
-                audit_inode(NULL, file->f_path.dentry);
+                audit_inode(NULL, file->f_path.dentry, 0);
                err = chmod_common(&file->f_path, mode);
                fput(file);
        }
@@ -588,7 +588,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
        error = mnt_want_write_file(f.file);
        if (error)
                goto out_fput;
-        audit_inode(NULL, f.file->f_path.dentry);
+        audit_inode(NULL, f.file->f_path.dentry, 0);
        error = chown_common(&f.file->f_path, user, group);
        mnt_drop_write_file(f.file);
 out_fput:
@@ -859,6 +859,24 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
 }
 /**
+ * file_open_name - open file and return file pointer
+ *
+ * @name:       struct filename containing path to open
+ * @flags:      open flags as per the open(2) second argument
+ * @mode:       mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to.  But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *file_open_name(struct filename *name, int flags, umode_t mode)
+{
+        struct open_flags op;
+        int lookup = build_open_flags(flags, mode, &op);
+        return do_filp_open(AT_FDCWD, name, &op, lookup);
+}
+/**
 * filp_open - open file and return file pointer
 *
 * @filename:   path to open
@@ -871,9 +889,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
 */
 struct file *filp_open(const char *filename, int flags, umode_t mode)
 {
-        struct open_flags op;
+        struct filename name = {.name = filename};
-        int lookup = build_open_flags(flags, mode, &op);
+        return file_open_name(&name, flags, mode);
-        return do_filp_open(AT_FDCWD, filename, &op, lookup);
 }
 EXPORT_SYMBOL(filp_open);
@@ -895,7 +912,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
 {
        struct open_flags op;
        int lookup = build_open_flags(flags, mode, &op);
-        char *tmp = getname(filename);
+        struct filename *tmp = getname(filename);
        int fd = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ef5c84be66f9..144a96732dd7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2258,7 +2258,8 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
        pid_t tgid = task_tgid_nr_ns(current, ns);
        char *name = ERR_PTR(-ENOENT);
        if (tgid) {
-                name = __getname();
+                /* 11 for max length of signed int in decimal + NULL term */
+                name = kmalloc(12, GFP_KERNEL);
                if (!name)
                        name = ERR_PTR(-ENOMEM);
                else
@@ -2273,7 +2274,7 @@ static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
 {
        char *s = nd_get_link(nd);
        if (!IS_ERR(s))
-                __putname(s);
+                kfree(s);
 }
 static const struct inode_operations proc_self_inode_operations = {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 79827ce03e3b..14df8806ff29 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1158,6 +1158,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        struct vm_area_struct *vma = v;
        struct numa_maps *md = &numa_priv->md;
        struct file *file = vma->vm_file;
+        struct task_struct *task = proc_priv->task;
        struct mm_struct *mm = vma->vm_mm;
        struct mm_walk walk = {};
        struct mempolicy *pol;
@@ -1177,9 +1178,11 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        walk.private = md;
        walk.mm = mm;
-        pol = get_vma_policy(proc_priv->task, vma, vma->vm_start);
+        task_lock(task);
+        pol = get_vma_policy(task, vma, vma->vm_start);
        mpol_to_str(buffer, sizeof(buffer), pol, 0);
        mpol_cond_put(pol);
+        task_unlock(task);
        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
@@ -1189,7 +1192,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
                seq_printf(m, " heap");
        } else {
-                pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid);
+                pid_t tid = vm_is_stack(task, vma, is_pid);
                if (tid != 0) {
                        /*
                         * Thread stack in /proc/PID/task/TID/maps or
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ff0135d6bc51..af1661f7a54f 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -331,11 +331,11 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
 #ifdef CONFIG_BLOCK
        struct block_device *bdev;
        struct super_block *sb;
-        char *tmp = getname(special);
+        struct filename *tmp = getname(special);
        if (IS_ERR(tmp))
                return ERR_CAST(tmp);
-        bdev = lookup_bdev(tmp);
+        bdev = lookup_bdev(tmp->name);
        putname(tmp);
        if (IS_ERR(bdev))
                return ERR_CAST(bdev);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 46485557cdc6..f27f01a98aa2 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1573,8 +1573,10 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
                        reiserfs_warning(sb, "reiserfs-13077",
                                "nfsd/reiserfs, fhtype=%d, len=%d - odd",
                                fh_type, fh_len);
-                fh_type = 5;
+                fh_type = fh_len;
        }
+        if (fh_len < 2)
+                return NULL;
        return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
                (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
@@ -1583,6 +1585,8 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
 struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
                int fh_len, int fh_type)
 {
+        if (fh_type > fh_len)
+                fh_type = fh_len;
        if (fh_type < 4)
                return NULL;
diff --git a/fs/super.c b/fs/super.c
index a3bc935069d9..12f123712161 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -186,15 +186,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
                spin_lock_init(&s->s_inode_lru_lock);
                INIT_LIST_HEAD(&s->s_mounts);
                init_rwsem(&s->s_umount);
-                mutex_init(&s->s_lock);
                lockdep_set_class(&s->s_umount, &type->s_umount_key);
                /*
-                 * The locking rules for s_lock are up to the
-                 * filesystem. For example ext3fs has different
-                 * lock ordering than usbfs:
-                 */
-                lockdep_set_class(&s->s_lock, &type->s_lock_key);
-                /*
                 * sget() can have s_umount recursion.
                 *
                 * When it cannot find a suitable sb, it allocates a new
@@ -394,22 +387,6 @@ bool grab_super_passive(struct super_block *sb)
        return false;
 }
-/*
- * Superblock locking.  We really ought to get rid of these two.
- */
-void lock_super(struct super_block * sb)
-{
-        mutex_lock(&sb->s_lock);
-}
-void unlock_super(struct super_block * sb)
-{
-        mutex_unlock(&sb->s_lock);
-}
-EXPORT_SYMBOL(lock_super);
-EXPORT_SYMBOL(unlock_super);
 /**
 *      generic_shutdown_super  -       common helper for ->kill_sb()
 *      @sb: superblock to kill
diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c
index 9a6ad96acf27..921c053fc052 100644
--- a/fs/sysv/balloc.c
+++ b/fs/sysv/balloc.c
@@ -60,12 +60,12 @@ void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
                return;
        }
-        lock_super(sb);
+        mutex_lock(&sbi->s_lock);
        count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
        if (count > sbi->s_flc_size) {
                printk("sysv_free_block: flc_count > flc_size\n");
-                unlock_super(sb);
+                mutex_unlock(&sbi->s_lock);
                return;
        }
        /* If the free list head in super-block is full, it is copied
@@ -77,7 +77,7 @@ void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
                bh = sb_getblk(sb, block);
                if (!bh) {
                        printk("sysv_free_block: getblk() failed\n");
-                        unlock_super(sb);
+                        mutex_unlock(&sbi->s_lock);
                        return;
                }
                memset(bh->b_data, 0, sb->s_blocksize);
@@ -93,7 +93,7 @@ void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
        *sbi->s_bcache_count = cpu_to_fs16(sbi, count);
        fs32_add(sbi, sbi->s_free_blocks, 1);
        dirty_sb(sb);
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_lock);
 }
 sysv_zone_t sysv_new_block(struct super_block * sb)
@@ -104,7 +104,7 @@ sysv_zone_t sysv_new_block(struct super_block * sb)
        struct buffer_head * bh;
        unsigned count;
-        lock_super(sb);
+        mutex_lock(&sbi->s_lock);
        count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
        if (count == 0) /* Applies only to Coherent FS */
@@ -147,11 +147,11 @@ sysv_zone_t sysv_new_block(struct super_block * sb)
        /* Now the free list head in the superblock is valid again. */
        fs32_add(sbi, sbi->s_free_blocks, -1);
        dirty_sb(sb);
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_lock);
        return nr;
 Enospc:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_lock);
        return 0;
 }
@@ -173,7 +173,7 @@ unsigned long sysv_count_free_blocks(struct super_block * sb)
        if (sbi->s_type == FSTYPE_AFS)
                return 0;
-        lock_super(sb);
+        mutex_lock(&sbi->s_lock);
        sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks);
        if (0)
@@ -211,7 +211,7 @@ unsigned long sysv_count_free_blocks(struct super_block * sb)
        if (count != sb_count)
                goto Ecount;
 done:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_lock);
        return count;
 Einval:
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 8233b02eccae..f9db4eb31db4 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -118,7 +118,7 @@ void sysv_free_inode(struct inode * inode)
                       "%s\n", inode->i_sb->s_id);
                return;
        }
-        lock_super(sb);
+        mutex_lock(&sbi->s_lock);
        count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
        if (count < sbi->s_fic_size) {
                *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino);
@@ -128,7 +128,7 @@ void sysv_free_inode(struct inode * inode)
        dirty_sb(sb);
        memset(raw_inode, 0, sizeof(struct sysv_inode));
        mark_buffer_dirty(bh);
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_lock);
        brelse(bh);
 }
@@ -147,13 +147,13 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
        if (!inode)
                return ERR_PTR(-ENOMEM);
-        lock_super(sb);
+        mutex_lock(&sbi->s_lock);
        count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
        if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) {
                count = refill_free_cache(sb);
                if (count == 0) {
                        iput(inode);
-                        unlock_super(sb);
+                        mutex_unlock(&sbi->s_lock);
                        return ERR_PTR(-ENOSPC);
                }
        }
@@ -174,7 +174,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
        sysv_write_inode(inode, &wbc);  /* ensure inode not allocated again */
        mark_inode_dirty(inode);        /* cleared by sysv_write_inode() */
        /* That's it. */
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_lock);
        return inode;
 }
@@ -185,7 +185,7 @@ unsigned long sysv_count_free_inodes(struct super_block * sb)
        struct sysv_inode * raw_inode;
        int ino, count, sb_count;
-        lock_super(sb);
+        mutex_lock(&sbi->s_lock);
        sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes);
@@ -213,7 +213,7 @@ unsigned long sysv_count_free_inodes(struct super_block * sb)
        if (count != sb_count)
                goto Einval;
 out:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_lock);
        return count;
 Einval:
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index d33e506c1eac..c327d4ee1235 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -36,7 +36,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
        struct sysv_sb_info *sbi = SYSV_SB(sb);
        unsigned long time = get_seconds(), old_time;
-        lock_super(sb);
+        mutex_lock(&sbi->s_lock);
        /*
         * If we are going to write out the super block,
@@ -51,7 +51,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
                mark_buffer_dirty(sbi->s_bh2);
        }
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_lock);
        return 0;
 }
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 7491c33b6468..a38e87bdd78d 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -368,6 +368,7 @@ static int sysv_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_sb = sb;
        sbi->s_block_base = 0;
+        mutex_init(&sbi->s_lock);
        sb->s_fs_info = sbi;
        sb_set_blocksize(sb, BLOCK_SIZE);
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 0bc35fdc58e2..69d488986cce 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -58,6 +58,7 @@ struct sysv_sb_info {
        u32            s_nzones;        /* same as s_sbd->s_fsize */
        u16            s_namelen;       /* max length of dir entry */
        int            s_forced_ro;
+        struct mutex s_lock;
 };
 /*
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 1b3e410bf334..a7ea492ae660 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -54,7 +54,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
        if (ufs_fragnum(fragment) + count > uspi->s_fpg)
                ufs_error (sb, "ufs_free_fragments", "internal error");
        
-        lock_super(sb);
+        mutex_lock(&UFS_SB(sb)->s_lock);
        
        cgno = ufs_dtog(uspi, fragment);
        bit = ufs_dtogd(uspi, fragment);
@@ -118,12 +118,12 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
                ubh_sync_block(UCPI_UBH(ucpi));
        ufs_mark_sb_dirty(sb);
        
-        unlock_super (sb);
+        mutex_unlock(&UFS_SB(sb)->s_lock);
        UFSD("EXIT\n");
        return;
 failed:
-        unlock_super (sb);
+        mutex_unlock(&UFS_SB(sb)->s_lock);
        UFSD("EXIT (FAILED)\n");
        return;
 }
@@ -155,7 +155,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
                goto failed;
        }
-        lock_super(sb);
+        mutex_lock(&UFS_SB(sb)->s_lock);
        
 do_more:
        overflow = 0;
@@ -215,12 +215,12 @@ do_more:
        }
        ufs_mark_sb_dirty(sb);
-        unlock_super (sb);
+        mutex_unlock(&UFS_SB(sb)->s_lock);
        UFSD("EXIT\n");
        return;
 failed_unlock:
-        unlock_super (sb);
+        mutex_unlock(&UFS_SB(sb)->s_lock);
 failed:
        UFSD("EXIT (FAILED)\n");
        return;
@@ -361,7 +361,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
        usb1 = ubh_get_usb_first(uspi);
        *err = -ENOSPC;
-        lock_super (sb);
+        mutex_lock(&UFS_SB(sb)->s_lock);
        tmp = ufs_data_ptr_to_cpu(sb, p);
        if (count + ufs_fragnum(fragment) > uspi->s_fpb) {
@@ -382,19 +382,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
                                  "fragment %llu, tmp %llu\n",
                                  (unsigned long long)fragment,
                                  (unsigned long long)tmp);
-                        unlock_super(sb);
+                        mutex_unlock(&UFS_SB(sb)->s_lock);
                        return INVBLOCK;
                }
                if (fragment < UFS_I(inode)->i_lastfrag) {
                        UFSD("EXIT (ALREADY ALLOCATED)\n");
-                        unlock_super (sb);
+                        mutex_unlock(&UFS_SB(sb)->s_lock);
                        return 0;
                }
        }
        else {
                if (tmp) {
                        UFSD("EXIT (ALREADY ALLOCATED)\n");
-                        unlock_super(sb);
+                        mutex_unlock(&UFS_SB(sb)->s_lock);
                        return 0;
                }
        }
@@ -403,7 +403,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
         * There is not enough space for user on the device
         */
        if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
-                unlock_super (sb);
+                mutex_unlock(&UFS_SB(sb)->s_lock);
                UFSD("EXIT (FAILED)\n");
                return 0;
        }
@@ -428,7 +428,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
                        ufs_clear_frags(inode, result + oldcount,
                                        newcount - oldcount, locked_page != NULL);
                }
-                unlock_super(sb);
+                mutex_unlock(&UFS_SB(sb)->s_lock);
                UFSD("EXIT, result %llu\n", (unsigned long long)result);
                return result;
        }
@@ -443,7 +443,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
                                                fragment + count);
                ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
                                locked_page != NULL);
-                unlock_super(sb);
+                mutex_unlock(&UFS_SB(sb)->s_lock);
                UFSD("EXIT, result %llu\n", (unsigned long long)result);
                return result;
        }
@@ -481,7 +481,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
                *err = 0;
                UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
                                                fragment + count);
-                unlock_super(sb);
+                mutex_unlock(&UFS_SB(sb)->s_lock);
                if (newcount < request)
                        ufs_free_fragments (inode, result + newcount, request - newcount);
                ufs_free_fragments (inode, tmp, oldcount);
@@ -489,7 +489,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
                return result;
        }
-        unlock_super(sb);
+        mutex_unlock(&UFS_SB(sb)->s_lock);
        UFSD("EXIT (FAILED)\n");
        return 0;
 }               
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index e84cbe21b986..d0426d74817b 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -71,11 +71,11 @@ void ufs_free_inode (struct inode * inode)
        
        ino = inode->i_ino;
-        lock_super (sb);
+        mutex_lock(&UFS_SB(sb)->s_lock);
        if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) {
                ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino);
-                unlock_super (sb);
+                mutex_unlock(&UFS_SB(sb)->s_lock);
                return;
        }
        
@@ -83,7 +83,7 @@ void ufs_free_inode (struct inode * inode)
        bit = ufs_inotocgoff (ino);
        ucpi = ufs_load_cylinder (sb, cg);
        if (!ucpi) {
-                unlock_super (sb);
+                mutex_unlock(&UFS_SB(sb)->s_lock);
                return;
        }
        ucg = ubh_get_ucg(UCPI_UBH(ucpi));
@@ -117,7 +117,7 @@ void ufs_free_inode (struct inode * inode)
                ubh_sync_block(UCPI_UBH(ucpi));
        
        ufs_mark_sb_dirty(sb);
-        unlock_super (sb);
+        mutex_unlock(&UFS_SB(sb)->s_lock);
        UFSD("EXIT\n");
 }
@@ -197,7 +197,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
        uspi = sbi->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
-        lock_super (sb);
+        mutex_lock(&sbi->s_lock);
        /*
         * Try to place the inode in its parent directory
@@ -333,20 +333,20 @@ cg_found:
                brelse(bh);
        }
-        unlock_super (sb);
+        mutex_unlock(&sbi->s_lock);
        UFSD("allocating inode %lu\n", inode->i_ino);
        UFSD("EXIT\n");
        return inode;
 fail_remove_inode:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_lock);
        clear_nlink(inode);
        iput(inode);
        UFSD("EXIT (FAILED): err %d\n", err);
        return ERR_PTR(err);
 failed:
-        unlock_super (sb);
+        mutex_unlock(&sbi->s_lock);
        make_bad_inode(inode);
        iput (inode);
        UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index f7cfecfe1cab..dc8e3a861d0f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -699,7 +699,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
        unsigned flags;
        lock_ufs(sb);
-        lock_super(sb);
+        mutex_lock(&UFS_SB(sb)->s_lock);
        UFSD("ENTER\n");
@@ -717,7 +717,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
        ufs_put_cstotal(sb);
        UFSD("EXIT\n");
-        unlock_super(sb);
+        mutex_unlock(&UFS_SB(sb)->s_lock);
        unlock_ufs(sb);
        return 0;
@@ -805,6 +805,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        }
 #endif
        mutex_init(&sbi->mutex);
+        mutex_init(&sbi->s_lock);
        spin_lock_init(&sbi->work_lock);
        INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
        /*
@@ -1280,7 +1281,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        unsigned flags;
        lock_ufs(sb);
-        lock_super(sb);
+        mutex_lock(&UFS_SB(sb)->s_lock);
        uspi = UFS_SB(sb)->s_uspi;
        flags = UFS_SB(sb)->s_flags;
        usb1 = ubh_get_usb_first(uspi);
@@ -1294,7 +1295,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        new_mount_opt = 0;
        ufs_set_opt (new_mount_opt, ONERROR_LOCK);
        if (!ufs_parse_options (data, &new_mount_opt)) {
-                unlock_super(sb);
+                mutex_unlock(&UFS_SB(sb)->s_lock);
                unlock_ufs(sb);
                return -EINVAL;
        }
@@ -1302,14 +1303,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                new_mount_opt |= ufstype;
        } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
                printk("ufstype can't be changed during remount\n");
-                unlock_super(sb);
+                mutex_unlock(&UFS_SB(sb)->s_lock);
                unlock_ufs(sb);
                return -EINVAL;
        }
        if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
                UFS_SB(sb)->s_mount_opt = new_mount_opt;
-                unlock_super(sb);
+                mutex_unlock(&UFS_SB(sb)->s_lock);
                unlock_ufs(sb);
                return 0;
        }
@@ -1334,7 +1335,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 #ifndef CONFIG_UFS_FS_WRITE
                printk("ufs was compiled with read-only support, "
                "can't be mounted as read-write\n");
-                unlock_super(sb);
+                mutex_unlock(&UFS_SB(sb)->s_lock);
                unlock_ufs(sb);
                return -EINVAL;
 #else
@@ -1344,13 +1345,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                    ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
                    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
                        printk("this ufstype is read-only supported\n");
-                        unlock_super(sb);
+                        mutex_unlock(&UFS_SB(sb)->s_lock);
                        unlock_ufs(sb);
                        return -EINVAL;
                }
                if (!ufs_read_cylinder_structures(sb)) {
                        printk("failed during remounting\n");
-                        unlock_super(sb);
+                        mutex_unlock(&UFS_SB(sb)->s_lock);
                        unlock_ufs(sb);
                        return -EPERM;
                }
@@ -1358,7 +1359,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 #endif
        }
        UFS_SB(sb)->s_mount_opt = new_mount_opt;
-        unlock_super(sb);
+        mutex_unlock(&UFS_SB(sb)->s_lock);
        unlock_ufs(sb);
        return 0;
 }
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 343e6fc571e5..ff2c15ab81aa 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -24,6 +24,7 @@ struct ufs_sb_info {
        int work_queued; /* non-zero if the delayed work is queued */
        struct delayed_work sync_work; /* FS sync delayed work */
        spinlock_t work_lock; /* protects sync_work and work_queued */
+        struct mutex s_lock;
 };
 struct ufs_inode_info {
diff --git a/fs/xattr.c b/fs/xattr.c
index 1780f062dbaf..e164dddb8e96 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -412,7 +412,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
        if (!f.file)
                return error;
        dentry = f.file->f_path.dentry;
-        audit_inode(NULL, dentry);
+        audit_inode(NULL, dentry, 0);
        error = mnt_want_write_file(f.file);
        if (!error) {
                error = setxattr(dentry, name, value, size, flags);
@@ -507,7 +507,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
        if (!f.file)
                return error;
-        audit_inode(NULL, f.file->f_path.dentry);
+        audit_inode(NULL, f.file->f_path.dentry, 0);
        error = getxattr(f.file->f_path.dentry, name, value, size);
        fdput(f);
        return error;
@@ -586,7 +586,7 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
        if (!f.file)
                return error;
-        audit_inode(NULL, f.file->f_path.dentry);
+        audit_inode(NULL, f.file->f_path.dentry, 0);
        error = listxattr(f.file->f_path.dentry, list, size);
        fdput(f);
        return error;
@@ -655,7 +655,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
        if (!f.file)
                return error;
        dentry = f.file->f_path.dentry;
-        audit_inode(NULL, dentry);
+        audit_inode(NULL, dentry, 0);
        error = mnt_want_write_file(f.file);
        if (!error) {
                error = removexattr(dentry, name);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 11efd830b5f5..9fbea87fdb6e 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -45,7 +45,7 @@ static void posix_acl_fix_xattr_userns(
                        break;
                case ACL_GROUP:
                        gid = make_kgid(from, le32_to_cpu(entry->e_id));
-                        entry->e_id = cpu_to_le32(from_kuid(to, uid));
+                        entry->e_id = cpu_to_le32(from_kgid(to, gid));
                        break;
                default:
                        break;
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 42679223a0fd..8c6d1d70278c 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -189,6 +189,9 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
        struct xfs_fid64        *fid64 = (struct xfs_fid64 *)fid;
        struct inode            *inode = NULL;
+        if (fh_len < xfs_fileid_length(fileid_type))
+                return NULL;
        switch (fileid_type) {
        case FILEID_INO32_GEN_PARENT:
                inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino,