Merge branch 'linus' into tracing/core

Merge reason: tracing/core was on a .30-rc1 base and was missing out on on a handful of tracing fixes present in .30-rc5-almost. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2009-05-07 05:17:13 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-05-07 05:17:34 -0400
commit: 44347d947f628060b92449702071bfe1d31dfb75 (patch)
tree: c6ed74610d5b3295df4296659f80f5feb94b28cc /fs
parent: d94fc523f3c35bd8013f04827e94756cbc0212f4 (diff)
parent: 413f81eba35d6ede9289b0c8a920c013a84fac71 (diff)
135 files changed, 2471 insertions, 2088 deletions
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 7a1d942ef68d..0149dab365e7 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -102,6 +102,7 @@ int afs_release(struct inode *inode, struct file *file)
        return 0;
 }
+#ifdef CONFIG_AFS_FSCACHE
 /*
 * deal with notification that a page was read from the cache
 */
@@ -117,6 +118,7 @@ static void afs_file_readpage_read_complete(struct page *page,
                SetPageUptodate(page);
        unlock_page(page);
 }
+#endif
 /*
 * AFS read page from file, directory or symlink
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index bf8c8af98004..4eb4d8dfb2f1 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -39,10 +39,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 {
        struct autofs_dirhash *dh = &sbi->dirhash;
        struct autofs_dir_ent *ent;
-        struct dentry *dentry;
        unsigned long timeout = sbi->exp_timeout;
        while (1) {
+                struct path path;
+                int umount_ok;
                if ( list_empty(&dh->expiry_head) || sbi->catatonic )
                        return NULL;    /* No entries */
                /* We keep the list sorted by last_usage and want old stuff */
@@ -57,17 +59,17 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
                        return ent; /* Symlinks are always expirable */
                /* Get the dentry for the autofs subdirectory */
-                dentry = ent->dentry;
+                path.dentry = ent->dentry;
-                if ( !dentry ) {
+                if (!path.dentry) {
                        /* Should only happen in catatonic mode */
                        printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
                        autofs_delete_usage(ent);
                        continue;
                }
-                if ( !dentry->d_inode ) {
+                if (!path.dentry->d_inode) {
-                        dput(dentry);
+                        dput(path.dentry);
                        printk("autofs: negative dentry on expiry queue: %s\n",
                               ent->name);
                        autofs_delete_usage(ent);
@@ -76,29 +78,29 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
                /* Make sure entry is mounted and unused; note that dentry will
                   point to the mounted-on-top root. */
-                if (!S_ISDIR(dentry->d_inode->i_mode)||!d_mountpoint(dentry)) {
+                if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
+                    !d_mountpoint(path.dentry)) {
                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
                        continue;
                }
-                mntget(mnt);
+                path.mnt = mnt;
-                dget(dentry);
+                path_get(&path);
-                if (!follow_down(&mnt, &dentry)) {
+                if (!follow_down(&path.mnt, &path.dentry)) {
-                        dput(dentry);
+                        path_put(&path);
-                        mntput(mnt);
                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
                        continue;
                }
-                while (d_mountpoint(dentry) && follow_down(&mnt, &dentry))
+                while (d_mountpoint(path.dentry) &&
+                       follow_down(&path.mnt, &path.dentry))
                        ;
-                dput(dentry);
+                umount_ok = may_umount(path.mnt);
+                path_put(&path);
-                if ( may_umount(mnt) ) {
+                if (umount_ok) {
-                        mntput(mnt);
                        DPRINTK(("autofs: signaling expire on %s\n", ent->name));
                        return ent; /* Expirable! */
                }
                DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
-                mntput(mnt);
        }
        return NULL;            /* No expirable entries */
 }
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 9e5ae8a4f5c8..84168c0dcc2d 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -54,11 +54,10 @@ static int check_name(const char *name)
 * Check a string doesn't overrun the chunk of
 * memory we copied from user land.
 */
-static int invalid_str(char *str, void *end)
+static int invalid_str(char *str, size_t size)
 {
-        while ((void *) str <= end)
+        if (memchr(str, 0, size))
-                if (!*str++)
+                return 0;
-                        return 0;
        return -EINVAL;
 }
@@ -138,8 +137,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
        }
        if (param->size > sizeof(*param)) {
-                err = invalid_str(param->path,
+                err = invalid_str(param->path, param->size - sizeof(*param));
-                                 (void *) ((size_t) param + param->size));
                if (err) {
                        AUTOFS_WARN(
                          "path string terminator missing for cmd(0x%08x)",
@@ -488,7 +486,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
        }
        path = param->path;
-        devid = sbi->sb->s_dev;
+        devid = new_encode_dev(sbi->sb->s_dev);
        param->requester.uid = param->requester.gid = -1;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 75f7ddacf7d6..3077d8f16523 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -70,8 +70,10 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
                 * Otherwise it's an offset mount and we need to check
                 * if we can umount its mount, if there is one.
                 */
-                if (!d_mountpoint(dentry))
+                if (!d_mountpoint(dentry)) {
+                        status = 0;
                        goto done;
+                }
        }
        /* Update the expiry counter if fs is busy */
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 70cfc4b84ae0..fdb66faa24f1 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1388,7 +1388,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
        prstatus->pr_sigpend = p->pending.signal.sig[0];
        prstatus->pr_sighold = p->blocked.sig[0];
        prstatus->pr_pid = task_pid_vnr(p);
-        prstatus->pr_ppid = task_pid_vnr(p->parent);
+        prstatus->pr_ppid = task_pid_vnr(p->real_parent);
        prstatus->pr_pgrp = task_pgrp_vnr(p);
        prstatus->pr_sid = task_session_vnr(p);
        if (thread_group_leader(p)) {
@@ -1433,7 +1433,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
        psinfo->pr_psargs[len] = 0;
        psinfo->pr_pid = task_pid_vnr(p);
-        psinfo->pr_ppid = task_pid_vnr(p->parent);
+        psinfo->pr_ppid = task_pid_vnr(p->real_parent);
        psinfo->pr_pgrp = task_pgrp_vnr(p);
        psinfo->pr_sid = task_session_vnr(p);
diff --git a/fs/bio.c b/fs/bio.c
index e0c9e545bbfa..98711647ece4 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -175,14 +175,6 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
        struct bio_vec *bvl;
        /*
-         * If 'bs' is given, lookup the pool and do the mempool alloc.
-         * If not, this is a bio_kmalloc() allocation and just do a
-         * kzalloc() for the exact number of vecs right away.
-         */
-        if (!bs)
-                bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
-        /*
         * see comment near bvec_array define!
         */
        switch (nr) {
@@ -260,21 +252,6 @@ void bio_free(struct bio *bio, struct bio_set *bs)
        mempool_free(p, bs->bio_pool);
 }
-/*
- * default destructor for a bio allocated with bio_alloc_bioset()
- */
-static void bio_fs_destructor(struct bio *bio)
-{
-        bio_free(bio, fs_bio_set);
-}
-static void bio_kmalloc_destructor(struct bio *bio)
-{
-        if (bio_has_allocated_vec(bio))
-                kfree(bio->bi_io_vec);
-        kfree(bio);
-}
 void bio_init(struct bio *bio)
 {
        memset(bio, 0, sizeof(*bio));
@@ -301,21 +278,15 @@ void bio_init(struct bio *bio)
 **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+        unsigned long idx = BIO_POOL_NONE;
        struct bio_vec *bvl = NULL;
-        struct bio *bio = NULL;
+        struct bio *bio;
-        unsigned long idx = 0;
+        void *p;
-        void *p = NULL;
+        p = mempool_alloc(bs->bio_pool, gfp_mask);
-        if (bs) {
+        if (unlikely(!p))
-                p = mempool_alloc(bs->bio_pool, gfp_mask);
+                return NULL;
-                if (!p)
+        bio = p + bs->front_pad;
-                        goto err;
-                bio = p + bs->front_pad;
-        } else {
-                bio = kmalloc(sizeof(*bio), gfp_mask);
-                if (!bio)
-                        goto err;
-        }
        bio_init(bio);
@@ -332,22 +303,33 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                nr_iovecs = bvec_nr_vecs(idx);
        }
+out_set:
        bio->bi_flags |= idx << BIO_POOL_OFFSET;
        bio->bi_max_vecs = nr_iovecs;
-out_set:
        bio->bi_io_vec = bvl;
        return bio;
 err_free:
-        if (bs)
+        mempool_free(p, bs->bio_pool);
-                mempool_free(p, bs->bio_pool);
-        else
-                kfree(bio);
-err:
        return NULL;
 }
+static void bio_fs_destructor(struct bio *bio)
+{
+        bio_free(bio, fs_bio_set);
+}
+/**
+ *      bio_alloc - allocate a new bio, memory pool backed
+ *      @gfp_mask: allocation mask to use
+ *      @nr_iovecs: number of iovecs
+ *
+ *      Allocate a new bio with @nr_iovecs bvecs.  If @gfp_mask
+ *      contains __GFP_WAIT, the allocation is guaranteed to succeed.
+ *
+ *      RETURNS:
+ *      Pointer to new bio on success, NULL on failure.
+ */
 struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 {
        struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
@@ -358,19 +340,45 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
        return bio;
 }
-/*
+static void bio_kmalloc_destructor(struct bio *bio)
- * Like bio_alloc(), but doesn't use a mempool backing. This means that
+{
- * it CAN fail, but while bio_alloc() can only be used for allocations
+        if (bio_integrity(bio))
- * that have a short (finite) life span, bio_kmalloc() should be used
+                bio_integrity_free(bio);
- * for more permanent bio allocations (like allocating some bio's for
+        kfree(bio);
- * initalization or setup purposes).
+}
- */
+/**
+ * bio_alloc - allocate a bio for I/O
+ * @gfp_mask:   the GFP_ mask given to the slab allocator
+ * @nr_iovecs:  number of iovecs to pre-allocate
+ *
+ * Description:
+ *   bio_alloc will allocate a bio and associated bio_vec array that can hold
+ *   at least @nr_iovecs entries. Allocations will be done from the
+ *   fs_bio_set. Also see @bio_alloc_bioset.
+ *
+ *   If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
+ *   a bio. This is due to the mempool guarantees. To make this work, callers
+ *   must never allocate more than 1 bio at the time from this pool. Callers
+ *   that need to allocate more than 1 bio must always submit the previously
+ *   allocate bio for IO before attempting to allocate a new one. Failure to
+ *   do so can cause livelocks under memory pressure.
+ *
+ **/
 struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
 {
-        struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+        struct bio *bio;
-        if (bio)
+        bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
-                bio->bi_destructor = bio_kmalloc_destructor;
+                      gfp_mask);
+        if (unlikely(!bio))
+                return NULL;
+        bio_init(bio);
+        bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
+        bio->bi_max_vecs = nr_iovecs;
+        bio->bi_io_vec = bio->bi_inline_vecs;
+        bio->bi_destructor = bio_kmalloc_destructor;
        return bio;
 }
@@ -809,12 +817,15 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
                len += iov[i].iov_len;
        }
+        if (offset)
+                nr_pages++;
        bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
        if (!bmd)
                return ERR_PTR(-ENOMEM);
        ret = -ENOMEM;
-        bio = bio_alloc(gfp_mask, nr_pages);
+        bio = bio_kmalloc(gfp_mask, nr_pages);
        if (!bio)
                goto out_bmd;
@@ -938,7 +949,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
        if (!nr_pages)
                return ERR_PTR(-EINVAL);
-        bio = bio_alloc(gfp_mask, nr_pages);
+        bio = bio_kmalloc(gfp_mask, nr_pages);
        if (!bio)
                return ERR_PTR(-ENOMEM);
@@ -1122,7 +1133,7 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data,
        int offset, i;
        struct bio *bio;
-        bio = bio_alloc(gfp_mask, nr_pages);
+        bio = bio_kmalloc(gfp_mask, nr_pages);
        if (!bio)
                return ERR_PTR(-ENOMEM);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9adf5e4f7e96..94212844a9bc 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,25 +1,10 @@
-ifneq ($(KERNELRELEASE),)
-# kbuild part of makefile
 obj-$(CONFIG_BTRFS_FS) := btrfs.o
-btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           file-item.o inode-item.o inode-map.o disk-io.o \
           transaction.o inode.o file.o tree-defrag.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
           compression.o delayed-ref.o
-else
-# Normal Makefile
-KERNELDIR := /lib/modules/`uname -r`/build
-all:
-        $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
-modules_install:
-        $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
-clean:
-        $(MAKE) -C $(KERNELDIR) M=`pwd` clean
-endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 7fdd184a528d..cbba000dccbe 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,15 +60,20 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                return ERR_PTR(-EINVAL);
        }
+        /* Handle the cached NULL acl case without locking */
+        acl = ACCESS_ONCE(*p_acl);
+        if (!acl)
+                return acl;
        spin_lock(&inode->i_lock);
-        if (*p_acl != BTRFS_ACL_NOT_CACHED)
+        acl = *p_acl;
-                acl = posix_acl_dup(*p_acl);
+        if (acl != BTRFS_ACL_NOT_CACHED)
+                acl = posix_acl_dup(acl);
        spin_unlock(&inode->i_lock);
-        if (acl)
+        if (acl != BTRFS_ACL_NOT_CACHED)
                return acl;
        size = __btrfs_getxattr(inode, name, "", 0);
        if (size > 0) {
                value = kzalloc(size, GFP_NOFS);
@@ -80,9 +85,12 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                        btrfs_update_cached_acl(inode, p_acl, acl);
                }
                kfree(value);
-        } else if (size == -ENOENT) {
+        } else if (size == -ENOENT || size == -ENODATA || size == 0) {
+                /* FIXME, who returns -ENOENT?  I think nobody */
                acl = NULL;
                btrfs_update_cached_acl(inode, p_acl, acl);
+        } else {
+                acl = ERR_PTR(-EIO);
        }
        return acl;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 51bfdfc8fcda..502c3d61de62 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -25,6 +25,7 @@
 #define WORK_QUEUED_BIT 0
 #define WORK_DONE_BIT 1
 #define WORK_ORDER_DONE_BIT 2
+#define WORK_HIGH_PRIO_BIT 3
 /*
 * container for the kthread task pointer and the list of pending work
@@ -36,6 +37,7 @@ struct btrfs_worker_thread {
        /* list of struct btrfs_work that are waiting for service */
        struct list_head pending;
+        struct list_head prio_pending;
        /* list of worker threads from struct btrfs_workers */
        struct list_head worker_list;
@@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
        spin_lock_irqsave(&workers->lock, flags);
-        while (!list_empty(&workers->order_list)) {
+        while (1) {
-                work = list_entry(workers->order_list.next,
+                if (!list_empty(&workers->prio_order_list)) {
-                                  struct btrfs_work, order_list);
+                        work = list_entry(workers->prio_order_list.next,
+                                          struct btrfs_work, order_list);
+                } else if (!list_empty(&workers->order_list)) {
+                        work = list_entry(workers->order_list.next,
+                                          struct btrfs_work, order_list);
+                } else {
+                        break;
+                }
                if (!test_bit(WORK_DONE_BIT, &work->flags))
                        break;
@@ -143,8 +151,14 @@ static int worker_loop(void *arg)
        do {
                spin_lock_irq(&worker->lock);
 again_locked:
-                while (!list_empty(&worker->pending)) {
+                while (1) {
-                        cur = worker->pending.next;
+                        if (!list_empty(&worker->prio_pending))
+                                cur = worker->prio_pending.next;
+                        else if (!list_empty(&worker->pending))
+                                cur = worker->pending.next;
+                        else
+                                break;
                        work = list_entry(cur, struct btrfs_work, list);
                        list_del(&work->list);
                        clear_bit(WORK_QUEUED_BIT, &work->flags);
@@ -163,7 +177,6 @@ again_locked:
                        spin_lock_irq(&worker->lock);
                        check_idle_worker(worker);
                }
                if (freezing(current)) {
                        worker->working = 0;
@@ -178,7 +191,8 @@ again_locked:
                                 * jump_in?
                                 */
                                smp_mb();
-                                if (!list_empty(&worker->pending))
+                                if (!list_empty(&worker->pending) ||
+                                    !list_empty(&worker->prio_pending))
                                        continue;
                                /*
@@ -191,7 +205,8 @@ again_locked:
                                 */
                                schedule_timeout(1);
                                smp_mb();
-                                if (!list_empty(&worker->pending))
+                                if (!list_empty(&worker->pending) ||
+                                    !list_empty(&worker->prio_pending))
                                        continue;
                                if (kthread_should_stop())
@@ -200,7 +215,8 @@ again_locked:
                                /* still no more work?, sleep for real */
                                spin_lock_irq(&worker->lock);
                                set_current_state(TASK_INTERRUPTIBLE);
-                                if (!list_empty(&worker->pending))
+                                if (!list_empty(&worker->pending) ||
+                                    !list_empty(&worker->prio_pending))
                                        goto again_locked;
                                /*
@@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
        INIT_LIST_HEAD(&workers->worker_list);
        INIT_LIST_HEAD(&workers->idle_list);
        INIT_LIST_HEAD(&workers->order_list);
+        INIT_LIST_HEAD(&workers->prio_order_list);
        spin_lock_init(&workers->lock);
        workers->max_workers = max;
        workers->idle_thresh = 32;
@@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                }
                INIT_LIST_HEAD(&worker->pending);
+                INIT_LIST_HEAD(&worker->prio_pending);
                INIT_LIST_HEAD(&worker->worker_list);
                spin_lock_init(&worker->lock);
                atomic_set(&worker->num_pending, 0);
@@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work)
                goto out;
        spin_lock_irqsave(&worker->lock, flags);
-        list_add_tail(&work->list, &worker->pending);
+        if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+                list_add_tail(&work->list, &worker->prio_pending);
+        else
+                list_add_tail(&work->list, &worker->pending);
        atomic_inc(&worker->num_pending);
        /* by definition we're busy, take ourselves off the idle
@@ -422,6 +443,11 @@ out:
        return 0;
 }
+void btrfs_set_work_high_prio(struct btrfs_work *work)
+{
+        set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
 /*
 * places a struct btrfs_work into the pending queue of one of the kthreads
 */
@@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
        worker = find_worker(workers);
        if (workers->ordered) {
                spin_lock_irqsave(&workers->lock, flags);
-                list_add_tail(&work->order_list, &workers->order_list);
+                if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+                        list_add_tail(&work->order_list,
+                                      &workers->prio_order_list);
+                } else {
+                        list_add_tail(&work->order_list, &workers->order_list);
+                }
                spin_unlock_irqrestore(&workers->lock, flags);
        } else {
                INIT_LIST_HEAD(&work->order_list);
@@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
        spin_lock_irqsave(&worker->lock, flags);
-        list_add_tail(&work->list, &worker->pending);
+        if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+                list_add_tail(&work->list, &worker->prio_pending);
+        else
+                list_add_tail(&work->list, &worker->pending);
        atomic_inc(&worker->num_pending);
        check_busy_worker(worker);
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 31be4ed8b63e..1b511c109db6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -85,6 +85,7 @@ struct btrfs_workers {
         * of work items waiting for completion
         */
        struct list_head order_list;
+        struct list_head prio_order_list;
        /* lock for finding the next worker thread to queue on */
        spinlock_t lock;
@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
 int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_set_work_high_prio(struct btrfs_work *work);
 #endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e5b2533b691a..a99f1c2a710d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1325,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root,
        int ret = 0;
        int blocksize;
-        parent = path->nodes[level - 1];
+        parent = path->nodes[level + 1];
        if (!parent)
                return 0;
        nritems = btrfs_header_nritems(parent);
-        slot = path->slots[level];
+        slot = path->slots[level + 1];
        blocksize = btrfs_level_size(root, level);
        if (slot > 0) {
@@ -1341,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                        block1 = 0;
                free_extent_buffer(eb);
        }
-        if (slot < nritems) {
+        if (slot + 1 < nritems) {
                block2 = btrfs_node_blockptr(parent, slot + 1);
                gen = btrfs_node_ptr_generation(parent, slot + 1);
                eb = btrfs_find_tree_block(root, block2, blocksize);
@@ -1351,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
        }
        if (block1 || block2) {
                ret = -EAGAIN;
+                /* release the whole path */
                btrfs_release_path(root, path);
+                /* read the blocks */
                if (block1)
                        readahead_tree_block(root, block1, blocksize, 0);
                if (block2)
@@ -1361,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                        eb = read_tree_block(root, block1, blocksize, 0);
                        free_extent_buffer(eb);
                }
-                if (block1) {
+                if (block2) {
                        eb = read_tree_block(root, block2, blocksize, 0);
                        free_extent_buffer(eb);
                }
@@ -1481,12 +1485,15 @@ read_block_for_search(struct btrfs_trans_handle *trans,
         * of the btree by dropping locks before
         * we read.
         */
-        btrfs_release_path(NULL, p);
+        btrfs_unlock_up_safe(p, level + 1);
+        btrfs_set_path_blocking(p);
        if (tmp)
                free_extent_buffer(tmp);
        if (p->reada)
                reada_for_search(root, p, level, slot, key->objectid);
+        btrfs_release_path(NULL, p);
        tmp = read_tree_block(root, blocknr, blocksize, gen);
        if (tmp)
                free_extent_buffer(tmp);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ad96495dedc5..4414a5d9983a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -881,6 +881,9 @@ struct btrfs_fs_info {
        u64 metadata_alloc_profile;
        u64 system_alloc_profile;
+        unsigned data_chunk_allocations;
+        unsigned metadata_ratio;
        void *bdev_holder;
 };
@@ -2174,7 +2177,8 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct inode *inode,
-                       u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+                       u64 start, u64 end, u64 locked_end,
+                       u64 inline_limit, u64 *hint_block);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct inode *inode, u64 start, u64 end);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92caa8035f36..0ff16d3331da 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -232,10 +232,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                        memcpy(&found, result, csum_size);
                        read_extent_buffer(buf, &val, 0, csum_size);
-                        printk(KERN_INFO "btrfs: %s checksum verify failed "
+                        if (printk_ratelimit()) {
-                               "on %llu wanted %X found %X level %d\n",
+                                printk(KERN_INFO "btrfs: %s checksum verify "
-                               root->fs_info->sb->s_id,
+                                       "failed on %llu wanted %X found %X "
-                               buf->start, val, found, btrfs_header_level(buf));
+                                       "level %d\n",
+                                       root->fs_info->sb->s_id,
+                                       (unsigned long long)buf->start, val, found,
+                                       btrfs_header_level(buf));
+                        }
                        if (result != (char *)&inline_result)
                                kfree(result);
                        return 1;
@@ -268,10 +272,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
                ret = 0;
                goto out;
        }
-        printk("parent transid verify failed on %llu wanted %llu found %llu\n",
+        if (printk_ratelimit()) {
-               (unsigned long long)eb->start,
+                printk("parent transid verify failed on %llu wanted %llu "
-               (unsigned long long)parent_transid,
+                       "found %llu\n",
-               (unsigned long long)btrfs_header_generation(eb));
+                       (unsigned long long)eb->start,
+                       (unsigned long long)parent_transid,
+                       (unsigned long long)btrfs_header_generation(eb));
+        }
        ret = 1;
        clear_extent_buffer_uptodate(io_tree, eb);
 out:
@@ -415,9 +422,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
-                printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
+                if (printk_ratelimit()) {
-                       (unsigned long long)found_start,
+                        printk(KERN_INFO "btrfs bad tree block start "
-                       (unsigned long long)eb->start);
+                               "%llu %llu\n",
+                               (unsigned long long)found_start,
+                               (unsigned long long)eb->start);
+                }
                ret = -EIO;
                goto err;
        }
@@ -429,8 +439,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                goto err;
        }
        if (check_tree_block_fsid(root, eb)) {
-                printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+                if (printk_ratelimit()) {
-                       (unsigned long long)eb->start);
+                        printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+                               (unsigned long long)eb->start);
+                }
                ret = -EIO;
                goto err;
        }
@@ -579,19 +591,12 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->bio_flags = bio_flags;
        atomic_inc(&fs_info->nr_async_submits);
+        if (rw & (1 << BIO_RW_SYNCIO))
+                btrfs_set_work_high_prio(&async->work);
        btrfs_queue_worker(&fs_info->workers, &async->work);
-#if 0
-        int limit = btrfs_async_submit_limit(fs_info);
-        if (atomic_read(&fs_info->nr_async_submits) > limit) {
-                wait_event_timeout(fs_info->async_submit_wait,
-                           (atomic_read(&fs_info->nr_async_submits) < limit),
-                           HZ/10);
-                wait_event_timeout(fs_info->async_submit_wait,
-                           (atomic_read(&fs_info->nr_async_bios) < limit),
-                           HZ/10);
-        }
-#endif
        while (atomic_read(&fs_info->async_submit_draining) &&
              atomic_read(&fs_info->nr_async_submits)) {
                wait_event(fs_info->async_submit_wait,
@@ -656,6 +661,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
                                     mirror_num, 0);
        }
        /*
         * kthread helpers are used to submit writes so that checksumming
         * can happen in parallel across all CPUs
@@ -765,27 +771,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
        }
 }
-#if 0
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
-{
-        struct buffer_head *bh;
-        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-        struct buffer_head *head;
-        if (!page_has_buffers(page)) {
-                create_empty_buffers(page, root->fs_info->sb->s_blocksize,
-                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
-        }
-        head = page_buffers(page);
-        bh = head;
-        do {
-                if (buffer_dirty(bh))
-                        csum_tree_block(root, bh, 0);
-                bh = bh->b_this_page;
-        } while (bh != head);
-        return block_write_full_page(page, btree_get_block, wbc);
-}
-#endif
 static struct address_space_operations btree_aops = {
        .readpage       = btree_readpage,
        .writepage      = btree_writepage,
@@ -1273,11 +1258,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
        int ret = 0;
        struct btrfs_device *device;
        struct backing_dev_info *bdi;
-#if 0
-        if ((bdi_bits & (1 << BDI_write_congested)) &&
-            btrfs_congested_async(info, 0))
-                return 1;
-#endif
        list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
                if (!device->bdev)
                        continue;
@@ -1599,6 +1580,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->btree_inode = new_inode(sb);
        fs_info->btree_inode->i_ino = 1;
        fs_info->btree_inode->i_nlink = 1;
+        fs_info->metadata_ratio = 8;
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
@@ -1689,7 +1671,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (features) {
                printk(KERN_ERR "BTRFS: couldn't mount because of "
                       "unsupported optional features (%Lx).\n",
-                       features);
+                       (unsigned long long)features);
                err = -EINVAL;
                goto fail_iput;
        }
@@ -1699,7 +1681,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (!(sb->s_flags & MS_RDONLY) && features) {
                printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
                       "unsupported option features (%Lx).\n",
-                       features);
+                       (unsigned long long)features);
                err = -EINVAL;
                goto fail_iput;
        }
@@ -2095,10 +2077,10 @@ static int write_dev_supers(struct btrfs_device *device,
                                device->barriers = 0;
                                get_bh(bh);
                                lock_buffer(bh);
-                                ret = submit_bh(WRITE, bh);
+                                ret = submit_bh(WRITE_SYNC, bh);
                        }
                } else {
-                        ret = submit_bh(WRITE, bh);
+                        ret = submit_bh(WRITE_SYNC, bh);
                }
                if (!ret && wait) {
@@ -2291,7 +2273,7 @@ int close_ctree(struct btrfs_root *root)
        if (fs_info->delalloc_bytes) {
                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
-                       fs_info->delalloc_bytes);
+                       (unsigned long long)fs_info->delalloc_bytes);
        }
        if (fs_info->total_ref_cache_size) {
                printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
@@ -2328,16 +2310,6 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->submit_workers);
-#if 0
-        while (!list_empty(&fs_info->hashers)) {
-                struct btrfs_hasher *hasher;
-                hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
-                                    hashers);
-                list_del(&hasher->hashers);
-                crypto_free_hash(&fs_info->hash_tfm);
-                kfree(hasher);
-        }
-#endif
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 178df4c67de4..e4966444811b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1844,10 +1844,14 @@ again:
                printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
                       ", %llu bytes_used, %llu bytes_reserved, "
                       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
-                       "%llu total\n", bytes, data_sinfo->bytes_delalloc,
+                       "%llu total\n", (unsigned long long)bytes,
-                       data_sinfo->bytes_used, data_sinfo->bytes_reserved,
+                       (unsigned long long)data_sinfo->bytes_delalloc,
-                       data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
+                       (unsigned long long)data_sinfo->bytes_used,
-                       data_sinfo->bytes_may_use, data_sinfo->total_bytes);
+                       (unsigned long long)data_sinfo->bytes_reserved,
+                       (unsigned long long)data_sinfo->bytes_pinned,
+                       (unsigned long long)data_sinfo->bytes_readonly,
+                       (unsigned long long)data_sinfo->bytes_may_use,
+                       (unsigned long long)data_sinfo->total_bytes);
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
@@ -1918,15 +1922,29 @@ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
        spin_unlock(&info->lock);
 }
+static void force_metadata_allocation(struct btrfs_fs_info *info)
+{
+        struct list_head *head = &info->space_info;
+        struct btrfs_space_info *found;
+        rcu_read_lock();
+        list_for_each_entry_rcu(found, head, list) {
+                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+                        found->force_alloc = 1;
+        }
+        rcu_read_unlock();
+}
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force)
 {
        struct btrfs_space_info *space_info;
+        struct btrfs_fs_info *fs_info = extent_root->fs_info;
        u64 thresh;
        int ret = 0;
-        mutex_lock(&extent_root->fs_info->chunk_mutex);
+        mutex_lock(&fs_info->chunk_mutex);
        flags = btrfs_reduce_alloc_profile(extent_root, flags);
@@ -1958,6 +1976,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        }
        spin_unlock(&space_info->lock);
+        /*
+         * if we're doing a data chunk, go ahead and make sure that
+         * we keep a reasonable number of metadata chunks allocated in the
+         * FS as well.
+         */
+        if (flags & BTRFS_BLOCK_GROUP_DATA) {
+                fs_info->data_chunk_allocations++;
+                if (!(fs_info->data_chunk_allocations %
+                      fs_info->metadata_ratio))
+                        force_metadata_allocation(fs_info);
+        }
        ret = btrfs_alloc_chunk(trans, extent_root, flags);
        if (ret)
                space_info->full = 1;
@@ -2798,9 +2828,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
                                    info->bytes_pinned - info->bytes_reserved),
               (info->full) ? "" : "not ");
        printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
-               " may_use=%llu, used=%llu\n", info->total_bytes,
+               " may_use=%llu, used=%llu\n",
-               info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
+               (unsigned long long)info->total_bytes,
-               info->bytes_used);
+               (unsigned long long)info->bytes_pinned,
+               (unsigned long long)info->bytes_delalloc,
+               (unsigned long long)info->bytes_may_use,
+               (unsigned long long)info->bytes_used);
        down_read(&info->groups_sem);
        list_for_each_entry(cache, &info->block_groups, list) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eb2bee8b7fbf..fe9eb990e443 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,12 +17,6 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
-/* temporary define until extent_map moves out of btrfs */
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
-                                       unsigned long extra_flags,
-                                       void (*ctor)(void *, struct kmem_cache *,
-                                                    unsigned long));
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -50,20 +44,23 @@ struct extent_page_data {
        /* tells writepage not to lock the state bits for this range
         * it still does the unlocking
         */
-        int extent_locked;
+        unsigned int extent_locked:1;
+        /* tells the submit_bio code to use a WRITE_SYNC */
+        unsigned int sync_io:1;
 };
 int __init extent_io_init(void)
 {
-        extent_state_cache = btrfs_cache_create("extent_state",
+        extent_state_cache = kmem_cache_create("extent_state",
-                                            sizeof(struct extent_state), 0,
+                        sizeof(struct extent_state), 0,
-                                            NULL);
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!extent_state_cache)
                return -ENOMEM;
-        extent_buffer_cache = btrfs_cache_create("extent_buffers",
+        extent_buffer_cache = kmem_cache_create("extent_buffers",
-                                            sizeof(struct extent_buffer), 0,
+                        sizeof(struct extent_buffer), 0,
-                                            NULL);
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!extent_buffer_cache)
                goto free_state_cache;
        return 0;
@@ -1404,69 +1401,6 @@ out:
        return total_bytes;
 }
-#if 0
-/*
- * helper function to lock both pages and extents in the tree.
- * pages must be locked first.
- */
-static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
-{
-        unsigned long index = start >> PAGE_CACHE_SHIFT;
-        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-        struct page *page;
-        int err;
-        while (index <= end_index) {
-                page = grab_cache_page(tree->mapping, index);
-                if (!page) {
-                        err = -ENOMEM;
-                        goto failed;
-                }
-                if (IS_ERR(page)) {
-                        err = PTR_ERR(page);
-                        goto failed;
-                }
-                index++;
-        }
-        lock_extent(tree, start, end, GFP_NOFS);
-        return 0;
-failed:
-        /*
-         * we failed above in getting the page at 'index', so we undo here
-         * up to but not including the page at 'index'
-         */
-        end_index = index;
-        index = start >> PAGE_CACHE_SHIFT;
-        while (index < end_index) {
-                page = find_get_page(tree->mapping, index);
-                unlock_page(page);
-                page_cache_release(page);
-                index++;
-        }
-        return err;
-}
-/*
- * helper function to unlock both pages and extents in the tree.
- */
-static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
-{
-        unsigned long index = start >> PAGE_CACHE_SHIFT;
-        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-        struct page *page;
-        while (index <= end_index) {
-                page = find_get_page(tree->mapping, index);
-                unlock_page(page);
-                page_cache_release(page);
-                index++;
-        }
-        unlock_extent(tree, start, end, GFP_NOFS);
-        return 0;
-}
-#endif
 /*
 * set the private field for a given byte offset in the tree.  If there isn't
 * an extent_state there already, this does nothing.
@@ -2101,6 +2035,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
        return ret;
 }
+static noinline void update_nr_written(struct page *page,
+                                      struct writeback_control *wbc,
+                                      unsigned long nr_written)
+{
+        wbc->nr_to_write -= nr_written;
+        if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+            wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+                page->mapping->writeback_index = page->index + nr_written;
+}
 /*
 * the writepage semantics are similar to regular writepage.  extent
 * records are inserted to lock ranges in the tree, and as dirty areas
@@ -2136,8 +2080,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        u64 delalloc_end;
        int page_started;
        int compressed;
+        int write_flags;
        unsigned long nr_written = 0;
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                write_flags = WRITE_SYNC_PLUG;
+        else
+                write_flags = WRITE;
        WARN_ON(!PageLocked(page));
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
@@ -2164,6 +2114,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        delalloc_end = 0;
        page_started = 0;
        if (!epd->extent_locked) {
+                /*
+                 * make sure the wbc mapping index is at least updated
+                 * to this page.
+                 */
+                update_nr_written(page, wbc, 0);
                while (delalloc_end < page_end) {
                        nr_delalloc = find_lock_delalloc_range(inode, tree,
                                                       page,
@@ -2185,7 +2141,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                 */
                if (page_started) {
                        ret = 0;
-                        goto update_nr_written;
+                        /*
+                         * we've unlocked the page, so we can't update
+                         * the mapping's writeback index, just update
+                         * nr_to_write.
+                         */
+                        wbc->nr_to_write -= nr_written;
+                        goto done_unlocked;
                }
        }
        lock_extent(tree, start, page_end, GFP_NOFS);
@@ -2198,13 +2160,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                if (ret == -EAGAIN) {
                        unlock_extent(tree, start, page_end, GFP_NOFS);
                        redirty_page_for_writepage(wbc, page);
+                        update_nr_written(page, wbc, nr_written);
                        unlock_page(page);
                        ret = 0;
-                        goto update_nr_written;
+                        goto done_unlocked;
                }
        }
-        nr_written++;
+        /*
+         * we don't want to touch the inode after unlocking the page,
+         * so we update the mapping writeback index now
+         */
+        update_nr_written(page, wbc, nr_written + 1);
        end = page_end;
        if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
@@ -2314,9 +2281,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                                       (unsigned long long)end);
                        }
-                        ret = submit_extent_page(WRITE, tree, page, sector,
+                        ret = submit_extent_page(write_flags, tree, page,
-                                                 iosize, pg_offset, bdev,
+                                                 sector, iosize, pg_offset,
-                                                 &epd->bio, max_nr,
+                                                 bdev, &epd->bio, max_nr,
                                                 end_bio_extent_writepage,
                                                 0, 0, 0);
                        if (ret)
@@ -2336,11 +2303,8 @@ done:
                unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
        unlock_page(page);
-update_nr_written:
+done_unlocked:
-        wbc->nr_to_write -= nr_written;
-        if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
-            wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
-                page->mapping->writeback_index = page->index + nr_written;
        return 0;
 }
@@ -2460,15 +2424,23 @@ retry:
        return ret;
 }
-static noinline void flush_write_bio(void *data)
+static void flush_epd_write_bio(struct extent_page_data *epd)
 {
-        struct extent_page_data *epd = data;
        if (epd->bio) {
-                submit_one_bio(WRITE, epd->bio, 0, 0);
+                if (epd->sync_io)
+                        submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
+                else
+                        submit_one_bio(WRITE, epd->bio, 0, 0);
                epd->bio = NULL;
        }
 }
+static noinline void flush_write_bio(void *data)
+{
+        struct extent_page_data *epd = data;
+        flush_epd_write_bio(epd);
+}
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                          get_extent_t *get_extent,
                          struct writeback_control *wbc)
@@ -2480,23 +2452,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                .tree = tree,
                .get_extent = get_extent,
                .extent_locked = 0,
+                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
        struct writeback_control wbc_writepages = {
                .bdi            = wbc->bdi,
-                .sync_mode      = WB_SYNC_NONE,
+                .sync_mode      = wbc->sync_mode,
                .older_than_this = NULL,
                .nr_to_write    = 64,
                .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
                .range_end      = (loff_t)-1,
        };
        ret = __extent_writepage(page, wbc, &epd);
        extent_write_cache_pages(tree, mapping, &wbc_writepages,
                                 __extent_writepage, &epd, flush_write_bio);
-        if (epd.bio)
+        flush_epd_write_bio(&epd);
-                submit_one_bio(WRITE, epd.bio, 0, 0);
        return ret;
 }
@@ -2515,6 +2486,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                .tree = tree,
                .get_extent = get_extent,
                .extent_locked = 1,
+                .sync_io = mode == WB_SYNC_ALL,
        };
        struct writeback_control wbc_writepages = {
                .bdi            = inode->i_mapping->backing_dev_info,
@@ -2540,8 +2512,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                start += PAGE_CACHE_SIZE;
        }
-        if (epd.bio)
+        flush_epd_write_bio(&epd);
-                submit_one_bio(WRITE, epd.bio, 0, 0);
        return ret;
 }
@@ -2556,13 +2527,13 @@ int extent_writepages(struct extent_io_tree *tree,
                .tree = tree,
                .get_extent = get_extent,
                .extent_locked = 0,
+                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
        ret = extent_write_cache_pages(tree, mapping, wbc,
                                       __extent_writepage, &epd,
                                       flush_write_bio);
-        if (epd.bio)
+        flush_epd_write_bio(&epd);
-                submit_one_bio(WRITE, epd.bio, 0, 0);
        return ret;
 }
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b187917b36fa..30c9365861e6 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -6,19 +6,14 @@
 #include <linux/hardirq.h>
 #include "extent_map.h"
-/* temporary define until extent_map moves out of btrfs */
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
-                                       unsigned long extra_flags,
-                                       void (*ctor)(void *, struct kmem_cache *,
-                                                    unsigned long));
 static struct kmem_cache *extent_map_cache;
 int __init extent_map_init(void)
 {
-        extent_map_cache = btrfs_cache_create("extent_map",
+        extent_map_cache = kmem_cache_create("extent_map",
-                                            sizeof(struct extent_map), 0,
+                        sizeof(struct extent_map), 0,
-                                            NULL);
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!extent_map_cache)
                return -ENOMEM;
        return 0;
@@ -43,7 +38,6 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
        tree->map.rb_node = NULL;
        spin_lock_init(&tree->lock);
 }
-EXPORT_SYMBOL(extent_map_tree_init);
 /**
 * alloc_extent_map - allocate new extent map structure
@@ -64,7 +58,6 @@ struct extent_map *alloc_extent_map(gfp_t mask)
        atomic_set(&em->refs, 1);
        return em;
 }
-EXPORT_SYMBOL(alloc_extent_map);
 /**
 * free_extent_map - drop reference count of an extent_map
@@ -83,7 +76,6 @@ void free_extent_map(struct extent_map *em)
                kmem_cache_free(extent_map_cache, em);
        }
 }
-EXPORT_SYMBOL(free_extent_map);
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
                                   struct rb_node *node)
@@ -264,7 +256,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
 out:
        return ret;
 }
-EXPORT_SYMBOL(add_extent_mapping);
 /* simple helper to do math around the end of an extent, handling wrap */
 static u64 range_end(u64 start, u64 len)
@@ -326,7 +317,6 @@ found:
 out:
        return em;
 }
-EXPORT_SYMBOL(lookup_extent_mapping);
 /**
 * remove_extent_mapping - removes an extent_map from the extent tree
@@ -346,4 +336,3 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
        em->in_tree = 0;
        return ret;
 }
-EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9c9fb46ccd08..1d51dc38bb49 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -272,83 +272,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
        return 0;
 }
-int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
-{
-        return 0;
-#if 0
-        struct btrfs_path *path;
-        struct btrfs_key found_key;
-        struct extent_buffer *leaf;
-        struct btrfs_file_extent_item *extent;
-        u64 last_offset = 0;
-        int nritems;
-        int slot;
-        int found_type;
-        int ret;
-        int err = 0;
-        u64 extent_end = 0;
-        path = btrfs_alloc_path();
-        ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
-                                       last_offset, 0);
-        while (1) {
-                nritems = btrfs_header_nritems(path->nodes[0]);
-                if (path->slots[0] >= nritems) {
-                        ret = btrfs_next_leaf(root, path);
-                        if (ret)
-                                goto out;
-                        nritems = btrfs_header_nritems(path->nodes[0]);
-                }
-                slot = path->slots[0];
-                leaf = path->nodes[0];
-                btrfs_item_key_to_cpu(leaf, &found_key, slot);
-                if (found_key.objectid != inode->i_ino)
-                        break;
-                if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-                        goto out;
-                if (found_key.offset < last_offset) {
-                        WARN_ON(1);
-                        btrfs_print_leaf(root, leaf);
-                        printk(KERN_ERR "inode %lu found offset %llu "
-                               "expected %llu\n", inode->i_ino,
-                               (unsigned long long)found_key.offset,
-                               (unsigned long long)last_offset);
-                        err = 1;
-                        goto out;
-                }
-                extent = btrfs_item_ptr(leaf, slot,
-                                        struct btrfs_file_extent_item);
-                found_type = btrfs_file_extent_type(leaf, extent);
-                if (found_type == BTRFS_FILE_EXTENT_REG) {
-                        extent_end = found_key.offset +
-                             btrfs_file_extent_num_bytes(leaf, extent);
-                } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-                        struct btrfs_item *item;
-                        item = btrfs_item_nr(leaf, slot);
-                        extent_end = found_key.offset +
-                             btrfs_file_extent_inline_len(leaf, extent);
-                        extent_end = (extent_end + root->sectorsize - 1) &
-                                ~((u64)root->sectorsize - 1);
-                }
-                last_offset = extent_end;
-                path->slots[0]++;
-        }
-        if (0 && last_offset < inode->i_size) {
-                WARN_ON(1);
-                btrfs_print_leaf(root, leaf);
-                printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
-                       inode->i_ino, (unsigned long long)last_offset,
-                       (unsigned long long)inode->i_size);
-                err = 1;
-        }
-out:
-        btrfs_free_path(path);
-        return err;
-#endif
-}
 /*
 * this is very complex, but the basic idea is to drop all extents
 * in the range start - end.  hint_block is filled in with a block number
@@ -363,15 +286,16 @@ out:
 */
 noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct inode *inode,
-                       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
+                       u64 start, u64 end, u64 locked_end,
+                       u64 inline_limit, u64 *hint_byte)
 {
        u64 extent_end = 0;
-        u64 locked_end = end;
        u64 search_start = start;
        u64 leaf_start;
        u64 ram_bytes = 0;
        u64 orig_parent = 0;
        u64 disk_bytenr = 0;
+        u64 orig_locked_end = locked_end;
        u8 compression;
        u8 encryption;
        u16 other_encoding = 0;
@@ -684,11 +608,10 @@ next_slot:
        }
 out:
        btrfs_free_path(path);
-        if (locked_end > end) {
+        if (locked_end > orig_locked_end) {
-                unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+                unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
-                              GFP_NOFS);
+                              locked_end - 1, GFP_NOFS);
        }
-        btrfs_check_file(root, inode);
        return ret;
 }
@@ -830,7 +753,7 @@ again:
                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
                BUG_ON(ret);
-                goto done;
+                goto release;
        } else if (split == start) {
                if (locked_end < extent_end) {
                        ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
@@ -926,6 +849,8 @@ again:
        }
 done:
        btrfs_mark_buffer_dirty(leaf);
+release:
        btrfs_release_path(root, path);
        if (split_end && split == start) {
                split = end;
@@ -1131,7 +1056,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                if (will_write) {
                        btrfs_fdatawrite_range(inode->i_mapping, pos,
                                               pos + write_bytes - 1,
-                                               WB_SYNC_NONE);
+                                               WB_SYNC_ALL);
                } else {
                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
                                                           num_pages);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 768b9523662d..0bc93657b460 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -332,13 +332,17 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                        printk(KERN_ERR "couldn't find space %llu to free\n",
                               (unsigned long long)offset);
                        printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
-                               block_group->cached, block_group->key.objectid,
+                               block_group->cached,
-                               block_group->key.offset);
+                               (unsigned long long)block_group->key.objectid,
+                               (unsigned long long)block_group->key.offset);
                        btrfs_dump_free_space(block_group, bytes);
                } else if (info) {
                        printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
                               "but wanted offset=%llu bytes=%llu\n",
-                               info->offset, info->bytes, offset, bytes);
+                               (unsigned long long)info->offset,
+                               (unsigned long long)info->bytes,
+                               (unsigned long long)offset,
+                               (unsigned long long)bytes);
                }
                WARN_ON(1);
        }
@@ -357,8 +361,9 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (info->bytes >= bytes)
                        count++;
-                printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset,
+                printk(KERN_ERR "entry offset %llu, bytes %llu\n",
-                       info->bytes);
+                       (unsigned long long)info->offset,
+                       (unsigned long long)info->bytes);
        }
        printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
               "\n", count);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index cc7334d833c9..9abbced1123d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -79,7 +79,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
        }
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
+        search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
        search_key.objectid = search_start;
        search_key.type = 0;
        search_key.offset = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0d1dd492a58..90c23eb28829 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -70,7 +70,6 @@ static struct extent_io_ops btrfs_extent_io_ops;
 static struct kmem_cache *btrfs_inode_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
-struct kmem_cache *btrfs_bit_radix_cachep;
 struct kmem_cache *btrfs_path_cachep;
 #define S_SHIFT 12
@@ -234,7 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
        }
        ret = btrfs_drop_extents(trans, root, inode, start,
-                                 aligned_end, start, &hint_byte);
+                                 aligned_end, aligned_end, start, &hint_byte);
        BUG_ON(ret);
        if (isize > actual_end)
@@ -1439,6 +1438,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
                                       struct inode *inode, u64 file_pos,
                                       u64 disk_bytenr, u64 disk_num_bytes,
                                       u64 num_bytes, u64 ram_bytes,
+                                       u64 locked_end,
                                       u8 compression, u8 encryption,
                                       u16 other_encoding, int extent_type)
 {
@@ -1455,7 +1455,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        path->leave_spinning = 1;
        ret = btrfs_drop_extents(trans, root, inode, file_pos,
-                                 file_pos + num_bytes, file_pos, &hint);
+                                 file_pos + num_bytes, locked_end,
+                                 file_pos, &hint);
        BUG_ON(ret);
        ins.objectid = inode->i_ino;
@@ -1590,6 +1591,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                                ordered_extent->disk_len,
                                                ordered_extent->len,
                                                ordered_extent->len,
+                                                ordered_extent->file_offset +
+                                                ordered_extent->len,
                                                compressed, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
                BUG_ON(ret);
@@ -1819,10 +1822,12 @@ good:
        return 0;
 zeroit:
-        printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+        if (printk_ratelimit()) {
-               "private %llu\n", page->mapping->host->i_ino,
+                printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
-               (unsigned long long)start, csum,
+                       "private %llu\n", page->mapping->host->i_ino,
-               (unsigned long long)private);
+                       (unsigned long long)start, csum,
+                       (unsigned long long)private);
+        }
        memset(kaddr + offset, 1, end - start + 1);
        flush_dcache_page(page);
        kunmap_atomic(kaddr, KM_USER0);
@@ -2011,6 +2016,57 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 }
 /*
+ * very simple check to peek ahead in the leaf looking for xattrs.  If we
+ * don't find any xattrs, we know there can't be any acls.
+ *
+ * slot is the slot the inode is in, objectid is the objectid of the inode
+ */
+static noinline int acls_after_inode_item(struct extent_buffer *leaf,
+                                          int slot, u64 objectid)
+{
+        u32 nritems = btrfs_header_nritems(leaf);
+        struct btrfs_key found_key;
+        int scanned = 0;
+        slot++;
+        while (slot < nritems) {
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                /* we found a different objectid, there must not be acls */
+                if (found_key.objectid != objectid)
+                        return 0;
+                /* we found an xattr, assume we've got an acl */
+                if (found_key.type == BTRFS_XATTR_ITEM_KEY)
+                        return 1;
+                /*
+                 * we found a key greater than an xattr key, there can't
+                 * be any acls later on
+                 */
+                if (found_key.type > BTRFS_XATTR_ITEM_KEY)
+                        return 0;
+                slot++;
+                scanned++;
+                /*
+                 * it goes inode, inode backrefs, xattrs, extents,
+                 * so if there are a ton of hard links to an inode there can
+                 * be a lot of backrefs.  Don't waste time searching too hard,
+                 * this is just an optimization
+                 */
+                if (scanned >= 8)
+                        break;
+        }
+        /* we hit the end of the leaf before we found an xattr or
+         * something larger than an xattr.  We have to assume the inode
+         * has acls
+         */
+        return 1;
+}
+/*
 * read an inode from the btree into the in-memory inode
 */
 void btrfs_read_locked_inode(struct inode *inode)
@@ -2021,6 +2077,7 @@ void btrfs_read_locked_inode(struct inode *inode)
        struct btrfs_timespec *tspec;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key location;
+        int maybe_acls;
        u64 alloc_group_block;
        u32 rdev;
        int ret;
@@ -2067,6 +2124,16 @@ void btrfs_read_locked_inode(struct inode *inode)
        alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+        /*
+         * try to precache a NULL acl entry for files that don't have
+         * any xattrs or acls
+         */
+        maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
+        if (!maybe_acls) {
+                BTRFS_I(inode)->i_acl = NULL;
+                BTRFS_I(inode)->i_default_acl = NULL;
+        }
        BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
                                                alloc_group_block, 0);
        btrfs_free_path(path);
@@ -2877,6 +2944,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                        err = btrfs_drop_extents(trans, root, inode,
                                                 cur_offset,
                                                 cur_offset + hole_size,
+                                                 block_end,
                                                 cur_offset, &hint_byte);
                        if (err)
                                break;
@@ -3041,8 +3109,8 @@ static noinline void init_btrfs_i(struct inode *inode)
 {
        struct btrfs_inode *bi = BTRFS_I(inode);
-        bi->i_acl = NULL;
+        bi->i_acl = BTRFS_ACL_NOT_CACHED;
-        bi->i_default_acl = NULL;
+        bi->i_default_acl = BTRFS_ACL_NOT_CACHED;
        bi->generation = 0;
        bi->sequence = 0;
@@ -4634,47 +4702,36 @@ void btrfs_destroy_cachep(void)
                kmem_cache_destroy(btrfs_trans_handle_cachep);
        if (btrfs_transaction_cachep)
                kmem_cache_destroy(btrfs_transaction_cachep);
-        if (btrfs_bit_radix_cachep)
-                kmem_cache_destroy(btrfs_bit_radix_cachep);
        if (btrfs_path_cachep)
                kmem_cache_destroy(btrfs_path_cachep);
 }
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
-                                       unsigned long extra_flags,
-                                       void (*ctor)(void *))
-{
-        return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
-                                 SLAB_MEM_SPREAD | extra_flags), ctor);
-}
 int btrfs_init_cachep(void)
 {
-        btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
+        btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
-                                          sizeof(struct btrfs_inode),
+                        sizeof(struct btrfs_inode), 0,
-                                          0, init_once);
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
        if (!btrfs_inode_cachep)
                goto fail;
-        btrfs_trans_handle_cachep =
-                        btrfs_cache_create("btrfs_trans_handle_cache",
+        btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
-                                           sizeof(struct btrfs_trans_handle),
+                        sizeof(struct btrfs_trans_handle), 0,
-                                           0, NULL);
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_trans_handle_cachep)
                goto fail;
-        btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
-                                             sizeof(struct btrfs_transaction),
+        btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
-                                             0, NULL);
+                        sizeof(struct btrfs_transaction), 0,
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_transaction_cachep)
                goto fail;
-        btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
-                                         sizeof(struct btrfs_path),
+        btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
-                                         0, NULL);
+                        sizeof(struct btrfs_path), 0,
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
        if (!btrfs_path_cachep)
                goto fail;
-        btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
-                                              SLAB_DESTROY_BY_RCU, NULL);
-        if (!btrfs_bit_radix_cachep)
-                goto fail;
        return 0;
 fail:
        btrfs_destroy_cachep();
@@ -4970,10 +5027,10 @@ out_fail:
        return err;
 }
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+static int prealloc_file_range(struct btrfs_trans_handle *trans,
-                               u64 alloc_hint, int mode)
+                               struct inode *inode, u64 start, u64 end,
+                               u64 locked_end, u64 alloc_hint, int mode)
 {
-        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 alloc_size;
@@ -4981,10 +5038,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
        u64 num_bytes = end - start;
        int ret = 0;
-        trans = btrfs_join_transaction(root, 1);
-        BUG_ON(!trans);
-        btrfs_set_trans_block_group(trans, inode);
        while (num_bytes > 0) {
                alloc_size = min(num_bytes, root->fs_info->max_extent);
                ret = btrfs_reserve_extent(trans, root, alloc_size,
@@ -4997,7 +5050,8 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
                ret = insert_reserved_file_extent(trans, inode,
                                                  cur_offset, ins.objectid,
                                                  ins.offset, ins.offset,
-                                                  ins.offset, 0, 0, 0,
+                                                  ins.offset, locked_end,
+                                                  0, 0, 0,
                                                  BTRFS_FILE_EXTENT_PREALLOC);
                BUG_ON(ret);
                num_bytes -= ins.offset;
@@ -5015,7 +5069,6 @@ out:
                BUG_ON(ret);
        }
-        btrfs_end_transaction(trans, root);
        return ret;
 }
@@ -5027,13 +5080,21 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        u64 alloc_start;
        u64 alloc_end;
        u64 alloc_hint = 0;
+        u64 locked_end;
        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
        struct extent_map *em;
+        struct btrfs_trans_handle *trans;
        int ret;
        alloc_start = offset & ~mask;
        alloc_end =  (offset + len + mask) & ~mask;
+        /*
+         * wait for ordered IO before we have any locks.  We'll loop again
+         * below with the locks held.
+         */
+        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
        mutex_lock(&inode->i_mutex);
        if (alloc_start > inode->i_size) {
                ret = btrfs_cont_expand(inode, alloc_start);
@@ -5041,10 +5102,21 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        goto out;
        }
+        locked_end = alloc_end - 1;
        while (1) {
                struct btrfs_ordered_extent *ordered;
-                lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
-                            alloc_end - 1, GFP_NOFS);
+                trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
+                if (!trans) {
+                        ret = -EIO;
+                        goto out;
+                }
+                /* the extent lock is ordered inside the running
+                 * transaction
+                 */
+                lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+                            GFP_NOFS);
                ordered = btrfs_lookup_first_ordered_extent(inode,
                                                            alloc_end - 1);
                if (ordered &&
@@ -5052,7 +5124,13 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                    ordered->file_offset < alloc_end) {
                        btrfs_put_ordered_extent(ordered);
                        unlock_extent(&BTRFS_I(inode)->io_tree,
-                                      alloc_start, alloc_end - 1, GFP_NOFS);
+                                      alloc_start, locked_end, GFP_NOFS);
+                        btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+                        /*
+                         * we can't wait on the range with the transaction
+                         * running or with the extent lock held
+                         */
                        btrfs_wait_ordered_range(inode, alloc_start,
                                                 alloc_end - alloc_start);
                } else {
@@ -5070,8 +5148,9 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                last_byte = min(extent_map_end(em), alloc_end);
                last_byte = (last_byte + mask) & ~mask;
                if (em->block_start == EXTENT_MAP_HOLE) {
-                        ret = prealloc_file_range(inode, cur_offset,
+                        ret = prealloc_file_range(trans, inode, cur_offset,
-                                        last_byte, alloc_hint, mode);
+                                        last_byte, locked_end + 1,
+                                        alloc_hint, mode);
                        if (ret < 0) {
                                free_extent_map(em);
                                break;
@@ -5087,8 +5166,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        break;
                }
        }
-        unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
+        unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                      GFP_NOFS);
+        btrfs_end_transaction(trans, BTRFS_I(inode)->root);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7594bec1be10..5e94ea6e1cbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -461,15 +461,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
-        if (!vol_args)
+                return PTR_ERR(vol_args);
-                return -ENOMEM;
-        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-                ret = -EFAULT;
-                goto out;
-        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        namelen = strlen(vol_args->name);
@@ -483,11 +477,13 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
                *devstr = '\0';
                devstr = vol_args->name;
                devid = simple_strtoull(devstr, &end, 10);
-                printk(KERN_INFO "resizing devid %llu\n", devid);
+                printk(KERN_INFO "resizing devid %llu\n",
+                       (unsigned long long)devid);
        }
        device = btrfs_find_device(root, devid, NULL, NULL);
        if (!device) {
-                printk(KERN_INFO "resizer unable to find device %llu\n", devid);
+                printk(KERN_INFO "resizer unable to find device %llu\n",
+                       (unsigned long long)devid);
                ret = -EINVAL;
                goto out_unlock;
        }
@@ -545,7 +541,6 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 out_unlock:
        mutex_unlock(&root->fs_info->volume_mutex);
-out:
        kfree(vol_args);
        return ret;
 }
@@ -565,15 +560,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
-        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
-        if (!vol_args)
+                return PTR_ERR(vol_args);
-                return -ENOMEM;
-        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-                ret = -EFAULT;
-                goto out;
-        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        namelen = strlen(vol_args->name);
@@ -675,19 +664,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
+                return PTR_ERR(vol_args);
-        if (!vol_args)
-                return -ENOMEM;
-        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-                ret = -EFAULT;
-                goto out;
-        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_init_new_device(root, vol_args->name);
-out:
        kfree(vol_args);
        return ret;
 }
@@ -703,19 +686,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
-        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
+                return PTR_ERR(vol_args);
-        if (!vol_args)
-                return -ENOMEM;
-        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-                ret = -EFAULT;
-                goto out;
-        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_rm_device(root, vol_args->name);
-out:
        kfree(vol_args);
        return ret;
 }
@@ -830,7 +807,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        BUG_ON(!trans);
        /* punch hole in destination first */
-        btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
+        btrfs_drop_extents(trans, root, inode, off, off + len,
+                           off + len, 0, &hint_byte);
        /* clone data */
        key.objectid = src->i_ino;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 53c87b197d70..d6f0806c682f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -489,7 +489,7 @@ again:
        /* start IO across the range first to instantiate any delalloc
         * extents
         */
-        btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+        btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
        /* The compression code will leave pages locked but return from
         * writepage without setting the page writeback.  Starting again
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9744af9d71e9..3536bdb2d7cb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -68,7 +68,7 @@ enum {
        Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
        Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
        Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_notreelog,
-        Opt_flushoncommit, Opt_err,
+        Opt_ratio, Opt_flushoncommit, Opt_err,
 };
 static match_table_t tokens = {
@@ -87,6 +87,7 @@ static match_table_t tokens = {
        {Opt_noacl, "noacl"},
        {Opt_notreelog, "notreelog"},
        {Opt_flushoncommit, "flushoncommit"},
+        {Opt_ratio, "metadata_ratio=%d"},
        {Opt_err, NULL},
 };
@@ -195,7 +196,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                info->max_extent = max_t(u64,
                                        info->max_extent, root->sectorsize);
                                printk(KERN_INFO "btrfs: max_extent at %llu\n",
-                                       info->max_extent);
+                                       (unsigned long long)info->max_extent);
                        }
                        break;
                case Opt_max_inline:
@@ -210,7 +211,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                                root->sectorsize);
                                }
                                printk(KERN_INFO "btrfs: max_inline at %llu\n",
-                                        info->max_inline);
+                                        (unsigned long long)info->max_inline);
                        }
                        break;
                case Opt_alloc_start:
@@ -220,7 +221,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                kfree(num);
                                printk(KERN_INFO
                                        "btrfs: allocations start at %llu\n",
-                                        info->alloc_start);
+                                        (unsigned long long)info->alloc_start);
                        }
                        break;
                case Opt_noacl:
@@ -234,6 +235,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
                        btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
                        break;
+                case Opt_ratio:
+                        intarg = 0;
+                        match_int(&args[0], &intarg);
+                        if (intarg) {
+                                info->metadata_ratio = intarg;
+                                printk(KERN_INFO "btrfs: metadata ratio %d\n",
+                                       info->metadata_ratio);
+                        }
+                        break;
                default:
                        break;
                }
@@ -410,11 +420,14 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (btrfs_test_opt(root, NOBARRIER))
                seq_puts(seq, ",nobarrier");
        if (info->max_extent != (u64)-1)
-                seq_printf(seq, ",max_extent=%llu", info->max_extent);
+                seq_printf(seq, ",max_extent=%llu",
+                           (unsigned long long)info->max_extent);
        if (info->max_inline != 8192 * 1024)
-                seq_printf(seq, ",max_inline=%llu", info->max_inline);
+                seq_printf(seq, ",max_inline=%llu",
+                           (unsigned long long)info->max_inline);
        if (info->alloc_start != 0)
-                seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
+                seq_printf(seq, ",alloc_start=%llu",
+                           (unsigned long long)info->alloc_start);
        if (info->thread_pool_size !=  min_t(unsigned long,
                                             num_online_cpus() + 2, 8))
                seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
@@ -635,14 +648,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        vol = kmalloc(sizeof(*vol), GFP_KERNEL);
+        vol = memdup_user((void __user *)arg, sizeof(*vol));
-        if (!vol)
+        if (IS_ERR(vol))
-                return -ENOMEM;
+                return PTR_ERR(vol);
-        if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
-                ret = -EFAULT;
-                goto out;
-        }
        switch (cmd) {
        case BTRFS_IOC_SCAN_DEV:
@@ -650,7 +658,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
                                            &btrfs_fs_type, &fs_devices);
                break;
        }
-out:
        kfree(vol);
        return ret;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2869b3361eb6..01b143605ec1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -687,7 +687,13 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
                prepare_to_wait(&info->transaction_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
                mutex_unlock(&info->trans_mutex);
+                atomic_dec(&info->throttles);
+                wake_up(&info->transaction_throttle);
                schedule();
+                atomic_inc(&info->throttles);
                mutex_lock(&info->trans_mutex);
                finish_wait(&info->transaction_wait, &wait);
        }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 25f20ea11f27..db5e212e8445 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -536,7 +536,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
        saved_nbytes = inode_get_bytes(inode);
        /* drop any overlapping extents */
        ret = btrfs_drop_extents(trans, root, inode,
-                         start, extent_end, start, &alloc_hint);
+                         start, extent_end, extent_end, start, &alloc_hint);
        BUG_ON(ret);
        if (found_type == BTRFS_FILE_EXTENT_REG ||
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e0913e469728..5f01dad4b696 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
        return NULL;
 }
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+                        struct bio *head, struct bio *tail)
+{
+        struct bio *old_head;
+        old_head = pending_bios->head;
+        pending_bios->head = head;
+        if (pending_bios->tail)
+                tail->bi_next = old_head;
+        else
+                pending_bios->tail = tail;
+}
 /*
 * we try to collect pending bios for a device so we don't get a large
 * number of procs sending bios down to the same device.  This greatly
@@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        struct bio *pending;
        struct backing_dev_info *bdi;
        struct btrfs_fs_info *fs_info;
+        struct btrfs_pending_bios *pending_bios;
        struct bio *tail;
        struct bio *cur;
        int again = 0;
-        unsigned long num_run = 0;
+        unsigned long num_run;
+        unsigned long num_sync_run;
        unsigned long limit;
        unsigned long last_waited = 0;
@@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        limit = btrfs_async_submit_limit(fs_info);
        limit = limit * 2 / 3;
+        /* we want to make sure that every time we switch from the sync
+         * list to the normal list, we unplug
+         */
+        num_sync_run = 0;
 loop:
        spin_lock(&device->io_lock);
+        num_run = 0;
 loop_lock:
        /* take all the bios off the list at once and process them
         * later on (without the lock held).  But, remember the
         * tail and other pointers so the bios can be properly reinserted
         * into the list if we hit congestion
         */
-        pending = device->pending_bios;
+        if (device->pending_sync_bios.head)
-        tail = device->pending_bio_tail;
+                pending_bios = &device->pending_sync_bios;
+        else
+                pending_bios = &device->pending_bios;
+        pending = pending_bios->head;
+        tail = pending_bios->tail;
        WARN_ON(pending && !tail);
-        device->pending_bios = NULL;
-        device->pending_bio_tail = NULL;
        /*
         * if pending was null this time around, no bios need processing
@@ -176,16 +202,41 @@ loop_lock:
         * device->running_pending is used to synchronize with the
         * schedule_bio code.
         */
-        if (pending) {
+        if (device->pending_sync_bios.head == NULL &&
-                again = 1;
+            device->pending_bios.head == NULL) {
-                device->running_pending = 1;
-        } else {
                again = 0;
                device->running_pending = 0;
+        } else {
+                again = 1;
+                device->running_pending = 1;
        }
+        pending_bios->head = NULL;
+        pending_bios->tail = NULL;
        spin_unlock(&device->io_lock);
+        /*
+         * if we're doing the regular priority list, make sure we unplug
+         * for any high prio bios we've sent down
+         */
+        if (pending_bios == &device->pending_bios && num_sync_run > 0) {
+                num_sync_run = 0;
+                blk_run_backing_dev(bdi, NULL);
+        }
        while (pending) {
+                rmb();
+                if (pending_bios != &device->pending_sync_bios &&
+                    device->pending_sync_bios.head &&
+                    num_run > 16) {
+                        cond_resched();
+                        spin_lock(&device->io_lock);
+                        requeue_list(pending_bios, pending, tail);
+                        goto loop_lock;
+                }
                cur = pending;
                pending = pending->bi_next;
                cur->bi_next = NULL;
@@ -196,10 +247,18 @@ loop_lock:
                        wake_up(&fs_info->async_submit_wait);
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-                bio_get(cur);
                submit_bio(cur->bi_rw, cur);
-                bio_put(cur);
                num_run++;
+                if (bio_sync(cur))
+                        num_sync_run++;
+                if (need_resched()) {
+                        if (num_sync_run) {
+                                blk_run_backing_dev(bdi, NULL);
+                                num_sync_run = 0;
+                        }
+                        cond_resched();
+                }
                /*
                 * we made progress, there is more work to do and the bdi
@@ -208,7 +267,6 @@ loop_lock:
                 */
                if (pending && bdi_write_congested(bdi) && num_run > 16 &&
                    fs_info->fs_devices->open_devices > 1) {
-                        struct bio *old_head;
                        struct io_context *ioc;
                        ioc = current->io_context;
@@ -233,17 +291,17 @@ loop_lock:
                                 * against it before looping
                                 */
                                last_waited = ioc->last_waited;
+                                if (need_resched()) {
+                                        if (num_sync_run) {
+                                                blk_run_backing_dev(bdi, NULL);
+                                                num_sync_run = 0;
+                                        }
+                                        cond_resched();
+                                }
                                continue;
                        }
                        spin_lock(&device->io_lock);
+                        requeue_list(pending_bios, pending, tail);
-                        old_head = device->pending_bios;
-                        device->pending_bios = pending;
-                        if (device->pending_bio_tail)
-                                tail->bi_next = old_head;
-                        else
-                                device->pending_bio_tail = tail;
                        device->running_pending = 1;
                        spin_unlock(&device->io_lock);
@@ -251,11 +309,18 @@ loop_lock:
                        goto done;
                }
        }
+        if (num_sync_run) {
+                num_sync_run = 0;
+                blk_run_backing_dev(bdi, NULL);
+        }
+        cond_resched();
        if (again)
                goto loop;
        spin_lock(&device->io_lock);
-        if (device->pending_bios)
+        if (device->pending_bios.head || device->pending_sync_bios.head)
                goto loop_lock;
        spin_unlock(&device->io_lock);
@@ -1478,7 +1543,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-        btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+        btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
        btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
        btrfs_mark_buffer_dirty(leaf);
@@ -1875,14 +1940,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        device->total_bytes = new_size;
        if (device->writeable)
                device->fs_devices->total_rw_bytes -= diff;
-        ret = btrfs_update_device(trans, device);
-        if (ret) {
-                unlock_chunks(root);
-                btrfs_end_transaction(trans, root);
-                goto done;
-        }
-        WARN_ON(diff > old_total);
-        btrfs_set_super_total_bytes(super_copy, old_total - diff);
        unlock_chunks(root);
        btrfs_end_transaction(trans, root);
@@ -1914,7 +1971,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                length = btrfs_dev_extent_length(l, dev_extent);
                if (key.offset + length <= new_size)
-                        goto done;
+                        break;
                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -1927,6 +1984,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                        goto done;
        }
+        /* Shrinking succeeded, else we would be at "done". */
+        trans = btrfs_start_transaction(root, 1);
+        if (!trans) {
+                ret = -ENOMEM;
+                goto done;
+        }
+        lock_chunks(root);
+        device->disk_total_bytes = new_size;
+        /* Now btrfs_update_device() will change the on-disk size. */
+        ret = btrfs_update_device(trans, device);
+        if (ret) {
+                unlock_chunks(root);
+                btrfs_end_transaction(trans, root);
+                goto done;
+        }
+        WARN_ON(diff > old_total);
+        btrfs_set_super_total_bytes(super_copy, old_total - diff);
+        unlock_chunks(root);
+        btrfs_end_transaction(trans, root);
 done:
        btrfs_free_path(path);
        return ret;
@@ -2497,7 +2574,7 @@ again:
                        max_errors = 1;
                }
        }
-        if (multi_ret && rw == WRITE &&
+        if (multi_ret && (rw & (1 << BIO_RW)) &&
            stripes_allocated < stripes_required) {
                stripes_allocated = map->num_stripes;
                free_extent_map(em);
@@ -2762,6 +2839,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
                                 int rw, struct bio *bio)
 {
        int should_queue = 1;
+        struct btrfs_pending_bios *pending_bios;
        /* don't bother with additional async steps for reads, right now */
        if (!(rw & (1 << BIO_RW))) {
@@ -2783,13 +2861,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
        bio->bi_rw |= rw;
        spin_lock(&device->io_lock);
+        if (bio_sync(bio))
+                pending_bios = &device->pending_sync_bios;
+        else
+                pending_bios = &device->pending_bios;
-        if (device->pending_bio_tail)
+        if (pending_bios->tail)
-                device->pending_bio_tail->bi_next = bio;
+                pending_bios->tail->bi_next = bio;
-        device->pending_bio_tail = bio;
+        pending_bios->tail = bio;
-        if (!device->pending_bios)
+        if (!pending_bios->head)
-                device->pending_bios = bio;
+                pending_bios->head = bio;
        if (device->running_pending)
                should_queue = 0;
@@ -3006,7 +3088,8 @@ static int fill_device_from_item(struct extent_buffer *leaf,
        unsigned long ptr;
        device->devid = btrfs_device_id(leaf, dev_item);
-        device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+        device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+        device->total_bytes = device->disk_total_bytes;
        device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
        device->type = btrfs_device_type(leaf, dev_item);
        device->io_align = btrfs_device_io_align(leaf, dev_item);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2185de72ff7d..5c3ff6d02fd7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -23,13 +23,22 @@
 #include "async-thread.h"
 struct buffer_head;
+struct btrfs_pending_bios {
+        struct bio *head;
+        struct bio *tail;
+};
 struct btrfs_device {
        struct list_head dev_list;
        struct list_head dev_alloc_list;
        struct btrfs_fs_devices *fs_devices;
        struct btrfs_root *dev_root;
-        struct bio *pending_bios;
-        struct bio *pending_bio_tail;
+        /* regular prio bios */
+        struct btrfs_pending_bios pending_bios;
+        /* WRITE_SYNC bios */
+        struct btrfs_pending_bios pending_sync_bios;
        int running_pending;
        u64 generation;
@@ -52,6 +61,9 @@ struct btrfs_device {
        /* size of the device */
        u64 total_bytes;
+        /* size of the disk */
+        u64 disk_total_bytes;
        /* bytes used */
        u64 bytes_used;
diff --git a/fs/buffer.c b/fs/buffer.c
index 13edf7ad3ff1..aed297739eb0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -360,7 +360,7 @@ still_busy:
 * Completion handler for block_write_full_page() - pages which are unlocked
 * during I/O, and which have PageWriteback cleared upon I/O completion.
 */
-static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
        char b[BDEVNAME_SIZE];
        unsigned long flags;
@@ -438,11 +438,17 @@ static void mark_buffer_async_read(struct buffer_head *bh)
        set_buffer_async_read(bh);
 }
-void mark_buffer_async_write(struct buffer_head *bh)
+void mark_buffer_async_write_endio(struct buffer_head *bh,
+                                   bh_end_io_t *handler)
 {
-        bh->b_end_io = end_buffer_async_write;
+        bh->b_end_io = handler;
        set_buffer_async_write(bh);
 }
+void mark_buffer_async_write(struct buffer_head *bh)
+{
+        mark_buffer_async_write_endio(bh, end_buffer_async_write);
+}
 EXPORT_SYMBOL(mark_buffer_async_write);
@@ -547,7 +553,7 @@ repeat:
        return err;
 }
-void do_thaw_all(unsigned long unused)
+void do_thaw_all(struct work_struct *work)
 {
        struct super_block *sb;
        char b[BDEVNAME_SIZE];
@@ -567,6 +573,7 @@ restart:
                        goto restart;
        }
        spin_unlock(&sb_lock);
+        kfree(work);
        printk(KERN_WARNING "Emergency Thaw complete\n");
 }
@@ -577,7 +584,13 @@ restart:
 */
 void emergency_thaw_all(void)
 {
-        pdflush_operation(do_thaw_all, 0);
+        struct work_struct *work;
+        work = kmalloc(sizeof(*work), GFP_ATOMIC);
+        if (work) {
+                INIT_WORK(work, do_thaw_all);
+                schedule_work(work);
+        }
 }
 /**
@@ -1608,7 +1621,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
 * unplugging the device queue.
 */
 static int __block_write_full_page(struct inode *inode, struct page *page,
-                        get_block_t *get_block, struct writeback_control *wbc)
+                        get_block_t *get_block, struct writeback_control *wbc,
+                        bh_end_io_t *handler)
 {
        int err;
        sector_t block;
@@ -1693,7 +1707,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
                        continue;
                }
                if (test_clear_buffer_dirty(bh)) {
-                        mark_buffer_async_write(bh);
+                        mark_buffer_async_write_endio(bh, handler);
                } else {
                        unlock_buffer(bh);
                }
@@ -1746,7 +1760,7 @@ recover:
                if (buffer_mapped(bh) && buffer_dirty(bh) &&
                    !buffer_delay(bh)) {
                        lock_buffer(bh);
-                        mark_buffer_async_write(bh);
+                        mark_buffer_async_write_endio(bh, handler);
                } else {
                        /*
                         * The buffer may have been set dirty during
@@ -2383,7 +2397,8 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        if ((page->mapping != inode->i_mapping) ||
            (page_offset(page) > size)) {
                /* page got truncated out from underneath us */
-                goto out_unlock;
+                unlock_page(page);
+                goto out;
        }
        /* page is wholly or partially inside EOF */
@@ -2397,14 +2412,15 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                ret = block_commit_write(page, 0, end);
        if (unlikely(ret)) {
+                unlock_page(page);
                if (ret == -ENOMEM)
                        ret = VM_FAULT_OOM;
                else /* -ENOSPC, -EIO, etc */
                        ret = VM_FAULT_SIGBUS;
-        }
+        } else
+                ret = VM_FAULT_LOCKED;
-out_unlock:
+out:
-        unlock_page(page);
        return ret;
 }
@@ -2672,7 +2688,8 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 out:
        ret = mpage_writepage(page, get_block, wbc);
        if (ret == -EAGAIN)
-                ret = __block_write_full_page(inode, page, get_block, wbc);
+                ret = __block_write_full_page(inode, page, get_block, wbc,
+                                              end_buffer_async_write);
        return ret;
 }
 EXPORT_SYMBOL(nobh_writepage);
@@ -2830,9 +2847,10 @@ out:
 /*
 * The generic ->writepage function for buffer-backed address_spaces
+ * this form passes in the end_io handler used to finish the IO.
 */
-int block_write_full_page(struct page *page, get_block_t *get_block,
+int block_write_full_page_endio(struct page *page, get_block_t *get_block,
-                        struct writeback_control *wbc)
+                        struct writeback_control *wbc, bh_end_io_t *handler)
 {
        struct inode * const inode = page->mapping->host;
        loff_t i_size = i_size_read(inode);
@@ -2841,7 +2859,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
        /* Is the page fully inside i_size? */
        if (page->index < end_index)
-                return __block_write_full_page(inode, page, get_block, wbc);
+                return __block_write_full_page(inode, page, get_block, wbc,
+                                               handler);
        /* Is the page fully outside i_size? (truncate in progress) */
        offset = i_size & (PAGE_CACHE_SIZE-1);
@@ -2864,9 +2883,20 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
         * writes to that region are not written out to the file."
         */
        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
-        return __block_write_full_page(inode, page, get_block, wbc);
+        return __block_write_full_page(inode, page, get_block, wbc, handler);
+}
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ */
+int block_write_full_page(struct page *page, get_block_t *get_block,
+                        struct writeback_control *wbc)
+{
+        return block_write_full_page_endio(page, get_block, wbc,
+                                           end_buffer_async_write);
 }
 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
                            get_block_t *get_block)
 {
@@ -3335,9 +3365,11 @@ EXPORT_SYMBOL(block_read_full_page);
 EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
+EXPORT_SYMBOL(block_write_full_page_endio);
 EXPORT_SYMBOL(cont_write_begin);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
+EXPORT_SYMBOL(end_buffer_async_write);
 EXPORT_SYMBOL(file_fsync);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_cont_expand_simple);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 65984006192c..9d1fb6ec8a5a 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -15,7 +15,8 @@ Posix file open support added (turned off after one attempt if server
 fails to support it properly, as with Samba server versions prior to 3.3.2)
 Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
 little memory for the "nativeFileSystem" field returned by the server
-during mount). 
+during mount).  Endian convert inode numbers if necessary (makes it easier
+to compare inode numbers on network files from big endian systems). 
 Version 1.56
 ------------
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 3fd3a9df043a..67bf93a40d2e 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -41,7 +41,7 @@ cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen)
        /* attach the data */
        memcpy(payload, data, datalen);
-        rcu_assign_pointer(key->payload.data, payload);
+        key->payload.data = payload;
        ret = 0;
 error:
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 38491fd3871d..0d6d8b573652 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -66,9 +66,6 @@ unsigned int sign_CIFS_PDUs = 1;
 extern struct task_struct *oplockThread; /* remove sparse warning */
 struct task_struct *oplockThread = NULL;
 /* extern struct task_struct * dnotifyThread; remove sparse warning */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-static struct task_struct *dnotifyThread = NULL;
-#endif
 static const struct super_operations cifs_super_ops;
 unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
 module_param(CIFSMaxBufSize, int, 0);
@@ -316,6 +313,7 @@ cifs_alloc_inode(struct super_block *sb)
        cifs_inode->clientCanCacheAll = false;
        cifs_inode->delete_pending = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
+        cifs_inode->server_eof = 0;
        /* Can not set i_flags here - they get immediately overwritten
           to zero by the VFS */
@@ -1040,34 +1038,6 @@ static int cifs_oplock_thread(void *dummyarg)
        return 0;
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-static int cifs_dnotify_thread(void *dummyarg)
-{
-        struct list_head *tmp;
-        struct TCP_Server_Info *server;
-        do {
-                if (try_to_freeze())
-                        continue;
-                set_current_state(TASK_INTERRUPTIBLE);
-                schedule_timeout(15*HZ);
-                /* check if any stuck requests that need
-                   to be woken up and wakeq so the
-                   thread can wake up and error out */
-                read_lock(&cifs_tcp_ses_lock);
-                list_for_each(tmp, &cifs_tcp_ses_list) {
-                        server = list_entry(tmp, struct TCP_Server_Info,
-                                         tcp_ses_list);
-                        if (atomic_read(&server->inFlight))
-                                wake_up_all(&server->response_q);
-                }
-                read_unlock(&cifs_tcp_ses_lock);
-        } while (!kthread_should_stop());
-        return 0;
-}
-#endif
 static int __init
 init_cifs(void)
 {
@@ -1144,21 +1114,8 @@ init_cifs(void)
                goto out_unregister_dfs_key_type;
        }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-        dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd");
-        if (IS_ERR(dnotifyThread)) {
-                rc = PTR_ERR(dnotifyThread);
-                cERROR(1, ("error %d create dnotify thread", rc));
-                goto out_stop_oplock_thread;
-        }
-#endif
        return 0;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
- out_stop_oplock_thread:
-#endif
-        kthread_stop(oplockThread);
 out_unregister_dfs_key_type:
 #ifdef CONFIG_CIFS_DFS_UPCALL
        unregister_key_type(&key_type_dns_resolver);
@@ -1196,9 +1153,6 @@ exit_cifs(void)
        cifs_destroy_inodecache();
        cifs_destroy_mids();
        cifs_destroy_request_bufs();
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-        kthread_stop(dnotifyThread);
-#endif
        kthread_stop(oplockThread);
 }
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9fbf4dff5da6..df40ab64cd95 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -350,7 +350,7 @@ struct cifsFileInfo {
        bool invalidHandle:1;   /* file closed via session abend */
        bool messageMode:1;     /* for pipes: message vs byte mode */
        atomic_t wrtPending;   /* handle in use - defer close */
-        struct semaphore fh_sem; /* prevents reopen race after dead ses*/
+        struct mutex fh_mutex; /* prevents reopen race after dead ses*/
        struct cifs_search_info srch_inf;
 };
@@ -370,6 +370,7 @@ struct cifsInodeInfo {
        bool clientCanCacheAll:1;       /* read and writebehind oplock */
        bool oplockPending:1;
        bool delete_pending:1;          /* DELETE_ON_CLOSE is set */
+        u64  server_eof;                /* current file size on server */
        struct inode vfs_inode;
 };
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b370489c8da5..a785f69dbc9f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2163,7 +2163,7 @@ typedef struct {
        __le32 Type;
        __le64 DevMajor;
        __le64 DevMinor;
-        __u64 UniqueId;
+        __le64 UniqueId;
        __le64 Permissions;
        __le64 Nlinks;
 } __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */
@@ -2308,7 +2308,7 @@ struct unlink_psx_rq { /* level 0x20a SetPathInfo */
 } __attribute__((packed));
 struct file_internal_info {
-        __u64  UniqueId; /* inode number */
+        __le64  UniqueId; /* inode number */
 } __attribute__((packed));      /* level 0x3ee */
 struct file_mode_info {
@@ -2338,7 +2338,7 @@ typedef struct {
        __le32 Type;
        __le64 DevMajor;
        __le64 DevMinor;
-        __u64 UniqueId;
+        __le64 UniqueId;
        __le64 Permissions;
        __le64 Nlinks;
        char FileName[1];
@@ -2386,7 +2386,7 @@ typedef struct {
        __le32 FileNameLength;
        __le32 EaSize; /* EA size */
        __le32 Reserved;
-        __u64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
+        __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
        char FileName[1];
 } __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index bc09c998631f..a0845dc7b8a9 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1626,6 +1626,8 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
        int smb_hdr_len;
        int resp_buf_type = 0;
+        *nbytes = 0;
        cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count));
        if (tcon->ses->capabilities & CAP_LARGE_FILES) {
@@ -1682,11 +1684,9 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
        cifs_stats_inc(&tcon->num_writes);
        if (rc) {
                cFYI(1, ("Send error Write2 = %d", rc));
-                *nbytes = 0;
        } else if (resp_buf_type == 0) {
                /* presumably this can not happen, but best to be safe */
                rc = -EIO;
-                *nbytes = 0;
        } else {
                WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base;
                *nbytes = le16_to_cpu(pSMBr->CountHigh);
@@ -3918,7 +3918,7 @@ GetInodeNumberRetry:
                        }
                        pfinfo = (struct file_internal_info *)
                                (data_offset + (char *) &pSMBr->hdr.Protocol);
-                        *inode_number = pfinfo->UniqueId;
+                        *inode_number = le64_to_cpu(pfinfo->UniqueId);
                }
        }
 GetInodeNumOut:
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0de3b5615a22..bacdef1546b7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2214,9 +2214,58 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon,
        return rc;
 }
+static void
+cleanup_volume_info(struct smb_vol **pvolume_info)
+{
+        struct smb_vol *volume_info;
+        if (!pvolume_info && !*pvolume_info)
+                return;
+        volume_info = *pvolume_info;
+        kzfree(volume_info->password);
+        kfree(volume_info->UNC);
+        kfree(volume_info->prepath);
+        kfree(volume_info);
+        *pvolume_info = NULL;
+        return;
+}
+#ifdef CONFIG_CIFS_DFS_UPCALL
+/* build_path_to_root returns full path to root when
+ * we do not have an exiting connection (tcon) */
+static char *
+build_unc_path_to_root(const struct smb_vol *volume_info,
+                const struct cifs_sb_info *cifs_sb)
+{
+        char *full_path;
+        int unc_len = strnlen(volume_info->UNC, MAX_TREE_SIZE + 1);
+        full_path = kmalloc(unc_len + cifs_sb->prepathlen + 1, GFP_KERNEL);
+        if (full_path == NULL)
+                return ERR_PTR(-ENOMEM);
+        strncpy(full_path, volume_info->UNC, unc_len);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
+                int i;
+                for (i = 0; i < unc_len; i++) {
+                        if (full_path[i] == '\\')
+                                full_path[i] = '/';
+                }
+        }
+        if (cifs_sb->prepathlen)
+                strncpy(full_path + unc_len, cifs_sb->prepath,
+                                cifs_sb->prepathlen);
+        full_path[unc_len + cifs_sb->prepathlen] = 0; /* add trailing null */
+        return full_path;
+}
+#endif
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
-           char *mount_data, const char *devname)
+                char *mount_data_global, const char *devname)
 {
        int rc = 0;
        int xid;
@@ -2225,6 +2274,13 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        struct cifsTconInfo *tcon = NULL;
        struct TCP_Server_Info *srvTcp = NULL;
        char   *full_path;
+        char *mount_data = mount_data_global;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        struct dfs_info3_param *referrals = NULL;
+        unsigned int num_referrals = 0;
+try_mount_again:
+#endif
+        full_path = NULL;
        xid = GetXid();
@@ -2371,11 +2427,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                                }
                        }
-                        /* check for null share name ie connect to dfs root */
                        if ((strchr(volume_info->UNC + 3, '\\') == NULL)
                            && (strchr(volume_info->UNC + 3, '/') == NULL)) {
-                                /* rc = connect_to_dfs_path(...) */
+                                cERROR(1, ("Missing share name"));
-                                cFYI(1, ("DFS root not supported"));
                                rc = -ENODEV;
                                goto mount_fail_check;
                        } else {
@@ -2392,7 +2446,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                                }
                        }
                        if (rc)
-                                goto mount_fail_check;
+                                goto remote_path_check;
                        tcon->seal = volume_info->seal;
                        write_lock(&cifs_tcp_ses_lock);
                        list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
@@ -2417,19 +2471,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        /* BB FIXME fix time_gran to be larger for LANMAN sessions */
        sb->s_time_gran = 100;
-mount_fail_check:
+        if (rc)
-        /* on error free sesinfo and tcon struct if needed */
+                goto remote_path_check;
-        if (rc) {
-                /* If find_unc succeeded then rc == 0 so we can not end */
-                /* up accidently freeing someone elses tcon struct */
-                if (tcon)
-                        cifs_put_tcon(tcon);
-                else if (pSesInfo)
-                        cifs_put_smb_ses(pSesInfo);
-                else
-                        cifs_put_tcp_session(srvTcp);
-                goto out;
-        }
        cifs_sb->tcon = tcon;
        /* do not care if following two calls succeed - informational */
@@ -2461,7 +2505,9 @@ mount_fail_check:
                cifs_sb->rsize = min(cifs_sb->rsize,
                               (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
-        if (!rc && cifs_sb->prepathlen) {
+remote_path_check:
+        /* check if a whole path (including prepath) is not remote */
+        if (!rc && cifs_sb->prepathlen && tcon) {
                /* build_path_to_root works only when we have a valid tcon */
                full_path = cifs_build_path_to_root(cifs_sb);
                if (full_path == NULL) {
@@ -2469,31 +2515,79 @@ mount_fail_check:
                        goto mount_fail_check;
                }
                rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
-                if (rc) {
+                if (rc != -EREMOTE) {
-                        cERROR(1, ("Path %s in not accessible: %d",
-                                                full_path, rc));
                        kfree(full_path);
                        goto mount_fail_check;
                }
                kfree(full_path);
        }
+        /* get referral if needed */
+        if (rc == -EREMOTE) {
+#ifdef CONFIG_CIFS_DFS_UPCALL
+                /* convert forward to back slashes in prepath here if needed */
+                if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
+                        convert_delimiter(cifs_sb->prepath,
+                                        CIFS_DIR_SEP(cifs_sb));
+                full_path = build_unc_path_to_root(volume_info, cifs_sb);
+                if (IS_ERR(full_path)) {
+                        rc = PTR_ERR(full_path);
+                        goto mount_fail_check;
+                }
+                cFYI(1, ("Getting referral for: %s", full_path));
+                rc = get_dfs_path(xid, pSesInfo , full_path + 1,
+                        cifs_sb->local_nls, &num_referrals, &referrals,
+                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+                if (!rc && num_referrals > 0) {
+                        char *fake_devname = NULL;
+                        if (mount_data != mount_data_global)
+                                kfree(mount_data);
+                        mount_data = cifs_compose_mount_options(
+                                        cifs_sb->mountdata, full_path + 1,
+                                        referrals, &fake_devname);
+                        kfree(fake_devname);
+                        free_dfs_info_array(referrals, num_referrals);
+                        if (tcon)
+                                cifs_put_tcon(tcon);
+                        else if (pSesInfo)
+                                cifs_put_smb_ses(pSesInfo);
+                        cleanup_volume_info(&volume_info);
+                        FreeXid(xid);
+                        kfree(full_path);
+                        goto try_mount_again;
+                }
+#else /* No DFS support, return error on mount */
+                rc = -EOPNOTSUPP;
+#endif
+        }
+mount_fail_check:
+        /* on error free sesinfo and tcon struct if needed */
+        if (rc) {
+                if (mount_data != mount_data_global)
+                        kfree(mount_data);
+                /* If find_unc succeeded then rc == 0 so we can not end */
+                /* up accidently freeing someone elses tcon struct */
+                if (tcon)
+                        cifs_put_tcon(tcon);
+                else if (pSesInfo)
+                        cifs_put_smb_ses(pSesInfo);
+                else
+                        cifs_put_tcp_session(srvTcp);
+                goto out;
+        }
        /* volume_info->password is freed above when existing session found
        (in which case it is not needed anymore) but when new sesion is created
        the password ptr is put in the new session structure (in which case the
        password will be freed at unmount time) */
 out:
        /* zero out password before freeing */
-        if (volume_info) {
+        cleanup_volume_info(&volume_info);
-                if (volume_info->password != NULL) {
-                        memset(volume_info->password, 0,
-                                strlen(volume_info->password));
-                        kfree(volume_info->password);
-                }
-                kfree(volume_info->UNC);
-                kfree(volume_info->prepath);
-                kfree(volume_info);
-        }
        FreeXid(xid);
        return rc;
 }
@@ -2673,8 +2767,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 /* We look for obvious messed up bcc or strings in response so we do not go off
   the end since (at least) WIN2K and Windows XP have a major bug in not null
   terminating last Unicode string in response  */
-                                if (ses->serverOS)
+                                kfree(ses->serverOS);
-                                        kfree(ses->serverOS);
                                ses->serverOS = kzalloc(2 * (len + 1),
                                                        GFP_KERNEL);
                                if (ses->serverOS == NULL)
@@ -2710,8 +2803,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
                                /* last string is not always null terminated
                                   (for e.g. for Windows XP & 2000) */
-                                                if (ses->serverDomain)
+                                                kfree(ses->serverDomain);
-                                                        kfree(ses->serverDomain);
                                                ses->serverDomain =
                                                    kzalloc(2*(len+1),
                                                            GFP_KERNEL);
@@ -2725,8 +2817,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                ses->serverDomain[1+(2*len)] = 0;
                                        } else { /* else no more room so create
                                                  dummy domain string */
-                                                if (ses->serverDomain)
+                                                kfree(ses->serverDomain);
-                                                        kfree(ses->serverDomain);
                                                ses->serverDomain =
                                                        kzalloc(2, GFP_KERNEL);
                                        }
@@ -2772,8 +2863,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                        bcc_ptr++;
                                        len = strnlen(bcc_ptr, 1024);
-                                        if (ses->serverDomain)
+                                        kfree(ses->serverDomain);
-                                                kfree(ses->serverDomain);
                                        ses->serverDomain = kzalloc(len + 1,
                                                                    GFP_KERNEL);
                                        if (ses->serverDomain == NULL)
@@ -3013,8 +3103,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 /* We look for obvious messed up bcc or strings in response so we do not go off
   the end since (at least) WIN2K and Windows XP have a major bug in not null
   terminating last Unicode string in response  */
-                                        if (ses->serverOS)
+                                        kfree(ses->serverOS);
-                                                kfree(ses->serverOS);
                                        ses->serverOS =
                                            kzalloc(2 * (len + 1), GFP_KERNEL);
                                        cifs_strfromUCS_le(ses->serverOS,
@@ -3086,8 +3175,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                                        if (((long) bcc_ptr + len) - (long)
                                            pByteArea(smb_buffer_response)
                                            <= BCC(smb_buffer_response)) {
-                                                if (ses->serverOS)
+                                                kfree(ses->serverOS);
-                                                        kfree(ses->serverOS);
                                                ses->serverOS =
                                                    kzalloc(len + 1,
                                                            GFP_KERNEL);
@@ -3414,8 +3502,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 /* We look for obvious messed up bcc or strings in response so we do not go off
  the end since (at least) WIN2K and Windows XP have a major bug in not null
  terminating last Unicode string in response  */
-                                        if (ses->serverOS)
+                                        kfree(ses->serverOS);
-                                                kfree(ses->serverOS);
                                        ses->serverOS =
                                            kzalloc(2 * (len + 1), GFP_KERNEL);
                                        cifs_strfromUCS_le(ses->serverOS,
@@ -3448,8 +3535,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                if (remaining_words > 0) {
                                                        len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
     /* last string not always null terminated (e.g. for Windows XP & 2000) */
-                                                        if (ses->serverDomain)
+                                                        kfree(ses->serverDomain);
-                                                                kfree(ses->serverDomain);
                                                        ses->serverDomain =
                                                            kzalloc(2 *
                                                                    (len +
@@ -3476,13 +3562,11 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                            = 0;
                                                } /* else no more room so create dummy domain string */
                                                else {
-                                                        if (ses->serverDomain)
+                                                        kfree(ses->serverDomain);
-                                                                kfree(ses->serverDomain);
                                                        ses->serverDomain = kzalloc(2,GFP_KERNEL);
                                                }
                                        } else {  /* no room so create dummy domain and NOS string */
-                                                if (ses->serverDomain)
+                                                kfree(ses->serverDomain);
-                                                        kfree(ses->serverDomain);
                                                ses->serverDomain = kzalloc(2, GFP_KERNEL);
                                                kfree(ses->serverNOS);
                                                ses->serverNOS = kzalloc(2, GFP_KERNEL);
@@ -3492,8 +3576,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                        if (((long) bcc_ptr + len) -
                                           (long) pByteArea(smb_buffer_response)
                                                <= BCC(smb_buffer_response)) {
-                                                if (ses->serverOS)
+                                                kfree(ses->serverOS);
-                                                        kfree(ses->serverOS);
                                                ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
                                                strncpy(ses->serverOS,bcc_ptr, len);
@@ -3512,8 +3595,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                bcc_ptr++;
                                                len = strnlen(bcc_ptr, 1024);
-                                                if (ses->serverDomain)
+                                                kfree(ses->serverDomain);
-                                                        kfree(ses->serverDomain);
                                                ses->serverDomain =
                                                                kzalloc(len+1,
                                                                    GFP_KERNEL);
@@ -3674,16 +3756,15 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                            BCC(smb_buffer_response)) {
                                kfree(tcon->nativeFileSystem);
                                tcon->nativeFileSystem =
-                                    kzalloc(2*(length + 1), GFP_KERNEL);
+                                    kzalloc((4 * length) + 2, GFP_KERNEL);
-                                if (tcon->nativeFileSystem)
+                                if (tcon->nativeFileSystem) {
                                        cifs_strfromUCS_le(
                                                tcon->nativeFileSystem,
                                                (__le16 *) bcc_ptr,
                                                length, nls_codepage);
-                                bcc_ptr += 2 * length;
+                                        cFYI(1, ("nativeFileSystem=%s",
-                                bcc_ptr[0] = 0; /* null terminate the string */
+                                                tcon->nativeFileSystem));
-                                bcc_ptr[1] = 0;
+                                }
-                                bcc_ptr += 2;
                        }
                        /* else do not bother copying these information fields*/
                } else {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 54dce78fbb73..461750e01364 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -129,12 +129,62 @@ cifs_bp_rename_retry:
        return full_path;
 }
+static void
+cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
+                        struct cifsTconInfo *tcon, bool write_only)
+{
+        int oplock = 0;
+        struct cifsFileInfo *pCifsFile;
+        struct cifsInodeInfo *pCifsInode;
+        pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+        if (pCifsFile == NULL)
+                return;
+        if (oplockEnabled)
+                oplock = REQ_OPLOCK;
+        pCifsFile->netfid = fileHandle;
+        pCifsFile->pid = current->tgid;
+        pCifsFile->pInode = newinode;
+        pCifsFile->invalidHandle = false;
+        pCifsFile->closePend = false;
+        mutex_init(&pCifsFile->fh_mutex);
+        mutex_init(&pCifsFile->lock_mutex);
+        INIT_LIST_HEAD(&pCifsFile->llist);
+        atomic_set(&pCifsFile->wrtPending, 0);
+        /* set the following in open now
+                        pCifsFile->pfile = file; */
+        write_lock(&GlobalSMBSeslock);
+        list_add(&pCifsFile->tlist, &tcon->openFileList);
+        pCifsInode = CIFS_I(newinode);
+        if (pCifsInode) {
+                /* if readable file instance put first in list*/
+                if (write_only)
+                        list_add_tail(&pCifsFile->flist,
+                                      &pCifsInode->openFileList);
+                else
+                        list_add(&pCifsFile->flist, &pCifsInode->openFileList);
+                if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+                        pCifsInode->clientCanCacheAll = true;
+                        pCifsInode->clientCanCacheRead = true;
+                        cFYI(1, ("Exclusive Oplock inode %p", newinode));
+                } else if ((oplock & 0xF) == OPLOCK_READ)
+                                pCifsInode->clientCanCacheRead = true;
+        }
+        write_unlock(&GlobalSMBSeslock);
+}
 int cifs_posix_open(char *full_path, struct inode **pinode,
                    struct super_block *sb, int mode, int oflags,
                    int *poplock, __u16 *pnetfid, int xid)
 {
        int rc;
        __u32 oplock;
+        bool write_only = false;
        FILE_UNIX_BASIC_INFO *presp_data;
        __u32 posix_flags = 0;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -172,6 +222,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        if (oflags & O_DIRECT)
                posix_flags |= SMB_O_DIRECT;
+        if (!(oflags & FMODE_READ))
+                write_only = true;
        rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
                        pnetfid, presp_data, &oplock, full_path,
@@ -187,8 +239,10 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        if (!pinode)
                goto posix_open_ret; /* caller does not need info */
-        if (*pinode == NULL)
+        if (*pinode == NULL) {
-                *pinode = cifs_new_inode(sb, &presp_data->UniqueId);
+                __u64 unique_id = le64_to_cpu(presp_data->UniqueId);
+                *pinode = cifs_new_inode(sb, &unique_id);
+        }
        /* else an inode was passed in. Update its info, don't create one */
        /* We do not need to close the file if new_inode fails since
@@ -198,6 +252,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        posix_fill_in_inode(*pinode, presp_data, 1);
+        cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only);
 posix_open_ret:
        kfree(presp_data);
        return rc;
@@ -239,7 +295,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        char *full_path = NULL;
        FILE_ALL_INFO *buf = NULL;
        struct inode *newinode = NULL;
-        struct cifsInodeInfo *pCifsInode;
        int disposition = FILE_OVERWRITE_IF;
        bool write_only = false;
@@ -410,44 +465,8 @@ cifs_create_set_dentry:
                /* mknod case - do not leave file open */
                CIFSSMBClose(xid, tcon, fileHandle);
        } else if (newinode) {
-                struct cifsFileInfo *pCifsFile =
+                        cifs_fill_fileinfo(newinode, fileHandle,
-                        kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+                                        cifs_sb->tcon, write_only);
-                if (pCifsFile == NULL)
-                        goto cifs_create_out;
-                pCifsFile->netfid = fileHandle;
-                pCifsFile->pid = current->tgid;
-                pCifsFile->pInode = newinode;
-                pCifsFile->invalidHandle = false;
-                pCifsFile->closePend     = false;
-                init_MUTEX(&pCifsFile->fh_sem);
-                mutex_init(&pCifsFile->lock_mutex);
-                INIT_LIST_HEAD(&pCifsFile->llist);
-                atomic_set(&pCifsFile->wrtPending, 0);
-                /* set the following in open now
-                                pCifsFile->pfile = file; */
-                write_lock(&GlobalSMBSeslock);
-                list_add(&pCifsFile->tlist, &tcon->openFileList);
-                pCifsInode = CIFS_I(newinode);
-                if (pCifsInode) {
-                        /* if readable file instance put first in list*/
-                        if (write_only) {
-                                list_add_tail(&pCifsFile->flist,
-                                              &pCifsInode->openFileList);
-                        } else {
-                                list_add(&pCifsFile->flist,
-                                         &pCifsInode->openFileList);
-                        }
-                        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-                                pCifsInode->clientCanCacheAll = true;
-                                pCifsInode->clientCanCacheRead = true;
-                                cFYI(1, ("Exclusive Oplock inode %p",
-                                        newinode));
-                        } else if ((oplock & 0xF) == OPLOCK_READ)
-                                pCifsInode->clientCanCacheRead = true;
-                }
-                write_unlock(&GlobalSMBSeslock);
        }
 cifs_create_out:
        kfree(buf);
@@ -580,17 +599,21 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        return rc;
 }
 struct dentry *
 cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
            struct nameidata *nd)
 {
        int xid;
        int rc = 0; /* to get around spurious gcc warning, set to zero here */
+        int oplock = 0;
+        int mode;
+        __u16 fileHandle = 0;
+        bool posix_open = false;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
        struct inode *newInode = NULL;
        char *full_path = NULL;
+        struct file *filp;
        xid = GetXid();
@@ -632,12 +655,37 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        }
        cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode));
-        if (pTcon->unix_ext)
+        if (pTcon->unix_ext) {
-                rc = cifs_get_inode_info_unix(&newInode, full_path,
+                if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
-                                              parent_dir_inode->i_sb, xid);
+                                (nd->flags & LOOKUP_OPEN)) {
-        else
+                        if (!((nd->intent.open.flags & O_CREAT) &&
+                                        (nd->intent.open.flags & O_EXCL))) {
+                                mode = nd->intent.open.create_mode &
+                                                ~current_umask();
+                                rc = cifs_posix_open(full_path, &newInode,
+                                        parent_dir_inode->i_sb, mode,
+                                        nd->intent.open.flags, &oplock,
+                                        &fileHandle, xid);
+                                /*
+                                 * This code works around a bug in
+                                 * samba posix open in samba versions 3.3.1
+                                 * and earlier where create works
+                                 * but open fails with invalid parameter.
+                                 * If either of these error codes are
+                                 * returned, follow the normal lookup.
+                                 * Otherwise, the error during posix open
+                                 * is handled.
+                                 */
+                                if ((rc != -EINVAL) && (rc != -EOPNOTSUPP))
+                                        posix_open = true;
+                        }
+                }
+                if (!posix_open)
+                        rc = cifs_get_inode_info_unix(&newInode, full_path,
+                                                parent_dir_inode->i_sb, xid);
+        } else
                rc = cifs_get_inode_info(&newInode, full_path, NULL,
-                                         parent_dir_inode->i_sb, xid, NULL);
+                                parent_dir_inode->i_sb, xid, NULL);
        if ((rc == 0) && (newInode != NULL)) {
                if (pTcon->nocase)
@@ -645,7 +693,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                else
                        direntry->d_op = &cifs_dentry_ops;
                d_add(direntry, newInode);
+                if (posix_open)
+                        filp = lookup_instantiate_filp(nd, direntry, NULL);
                /* since paths are not looked up by component - the parent
                   directories are presumed to be good here */
                renew_parental_timestamps(direntry);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 1e0c1bd8f2e4..df4a306f697e 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -78,7 +78,7 @@ dns_resolver_instantiate(struct key *key, const void *data,
        }
        key->type_data.x[0] = datalen;
-        rcu_assign_pointer(key->payload.data, ip);
+        key->payload.data = ip;
        return rc;
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 81747acca4c4..50ca088d8860 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -46,7 +46,7 @@ static inline struct cifsFileInfo *cifs_init_private(
        memset(private_data, 0, sizeof(struct cifsFileInfo));
        private_data->netfid = netfid;
        private_data->pid = current->tgid;
-        init_MUTEX(&private_data->fh_sem);
+        mutex_init(&private_data->fh_mutex);
        mutex_init(&private_data->lock_mutex);
        INIT_LIST_HEAD(&private_data->llist);
        private_data->pfile = file; /* needed for writepage */
@@ -284,35 +284,32 @@ int cifs_open(struct inode *inode, struct file *file)
        cifs_sb = CIFS_SB(inode->i_sb);
        tcon = cifs_sb->tcon;
-        if (file->f_flags & O_CREAT) {
+        /* search inode for this file and fill in file->private_data */
-                /* search inode for this file and fill in file->private_data */
+        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
-                pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
+        read_lock(&GlobalSMBSeslock);
-                read_lock(&GlobalSMBSeslock);
+        list_for_each(tmp, &pCifsInode->openFileList) {
-                list_for_each(tmp, &pCifsInode->openFileList) {
+                pCifsFile = list_entry(tmp, struct cifsFileInfo,
-                        pCifsFile = list_entry(tmp, struct cifsFileInfo,
+                                       flist);
-                                               flist);
+                if ((pCifsFile->pfile == NULL) &&
-                        if ((pCifsFile->pfile == NULL) &&
+                    (pCifsFile->pid == current->tgid)) {
-                            (pCifsFile->pid == current->tgid)) {
+                        /* mode set in cifs_create */
-                                /* mode set in cifs_create */
+                        /* needed for writepage */
-                                /* needed for writepage */
+                        pCifsFile->pfile = file;
-                                pCifsFile->pfile = file;
+                        file->private_data = pCifsFile;
-                                file->private_data = pCifsFile;
+                        break;
-                                break;
-                        }
-                }
-                read_unlock(&GlobalSMBSeslock);
-                if (file->private_data != NULL) {
-                        rc = 0;
-                        FreeXid(xid);
-                        return rc;
-                } else {
-                        if (file->f_flags & O_EXCL)
-                                cERROR(1, ("could not find file instance for "
-                                           "new file %p", file));
                }
        }
+        read_unlock(&GlobalSMBSeslock);
+        if (file->private_data != NULL) {
+                rc = 0;
+                FreeXid(xid);
+                return rc;
+        } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
+                        cERROR(1, ("could not find file instance for "
+                                   "new file %p", file));
        full_path = build_path_from_dentry(file->f_path.dentry);
        if (full_path == NULL) {
@@ -500,9 +497,9 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
                return -EBADF;
        xid = GetXid();
-        down(&pCifsFile->fh_sem);
+        mutex_unlock(&pCifsFile->fh_mutex);
        if (!pCifsFile->invalidHandle) {
-                up(&pCifsFile->fh_sem);
+                mutex_lock(&pCifsFile->fh_mutex);
                FreeXid(xid);
                return 0;
        }
@@ -533,7 +530,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        if (full_path == NULL) {
                rc = -ENOMEM;
 reopen_error_exit:
-                up(&pCifsFile->fh_sem);
+                mutex_lock(&pCifsFile->fh_mutex);
                FreeXid(xid);
                return rc;
        }
@@ -575,14 +572,14 @@ reopen_error_exit:
                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                up(&pCifsFile->fh_sem);
+                mutex_lock(&pCifsFile->fh_mutex);
                cFYI(1, ("cifs_open returned 0x%x", rc));
                cFYI(1, ("oplock: %d", oplock));
        } else {
 reopen_success:
                pCifsFile->netfid = netfid;
                pCifsFile->invalidHandle = false;
-                up(&pCifsFile->fh_sem);
+                mutex_lock(&pCifsFile->fh_mutex);
                pCifsInode = CIFS_I(inode);
                if (pCifsInode) {
                        if (can_flush) {
@@ -971,6 +968,40 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        return rc;
 }
+/*
+ * Set the timeout on write requests past EOF. For some servers (Windows)
+ * these calls can be very long.
+ *
+ * If we're writing >10M past the EOF we give a 180s timeout. Anything less
+ * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
+ * The 10M cutoff is totally arbitrary. A better scheme for this would be
+ * welcome if someone wants to suggest one.
+ *
+ * We may be able to do a better job with this if there were some way to
+ * declare that a file should be sparse.
+ */
+static int
+cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
+{
+        if (offset <= cifsi->server_eof)
+                return CIFS_STD_OP;
+        else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
+                return CIFS_VLONG_OP;
+        else
+                return CIFS_LONG_OP;
+}
+/* update the file size (if needed) after a write */
+static void
+cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
+                      unsigned int bytes_written)
+{
+        loff_t end_of_write = offset + bytes_written;
+        if (end_of_write > cifsi->server_eof)
+                cifsi->server_eof = end_of_write;
+}
 ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        size_t write_size, loff_t *poffset)
 {
@@ -981,6 +1012,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        struct cifsTconInfo *pTcon;
        int xid, long_op;
        struct cifsFileInfo *open_file;
+        struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -1000,11 +1032,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        xid = GetXid();
-        if (*poffset > file->f_path.dentry->d_inode->i_size)
+        long_op = cifs_write_timeout(cifsi, *poffset);
-                long_op = CIFS_VLONG_OP; /* writes past EOF take long time */
-        else
-                long_op = CIFS_LONG_OP;
        for (total_written = 0; write_size > total_written;
             total_written += bytes_written) {
                rc = -EAGAIN;
@@ -1048,8 +1076,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                                FreeXid(xid);
                                return rc;
                        }
-                } else
+                } else {
+                        cifs_update_eof(cifsi, *poffset, bytes_written);
                        *poffset += bytes_written;
+                }
                long_op = CIFS_STD_OP; /* subsequent writes fast -
                                    15 seconds is plenty */
        }
@@ -1085,6 +1115,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        struct cifsTconInfo *pTcon;
        int xid, long_op;
        struct cifsFileInfo *open_file;
+        struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -1099,11 +1130,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        xid = GetXid();
-        if (*poffset > file->f_path.dentry->d_inode->i_size)
+        long_op = cifs_write_timeout(cifsi, *poffset);
-                long_op = CIFS_VLONG_OP; /* writes past EOF can be slow */
-        else
-                long_op = CIFS_LONG_OP;
        for (total_written = 0; write_size > total_written;
             total_written += bytes_written) {
                rc = -EAGAIN;
@@ -1166,8 +1193,10 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
                                FreeXid(xid);
                                return rc;
                        }
-                } else
+                } else {
+                        cifs_update_eof(cifsi, *poffset, bytes_written);
                        *poffset += bytes_written;
+                }
                long_op = CIFS_STD_OP; /* subsequent writes fast -
                                    15 seconds is plenty */
        }
@@ -1380,11 +1409,12 @@ static int cifs_writepages(struct address_space *mapping,
        int nr_pages;
        __u64 offset = 0;
        struct cifsFileInfo *open_file;
+        struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
        struct page *page;
        struct pagevec pvec;
        int rc = 0;
        int scanned = 0;
-        int xid;
+        int xid, long_op;
        cifs_sb = CIFS_SB(mapping->host->i_sb);
@@ -1528,12 +1558,15 @@ retry:
                                cERROR(1, ("No writable handles for inode"));
                                rc = -EBADF;
                        } else {
+                                long_op = cifs_write_timeout(cifsi, offset);
                                rc = CIFSSMBWrite2(xid, cifs_sb->tcon,
                                                   open_file->netfid,
                                                   bytes_to_write, offset,
                                                   &bytes_written, iov, n_iov,
-                                                   CIFS_LONG_OP);
+                                                   long_op);
                                atomic_dec(&open_file->wrtPending);
+                                cifs_update_eof(cifsi, offset, bytes_written);
                                if (rc || bytes_written < bytes_to_write) {
                                        cERROR(1, ("Write2 ret %d, wrote %d",
                                                  rc, bytes_written));
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f121a80fdd6f..f36b4e40e443 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -143,6 +143,7 @@ static void cifs_unix_info_to_inode(struct inode *inode,
        inode->i_nlink = le64_to_cpu(info->Nlinks);
+        cifsInfo->server_eof = end_of_file;
        spin_lock(&inode->i_lock);
        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
                /*
@@ -276,7 +277,8 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        /* get new inode */
        if (*pinode == NULL) {
-                *pinode = cifs_new_inode(sb, &find_data.UniqueId);
+                __u64 unique_id = le64_to_cpu(find_data.UniqueId);
+                *pinode = cifs_new_inode(sb, &unique_id);
                if (*pinode == NULL) {
                        rc = -ENOMEM;
                        goto cgiiu_exit;
@@ -605,12 +607,12 @@ int cifs_get_inode_info(struct inode **pinode,
                        inode->i_mode |= S_IFREG;
        }
+        cifsInfo->server_eof = le64_to_cpu(pfindData->EndOfFile);
        spin_lock(&inode->i_lock);
-        if (is_size_safe_to_change(cifsInfo,
+        if (is_size_safe_to_change(cifsInfo, cifsInfo->server_eof)) {
-                                   le64_to_cpu(pfindData->EndOfFile))) {
                /* can not safely shrink the file size here if the
                   client is writing to it due to potential races */
-                i_size_write(inode, le64_to_cpu(pfindData->EndOfFile));
+                i_size_write(inode, cifsInfo->server_eof);
                /* 512 bytes (2**9) is the fake blocksize that must be
                   used for this calculation */
@@ -1138,6 +1140,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        cFYI(1, ("posix mkdir returned 0x%x", rc));
                        d_drop(direntry);
                } else {
+                        __u64 unique_id;
                        if (pInfo->Type == cpu_to_le32(-1)) {
                                /* no return info, go query for it */
                                kfree(pInfo);
@@ -1151,8 +1154,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        else
                                direntry->d_op = &cifs_dentry_ops;
-                        newinode = cifs_new_inode(inode->i_sb,
+                        unique_id = le64_to_cpu(pInfo->UniqueId);
-                                                  &pInfo->UniqueId);
+                        newinode = cifs_new_inode(inode->i_sb, &unique_id);
                        if (newinode == NULL) {
                                kfree(pInfo);
                                goto mkdir_get_info;
@@ -1450,7 +1453,8 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
                     checking the UniqueId via FILE_INTERNAL_INFO */
 unlink_target:
-        if ((rc == -EACCES) || (rc == -EEXIST)) {
+        /* Try unlinking the target dentry if it's not negative */
+        if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
                tmprc = cifs_unlink(target_dir, target_dentry);
                if (tmprc)
                        goto cifs_rename_exit;
@@ -1753,6 +1757,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
        }
        if (rc == 0) {
+                cifsInode->server_eof = attrs->ia_size;
                rc = cifs_vmtruncate(inode, attrs->ia_size);
                cifs_truncate_page(inode->i_mapping, inode->i_size);
        }
@@ -1792,20 +1797,21 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
                goto out;
        }
-        if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
+        /*
-                /*
+         * Attempt to flush data before changing attributes. We need to do
-                   Flush data before changing file size or changing the last
+         * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
-                   write time of the file on the server. If the
+         * ownership or mode then we may also need to do this. Here, we take
-                   flush returns error, store it to report later and continue.
+         * the safe way out and just do the flush on all setattr requests. If
-                   BB: This should be smarter. Why bother flushing pages that
+         * the flush returns error, store it to report later and continue.
-                   will be truncated anyway? Also, should we error out here if
+         *
-                   the flush returns error?
+         * BB: This should be smarter. Why bother flushing pages that
-                 */
+         * will be truncated anyway? Also, should we error out here if
-                rc = filemap_write_and_wait(inode->i_mapping);
+         * the flush returns error?
-                if (rc != 0) {
+         */
-                        cifsInode->write_behind_rc = rc;
+        rc = filemap_write_and_wait(inode->i_mapping);
-                        rc = 0;
+        if (rc != 0) {
-                }
+                cifsInode->write_behind_rc = rc;
+                rc = 0;
        }
        if (attrs->ia_valid & ATTR_SIZE) {
@@ -1903,20 +1909,21 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
                return -ENOMEM;
        }
-        if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
+        /*
-                /*
+         * Attempt to flush data before changing attributes. We need to do
-                   Flush data before changing file size or changing the last
+         * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
-                   write time of the file on the server. If the
+         * ownership or mode then we may also need to do this. Here, we take
-                   flush returns error, store it to report later and continue.
+         * the safe way out and just do the flush on all setattr requests. If
-                   BB: This should be smarter. Why bother flushing pages that
+         * the flush returns error, store it to report later and continue.
-                   will be truncated anyway? Also, should we error out here if
+         *
-                   the flush returns error?
+         * BB: This should be smarter. Why bother flushing pages that
-                 */
+         * will be truncated anyway? Also, should we error out here if
-                rc = filemap_write_and_wait(inode->i_mapping);
+         * the flush returns error?
-                if (rc != 0) {
+         */
-                        cifsInode->write_behind_rc = rc;
+        rc = filemap_write_and_wait(inode->i_mapping);
-                        rc = 0;
+        if (rc != 0) {
-                }
+                cifsInode->write_behind_rc = rc;
+                rc = 0;
        }
        if (attrs->ia_valid & ATTR_SIZE) {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c2c01ff4c32c..1a8be6228333 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -239,6 +239,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
        if (atomic_read(&cifsInfo->inUse) == 0)
                atomic_set(&cifsInfo->inUse, 1);
+        cifsInfo->server_eof = end_of_file;
        spin_lock(&tmp_inode->i_lock);
        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
                /* can not safely change the file size here if the
@@ -375,6 +376,7 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
                tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
        tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks);
+        cifsInfo->server_eof = end_of_file;
        spin_lock(&tmp_inode->i_lock);
        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
                /* can not safely change the file size here if the
@@ -840,7 +842,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                        len = strnlen(filename, PATH_MAX);
                }
-                *pinum = pFindData->UniqueId;
+                *pinum = le64_to_cpu(pFindData->UniqueId);
        } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
                FILE_DIRECTORY_INFO *pFindData =
                        (FILE_DIRECTORY_INFO *)current_entry;
@@ -856,7 +858,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                        (SEARCH_ID_FULL_DIR_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
-                *pinum = pFindData->UniqueId;
+                *pinum = le64_to_cpu(pFindData->UniqueId);
        } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
                FILE_BOTH_DIRECTORY_INFO *pFindData =
                        (FILE_BOTH_DIRECTORY_INFO *)current_entry;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5c68b4282be9..c652c73760dd 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -285,35 +285,36 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
        int words_left, len;
        char *data = *pbcc_area;
        cFYI(1, ("bleft %d", bleft));
+        /*
-        /* SMB header is unaligned, so cifs servers word align start of
+         * Windows servers do not always double null terminate their final
-           Unicode strings */
+         * Unicode string. Check to see if there are an uneven number of bytes
-        data++;
+         * left. If so, then add an extra NULL pad byte to the end of the
-        bleft--; /* Windows servers do not always double null terminate
+         * response.
-                    their final Unicode string - in which case we
+         *
-                    now will not attempt to decode the byte of junk
+         * See section 2.7.2 in "Implementing CIFS" for details
-                    which follows it */
+         */
+        if (bleft % 2) {
+                data[bleft] = 0;
+                ++bleft;
+        }
        words_left = bleft / 2;
        /* save off server operating system */
        len = UniStrnlen((wchar_t *) data, words_left);
-/* We look for obvious messed up bcc or strings in response so we do not go off
-   the end since (at least) WIN2K and Windows XP have a major bug in not null
-   terminating last Unicode string in response  */
        if (len >= words_left)
                return rc;
        kfree(ses->serverOS);
        /* UTF-8 string will not grow more than four times as big as UCS-16 */
        ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
-        if (ses->serverOS != NULL)
+        if (ses->serverOS != NULL) {
                cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp);
+                cFYI(1, ("serverOS=%s", ses->serverOS));
+        }
        data += 2 * (len + 1);
        words_left -= len + 1;
@@ -328,6 +329,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
        if (ses->serverNOS != NULL) {
                cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len,
                                   nls_cp);
+                cFYI(1, ("serverNOS=%s", ses->serverNOS));
                if (strncmp(ses->serverNOS, "NT LAN Manager 4", 16) == 0) {
                        cFYI(1, ("NT4 server"));
                        ses->flags |= CIFS_SES_NT4;
@@ -343,12 +345,11 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
                return rc;
        kfree(ses->serverDomain);
-        ses->serverDomain = kzalloc(2 * (len + 1), GFP_KERNEL); /* BB FIXME wrong length */
+        ses->serverDomain = kzalloc((4 * len) + 2, GFP_KERNEL);
        if (ses->serverDomain != NULL) {
                cifs_strfromUCS_le(ses->serverDomain, (__le16 *)data, len,
                                   nls_cp);
-                ses->serverDomain[2*len] = 0;
+                cFYI(1, ("serverDomain=%s", ses->serverDomain));
-                ses->serverDomain[(2*len) + 1] = 0;
        }
        data += 2 * (len + 1);
        words_left -= len + 1;
@@ -702,12 +703,18 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
        }
        /* BB check if Unicode and decode strings */
-        if (smb_buf->Flags2 & SMBFLG2_UNICODE)
+        if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+                /* unicode string area must be word-aligned */
+                if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+                        ++bcc_ptr;
+                        --bytes_remaining;
+                }
                rc = decode_unicode_ssetup(&bcc_ptr, bytes_remaining,
-                                                   ses, nls_cp);
+                                           ses, nls_cp);
-        else
+        } else {
                rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining,
                                         ses, nls_cp);
+        }
 ssetup_exit:
        if (spnego_key) {
diff --git a/fs/compat.c b/fs/compat.c
index 3f84d5f15889..681ed81e6be0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -181,22 +181,24 @@ asmlinkage long compat_sys_newstat(char __user * filename,
                struct compat_stat __user *statbuf)
 {
        struct kstat stat;
-        int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_stat(filename, &stat);
-                error = cp_compat_stat(&stat, statbuf);
+        if (error)
-        return error;
+                return error;
+        return cp_compat_stat(&stat, statbuf);
 }
 asmlinkage long compat_sys_newlstat(char __user * filename,
                struct compat_stat __user *statbuf)
 {
        struct kstat stat;
-        int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_lstat(filename, &stat);
-                error = cp_compat_stat(&stat, statbuf);
+        if (error)
-        return error;
+                return error;
+        return cp_compat_stat(&stat, statbuf);
 }
 #ifndef __ARCH_WANT_STAT64
@@ -204,21 +206,12 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
                struct compat_stat __user *statbuf, int flag)
 {
        struct kstat stat;
-        int error = -EINVAL;
+        int error;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-                goto out;
-        if (flag & AT_SYMLINK_NOFOLLOW)
-                error = vfs_lstat_fd(dfd, filename, &stat);
-        else
-                error = vfs_stat_fd(dfd, filename, &stat);
-        if (!error)
-                error = cp_compat_stat(&stat, statbuf);
-out:
+        error = vfs_fstatat(dfd, filename, &stat, flag);
-        return error;
+        if (error)
+                return error;
+        return cp_compat_stat(&stat, statbuf);
 }
 #endif
@@ -1483,6 +1476,7 @@ int compat_do_execve(char * filename,
        struct linux_binprm *bprm;
        struct file *file;
        struct files_struct *displaced;
+        bool clear_in_exec;
        int retval;
        retval = unshare_files(&displaced);
@@ -1505,8 +1499,9 @@ int compat_do_execve(char * filename,
                goto out_unlock;
        retval = check_unsafe_exec(bprm);
-        if (retval)
+        if (retval < 0)
                goto out_unlock;
+        clear_in_exec = retval;
        file = open_exec(filename);
        retval = PTR_ERR(file);
@@ -1553,9 +1548,7 @@ int compat_do_execve(char * filename,
                goto out;
        /* execve succeeded */
-        write_lock(&current->fs->lock);
        current->fs->in_exec = 0;
-        write_unlock(&current->fs->lock);
        current->in_execve = 0;
        mutex_unlock(&current->cred_exec_mutex);
        acct_update_integrals(current);
@@ -1575,9 +1568,8 @@ out_file:
        }
 out_unmark:
-        write_lock(&current->fs->lock);
+        if (clear_in_exec)
-        current->fs->in_exec = 0;
+                current->fs->in_exec = 0;
-        write_unlock(&current->fs->lock);
 out_unlock:
        current->in_execve = 0;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 3e87ce443ea2..b83f6bcfa51a 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -58,7 +58,6 @@
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
 #include <linux/atalk.h>
-#include <linux/loop.h>
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci.h>
@@ -68,6 +67,7 @@
 #include <linux/gigaset_dev.h>
 #ifdef CONFIG_BLOCK
+#include <linux/loop.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/sg.h>
@@ -2660,6 +2660,8 @@ HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
 HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
 /* block stuff */
 #ifdef CONFIG_BLOCK
+/* loop */
+IGNORE_IOCTL(LOOP_CLR_FD)
 /* Raw devices */
 HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
 HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
@@ -2728,9 +2730,6 @@ HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
 IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
 IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
 #ifdef CONFIG_SPARC
 /* Sparc framebuffers, handled in sbusfb_compat_ioctl() */
 IGNORE_IOCTL(FBIOGTYPE)
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 932a92b31483..c8afa6b1d91d 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -135,7 +135,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
        struct path path;
        struct configfs_dirent *sd;
        struct config_item *parent_item;
-        struct config_item *target_item;
+        struct config_item *target_item = NULL;
        struct config_item_type *type;
        ret = -EPERM;  /* What lack-of-symlink returns */
diff --git a/fs/dcache.c b/fs/dcache.c
index 761d30be2683..1fcffebfb44f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2149,7 +2149,6 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
        int result;
        unsigned long seq;
-        /* FIXME: This is old behavior, needed? Please check callers. */
        if (new_dentry == old_dentry)
                return 1;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index da258e7249cc..05763bbc2050 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -307,8 +307,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
        struct bio *bio;
        bio = bio_alloc(GFP_KERNEL, nr_vecs);
-        if (bio == NULL)
-                return -ENOMEM;
        bio->bi_bdev = bdev;
        bio->bi_sector = first_sector;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 8b65f289ee00..b91851f1cda3 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -483,15 +483,7 @@ int ecryptfs_encrypt_page(struct page *page)
        ecryptfs_inode = page->mapping->host;
        crypt_stat =
                &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
-        if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+        BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
-                rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page,
-                                                       0, PAGE_CACHE_SIZE);
-                if (rc)
-                        printk(KERN_ERR "%s: Error attempting to copy "
-                               "page at index [%ld]\n", __func__,
-                               page->index);
-                goto out;
-        }
        enc_extent_page = alloc_page(GFP_USER);
        if (!enc_extent_page) {
                rc = -ENOMEM;
@@ -620,16 +612,7 @@ int ecryptfs_decrypt_page(struct page *page)
        ecryptfs_inode = page->mapping->host;
        crypt_stat =
                &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
-        if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+        BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
-                rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
-                                                      PAGE_CACHE_SIZE,
-                                                      ecryptfs_inode);
-                if (rc)
-                        printk(KERN_ERR "%s: Error attempting to copy "
-                               "page at index [%ld]\n", __func__,
-                               page->index);
-                goto out;
-        }
        enc_extent_page = alloc_page(GFP_USER);
        if (!enc_extent_page) {
                rc = -ENOMEM;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 064c5820e4e5..00b30a2d5466 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -269,6 +269,7 @@ struct ecryptfs_crypt_stat {
 #define ECRYPTFS_ENCRYPT_FILENAMES    0x00000800
 #define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
 #define ECRYPTFS_ENCFN_USE_FEK        0x00002000
+#define ECRYPTFS_UNLINK_SIGS          0x00004000
        u32 flags;
        unsigned int file_version;
        size_t iv_bytes;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 55b3145b8072..2f0945d63297 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -379,9 +379,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                goto out_d_drop;
        }
        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
        lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
                                      lower_dir_dentry,
                                      ecryptfs_dentry->d_name.len);
+        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
@@ -406,9 +408,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                       "filename; rc = [%d]\n", __func__, rc);
                goto out_d_drop;
        }
+        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
        lower_dentry = lookup_one_len(encrypted_and_encoded_name,
                                      lower_dir_dentry,
                                      encrypted_and_encoded_name_size - 1);
+        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
@@ -636,8 +640,9 @@ static int
 ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 {
        char *lower_buf;
+        size_t lower_bufsiz;
        struct dentry *lower_dentry;
-        struct ecryptfs_crypt_stat *crypt_stat;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
        char *plaintext_name;
        size_t plaintext_name_size;
        mm_segment_t old_fs;
@@ -648,12 +653,21 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
                rc = -EINVAL;
                goto out;
        }
-        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+        mount_crypt_stat = &ecryptfs_superblock_to_private(
+                                                dentry->d_sb)->mount_crypt_stat;
+        /*
+         * If the lower filename is encrypted, it will result in a significantly
+         * longer name.  If needed, truncate the name after decode and decrypt.
+         */
+        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+                lower_bufsiz = PATH_MAX;
+        else
+                lower_bufsiz = bufsiz;
        /* Released in this function */
-        lower_buf = kmalloc(bufsiz, GFP_KERNEL);
+        lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
        if (lower_buf == NULL) {
                printk(KERN_ERR "%s: Out of memory whilst attempting to "
-                       "kmalloc [%d] bytes\n", __func__, bufsiz);
+                       "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
                rc = -ENOMEM;
                goto out;
        }
@@ -661,7 +675,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
        set_fs(get_ds());
        rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
                                                   (char __user *)lower_buf,
-                                                   bufsiz);
+                                                   lower_bufsiz);
        set_fs(old_fs);
        if (rc >= 0) {
                rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
@@ -674,7 +688,9 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
                                rc);
                        goto out_free_lower_buf;
                }
-                rc = copy_to_user(buf, plaintext_name, plaintext_name_size);
+                /* Check for bufsiz <= 0 done in sys_readlinkat() */
+                rc = copy_to_user(buf, plaintext_name,
+                                  min((size_t) bufsiz, plaintext_name_size));
                if (rc)
                        rc = -EFAULT;
                else
@@ -814,6 +830,13 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
                size_t num_zeros = (PAGE_CACHE_SIZE
                                    - (new_length & ~PAGE_CACHE_MASK));
+                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+                        rc = vmtruncate(inode, new_length);
+                        if (rc)
+                                goto out_free;
+                        rc = vmtruncate(lower_dentry->d_inode, new_length);
+                        goto out_free;
+                }
                if (num_zeros) {
                        char *zeros_virt;
@@ -915,8 +938,6 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
                        }
                        rc = 0;
                        crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
-                        mutex_unlock(&crypt_stat->cs_mutex);
-                        goto out;
                }
        }
        mutex_unlock(&crypt_stat->cs_mutex);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index aed56c25539b..ccabd5faa04d 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -190,14 +190,14 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
                init_special_inode(inode, lower_inode->i_mode,
                                   lower_inode->i_rdev);
        dentry->d_op = &ecryptfs_dops;
-        if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
-                d_add(dentry, inode);
-        else
-                d_instantiate(dentry, inode);
        fsstack_copy_attr_all(inode, lower_inode, NULL);
        /* This size will be overwritten for real files w/ headers and
         * other metadata */
        fsstack_copy_inode_size(inode, lower_inode);
+        if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
+                d_add(dentry, inode);
+        else
+                d_instantiate(dentry, inode);
 out:
        return rc;
 }
@@ -208,7 +208,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
       ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
-       ecryptfs_opt_err };
+       ecryptfs_opt_unlink_sigs, ecryptfs_opt_err };
 static const match_table_t tokens = {
        {ecryptfs_opt_sig, "sig=%s"},
@@ -222,6 +222,7 @@ static const match_table_t tokens = {
        {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
        {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
        {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
+        {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
        {ecryptfs_opt_err, NULL}
 };
@@ -402,6 +403,9 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                                fn_cipher_key_bytes;
                        fn_cipher_key_bytes_set = 1;
                        break;
+                case ecryptfs_opt_unlink_sigs:
+                        mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
+                        break;
                case ecryptfs_opt_err:
                default:
                        printk(KERN_WARNING
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 295e7fa56755..f1c17e87c5fb 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -133,45 +133,6 @@ out:
        return rc;
 }
-static int
-ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
-                             struct ecryptfs_msg_ctx **msg_ctx);
-/**
- * ecryptfs_send_raw_message
- * @msg_type: Message type
- * @daemon: Daemon struct for recipient of message
- *
- * A raw message is one that does not include an ecryptfs_message
- * struct. It simply has a type.
- *
- * Must be called with ecryptfs_daemon_hash_mux held.
- *
- * Returns zero on success; non-zero otherwise
- */
-static int ecryptfs_send_raw_message(u8 msg_type,
-                                     struct ecryptfs_daemon *daemon)
-{
-        struct ecryptfs_msg_ctx *msg_ctx;
-        int rc;
-        rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx);
-        if (rc) {
-                printk(KERN_ERR "%s: Error whilst attempting to send "
-                       "message to ecryptfsd; rc = [%d]\n", __func__, rc);
-                goto out;
-        }
-        /* Raw messages are logically context-free (e.g., no
-         * reply is expected), so we set the state of the
-         * ecryptfs_msg_ctx object to indicate that it should
-         * be freed as soon as the message is sent. */
-        mutex_lock(&msg_ctx->mux);
-        msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY;
-        mutex_unlock(&msg_ctx->mux);
-out:
-        return rc;
-}
 /**
 * ecryptfs_spawn_daemon - Create and initialize a new daemon struct
 * @daemon: Pointer to set to newly allocated daemon struct
@@ -212,49 +173,6 @@ out:
 }
 /**
- * ecryptfs_process_helo
- * @euid: The user ID owner of the message
- * @user_ns: The namespace in which @euid applies
- * @pid: The process ID for the userspace program that sent the
- *       message
- *
- * Adds the euid and pid values to the daemon euid hash.  If an euid
- * already has a daemon pid registered, the daemon will be
- * unregistered before the new daemon is put into the hash list.
- * Returns zero after adding a new daemon to the hash list;
- * non-zero otherwise.
- */
-int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
-                          struct pid *pid)
-{
-        struct ecryptfs_daemon *new_daemon;
-        struct ecryptfs_daemon *old_daemon;
-        int rc;
-        mutex_lock(&ecryptfs_daemon_hash_mux);
-        rc = ecryptfs_find_daemon_by_euid(&old_daemon, euid, user_ns);
-        if (rc != 0) {
-                printk(KERN_WARNING "Received request from user [%d] "
-                       "to register daemon [0x%p]; unregistering daemon "
-                       "[0x%p]\n", euid, pid, old_daemon->pid);
-                rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon);
-                if (rc)
-                        printk(KERN_WARNING "Failed to send QUIT "
-                               "message to daemon [0x%p]; rc = [%d]\n",
-                               old_daemon->pid, rc);
-                hlist_del(&old_daemon->euid_chain);
-                kfree(old_daemon);
-        }
-        rc = ecryptfs_spawn_daemon(&new_daemon, euid, user_ns, pid);
-        if (rc)
-                printk(KERN_ERR "%s: The gods are displeased with this attempt "
-                       "to create a new daemon object for euid [%d]; pid "
-                       "[0x%p]; rc = [%d]\n", __func__, euid, pid, rc);
-        mutex_unlock(&ecryptfs_daemon_hash_mux);
-        return rc;
-}
-/**
 * ecryptfs_exorcise_daemon - Destroy the daemon struct
 *
 * Must be called ceremoniously while in possession of
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index a67fea655f49..4ec8f61ccf5a 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -193,26 +193,20 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
        int rc = 0;
        mutex_lock(&msg_ctx->mux);
-        if (data) {
+        msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size),
-                msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size),
+                               GFP_KERNEL);
-                                       GFP_KERNEL);
+        if (!msg_ctx->msg) {
-                if (!msg_ctx->msg) {
+                rc = -ENOMEM;
-                        rc = -ENOMEM;
+                printk(KERN_ERR "%s: Out of memory whilst attempting "
-                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                       "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
-                               "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
+                       (sizeof(*msg_ctx->msg) + data_size));
-                               (sizeof(*msg_ctx->msg) + data_size));
+                goto out_unlock;
-                        goto out_unlock;
+        }
-                }
-        } else
-                msg_ctx->msg = NULL;
        msg_ctx->msg->index = msg_ctx->index;
        msg_ctx->msg->data_len = data_size;
        msg_ctx->type = msg_type;
-        if (data) {
+        memcpy(msg_ctx->msg->data, data, data_size);
-                memcpy(msg_ctx->msg->data, data, data_size);
+        msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
-                msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
-        } else
-                msg_ctx->msg_size = 0;
        mutex_lock(&daemon->mux);
        list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue);
        daemon->num_queued_msg_ctx++;
@@ -418,18 +412,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
        if (count == 0)
                goto out;
-        data = kmalloc(count, GFP_KERNEL);
-        if (!data) {
+        data = memdup_user(buf, count);
-                printk(KERN_ERR "%s: Out of memory whilst attempting to "
+        if (IS_ERR(data)) {
-                       "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
+                printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
+                       __func__, PTR_ERR(data));
                goto out;
        }
-        rc = copy_from_user(data, buf, count);
-        if (rc) {
-                printk(KERN_ERR "%s: copy_from_user returned error [%d]\n",
-                       __func__, rc);
-                goto out_free;
-        }
        sz = count;
        i = 0;
        switch (data[i++]) {
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 46cec2b69796..5c6bab9786e3 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -449,6 +449,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
        struct ecryptfs_crypt_stat *crypt_stat;
        crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
+        BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
        if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
                return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode);
        else
@@ -490,6 +491,16 @@ static int ecryptfs_write_end(struct file *file,
                ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
        ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
                        "(page w/ index = [0x%.16x], to = [%d])\n", index, to);
+        if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
+                rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
+                                                       to);
+                if (!rc) {
+                        rc = copied;
+                        fsstack_copy_inode_size(ecryptfs_inode,
+                                ecryptfs_inode_to_lower(ecryptfs_inode));
+                }
+                goto out;
+        }
        /* Fills in zeros if 'to' goes beyond inode size */
        rc = fill_zeros_to_end_of_page(page, to);
        if (rc) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 75c2ea9fee35..a137c6ea2fee 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -117,13 +117,15 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
                   size_t size)
 {
        struct page *ecryptfs_page;
+        struct ecryptfs_crypt_stat *crypt_stat;
+        struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
        char *ecryptfs_page_virt;
-        loff_t ecryptfs_file_size =
+        loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
-                i_size_read(ecryptfs_file->f_dentry->d_inode);
        loff_t data_offset = 0;
        loff_t pos;
        int rc = 0;
+        crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
        /*
         * if we are writing beyond current size, then start pos
         * at the current size - we'll fill in zeros from there.
@@ -184,7 +186,13 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
                flush_dcache_page(ecryptfs_page);
                SetPageUptodate(ecryptfs_page);
                unlock_page(ecryptfs_page);
-                rc = ecryptfs_encrypt_page(ecryptfs_page);
+                if (crypt_stat->flags & ECRYPTFS_ENCRYPTED)
+                        rc = ecryptfs_encrypt_page(ecryptfs_page);
+                else
+                        rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
+                                                ecryptfs_page,
+                                                start_offset_in_page,
+                                                data_offset);
                page_cache_release(ecryptfs_page);
                if (rc) {
                        printk(KERN_ERR "%s: Error encrypting "
@@ -194,14 +202,16 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
                pos += num_bytes;
        }
        if ((offset + size) > ecryptfs_file_size) {
-                i_size_write(ecryptfs_file->f_dentry->d_inode, (offset + size));
+                i_size_write(ecryptfs_inode, (offset + size));
-                rc = ecryptfs_write_inode_size_to_metadata(
+                if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
-                        ecryptfs_file->f_dentry->d_inode);
+                        rc = ecryptfs_write_inode_size_to_metadata(
-                if (rc) {
+                                                                ecryptfs_inode);
-                        printk(KERN_ERR "Problem with "
+                        if (rc) {
-                               "ecryptfs_write_inode_size_to_metadata; "
+                                printk(KERN_ERR "Problem with "
-                               "rc = [%d]\n", rc);
+                                       "ecryptfs_write_inode_size_to_metadata; "
-                        goto out;
+                                       "rc = [%d]\n", rc);
+                                goto out;
+                        }
                }
        }
 out:
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index c27ac2b358a1..fa4c7e7d15d9 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -170,7 +170,10 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
        list_for_each_entry(walker,
                            &mount_crypt_stat->global_auth_tok_list,
                            mount_crypt_stat_list) {
-                seq_printf(m, ",ecryptfs_sig=%s", walker->sig);
+                if (walker->flags & ECRYPTFS_AUTH_TOK_FNEK)
+                        seq_printf(m, ",ecryptfs_fnek_sig=%s", walker->sig);
+                else
+                        seq_printf(m, ",ecryptfs_sig=%s", walker->sig);
        }
        mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
@@ -186,6 +189,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_printf(m, ",ecryptfs_xattr_metadata");
        if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
                seq_printf(m, ",ecryptfs_encrypted_view");
+        if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
+                seq_printf(m, ",ecryptfs_unlink_sigs");
        return 0;
 }
diff --git a/fs/exec.c b/fs/exec.c
index 052a961e41aa..639177b0eeac 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -69,17 +69,18 @@ int suid_dumpable = 0;
 static LIST_HEAD(formats);
 static DEFINE_RWLOCK(binfmt_lock);
-int register_binfmt(struct linux_binfmt * fmt)
+int __register_binfmt(struct linux_binfmt * fmt, int insert)
 {
        if (!fmt)
                return -EINVAL;
        write_lock(&binfmt_lock);
-        list_add(&fmt->lh, &formats);
+        insert ? list_add(&fmt->lh, &formats) :
+                 list_add_tail(&fmt->lh, &formats);
        write_unlock(&binfmt_lock);
        return 0;       
 }
-EXPORT_SYMBOL(register_binfmt);
+EXPORT_SYMBOL(__register_binfmt);
 void unregister_binfmt(struct linux_binfmt * fmt)
 {
@@ -1060,7 +1061,6 @@ EXPORT_SYMBOL(install_exec_creds);
 int check_unsafe_exec(struct linux_binprm *bprm)
 {
        struct task_struct *p = current, *t;
-        unsigned long flags;
        unsigned n_fs;
        int res = 0;
@@ -1068,21 +1068,22 @@ int check_unsafe_exec(struct linux_binprm *bprm)
        n_fs = 1;
        write_lock(&p->fs->lock);
-        lock_task_sighand(p, &flags);
+        rcu_read_lock();
        for (t = next_thread(p); t != p; t = next_thread(t)) {
                if (t->fs == p->fs)
                        n_fs++;
        }
+        rcu_read_unlock();
        if (p->fs->users > n_fs) {
                bprm->unsafe |= LSM_UNSAFE_SHARE;
        } else {
-                if (p->fs->in_exec)
+                res = -EAGAIN;
-                        res = -EAGAIN;
+                if (!p->fs->in_exec) {
-                p->fs->in_exec = 1;
+                        p->fs->in_exec = 1;
+                        res = 1;
+                }
        }
-        unlock_task_sighand(p, &flags);
        write_unlock(&p->fs->lock);
        return res;
@@ -1284,6 +1285,7 @@ int do_execve(char * filename,
        struct linux_binprm *bprm;
        struct file *file;
        struct files_struct *displaced;
+        bool clear_in_exec;
        int retval;
        retval = unshare_files(&displaced);
@@ -1306,8 +1308,9 @@ int do_execve(char * filename,
                goto out_unlock;
        retval = check_unsafe_exec(bprm);
-        if (retval)
+        if (retval < 0)
                goto out_unlock;
+        clear_in_exec = retval;
        file = open_exec(filename);
        retval = PTR_ERR(file);
@@ -1355,9 +1358,7 @@ int do_execve(char * filename,
                goto out;
        /* execve succeeded */
-        write_lock(&current->fs->lock);
        current->fs->in_exec = 0;
-        write_unlock(&current->fs->lock);
        current->in_execve = 0;
        mutex_unlock(&current->cred_exec_mutex);
        acct_update_integrals(current);
@@ -1377,9 +1378,8 @@ out_file:
        }
 out_unmark:
-        write_lock(&current->fs->lock);
+        if (clear_in_exec)
-        current->fs->in_exec = 0;
+                current->fs->in_exec = 0;
-        write_unlock(&current->fs->lock);
 out_unlock:
        current->in_execve = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b43b95563663..acf678831103 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -590,9 +590,8 @@ static int ext2_get_blocks(struct inode *inode,
        if (depth == 0)
                return (err);
-reread:
-        partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+        partial = ext2_get_branch(inode, depth, offsets, chain, &err);
        /* Simplest case - block found, no allocation needed */
        if (!partial) {
                first_block = le32_to_cpu(chain[depth - 1].key);
@@ -602,15 +601,16 @@ reread:
                while (count < maxblocks && count <= blocks_to_boundary) {
                        ext2_fsblk_t blk;
-                        if (!verify_chain(chain, partial)) {
+                        if (!verify_chain(chain, chain + depth - 1)) {
                                /*
                                 * Indirect block might be removed by
                                 * truncate while we were reading it.
                                 * Handling of that case: forget what we've
                                 * got now, go to reread.
                                 */
+                                err = -EAGAIN;
                                count = 0;
-                                goto changed;
+                                break;
                        }
                        blk = le32_to_cpu(*(chain[depth-1].p + count));
                        if (blk == first_block + count)
@@ -618,7 +618,8 @@ reread:
                        else
                                break;
                }
-                goto got_it;
+                if (err != -EAGAIN)
+                        goto got_it;
        }
        /* Next simple case - plain lookup or failed read of indirect block */
@@ -626,6 +627,33 @@ reread:
                goto cleanup;
        mutex_lock(&ei->truncate_mutex);
+        /*
+         * If the indirect block is missing while we are reading
+         * the chain(ext3_get_branch() returns -EAGAIN err), or
+         * if the chain has been changed after we grab the semaphore,
+         * (either because another process truncated this branch, or
+         * another get_block allocated this branch) re-grab the chain to see if
+         * the request block has been allocated or not.
+         *
+         * Since we already block the truncate/other get_block
+         * at this point, we will have the current copy of the chain when we
+         * splice the branch into the tree.
+         */
+        if (err == -EAGAIN || !verify_chain(chain, partial)) {
+                while (partial > chain) {
+                        brelse(partial->bh);
+                        partial--;
+                }
+                partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+                if (!partial) {
+                        count++;
+                        mutex_unlock(&ei->truncate_mutex);
+                        if (err)
+                                goto cleanup;
+                        clear_buffer_new(bh_result);
+                        goto got_it;
+                }
+        }
        /*
         * Okay, we need to do block allocation.  Lazily initialize the block
@@ -683,12 +711,6 @@ cleanup:
                partial--;
        }
        return err;
-changed:
-        while (partial > chain) {
-                brelse(partial->bh);
-                partial--;
-        }
-        goto reread;
 }
 int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f983225266dc..5c4afe652245 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1395,8 +1395,10 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
                blk++;
        }
 out:
-        if (len == towrite)
+        if (len == towrite) {
+                mutex_unlock(&inode->i_mutex);
                return err;
+        }
        if (inode->i_size < off+len-towrite)
                i_size_write(inode, off+len-towrite);
        inode->i_version++;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 6132353dcf62..e40332158340 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -326,11 +326,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-        ext4_fsblk_t block = ext_pblock(ext);
+        ext4_fsblk_t block = ext_pblock(ext), valid_block;
        int len = ext4_ext_get_actual_len(ext);
        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-        if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
-                        ((block + len) > ext4_blocks_count(es))))
+        valid_block = le32_to_cpu(es->s_first_data_block) +
+                EXT4_SB(inode->i_sb)->s_gdb_count;
+        if (unlikely(block <= valid_block ||
+                     ((block + len) > ext4_blocks_count(es))))
                return 0;
        else
                return 1;
@@ -339,10 +342,13 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
 {
-        ext4_fsblk_t block = idx_pblock(ext_idx);
+        ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-        if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
-                        (block >= ext4_blocks_count(es))))
+        valid_block = le32_to_cpu(es->s_first_data_block) +
+                EXT4_SB(inode->i_sb)->s_gdb_count;
+        if (unlikely(block <= valid_block ||
+                     (block >= ext4_blocks_count(es))))
                return 0;
        else
                return 1;
@@ -2416,8 +2422,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
                        len = ee_len;
                bio = bio_alloc(GFP_NOIO, len);
-                if (!bio)
-                        return -ENOMEM;
                bio->bi_sector = ee_pblock;
                bio->bi_bdev   = inode->i_sb->s_bdev;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 47b84e8df568..f18e0a08a6b5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -585,6 +585,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 fallback:
        ngroups = sbi->s_groups_count;
        avefreei = freei / ngroups;
+fallback_retry:
        parent_group = EXT4_I(parent)->i_block_group;
        for (i = 0; i < ngroups; i++) {
                grp = (parent_group + i) % ngroups;
@@ -602,7 +603,7 @@ fallback:
                 * filesystems the above test can fail to find any blockgroups
                 */
                avefreei = 0;
-                goto fallback;
+                goto fallback_retry;
        }
        return -1;
@@ -831,11 +832,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                ret2 = find_group_flex(sb, dir, &group);
                if (ret2 == -1) {
                        ret2 = find_group_other(sb, dir, &group, mode);
-                        if (ret2 == 0 && once)
+                        if (ret2 == 0 && once) {
                                once = 0;
                                printk(KERN_NOTICE "ext4: find_group_flex "
                                       "failed, fallback succeeded dir %lu\n",
                                       dir->i_ino);
+                        }
                }
                goto got_group;
        }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c6bd6ced3bb7..e91f978c7f12 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4357,11 +4357,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
        inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
        ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
-        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
-            cpu_to_le32(EXT4_OS_HURD)) {
                ei->i_file_acl |=
                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
-        }
        inode->i_size = ext4_isize(raw_inode);
        ei->i_disksize = inode->i_size;
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
@@ -4409,9 +4407,23 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
        }
-        if (ei->i_flags & EXT4_EXTENTS_FL) {
+        ret = 0;
-                /* Validate extent which is part of inode */
+        if (ei->i_file_acl &&
-                ret = ext4_ext_check_inode(inode);
+            ((ei->i_file_acl < 
+              (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+               EXT4_SB(sb)->s_gdb_count)) ||
+             (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
+                ext4_error(sb, __func__,
+                           "bad extended attribute block %llu in inode #%lu",
+                           ei->i_file_acl, inode->i_ino);
+                ret = -EIO;
+                goto bad_inode;
+        } else if (ei->i_flags & EXT4_EXTENTS_FL) {
+                if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                    (S_ISLNK(inode->i_mode) &&
+                     !ext4_inode_is_fast_symlink(inode)))
+                        /* Validate extent which is part of inode */
+                        ret = ext4_ext_check_inode(inode);
        } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                   (S_ISLNK(inode->i_mode) &&
                    !ext4_inode_is_fast_symlink(inode))) {
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index d0a69ff25375..182f9ffe2b51 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -95,3 +95,6 @@ config FAT_DEFAULT_IOCHARSET
          Note that "utf8" is not recommended for FAT filesystems.
          If unsure, you shouldn't set "utf8" here.
          See <file:Documentation/filesystems/vfat.txt> for more information.
+          Enable any character sets you need in File Systems/Native Language
+          Support.
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 1aa70260e6d1..a24c58e181db 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -199,7 +199,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
        return retval;
 }
-int get_filesystem_list(char * buf)
+int __init get_filesystem_list(char *buf)
 {
        int len = 0;
        struct file_system_type * tmp;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2b25133524a3..06f30e965676 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -938,9 +938,9 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
 }
 static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
-                               unsigned *nbytesp, int write)
+                               size_t *nbytesp, int write)
 {
-        unsigned nbytes = *nbytesp;
+        size_t nbytes = *nbytesp;
        unsigned long user_addr = (unsigned long) buf;
        unsigned offset = user_addr & ~PAGE_MASK;
        int npages;
@@ -955,7 +955,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
                return 0;
        }
-        nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+        nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
        npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
        npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
        down_read(&current->mm->mmap_sem);
@@ -1298,6 +1298,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
        if (vma->vm_flags & VM_MAYSHARE)
                return -ENODEV;
+        invalidate_inode_pages2(file->f_mapping);
        return generic_file_mmap(file, vma);
 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3984e47d1d33..1afd9f26bcb1 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -597,7 +597,6 @@ __acquires(&gl->gl_spin)
        GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
-        down_read(&gfs2_umount_flush_sem);
        if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
            gl->gl_demote_state != gl->gl_state) {
                if (find_first_holder(gl))
@@ -614,15 +613,14 @@ __acquires(&gl->gl_spin)
                if (ret == 0)
                        goto out_unlock;
                if (ret == 2)
-                        goto out_sem;
+                        goto out;
                gh = find_first_waiter(gl);
                gl->gl_target = gh->gh_state;
                if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
                        do_error(gl, 0); /* Fail queued try locks */
        }
        do_xmote(gl, gh, gl->gl_target);
-out_sem:
+out:
-        up_read(&gfs2_umount_flush_sem);
        return;
 out_sched:
@@ -631,7 +629,7 @@ out_sched:
                gfs2_glock_put(gl);
 out_unlock:
        clear_bit(GLF_LOCK, &gl->gl_flags);
-        goto out_sem;
+        goto out;
 }
 static void glock_work_func(struct work_struct *work)
@@ -641,6 +639,7 @@ static void glock_work_func(struct work_struct *work)
        if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
                finish_xmote(gl, gl->gl_reply);
+        down_read(&gfs2_umount_flush_sem);
        spin_lock(&gl->gl_spin);
        if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
            gl->gl_state != LM_ST_UNLOCKED &&
@@ -653,6 +652,7 @@ static void glock_work_func(struct work_struct *work)
        }
        run_queue(gl, 0);
        spin_unlock(&gl->gl_spin);
+        up_read(&gfs2_umount_flush_sem);
        if (!delay ||
            queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
                gfs2_glock_put(gl);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index bf23a62aa925..70f87f43afa2 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -156,6 +156,12 @@ static void inode_go_sync(struct gfs2_glock *gl)
        error = filemap_fdatawait(metamapping);
        mapping_set_error(metamapping, error);
        gfs2_ail_empty_gl(gl);
+        /*
+         * Writeback of the data mapping may cause the dirty flag to be set
+         * so we have to clear it again here.
+         */
+        smp_mb__before_clear_bit();
+        clear_bit(GLF_DIRTY, &gl->gl_flags);
 }
 /**
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7b277d449155..5a31d426116f 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -137,15 +137,15 @@ void gfs2_set_iop(struct inode *inode)
        if (S_ISREG(mode)) {
                inode->i_op = &gfs2_file_iops;
                if (gfs2_localflocks(sdp))
-                        inode->i_fop = gfs2_file_fops_nolock;
+                        inode->i_fop = &gfs2_file_fops_nolock;
                else
-                        inode->i_fop = gfs2_file_fops;
+                        inode->i_fop = &gfs2_file_fops;
        } else if (S_ISDIR(mode)) {
                inode->i_op = &gfs2_dir_iops;
                if (gfs2_localflocks(sdp))
-                        inode->i_fop = gfs2_dir_fops_nolock;
+                        inode->i_fop = &gfs2_dir_fops_nolock;
                else
-                        inode->i_fop = gfs2_dir_fops;
+                        inode->i_fop = &gfs2_dir_fops;
        } else if (S_ISLNK(mode)) {
                inode->i_op = &gfs2_symlink_iops;
        } else {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index dca4fee3078b..c30be2b66580 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -101,21 +101,23 @@ void gfs2_dinode_print(const struct gfs2_inode *ip);
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
 extern const struct inode_operations gfs2_symlink_iops;
-extern const struct file_operations *gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_file_fops_nolock;
-extern const struct file_operations *gfs2_dir_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
 extern void gfs2_set_inode_flags(struct inode *inode);
 
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
-extern const struct file_operations *gfs2_file_fops;
+extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations *gfs2_dir_fops;
+extern const struct file_operations gfs2_dir_fops;
 static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
 {
        return sdp->sd_args.ar_localflocks;
 }
 #else /* Single node only */
-#define gfs2_file_fops NULL
+#define gfs2_file_fops gfs2_file_fops_nolock
-#define gfs2_dir_fops NULL
+#define gfs2_dir_fops gfs2_dir_fops_nolock
 static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
 {
        return 1;
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 70b9b8548945..5d82e91887e3 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -413,7 +413,9 @@ out_unlock:
        gfs2_glock_dq(&gh);
 out:
        gfs2_holder_uninit(&gh);
-        if (ret)
+        if (ret == -ENOMEM)
+                ret = VM_FAULT_OOM;
+        else if (ret)
                ret = VM_FAULT_SIGBUS;
        return ret;
 }
@@ -705,7 +707,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
        }
 }
-const struct file_operations *gfs2_file_fops = &(const struct file_operations){
+const struct file_operations gfs2_file_fops = {
        .llseek         = gfs2_llseek,
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
@@ -723,7 +725,7 @@ const struct file_operations *gfs2_file_fops = &(const struct file_operations){
        .setlease       = gfs2_setlease,
 };
-const struct file_operations *gfs2_dir_fops = &(const struct file_operations){
+const struct file_operations gfs2_dir_fops = {
        .readdir        = gfs2_readdir,
        .unlocked_ioctl = gfs2_ioctl,
        .open           = gfs2_open,
@@ -735,7 +737,7 @@ const struct file_operations *gfs2_dir_fops = &(const struct file_operations){
 #endif /* CONFIG_GFS2_FS_LOCKING_DLM */
-const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operations){
+const struct file_operations gfs2_file_fops_nolock = {
        .llseek         = gfs2_llseek,
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
@@ -751,7 +753,7 @@ const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operat
        .setlease       = generic_setlease,
 };
-const struct file_operations *gfs2_dir_fops_nolock = &(const struct file_operations){
+const struct file_operations gfs2_dir_fops_nolock = {
        .readdir        = gfs2_readdir,
        .unlocked_ioctl = gfs2_ioctl,
        .open           = gfs2_open,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 51883b3ad89c..650a730707b7 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -272,11 +272,6 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
        lock_page(page);
        bio = bio_alloc(GFP_NOFS, 1);
-        if (unlikely(!bio)) {
-                __free_page(page);
-                return -ENOBUFS;
-        }
        bio->bi_sector = sector * (sb->s_blocksize >> 9);
        bio->bi_bdev = sb->s_bdev;
        bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index abd5429ae285..1c70fa5168d6 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -371,6 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
        ip = ghs[1].gh_gl->gl_object;
        ip->i_disksize = size;
+        i_size_write(inode, size);
        error = gfs2_meta_inode_buffer(ip, &dibh);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8d53f66b5bcc..152e6c4a0dca 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -81,7 +81,7 @@ struct gfs2_quota_change_host {
 static LIST_HEAD(qd_lru_list);
 static atomic_t qd_lru_count = ATOMIC_INIT(0);
-static spinlock_t qd_lru_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(qd_lru_lock);
 int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
 {
@@ -1364,7 +1364,7 @@ int gfs2_quotad(void *data)
                        refrigerator();
                t = min(quotad_timeo, statfs_timeo);
-                prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE);
+                prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
                spin_lock(&sdp->sd_trunc_lock);
                empty = list_empty(&sdp->sd_trunc_list);
                spin_unlock(&sdp->sd_trunc_lock);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f03d024038ea..565038243fa2 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -212,8 +212,7 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
        if (tmp == 0)
                return BFITNOENT;
        ptr--;
-        bit = fls64(tmp);
+        bit = __ffs64(tmp);
-        bit--;          /* fls64 always adds one to the bit count */
        bit /= 2;       /* two bits per entry in the bitmap */
        return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
 }
@@ -1445,10 +1444,12 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *dibh;
        struct gfs2_alloc *al = ip->i_alloc;
        struct gfs2_rgrpd *rgd = al->al_rgd;
        u32 goal, blk;
        u64 block;
+        int error;
        if (rgrp_contains_block(rgd, ip->i_goal))
                goal = ip->i_goal - rgd->rd_data0;
@@ -1461,7 +1462,13 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
        rgd->rd_last_alloc = blk;
        block = rgd->rd_data0 + blk;
        ip->i_goal = block;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error == 0) {
+                struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
+                brelse(dibh);
+        }
        gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
        rgd->rd_free -= *n;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9435dda8f1e0..a1cbff2b4d99 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -70,6 +70,10 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
                BUG();
                return 0;
        }
+        if (!tree)
+                return 0;
        if (tree->node_size >= PAGE_CACHE_SIZE) {
                nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
                spin_lock(&tree->hash_lock);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 36ca2e1a4fa3..7b6165f25fbe 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -349,6 +349,7 @@ void hfs_mdb_put(struct super_block *sb)
        if (HFS_SB(sb)->nls_disk)
                unload_nls(HFS_SB(sb)->nls_disk);
+        free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
        kfree(HFS_SB(sb));
        sb->s_fs_info = NULL;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 23a3c76711e0..153d9681192b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -26,7 +26,6 @@
 #include <linux/pagevec.h>
 #include <linux/parser.h>
 #include <linux/mman.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
@@ -842,7 +841,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 bad_val:
        printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
               args[0].from, p);
-        return 1;
+        return -EINVAL;
 }
 static int
diff --git a/fs/inode.c b/fs/inode.c
index d06d6d268de9..6ad14a1cd8c9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1470,42 +1470,6 @@ static void __wait_on_freeing_inode(struct inode *inode)
        spin_lock(&inode_lock);
 }
-/*
- * We rarely want to lock two inodes that do not have a parent/child
- * relationship (such as directory, child inode) simultaneously. The
- * vast majority of file systems should be able to get along fine
- * without this. Do not use these functions except as a last resort.
- */
-void inode_double_lock(struct inode *inode1, struct inode *inode2)
-{
-        if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
-                if (inode1)
-                        mutex_lock(&inode1->i_mutex);
-                else if (inode2)
-                        mutex_lock(&inode2->i_mutex);
-                return;
-        }
-        if (inode1 < inode2) {
-                mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
-                mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
-        } else {
-                mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
-                mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
-        }
-}
-EXPORT_SYMBOL(inode_double_lock);
-void inode_double_unlock(struct inode *inode1, struct inode *inode2)
-{
-        if (inode1)
-                mutex_unlock(&inode1->i_mutex);
-        if (inode2 && inode2 != inode1)
-                mutex_unlock(&inode2->i_mutex);
-}
-EXPORT_SYMBOL(inode_double_unlock);
 static __initdata unsigned long ihash_entries;
 static int __init set_ihash_entries(char *str)
 {
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index a8e8513a78a9..06560c520f49 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -502,7 +502,7 @@ void journal_commit_transaction(journal_t *journal)
                err = 0;
        }
-        journal_write_revoke_records(journal, commit_transaction);
+        journal_write_revoke_records(journal, commit_transaction, write_op);
        /*
         * If we found any dirty or locked buffers, then we should have
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index c7bd649bbbdc..da6cd9bdaabc 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -55,6 +55,25 @@
 *                      need do nothing.
 * RevokeValid set, Revoked set:
 *                      buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment noone else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
 */
 #ifndef __KERNEL__
@@ -67,6 +86,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/init.h>
+#include <linux/bio.h>
 #endif
 #include <linux/log2.h>
@@ -99,8 +119,8 @@ struct jbd_revoke_table_s
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
                                    struct journal_head **, int *,
-                                    struct jbd_revoke_record_s *);
+                                    struct jbd_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int);
+static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 #endif
 /* Utility functions to maintain the revoke table */
@@ -402,8 +422,6 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
 * the second time we would still have a pending revoke to cancel.  So,
 * do not trust the Revoked bit on buffers unless RevokeValid is also
 * set.
- *
- * The caller must have the journal locked.
 */
 int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 {
@@ -481,12 +499,9 @@ void journal_switch_revoke_table(journal_t *journal)
 /*
 * Write revoke records to the journal for all entries in the current
 * revoke hash, deleting the entries as we go.
- *
- * Called with the journal lock held.
 */
 void journal_write_revoke_records(journal_t *journal,
-                                  transaction_t *transaction)
+                                  transaction_t *transaction, int write_op)
 {
        struct journal_head *descriptor;
        struct jbd_revoke_record_s *record;
@@ -510,14 +525,14 @@ void journal_write_revoke_records(journal_t *journal,
                                hash_list->next;
                        write_one_revoke_record(journal, transaction,
                                                &descriptor, &offset,
-                                                record);
+                                                record, write_op);
                        count++;
                        list_del(&record->hash);
                        kmem_cache_free(revoke_record_cache, record);
                }
        }
        if (descriptor)
-                flush_descriptor(journal, descriptor, offset);
+                flush_descriptor(journal, descriptor, offset, write_op);
        jbd_debug(1, "Wrote %d revoke records\n", count);
 }
@@ -530,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal,
                                    transaction_t *transaction,
                                    struct journal_head **descriptorp,
                                    int *offsetp,
-                                    struct jbd_revoke_record_s *record)
+                                    struct jbd_revoke_record_s *record,
+                                    int write_op)
 {
        struct journal_head *descriptor;
        int offset;
@@ -549,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal,
        /* Make sure we have a descriptor with space left for the record */
        if (descriptor) {
                if (offset == journal->j_blocksize) {
-                        flush_descriptor(journal, descriptor, offset);
+                        flush_descriptor(journal, descriptor, offset, write_op);
                        descriptor = NULL;
                }
        }
@@ -586,7 +602,7 @@ static void write_one_revoke_record(journal_t *journal,
 static void flush_descriptor(journal_t *journal,
                             struct journal_head *descriptor,
-                             int offset)
+                             int offset, int write_op)
 {
        journal_revoke_header_t *header;
        struct buffer_head *bh = jh2bh(descriptor);
@@ -601,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
        set_buffer_jwrite(bh);
        BUFFER_TRACE(bh, "write");
        set_buffer_dirty(bh);
-        ll_rw_block(SWRITE, 1, &bh);
+        ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
 }
 #endif
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 073c8c3df7cd..0b7d3b8226fd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -506,7 +506,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        if (err)
                jbd2_journal_abort(journal, err);
-        jbd2_journal_write_revoke_records(journal, commit_transaction);
+        jbd2_journal_write_revoke_records(journal, commit_transaction,
+                                          write_op);
        jbd_debug(3, "JBD: commit phase 2\n");
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index bbe6d592d8b3..a360b06af2e3 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -86,6 +86,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/init.h>
+#include <linux/bio.h>
 #endif
 #include <linux/log2.h>
@@ -118,8 +119,8 @@ struct jbd2_revoke_table_s
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
                                    struct journal_head **, int *,
-                                    struct jbd2_revoke_record_s *);
+                                    struct jbd2_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int);
+static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 #endif
 /* Utility functions to maintain the revoke table */
@@ -499,7 +500,8 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
 * revoke hash, deleting the entries as we go.
 */
 void jbd2_journal_write_revoke_records(journal_t *journal,
-                                  transaction_t *transaction)
+                                       transaction_t *transaction,
+                                       int write_op)
 {
        struct journal_head *descriptor;
        struct jbd2_revoke_record_s *record;
@@ -523,14 +525,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
                                hash_list->next;
                        write_one_revoke_record(journal, transaction,
                                                &descriptor, &offset,
-                                                record);
+                                                record, write_op);
                        count++;
                        list_del(&record->hash);
                        kmem_cache_free(jbd2_revoke_record_cache, record);
                }
        }
        if (descriptor)
-                flush_descriptor(journal, descriptor, offset);
+                flush_descriptor(journal, descriptor, offset, write_op);
        jbd_debug(1, "Wrote %d revoke records\n", count);
 }
@@ -543,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal,
                                    transaction_t *transaction,
                                    struct journal_head **descriptorp,
                                    int *offsetp,
-                                    struct jbd2_revoke_record_s *record)
+                                    struct jbd2_revoke_record_s *record,
+                                    int write_op)
 {
        struct journal_head *descriptor;
        int offset;
@@ -562,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal,
        /* Make sure we have a descriptor with space left for the record */
        if (descriptor) {
                if (offset == journal->j_blocksize) {
-                        flush_descriptor(journal, descriptor, offset);
+                        flush_descriptor(journal, descriptor, offset, write_op);
                        descriptor = NULL;
                }
        }
@@ -607,7 +610,7 @@ static void write_one_revoke_record(journal_t *journal,
 static void flush_descriptor(journal_t *journal,
                             struct journal_head *descriptor,
-                             int offset)
+                             int offset, int write_op)
 {
        jbd2_journal_revoke_header_t *header;
        struct buffer_head *bh = jh2bh(descriptor);
@@ -622,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
        set_buffer_jwrite(bh);
        BUFFER_TRACE(bh, "write");
        set_buffer_dirty(bh);
-        ll_rw_block(SWRITE, 1, &bh);
+        ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
 }
 #endif
diff --git a/fs/namei.c b/fs/namei.c
index b8433ebfae05..78f253cd2d4f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1248,6 +1248,8 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        int err;
        struct qstr this;
+        WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
        err = __lookup_one_len(name, &this, base, len);
        if (err)
                return ERR_PTR(err);
diff --git a/fs/namespace.c b/fs/namespace.c
index c6f54e4c4290..41196209a906 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1377,7 +1377,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        if (parent_path) {
                detach_mnt(source_mnt, parent_path);
                attach_mnt(source_mnt, path);
-                touch_mnt_namespace(current->nsproxy->mnt_ns);
+                touch_mnt_namespace(parent_path->mnt->mnt_ns);
        } else {
                mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
                commit_tree(source_mnt);
@@ -1920,8 +1920,9 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
        if (data_page)
                ((char *)data_page)[PAGE_SIZE - 1] = 0;
-        /* Default to relatime */
+        /* Default to relatime unless overriden */
-        mnt_flags |= MNT_RELATIME;
+        if (!(flags & MS_NOATIME))
+                mnt_flags |= MNT_RELATIME;
        /* Separate the per-mountpoint flags */
        if (flags & MS_NOSUID)
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index f54360f50a9c..fa038df63ac8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -660,13 +660,10 @@ outrel:
                        if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN)
                                return -ENOMEM;
                        if (user.object_name_len) {
-                                newname = kmalloc(user.object_name_len, GFP_USER);
+                                newname = memdup_user(user.object_name,
-                                if (!newname)
+                                                      user.object_name_len);
-                                        return -ENOMEM;
+                                if (IS_ERR(newname))
-                                if (copy_from_user(newname, user.object_name, user.object_name_len)) {
+                                        return PTR_ERR(newname);
-                                        kfree(newname);
-                                        return -EFAULT;
-                                }
                        } else {
                                newname = NULL;
                        }
@@ -760,13 +757,9 @@ outrel:
                        if (user.len > NCP_PRIVATE_DATA_MAX_LEN)
                                return -ENOMEM;
                        if (user.len) {
-                                new = kmalloc(user.len, GFP_USER);
+                                new = memdup_user(user.data, user.len);
-                                if (!new)
+                                if (IS_ERR(new))
-                                        return -ENOMEM;
+                                        return PTR_ERR(new);
-                                if (copy_from_user(new, user.data, user.len)) {
-                                        kfree(new);
-                                        return -EFAULT;
-                                }
                        } else {
                                new = NULL;
                        }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5a97bcfe03e5..ec7e27d00bc6 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -517,10 +517,10 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        ret = nfs_updatepage(filp, page, 0, pagelen);
 out_unlock:
+        if (!ret)
+                return VM_FAULT_LOCKED;
        unlock_page(page);
-        if (ret)
+        return VM_FAULT_SIGBUS;
-                ret = VM_FAULT_SIGBUS;
-        return ret;
 }
 static struct vm_operations_struct nfs_file_vm_ops = {
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index e6a1932c7110..35869a4921f1 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -713,7 +713,8 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
        if (args->npages != 0)
                xdr_encode_pages(buf, args->pages, 0, args->len);
        else
-                req->rq_slen += args->len;
+                req->rq_slen = xdr_adjust_iovec(req->rq_svec,
+                                p + XDR_QUADLEN(args->len));
        err = nfsacl_encode(buf, base, args->inode,
                            (args->mask & NFS_ACL) ?
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 3444c0052a87..5275097a7565 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -229,21 +229,23 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
                goto out;
        status = vfs_readdir(filp, nfsd4_build_namelist, &names);
        fput(filp);
+        mutex_lock(&dir->d_inode->i_mutex);
        while (!list_empty(&names)) {
                entry = list_entry(names.next, struct name_list, list);
                dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
                if (IS_ERR(dentry)) {
                        status = PTR_ERR(dentry);
-                        goto out;
+                        break;
                }
                status = f(dir, dentry);
                dput(dentry);
                if (status)
-                        goto out;
+                        break;
                list_del(&entry->list);
                kfree(entry);
        }
+        mutex_unlock(&dir->d_inode->i_mutex);
 out:
        while (!list_empty(&names)) {
                entry = list_entry(names.next, struct name_list, list);
@@ -255,36 +257,6 @@ out:
 }
 static int
-nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry)
-{
-        int status;
-        if (!S_ISREG(dir->d_inode->i_mode)) {
-                printk("nfsd4: non-file found in client recovery directory\n");
-                return -EINVAL;
-        }
-        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-        status = vfs_unlink(dir->d_inode, dentry);
-        mutex_unlock(&dir->d_inode->i_mutex);
-        return status;
-}
-static int
-nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry)
-{
-        int status;
-        /* For now this directory should already be empty, but we empty it of
-         * any regular files anyway, just in case the directory was created by
-         * a kernel from the future.... */
-        nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file);
-        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-        status = vfs_rmdir(dir->d_inode, dentry);
-        mutex_unlock(&dir->d_inode->i_mutex);
-        return status;
-}
-static int
 nfsd4_unlink_clid_dir(char *name, int namlen)
 {
        struct dentry *dentry;
@@ -294,18 +266,18 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
        mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
        dentry = lookup_one_len(name, rec_dir.dentry, namlen);
-        mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
        if (IS_ERR(dentry)) {
                status = PTR_ERR(dentry);
-                return status;
+                goto out_unlock;
        }
        status = -ENOENT;
        if (!dentry->d_inode)
                goto out;
+        status = vfs_rmdir(rec_dir.dentry->d_inode, dentry);
-        status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry);
 out:
        dput(dentry);
+out_unlock:
+        mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
        return status;
 }
@@ -348,7 +320,7 @@ purge_old(struct dentry *parent, struct dentry *child)
        if (nfs4_has_reclaimed_state(child->d_name.name, false))
                return 0;
-        status = nfsd4_clear_clid_dir(parent, child);
+        status = vfs_rmdir(parent->d_inode, child);
        if (status)
                printk("failed to remove client recovery directory %s\n",
                                child->d_name.name);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ab93fcfef254..6c68ffd6b4bb 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -116,10 +116,15 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
        }
        if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
                /* successfully crossed mount point */
-                exp_put(exp);
+                /*
-                *expp = exp2;
+                 * This is subtle: dentry is *not* under mnt at this point.
+                 * The only reason we are safe is that original mnt is pinned
+                 * down by exp, so we should dput before putting exp.
+                 */
                dput(dentry);
                *dpp = mounts;
+                exp_put(exp);
+                *expp = exp2;
        } else {
                exp_put(exp2);
                dput(mounts);
@@ -1885,8 +1890,8 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
        return 0;
 }
-static int nfsd_buffered_readdir(struct file *file, filldir_t func,
+static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
-                                 struct readdir_cd *cdp, loff_t *offsetp)
+                                    struct readdir_cd *cdp, loff_t *offsetp)
 {
        struct readdir_data buf;
        struct buffered_dirent *de;
@@ -1896,11 +1901,12 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
        buf.dirent = (void *)__get_free_page(GFP_KERNEL);
        if (!buf.dirent)
-                return -ENOMEM;
+                return nfserrno(-ENOMEM);
        offset = *offsetp;
        while (1) {
+                struct inode *dir_inode = file->f_path.dentry->d_inode;
                unsigned int reclen;
                cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1919,26 +1925,38 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
                if (!size)
                        break;
+                /*
+                 * Various filldir functions may end up calling back into
+                 * lookup_one_len() and the file system's ->lookup() method.
+                 * These expect i_mutex to be held, as it would within readdir.
+                 */
+                host_err = mutex_lock_killable(&dir_inode->i_mutex);
+                if (host_err)
+                        break;
                de = (struct buffered_dirent *)buf.dirent;
                while (size > 0) {
                        offset = de->offset;
                        if (func(cdp, de->name, de->namlen, de->offset,
                                 de->ino, de->d_type))
-                                goto done;
+                                break;
                        if (cdp->err != nfs_ok)
-                                goto done;
+                                break;
                        reclen = ALIGN(sizeof(*de) + de->namlen,
                                       sizeof(u64));
                        size -= reclen;
                        de = (struct buffered_dirent *)((char *)de + reclen);
                }
+                mutex_unlock(&dir_inode->i_mutex);
+                if (size > 0) /* We bailed out early */
+                        break;
                offset = vfs_llseek(file, 0, SEEK_CUR);
        }
- done:
        free_page((unsigned long)(buf.dirent));
        if (host_err)
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 24638e059bf3..064279e33bbb 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -688,6 +688,8 @@ static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
        .bpop_translate         =       NULL,
 };
+static struct lock_class_key nilfs_bmap_dat_lock_key;
 /**
 * nilfs_bmap_read - read a bmap from an inode
 * @bmap: bmap
@@ -715,6 +717,7 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
                bmap->b_pops = &nilfs_bmap_ptr_ops_p;
                bmap->b_last_allocated_key = 0; /* XXX: use macro */
                bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+                lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
                break;
        case NILFS_CPFILE_INO:
        case NILFS_SUFILE_INO:
@@ -772,6 +775,7 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
 {
        memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union));
        init_rwsem(&gcbmap->b_sem);
+        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
        gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
 }
@@ -779,5 +783,6 @@ void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
 {
        memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union));
        init_rwsem(&bmap->b_sem);
+        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
 }
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 7558c977db02..3d0c18a16db1 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -35,11 +35,6 @@
 #include "bmap_union.h"
 /*
- * NILFS filesystem version
- */
-#define NILFS_VERSION           "2.0.5"
-/*
 * nilfs inode data in memory
 */
 struct nilfs_inode_info {
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 6ade0963fc1d..4fc081e47d70 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -413,7 +413,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
        struct nilfs_segment_entry *ent, *n;
        struct inode *sufile = nilfs->ns_sufile;
        __u64 segnum[4];
-        time_t mtime;
        int err;
        int i;
@@ -442,24 +441,13 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
         * Collecting segments written after the latest super root.
         * These are marked dirty to avoid being reallocated in the next write.
         */
-        mtime = get_seconds();
        list_for_each_entry_safe(ent, n, head, list) {
-                if (ent->segnum == segnum[0]) {
+                if (ent->segnum != segnum[0]) {
-                        list_del(&ent->list);
+                        err = nilfs_sufile_scrap(sufile, ent->segnum);
-                        nilfs_free_segment_entry(ent);
+                        if (unlikely(err))
-                        continue;
+                                goto failed;
-                }
-                err = nilfs_open_segment_entry(ent, sufile);
-                if (unlikely(err))
-                        goto failed;
-                if (!nilfs_segment_usage_dirty(ent->raw_su)) {
-                        /* make the segment garbage */
-                        ent->raw_su->su_nblocks = cpu_to_le32(0);
-                        ent->raw_su->su_lastmod = cpu_to_le32(mtime);
-                        nilfs_segment_usage_set_dirty(ent->raw_su);
                }
                list_del(&ent->list);
-                nilfs_close_segment_entry(ent, sufile);
                nilfs_free_segment_entry(ent);
        }
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index c774cf397e2f..98e68677f045 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -93,6 +93,52 @@ nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
                                   create, NULL, bhp);
 }
+static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
+                                     u64 ncleanadd, u64 ndirtyadd)
+{
+        struct nilfs_sufile_header *header;
+        void *kaddr;
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        header = kaddr + bh_offset(header_bh);
+        le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
+        le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(header_bh);
+}
+int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
+                        void (*dofunc)(struct inode *, __u64,
+                                       struct buffer_head *,
+                                       struct buffer_head *))
+{
+        struct buffer_head *header_bh, *bh;
+        int ret;
+        if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
+                printk(KERN_WARNING "%s: invalid segment number: %llu\n",
+                       __func__, (unsigned long long)segnum);
+                return -EINVAL;
+        }
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh);
+        if (!ret) {
+                dofunc(sufile, segnum, header_bh, bh);
+                brelse(bh);
+        }
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
 /**
 * nilfs_sufile_alloc - allocate a segment
 * @sufile: inode of segment usage file
@@ -113,7 +159,6 @@ nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
 int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 {
        struct buffer_head *header_bh, *su_bh;
-        struct the_nilfs *nilfs;
        struct nilfs_sufile_header *header;
        struct nilfs_segment_usage *su;
        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
@@ -124,8 +169,6 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
        down_write(&NILFS_MDT(sufile)->mi_sem);
-        nilfs = NILFS_MDT(sufile)->mi_nilfs;
        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
        if (ret < 0)
                goto out_sem;
@@ -192,165 +235,84 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
        return ret;
 }
-/**
+void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
- * nilfs_sufile_cancel_free -
+                                 struct buffer_head *header_bh,
- * @sufile: inode of segment usage file
+                                 struct buffer_head *su_bh)
- * @segnum: segment number
- *
- * Description:
- *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- */
-int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
 {
-        struct buffer_head *header_bh, *su_bh;
-        struct the_nilfs *nilfs;
-        struct nilfs_sufile_header *header;
        struct nilfs_segment_usage *su;
        void *kaddr;
-        int ret;
-        down_write(&NILFS_MDT(sufile)->mi_sem);
-        nilfs = NILFS_MDT(sufile)->mi_nilfs;
-        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
-        if (ret < 0)
-                goto out_sem;
-        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh);
-        if (ret < 0)
-                goto out_header;
        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
-        su = nilfs_sufile_block_get_segment_usage(
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
-                sufile, segnum, su_bh, kaddr);
        if (unlikely(!nilfs_segment_usage_clean(su))) {
                printk(KERN_WARNING "%s: segment %llu must be clean\n",
                       __func__, (unsigned long long)segnum);
                kunmap_atomic(kaddr, KM_USER0);
-                goto out_su_bh;
+                return;
        }
        nilfs_segment_usage_set_dirty(su);
        kunmap_atomic(kaddr, KM_USER0);
-        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        nilfs_sufile_mod_counter(header_bh, -1, 1);
-        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
-        le64_add_cpu(&header->sh_ncleansegs, -1);
-        le64_add_cpu(&header->sh_ndirtysegs, 1);
-        kunmap_atomic(kaddr, KM_USER0);
-        nilfs_mdt_mark_buffer_dirty(header_bh);
        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
- out_su_bh:
-        brelse(su_bh);
- out_header:
-        brelse(header_bh);
- out_sem:
-        up_write(&NILFS_MDT(sufile)->mi_sem);
-        return ret;
 }
-/**
+void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
- * nilfs_sufile_freev - free segments
+                           struct buffer_head *header_bh,
- * @sufile: inode of segment usage file
+                           struct buffer_head *su_bh)
- * @segnum: array of segment numbers
- * @nsegs: number of segments
- *
- * Description: nilfs_sufile_freev() frees segments specified by @segnum and
- * @nsegs, which must have been returned by a previous call to
- * nilfs_sufile_alloc().
- *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- */
-#define NILFS_SUFILE_FREEV_PREALLOC     16
-int nilfs_sufile_freev(struct inode *sufile, __u64 *segnum, size_t nsegs)
 {
-        struct buffer_head *header_bh, **su_bh,
-                *su_bh_prealloc[NILFS_SUFILE_FREEV_PREALLOC];
-        struct the_nilfs *nilfs;
-        struct nilfs_sufile_header *header;
        struct nilfs_segment_usage *su;
        void *kaddr;
-        int ret, i;
+        int clean, dirty;
-        down_write(&NILFS_MDT(sufile)->mi_sem);
+        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
-        nilfs = NILFS_MDT(sufile)->mi_nilfs;
+        if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&
+            su->su_nblocks == cpu_to_le32(0)) {
-        /* prepare resources */
-        if (nsegs <= NILFS_SUFILE_FREEV_PREALLOC)
-                su_bh = su_bh_prealloc;
-        else {
-                su_bh = kmalloc(sizeof(*su_bh) * nsegs, GFP_NOFS);
-                if (su_bh == NULL) {
-                        ret = -ENOMEM;
-                        goto out_sem;
-                }
-        }
-        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
-        if (ret < 0)
-                goto out_su_bh;
-        for (i = 0; i < nsegs; i++) {
-                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum[i],
-                                                           0, &su_bh[i]);
-                if (ret < 0)
-                        goto out_bh;
-        }
-        /* free segments */
-        for (i = 0; i < nsegs; i++) {
-                kaddr = kmap_atomic(su_bh[i]->b_page, KM_USER0);
-                su = nilfs_sufile_block_get_segment_usage(
-                        sufile, segnum[i], su_bh[i], kaddr);
-                WARN_ON(nilfs_segment_usage_error(su));
-                nilfs_segment_usage_set_clean(su);
                kunmap_atomic(kaddr, KM_USER0);
-                nilfs_mdt_mark_buffer_dirty(su_bh[i]);
+                return;
        }
-        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        clean = nilfs_segment_usage_clean(su);
-        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+        dirty = nilfs_segment_usage_dirty(su);
-        le64_add_cpu(&header->sh_ncleansegs, nsegs);
-        le64_add_cpu(&header->sh_ndirtysegs, -(u64)nsegs);
+        /* make the segment garbage */
+        su->su_lastmod = cpu_to_le64(0);
+        su->su_nblocks = cpu_to_le32(0);
+        su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY);
        kunmap_atomic(kaddr, KM_USER0);
-        nilfs_mdt_mark_buffer_dirty(header_bh);
+        nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
+        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
+}
- out_bh:
+void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
-        for (i--; i >= 0; i--)
+                          struct buffer_head *header_bh,
-                brelse(su_bh[i]);
+                          struct buffer_head *su_bh)
-        brelse(header_bh);
+{
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        int sudirty;
- out_su_bh:
+        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
-        if (su_bh != su_bh_prealloc)
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
-                kfree(su_bh);
+        if (nilfs_segment_usage_clean(su)) {
+                printk(KERN_WARNING "%s: segment %llu is already clean\n",
+                       __func__, (unsigned long long)segnum);
+                kunmap_atomic(kaddr, KM_USER0);
+                return;
+        }
+        WARN_ON(nilfs_segment_usage_error(su));
+        WARN_ON(!nilfs_segment_usage_dirty(su));
- out_sem:
+        sudirty = nilfs_segment_usage_dirty(su);
-        up_write(&NILFS_MDT(sufile)->mi_sem);
+        nilfs_segment_usage_set_clean(su);
-        return ret;
+        kunmap_atomic(kaddr, KM_USER0);
-}
+        nilfs_mdt_mark_buffer_dirty(su_bh);
-/**
+        nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
- * nilfs_sufile_free -
+        nilfs_mdt_mark_dirty(sufile);
- * @sufile:
- * @segnum:
- */
-int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
-{
-        return nilfs_sufile_freev(sufile, &segnum, 1);
 }
 /**
@@ -500,72 +462,28 @@ int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
        return ret;
 }
-/**
+void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
- * nilfs_sufile_set_error - mark a segment as erroneous
+                               struct buffer_head *header_bh,
- * @sufile: inode of segment usage file
+                               struct buffer_head *su_bh)
- * @segnum: segment number
- *
- * Description: nilfs_sufile_set_error() marks the segment specified by
- * @segnum as erroneous. The error segment will never be used again.
- *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - Invalid segment usage number.
- */
-int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
 {
-        struct buffer_head *header_bh, *su_bh;
        struct nilfs_segment_usage *su;
-        struct nilfs_sufile_header *header;
        void *kaddr;
-        int ret;
+        int suclean;
-        if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
-                printk(KERN_WARNING "%s: invalid segment number: %llu\n",
-                       __func__, (unsigned long long)segnum);
-                return -EINVAL;
-        }
-        down_write(&NILFS_MDT(sufile)->mi_sem);
-        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
-        if (ret < 0)
-                goto out_sem;
-        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh);
-        if (ret < 0)
-                goto out_header;
        kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
        if (nilfs_segment_usage_error(su)) {
                kunmap_atomic(kaddr, KM_USER0);
-                brelse(su_bh);
+                return;
-                goto out_header;
        }
+        suclean = nilfs_segment_usage_clean(su);
        nilfs_segment_usage_set_error(su);
        kunmap_atomic(kaddr, KM_USER0);
-        brelse(su_bh);
-        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+        if (suclean)
-        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+                nilfs_sufile_mod_counter(header_bh, -1, 0);
-        le64_add_cpu(&header->sh_ndirtysegs, -1);
-        kunmap_atomic(kaddr, KM_USER0);
-        nilfs_mdt_mark_buffer_dirty(header_bh);
        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
-        brelse(su_bh);
- out_header:
-        brelse(header_bh);
- out_sem:
-        up_write(&NILFS_MDT(sufile)->mi_sem);
-        return ret;
 }
 /**
@@ -625,7 +543,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
                        si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
                        si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
                                ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
-                        if (nilfs_segment_is_active(nilfs, segnum + i + j))
+                        if (nilfs_segment_is_active(nilfs, segnum + j))
                                si[i + j].sui_flags |=
                                        (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
                }
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index d595f33a768d..a2e2efd4ade1 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -36,9 +36,6 @@ static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
 }
 int nilfs_sufile_alloc(struct inode *, __u64 *);
-int nilfs_sufile_cancel_free(struct inode *, __u64);
-int nilfs_sufile_freev(struct inode *, __u64 *, size_t);
-int nilfs_sufile_free(struct inode *, __u64);
 int nilfs_sufile_get_segment_usage(struct inode *, __u64,
                                   struct nilfs_segment_usage **,
                                   struct buffer_head **);
@@ -46,9 +43,83 @@ void nilfs_sufile_put_segment_usage(struct inode *, __u64,
                                    struct buffer_head *);
 int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
 int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
-int nilfs_sufile_set_error(struct inode *, __u64);
 ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
                                size_t);
+int nilfs_sufile_update(struct inode *, __u64, int,
+                        void (*dofunc)(struct inode *, __u64,
+                                       struct buffer_head *,
+                                       struct buffer_head *));
+void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
+                                 struct buffer_head *);
+void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
+                           struct buffer_head *);
+void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
+                          struct buffer_head *);
+void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
+                               struct buffer_head *);
+/**
+ * nilfs_sufile_cancel_free -
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
+{
+        return nilfs_sufile_update(sufile, segnum, 0,
+                                   nilfs_sufile_do_cancel_free);
+}
+/**
+ * nilfs_sufile_scrap - make a segment garbage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum)
+{
+        return nilfs_sufile_update(sufile, segnum, 1, nilfs_sufile_do_scrap);
+}
+/**
+ * nilfs_sufile_free - free segment
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
+{
+        return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free);
+}
+/**
+ * nilfs_sufile_set_error - mark a segment as erroneous
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description: nilfs_sufile_set_error() marks the segment specified by
+ * @segnum as erroneous. The error segment will never be used again.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid segment usage number.
+ */
+static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
+{
+        return nilfs_sufile_update(sufile, segnum, 0,
+                                   nilfs_sufile_do_set_error);
+}
 #endif  /* _NILFS_SUFILE_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e117e1ea9bff..6989b03e97ab 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -63,7 +63,6 @@
 MODULE_AUTHOR("NTT Corp.");
 MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
-MODULE_VERSION(NILFS_VERSION);
 MODULE_LICENSE("GPL");
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
@@ -476,11 +475,12 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        unsigned long long blocks;
        unsigned long overhead;
        unsigned long nrsvblocks;
        sector_t nfreeblocks;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        int err;
        /*
@@ -514,6 +514,9 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files = atomic_read(&sbi->s_inodes_count);
        buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
        buf->f_namelen = NILFS_NAME_LEN;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
        return 0;
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 33400cf0bbe2..7f65b3be4aa9 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -115,6 +115,7 @@ void put_nilfs(struct the_nilfs *nilfs)
 static int nilfs_load_super_root(struct the_nilfs *nilfs,
                                 struct nilfs_sb_info *sbi, sector_t sr_block)
 {
+        static struct lock_class_key dat_lock_key;
        struct buffer_head *bh_sr;
        struct nilfs_super_root *raw_sr;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
@@ -163,6 +164,9 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
        if (unlikely(err))
                goto failed_sufile;
+        lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key);
+        lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key);
        nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
        nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
                                 sizeof(struct nilfs_cpfile_header));
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 7d604480557a..b574431a031d 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -290,6 +290,21 @@ out_attach:
        else
                mlog_errno(ret);
+        /*
+         * In case of error, manually free the allocation and do the iput().
+         * We need to do this because error here means no d_instantiate(),
+         * which means iput() will not be called during dput(dentry).
+         */
+        if (ret < 0 && !alias) {
+                ocfs2_lock_res_free(&dl->dl_lockres);
+                BUG_ON(dl->dl_count != 1);
+                spin_lock(&dentry_attach_lock);
+                dentry->d_fsdata = NULL;
+                spin_unlock(&dentry_attach_lock);
+                kfree(dl);
+                iput(inode);
+        }
        dput(alias);
        return ret;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index e71160cda110..c5752305627c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2697,7 +2697,7 @@ static int ocfs2_dx_dir_index_block(struct inode *dir,
                                    u32 *num_dx_entries,
                                    struct buffer_head *dirent_bh)
 {
-        int ret, namelen, i;
+        int ret = 0, namelen, i;
        char *de_buf, *limit;
        struct ocfs2_dir_entry *de;
        struct buffer_head *dx_leaf_bh;
@@ -2934,7 +2934,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         */
        BUG_ON(alloc > 2);
-        ret = ocfs2_reserve_clusters(osb, alloc, &data_ac);
+        ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac);
        if (ret) {
                mlog_errno(ret);
                goto out;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index de3da8eb558c..15713cbb865c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -100,7 +100,8 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
        /* If the inode allocator bit is clear, this inode must be stale */
        if (!set) {
-                mlog(0, "inode %llu suballoc bit is clear\n", blkno);
+                mlog(0, "inode %llu suballoc bit is clear\n",
+                     (unsigned long long)blkno);
                status = -ESTALE;
                goto unlock_nfs_sync;
        }
@@ -114,7 +115,7 @@ check_err:
        if (status < 0) {
                if (status == -ESTALE) {
                        mlog(0, "stale inode ino: %llu generation: %u\n",
-                             blkno, handle->ih_generation);
+                             (unsigned long long)blkno, handle->ih_generation);
                }
                result = ERR_PTR(status);
                goto bail;
@@ -129,8 +130,8 @@ check_err:
 check_gen:
        if (handle->ih_generation != inode->i_generation) {
                iput(inode);
-                mlog(0, "stale inode ino: %llu generation: %u\n", blkno,
+                mlog(0, "stale inode ino: %llu generation: %u\n",
-                     handle->ih_generation);
+                     (unsigned long long)blkno, handle->ih_generation);
                result = ERR_PTR(-ESTALE);
                goto bail;
        }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8672b9536039..c2a87c885b73 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1912,6 +1912,22 @@ out_sems:
        return written ? written : ret;
 }
+static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
+                                struct file *out,
+                                struct splice_desc *sd)
+{
+        int ret;
+        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
+                                            sd->total_len, 0, NULL);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        return splice_from_pipe_feed(pipe, sd, pipe_to_file);
+}
 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                                       struct file *out,
                                       loff_t *ppos,
@@ -1919,38 +1935,76 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                                       unsigned int flags)
 {
        int ret;
-        struct inode *inode = out->f_path.dentry->d_inode;
+        struct address_space *mapping = out->f_mapping;
+        struct inode *inode = mapping->host;
+        struct splice_desc sd = {
+                .total_len = len,
+                .flags = flags,
+                .pos = *ppos,
+                .u.file = out,
+        };
        mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
                   (unsigned int)len,
                   out->f_path.dentry->d_name.len,
                   out->f_path.dentry->d_name.name);
-        mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+        if (pipe->inode)
+                mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
-        ret = ocfs2_rw_lock(inode, 1);
+        splice_from_pipe_begin(&sd);
-        if (ret < 0) {
+        do {
-                mlog_errno(ret);
+                ret = splice_from_pipe_next(pipe, &sd);
-                goto out;
+                if (ret <= 0)
-        }
+                        break;
-        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-                                            NULL);
+                ret = ocfs2_rw_lock(inode, 1);
-        if (ret < 0) {
+                if (ret < 0)
-                mlog_errno(ret);
+                        mlog_errno(ret);
-                goto out_unlock;
+                else {
-        }
+                        ret = ocfs2_splice_to_file(pipe, out, &sd);
+                        ocfs2_rw_unlock(inode, 1);
+                }
+                mutex_unlock(&inode->i_mutex);
+        } while (ret > 0);
+        splice_from_pipe_end(pipe, &sd);
        if (pipe->inode)
-                mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
-        ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
-        if (pipe->inode)
                mutex_unlock(&pipe->inode->i_mutex);
-out_unlock:
+        if (sd.num_spliced)
-        ocfs2_rw_unlock(inode, 1);
+                ret = sd.num_spliced;
-out:
-        mutex_unlock(&inode->i_mutex);
+        if (ret > 0) {
+                unsigned long nr_pages;
+                *ppos += ret;
+                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                /*
+                 * If file or inode is SYNC and we actually wrote some data,
+                 * sync it.
+                 */
+                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+                        int err;
+                        mutex_lock(&inode->i_mutex);
+                        err = ocfs2_rw_lock(inode, 1);
+                        if (err < 0) {
+                                mlog_errno(err);
+                        } else {
+                                err = generic_osync_inode(inode, mapping,
+                                                  OSYNC_METADATA|OSYNC_DATA);
+                                ocfs2_rw_unlock(inode, 1);
+                        }
+                        mutex_unlock(&inode->i_mutex);
+                        if (err)
+                                ret = err;
+                }
+                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+        }
        mlog_exit(ret);
        return ret;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 619dd7f6c053..eb7b76331eb7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -437,8 +437,9 @@ static inline int ocfs2_unlink_credits(struct super_block *sb)
 }
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
- * inode alloc group descriptor + orphan dir index leaf */
+ * inode alloc group descriptor + orphan dir index root +
-#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3)
+ * orphan dir index leaf */
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
 /* dinode update, old dir dinode update, new dir dinode update, old
 * dir dir entry, new dir dir entry, dir entry update for renaming
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2220f93f668b..33464c6b60a2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1025,10 +1025,8 @@ static int ocfs2_rename(struct inode *old_dir,
        struct inode *orphan_dir = NULL;
        struct ocfs2_dinode *newfe = NULL;
        char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
-        struct buffer_head *orphan_entry_bh = NULL;
        struct buffer_head *newfe_bh = NULL;
        struct buffer_head *old_inode_bh = NULL;
-        struct buffer_head *insert_entry_bh = NULL;
        struct ocfs2_super *osb = NULL;
        u64 newfe_blkno, old_de_ino;
        handle_t *handle = NULL;
@@ -1455,8 +1453,6 @@ bail:
        brelse(old_inode_bh);
        brelse(old_dir_bh);
        brelse(new_dir_bh);
-        brelse(orphan_entry_bh);
-        brelse(insert_entry_bh);
        mlog_exit(status);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index b4ca5911caaf..8439f6b324b9 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2197,26 +2197,29 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
        struct buffer_head *inode_bh = NULL;
        struct ocfs2_dinode *inode_fe;
-        mlog_entry("blkno: %llu\n", blkno);
+        mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
        /* dirty read disk */
        status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
        if (status < 0) {
-                mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status);
+                mlog(ML_ERROR, "read block %llu failed %d\n",
+                     (unsigned long long)blkno, status);
                goto bail;
        }
        inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
        if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
-                mlog(ML_ERROR, "invalid inode %llu requested\n", blkno);
+                mlog(ML_ERROR, "invalid inode %llu requested\n",
+                     (unsigned long long)blkno);
                status = -EINVAL;
                goto bail;
        }
-        if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT &&
+        if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
            (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
                mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
-                     blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
+                     (unsigned long long)blkno,
+                     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
                status = -EINVAL;
                goto bail;
        }
@@ -2251,7 +2254,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
        u64 bg_blkno;
        int status;
-        mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit);
+        mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
+                   (unsigned int)bit);
        alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
        if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
@@ -2266,7 +2270,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
        status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
                                             &group_bh);
        if (status < 0) {
-                mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status);
+                mlog(ML_ERROR, "read group %llu failed %d\n",
+                     (unsigned long long)bg_blkno, status);
                goto bail;
        }
@@ -2300,7 +2305,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
        struct inode *inode_alloc_inode;
        struct buffer_head *alloc_bh = NULL;
-        mlog_entry("blkno: %llu", blkno);
+        mlog_entry("blkno: %llu", (unsigned long long)blkno);
        status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
                                             &suballoc_bit);
diff --git a/fs/pipe.c b/fs/pipe.c
index 4af7aa521813..13414ec45b8d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -37,6 +37,42 @@
 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
 */
+static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
+{
+        if (pipe->inode)
+                mutex_lock_nested(&pipe->inode->i_mutex, subclass);
+}
+void pipe_lock(struct pipe_inode_info *pipe)
+{
+        /*
+         * pipe_lock() nests non-pipe inode locks (for writing to a file)
+         */
+        pipe_lock_nested(pipe, I_MUTEX_PARENT);
+}
+EXPORT_SYMBOL(pipe_lock);
+void pipe_unlock(struct pipe_inode_info *pipe)
+{
+        if (pipe->inode)
+                mutex_unlock(&pipe->inode->i_mutex);
+}
+EXPORT_SYMBOL(pipe_unlock);
+void pipe_double_lock(struct pipe_inode_info *pipe1,
+                      struct pipe_inode_info *pipe2)
+{
+        BUG_ON(pipe1 == pipe2);
+        if (pipe1 < pipe2) {
+                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+        } else {
+                pipe_lock_nested(pipe2, I_MUTEX_CHILD);
+                pipe_lock_nested(pipe1, I_MUTEX_PARENT);
+        }
+}
 /* Drop the inode semaphore and wait for a pipe event, atomically */
 void pipe_wait(struct pipe_inode_info *pipe)
 {
@@ -47,12 +83,10 @@ void pipe_wait(struct pipe_inode_info *pipe)
         * is considered a noninteractive wait:
         */
        prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
-        if (pipe->inode)
+        pipe_unlock(pipe);
-                mutex_unlock(&pipe->inode->i_mutex);
        schedule();
        finish_wait(&pipe->wait, &wait);
-        if (pipe->inode)
+        pipe_lock(pipe);
-                mutex_lock(&pipe->inode->i_mutex);
 }
 static int
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7e4877d9dcb5..725a650bbbb8 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -80,6 +80,7 @@
 #include <linux/delayacct.h>
 #include <linux/seq_file.h>
 #include <linux/pid_namespace.h>
+#include <linux/ptrace.h>
 #include <linux/tracehook.h>
 #include <asm/pgtable.h>
@@ -352,6 +353,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
        char state;
        pid_t ppid = 0, pgid = -1, sid = -1;
        int num_threads = 0;
+        int permitted;
        struct mm_struct *mm;
        unsigned long long start_time;
        unsigned long cmin_flt = 0, cmaj_flt = 0;
@@ -364,11 +366,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
        state = *get_task_state(task);
        vsize = eip = esp = 0;
+        permitted = ptrace_may_access(task, PTRACE_MODE_READ);
        mm = get_task_mm(task);
        if (mm) {
                vsize = task_vsize(mm);
-                eip = KSTK_EIP(task);
+                if (permitted) {
-                esp = KSTK_ESP(task);
+                        eip = KSTK_EIP(task);
+                        esp = KSTK_ESP(task);
+                }
        }
        get_task_comm(tcomm, task);
@@ -424,7 +429,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                unlock_task_sighand(task, &flags);
        }
-        if (!whole || num_threads < 2)
+        if (permitted && (!whole || num_threads < 2))
                wchan = get_wchan(task);
        if (!whole) {
                min_flt = task->min_flt;
@@ -476,7 +481,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                rsslim,
                mm ? mm->start_code : 0,
                mm ? mm->end_code : 0,
-                mm ? mm->start_stack : 0,
+                (permitted && mm) ? mm->start_stack : 0,
                esp,
                eip,
                /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f71559784bfb..fb45615943c2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -322,7 +322,10 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
        wchan = get_wchan(task);
        if (lookup_symbol_name(wchan, symname) < 0)
-                return sprintf(buffer, "%lu", wchan);
+                if (!ptrace_may_access(task, PTRACE_MODE_READ))
+                        return 0;
+                else
+                        return sprintf(buffer, "%lu", wchan);
        else
                return sprintf(buffer, "%s", symname);
 }
@@ -648,14 +651,14 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
 {
        struct proc_mounts *p = file->private_data;
        struct mnt_namespace *ns = p->ns;
-        unsigned res = 0;
+        unsigned res = POLLIN | POLLRDNORM;
        poll_wait(file, &ns->poll, wait);
        spin_lock(&vfsmount_lock);
        if (p->event != ns->event) {
                p->event = ns->event;
-                res = POLLERR;
+                res |= POLLERR | POLLPRI;
        }
        spin_unlock(&vfsmount_lock);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 74ea974f5ca6..c6b0302af4c4 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -35,7 +35,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #define K(x) ((x) << (PAGE_SHIFT - 10))
        si_meminfo(&i);
        si_swapinfo(&i);
-        committed = atomic_long_read(&vm_committed_space);
+        committed = percpu_counter_read_positive(&vm_committed_as);
        allowed = ((totalram_pages - hugetlb_total_pages())
                * sysctl_overcommit_ratio / 100) + total_swap_pages;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index f75efa22df5e..81e4eb60972e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -18,6 +18,9 @@
 #ifndef arch_irq_stat
 #define arch_irq_stat() 0
 #endif
+#ifndef arch_idle_time
+#define arch_idle_time(cpu) 0
+#endif
 static int show_stat(struct seq_file *p, void *v)
 {
@@ -40,6 +43,7 @@ static int show_stat(struct seq_file *p, void *v)
                nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
                system = cputime64_add(system, kstat_cpu(i).cpustat.system);
                idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
+                idle = cputime64_add(idle, arch_idle_time(i));
                iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
                irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
                softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
@@ -69,6 +73,7 @@ static int show_stat(struct seq_file *p, void *v)
                nice = kstat_cpu(i).cpustat.nice;
                system = kstat_cpu(i).cpustat.system;
                idle = kstat_cpu(i).cpustat.idle;
+                idle = cputime64_add(idle, arch_idle_time(i));
                iowait = kstat_cpu(i).cpustat.iowait;
                irq = kstat_cpu(i).cpustat.irq;
                softirq = kstat_cpu(i).cpustat.softirq;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 39e4ad4f59f4..6f61b7cc32e0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -665,6 +665,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
                goto out_task;
        ret = 0;
+        if (!count)
+                goto out_task;
        mm = get_task_mm(task);
        if (!mm)
                goto out_task;
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 385a0831cc99..68d4f6dc0578 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -1,12 +1,3 @@
-#
-# Makefile for the Linux filesystems.
-#
-# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
-# Rewritten to use lists instead of if-statements.
-#
-obj-y :=
 obj-$(CONFIG_QUOTA)             += dquot.o
 obj-$(CONFIG_QFMT_V1)           += quota_v1.o
 obj-$(CONFIG_QFMT_V2)           += quota_v2.o
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
index 06044a9dc62d..95217b830118 100644
--- a/fs/romfs/internal.h
+++ b/fs/romfs/internal.h
@@ -43,5 +43,5 @@ extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
                          void *buf, size_t buflen);
 extern ssize_t romfs_dev_strnlen(struct super_block *sb,
                                 unsigned long pos, size_t maxlen);
-extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
+extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
-                             const char *str, size_t size);
+                            const char *str, size_t size);
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
index 7e3e1e12a081..b3208adf8e71 100644
--- a/fs/romfs/storage.c
+++ b/fs/romfs/storage.c
@@ -67,26 +67,35 @@ static ssize_t romfs_mtd_strnlen(struct super_block *sb,
 * compare a string to one in a romfs image on MTD
 * - return 1 if matched, 0 if differ, -ve if error
 */
-static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos,
+static int romfs_mtd_strcmp(struct super_block *sb, unsigned long pos,
-                             const char *str, size_t size)
+                            const char *str, size_t size)
 {
-        u_char buf[16];
+        u_char buf[17];
        size_t len, segment;
        int ret;
-        /* scan the string up to 16 bytes at a time */
+        /* scan the string up to 16 bytes at a time, and attempt to grab the
+         * trailing NUL whilst we're at it */
+        buf[0] = 0xff;
        while (size > 0) {
-                segment = min_t(size_t, size, 16);
+                segment = min_t(size_t, size + 1, 17);
                ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
                if (ret < 0)
                        return ret;
+                len--;
                if (memcmp(buf, str, len) != 0)
                        return 0;
+                buf[0] = buf[len];
                size -= len;
                pos += len;
                str += len;
        }
+        /* check the trailing NUL was */
+        if (buf[0])
+                return 0;
        return 1;
 }
 #endif /* CONFIG_ROMFS_ON_MTD */
@@ -111,6 +120,7 @@ static int romfs_blk_read(struct super_block *sb, unsigned long pos,
                        return -EIO;
                memcpy(buf, bh->b_data + offset, segment);
                brelse(bh);
+                buf += segment;
                buflen -= segment;
                pos += segment;
        }
@@ -154,28 +164,48 @@ static ssize_t romfs_blk_strnlen(struct super_block *sb,
 * compare a string to one in a romfs image on a block device
 * - return 1 if matched, 0 if differ, -ve if error
 */
-static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos,
+static int romfs_blk_strcmp(struct super_block *sb, unsigned long pos,
-                             const char *str, size_t size)
+                            const char *str, size_t size)
 {
        struct buffer_head *bh;
        unsigned long offset;
        size_t segment;
-        bool x;
+        bool matched, terminated = false;
-        /* scan the string up to 16 bytes at a time */
+        /* compare string up to a block at a time */
        while (size > 0) {
                offset = pos & (ROMBSIZE - 1);
                segment = min_t(size_t, size, ROMBSIZE - offset);
                bh = sb_bread(sb, pos >> ROMBSBITS);
                if (!bh)
                        return -EIO;
-                x = (memcmp(bh->b_data + offset, str, segment) != 0);
+                matched = (memcmp(bh->b_data + offset, str, segment) == 0);
-                brelse(bh);
-                if (x)
-                        return 0;
                size -= segment;
                pos += segment;
                str += segment;
+                if (matched && size == 0 && offset + segment < ROMBSIZE) {
+                        if (!bh->b_data[offset + segment])
+                                terminated = true;
+                        else
+                                matched = false;
+                }
+                brelse(bh);
+                if (!matched)
+                        return 0;
+        }
+        if (!terminated) {
+                /* the terminating NUL must be on the first byte of the next
+                 * block */
+                BUG_ON((pos & (ROMBSIZE - 1)) != 0);
+                bh = sb_bread(sb, pos >> ROMBSBITS);
+                if (!bh)
+                        return -EIO;
+                matched = !bh->b_data[0];
+                brelse(bh);
+                if (!matched)
+                        return 0;
        }
        return 1;
@@ -234,10 +264,12 @@ ssize_t romfs_dev_strnlen(struct super_block *sb,
 /*
 * compare a string to one in romfs
+ * - the string to be compared to, str, may not be NUL-terminated; instead the
+ *   string is of the specified size
 * - return 1 if matched, 0 if differ, -ve if error
 */
-int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
+int romfs_dev_strcmp(struct super_block *sb, unsigned long pos,
-                      const char *str, size_t size)
+                     const char *str, size_t size)
 {
        size_t limit;
@@ -246,16 +278,16 @@ int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
                return -EIO;
        if (size > ROMFS_MAXFN)
                return -ENAMETOOLONG;
-        if (size > limit - pos)
+        if (size + 1 > limit - pos)
                return -EIO;
 #ifdef CONFIG_ROMFS_ON_MTD
        if (sb->s_mtd)
-                return romfs_mtd_strncmp(sb, pos, str, size);
+                return romfs_mtd_strcmp(sb, pos, str, size);
 #endif
 #ifdef CONFIG_ROMFS_ON_BLOCK
        if (sb->s_bdev)
-                return romfs_blk_strncmp(sb, pos, str, size);
+                return romfs_blk_strcmp(sb, pos, str, size);
 #endif
        return -EIO;
 }
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 10ca7d984a8b..c53b5ef8a02f 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -240,8 +240,8 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
                        goto error;
                /* try to match the first 16 bytes of name */
-                ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name,
+                ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name,
-                                        len);
+                                       len);
                if (ret < 0)
                        goto error;
                if (ret == 1)
diff --git a/fs/splice.c b/fs/splice.c
index c18aa7e03e2b..666953d59a35 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -182,8 +182,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
        do_wakeup = 0;
        page_nr = 0;
-        if (pipe->inode)
+        pipe_lock(pipe);
-                mutex_lock(&pipe->inode->i_mutex);
        for (;;) {
                if (!pipe->readers) {
@@ -245,15 +244,13 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
                pipe->waiting_writers--;
        }
-        if (pipe->inode) {
+        pipe_unlock(pipe);
-                mutex_unlock(&pipe->inode->i_mutex);
-                if (do_wakeup) {
+        if (do_wakeup) {
-                        smp_mb();
+                smp_mb();
-                        if (waitqueue_active(&pipe->wait))
+                if (waitqueue_active(&pipe->wait))
-                                wake_up_interruptible(&pipe->wait);
+                        wake_up_interruptible(&pipe->wait);
-                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
-                }
        }
        while (page_nr < spd_pages)
@@ -555,8 +552,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
 * a new page in the output file page cache and fill/dirty that.
 */
-static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
-                        struct splice_desc *sd)
+                 struct splice_desc *sd)
 {
        struct file *file = sd->u.file;
        struct address_space *mapping = file->f_mapping;
@@ -600,108 +597,177 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 out:
        return ret;
 }
+EXPORT_SYMBOL(pipe_to_file);
+static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
+{
+        smp_mb();
+        if (waitqueue_active(&pipe->wait))
+                wake_up_interruptible(&pipe->wait);
+        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+}
 /**
- * __splice_from_pipe - splice data from a pipe to given actor
+ * splice_from_pipe_feed - feed available data from a pipe to a file
 * @pipe:       pipe to splice from
 * @sd:         information to @actor
 * @actor:      handler that splices the data
 *
 * Description:
- *    This function does little more than loop over the pipe and call
+ *    This function loops over the pipe and calls @actor to do the
- *    @actor to do the actual moving of a single struct pipe_buffer to
+ *    actual moving of a single struct pipe_buffer to the desired
- *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
+ *    destination.  It returns when there's no more buffers left in
- *    pipe_to_user.
+ *    the pipe or if the requested number of bytes (@sd->total_len)
+ *    have been copied.  It returns a positive number (one) if the
+ *    pipe needs to be filled with more data, zero if the required
+ *    number of bytes have been copied and -errno on error.
 *
+ *    This, together with splice_from_pipe_{begin,end,next}, may be
+ *    used to implement the functionality of __splice_from_pipe() when
+ *    locking is required around copying the pipe buffers to the
+ *    destination.
 */
-ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
+int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
-                           splice_actor *actor)
+                          splice_actor *actor)
 {
-        int ret, do_wakeup, err;
+        int ret;
-        ret = 0;
-        do_wakeup = 0;
-        for (;;) {
-                if (pipe->nrbufs) {
-                        struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
-                        const struct pipe_buf_operations *ops = buf->ops;
-                        sd->len = buf->len;
+        while (pipe->nrbufs) {
-                        if (sd->len > sd->total_len)
+                struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
-                                sd->len = sd->total_len;
+                const struct pipe_buf_operations *ops = buf->ops;
-                        err = actor(pipe, buf, sd);
+                sd->len = buf->len;
-                        if (err <= 0) {
+                if (sd->len > sd->total_len)
-                                if (!ret && err != -ENODATA)
+                        sd->len = sd->total_len;
-                                        ret = err;
-                                break;
+                ret = actor(pipe, buf, sd);
-                        }
+                if (ret <= 0) {
+                        if (ret == -ENODATA)
+                                ret = 0;
+                        return ret;
+                }
+                buf->offset += ret;
+                buf->len -= ret;
-                        ret += err;
+                sd->num_spliced += ret;
-                        buf->offset += err;
+                sd->len -= ret;
-                        buf->len -= err;
+                sd->pos += ret;
+                sd->total_len -= ret;
-                        sd->len -= err;
+                if (!buf->len) {
-                        sd->pos += err;
+                        buf->ops = NULL;
-                        sd->total_len -= err;
+                        ops->release(pipe, buf);
-                        if (sd->len)
+                        pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
-                                continue;
+                        pipe->nrbufs--;
+                        if (pipe->inode)
+                                sd->need_wakeup = true;
+                }
-                        if (!buf->len) {
+                if (!sd->total_len)
-                                buf->ops = NULL;
+                        return 0;
-                                ops->release(pipe, buf);
+        }
-                                pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
-                                pipe->nrbufs--;
-                                if (pipe->inode)
-                                        do_wakeup = 1;
-                        }
-                        if (!sd->total_len)
+        return 1;
-                                break;
+}
-                }
+EXPORT_SYMBOL(splice_from_pipe_feed);
-                if (pipe->nrbufs)
+/**
-                        continue;
+ * splice_from_pipe_next - wait for some data to splice from
+ * @pipe:       pipe to splice from
+ * @sd:         information about the splice operation
+ *
+ * Description:
+ *    This function will wait for some data and return a positive
+ *    value (one) if pipe buffers are available.  It will return zero
+ *    or -errno if no more data needs to be spliced.
+ */
+int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+        while (!pipe->nrbufs) {
                if (!pipe->writers)
-                        break;
+                        return 0;
-                if (!pipe->waiting_writers) {
-                        if (ret)
-                                break;
-                }
-                if (sd->flags & SPLICE_F_NONBLOCK) {
+                if (!pipe->waiting_writers && sd->num_spliced)
-                        if (!ret)
+                        return 0;
-                                ret = -EAGAIN;
-                        break;
-                }
-                if (signal_pending(current)) {
+                if (sd->flags & SPLICE_F_NONBLOCK)
-                        if (!ret)
+                        return -EAGAIN;
-                                ret = -ERESTARTSYS;
-                        break;
-                }
-                if (do_wakeup) {
+                if (signal_pending(current))
-                        smp_mb();
+                        return -ERESTARTSYS;
-                        if (waitqueue_active(&pipe->wait))
-                                wake_up_interruptible_sync(&pipe->wait);
+                if (sd->need_wakeup) {
-                        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+                        wakeup_pipe_writers(pipe);
-                        do_wakeup = 0;
+                        sd->need_wakeup = false;
                }
                pipe_wait(pipe);
        }
-        if (do_wakeup) {
+        return 1;
-                smp_mb();
+}
-                if (waitqueue_active(&pipe->wait))
+EXPORT_SYMBOL(splice_from_pipe_next);
-                        wake_up_interruptible(&pipe->wait);
-                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
-        }
-        return ret;
+/**
+ * splice_from_pipe_begin - start splicing from pipe
+ * @sd:         information about the splice operation
+ *
+ * Description:
+ *    This function should be called before a loop containing
+ *    splice_from_pipe_next() and splice_from_pipe_feed() to
+ *    initialize the necessary fields of @sd.
+ */
+void splice_from_pipe_begin(struct splice_desc *sd)
+{
+        sd->num_spliced = 0;
+        sd->need_wakeup = false;
+}
+EXPORT_SYMBOL(splice_from_pipe_begin);
+/**
+ * splice_from_pipe_end - finish splicing from pipe
+ * @pipe:       pipe to splice from
+ * @sd:         information about the splice operation
+ *
+ * Description:
+ *    This function will wake up pipe writers if necessary.  It should
+ *    be called after a loop containing splice_from_pipe_next() and
+ *    splice_from_pipe_feed().
+ */
+void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+        if (sd->need_wakeup)
+                wakeup_pipe_writers(pipe);
+}
+EXPORT_SYMBOL(splice_from_pipe_end);
+/**
+ * __splice_from_pipe - splice data from a pipe to given actor
+ * @pipe:       pipe to splice from
+ * @sd:         information to @actor
+ * @actor:      handler that splices the data
+ *
+ * Description:
+ *    This function does little more than loop over the pipe and call
+ *    @actor to do the actual moving of a single struct pipe_buffer to
+ *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
+ *    pipe_to_user.
+ *
+ */
+ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
+                           splice_actor *actor)
+{
+        int ret;
+        splice_from_pipe_begin(sd);
+        do {
+                ret = splice_from_pipe_next(pipe, sd);
+                if (ret > 0)
+                        ret = splice_from_pipe_feed(pipe, sd, actor);
+        } while (ret > 0);
+        splice_from_pipe_end(pipe, sd);
+        return sd->num_spliced ? sd->num_spliced : ret;
 }
 EXPORT_SYMBOL(__splice_from_pipe);
@@ -715,7 +781,7 @@ EXPORT_SYMBOL(__splice_from_pipe);
 * @actor:      handler that splices the data
 *
 * Description:
- *    See __splice_from_pipe. This function locks the input and output inodes,
+ *    See __splice_from_pipe. This function locks the pipe inode,
 *    otherwise it's identical to __splice_from_pipe().
 *
 */
@@ -724,7 +790,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
                         splice_actor *actor)
 {
        ssize_t ret;
-        struct inode *inode = out->f_mapping->host;
        struct splice_desc sd = {
                .total_len = len,
                .flags = flags,
@@ -732,30 +797,15 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
                .u.file = out,
        };
-        /*
+        pipe_lock(pipe);
-         * The actor worker might be calling ->write_begin and
-         * ->write_end. Most of the time, these expect i_mutex to
-         * be held. Since this may result in an ABBA deadlock with
-         * pipe->inode, we have to order lock acquiry here.
-         *
-         * Outer lock must be inode->i_mutex, as pipe_wait() will
-         * release and reacquire pipe->inode->i_mutex, AND inode must
-         * never be a pipe.
-         */
-        WARN_ON(S_ISFIFO(inode->i_mode));
-        mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-        if (pipe->inode)
-                mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
        ret = __splice_from_pipe(pipe, &sd, actor);
-        if (pipe->inode)
+        pipe_unlock(pipe);
-                mutex_unlock(&pipe->inode->i_mutex);
-        mutex_unlock(&inode->i_mutex);
        return ret;
 }
 /**
- * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
+ * generic_file_splice_write - splice data from a pipe to a file
 * @pipe:       pipe info
 * @out:        file to write to
 * @ppos:       position in @out
@@ -764,13 +814,12 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 *
 * Description:
 *    Will either move or copy pages (determined by @flags options) from
- *    the given pipe inode to the given file. The caller is responsible
+ *    the given pipe inode to the given file.
- *    for acquiring i_mutex on both inodes.
 *
 */
 ssize_t
-generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
+generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
-                                 loff_t *ppos, size_t len, unsigned int flags)
+                          loff_t *ppos, size_t len, unsigned int flags)
 {
        struct address_space *mapping = out->f_mapping;
        struct inode *inode = mapping->host;
@@ -781,76 +830,28 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
                .u.file = out,
        };
        ssize_t ret;
-        int err;
-        err = file_remove_suid(out);
-        if (unlikely(err))
-                return err;
-        ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
-        if (ret > 0) {
-                unsigned long nr_pages;
-                *ppos += ret;
+        pipe_lock(pipe);
-                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-                /*
-                 * If file or inode is SYNC and we actually wrote some data,
-                 * sync it.
-                 */
-                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
-                        err = generic_osync_inode(inode, mapping,
-                                                  OSYNC_METADATA|OSYNC_DATA);
-                        if (err)
+        splice_from_pipe_begin(&sd);
-                                ret = err;
+        do {
-                }
+                ret = splice_from_pipe_next(pipe, &sd);
-                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+                if (ret <= 0)
-        }
+                        break;
-        return ret;
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-}
+                ret = file_remove_suid(out);
+                if (!ret)
+                        ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+                mutex_unlock(&inode->i_mutex);
+        } while (ret > 0);
+        splice_from_pipe_end(pipe, &sd);
-EXPORT_SYMBOL(generic_file_splice_write_nolock);
+        pipe_unlock(pipe);
-/**
+        if (sd.num_spliced)
- * generic_file_splice_write - splice data from a pipe to a file
+                ret = sd.num_spliced;
- * @pipe:       pipe info
- * @out:        file to write to
- * @ppos:       position in @out
- * @len:        number of bytes to splice
- * @flags:      splice modifier flags
- *
- * Description:
- *    Will either move or copy pages (determined by @flags options) from
- *    the given pipe inode to the given file.
- *
- */
-ssize_t
-generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
-                          loff_t *ppos, size_t len, unsigned int flags)
-{
-        struct address_space *mapping = out->f_mapping;
-        struct inode *inode = mapping->host;
-        struct splice_desc sd = {
-                .total_len = len,
-                .flags = flags,
-                .pos = *ppos,
-                .u.file = out,
-        };
-        ssize_t ret;
-        WARN_ON(S_ISFIFO(inode->i_mode));
-        mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-        ret = file_remove_suid(out);
-        if (likely(!ret)) {
-                if (pipe->inode)
-                        mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
-                ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
-                if (pipe->inode)
-                        mutex_unlock(&pipe->inode->i_mutex);
-        }
-        mutex_unlock(&inode->i_mutex);
        if (ret > 0) {
                unsigned long nr_pages;
@@ -1339,8 +1340,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
        if (!pipe)
                return -EBADF;
-        if (pipe->inode)
+        pipe_lock(pipe);
-                mutex_lock(&pipe->inode->i_mutex);
        error = ret = 0;
        while (nr_segs) {
@@ -1395,8 +1395,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
                iov++;
        }
-        if (pipe->inode)
+        pipe_unlock(pipe);
-                mutex_unlock(&pipe->inode->i_mutex);
        if (!ret)
                ret = error;
@@ -1524,7 +1523,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
                return 0;
        ret = 0;
-        mutex_lock(&pipe->inode->i_mutex);
+        pipe_lock(pipe);
        while (!pipe->nrbufs) {
                if (signal_pending(current)) {
@@ -1542,7 +1541,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
                pipe_wait(pipe);
        }
-        mutex_unlock(&pipe->inode->i_mutex);
+        pipe_unlock(pipe);
        return ret;
 }
@@ -1562,7 +1561,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
                return 0;
        ret = 0;
-        mutex_lock(&pipe->inode->i_mutex);
+        pipe_lock(pipe);
        while (pipe->nrbufs >= PIPE_BUFFERS) {
                if (!pipe->readers) {
@@ -1583,7 +1582,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
                pipe->waiting_writers--;
        }
-        mutex_unlock(&pipe->inode->i_mutex);
+        pipe_unlock(pipe);
        return ret;
 }
@@ -1599,10 +1598,10 @@ static int link_pipe(struct pipe_inode_info *ipipe,
        /*
         * Potential ABBA deadlock, work around it by ordering lock
-         * grabbing by inode address. Otherwise two different processes
+         * grabbing by pipe info address. Otherwise two different processes
         * could deadlock (one doing tee from A -> B, the other from B -> A).
         */
-        inode_double_lock(ipipe->inode, opipe->inode);
+        pipe_double_lock(ipipe, opipe);
        do {
                if (!opipe->readers) {
@@ -1653,7 +1652,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
        if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
                ret = -EAGAIN;
-        inode_double_unlock(ipipe->inode, opipe->inode);
+        pipe_unlock(ipipe);
+        pipe_unlock(opipe);
        /*
         * If we put data in the output pipe, wakeup any potential readers.
diff --git a/fs/stat.c b/fs/stat.c
index 2db740a0cfb5..075694e31d8b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -55,59 +55,54 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 EXPORT_SYMBOL(vfs_getattr);
-int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
+int vfs_fstat(unsigned int fd, struct kstat *stat)
 {
-        struct path path;
+        struct file *f = fget(fd);
-        int error;
+        int error = -EBADF;
-        error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path);
+        if (f) {
-        if (!error) {
+                error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
-                error = vfs_getattr(path.mnt, path.dentry, stat);
+                fput(f);
-                path_put(&path);
        }
        return error;
 }
+EXPORT_SYMBOL(vfs_fstat);
-int vfs_stat(char __user *name, struct kstat *stat)
+int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
 {
-        return vfs_stat_fd(AT_FDCWD, name, stat);
+        struct path path;
-}
+        int error = -EINVAL;
+        int lookup_flags = 0;
-EXPORT_SYMBOL(vfs_stat);
+        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+                goto out;
-int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
+        if (!(flag & AT_SYMLINK_NOFOLLOW))
-{
+                lookup_flags |= LOOKUP_FOLLOW;
-        struct path path;
-        int error;
-        error = user_path_at(dfd, name, 0, &path);
+        error = user_path_at(dfd, filename, lookup_flags, &path);
-        if (!error) {
+        if (error)
-                error = vfs_getattr(path.mnt, path.dentry, stat);
+                goto out;
-                path_put(&path);
-        }
+        error = vfs_getattr(path.mnt, path.dentry, stat);
+        path_put(&path);
+out:
        return error;
 }
+EXPORT_SYMBOL(vfs_fstatat);
-int vfs_lstat(char __user *name, struct kstat *stat)
+int vfs_stat(char __user *name, struct kstat *stat)
 {
-        return vfs_lstat_fd(AT_FDCWD, name, stat);
+        return vfs_fstatat(AT_FDCWD, name, stat, 0);
 }
+EXPORT_SYMBOL(vfs_stat);
-EXPORT_SYMBOL(vfs_lstat);
+int vfs_lstat(char __user *name, struct kstat *stat)
-int vfs_fstat(unsigned int fd, struct kstat *stat)
 {
-        struct file *f = fget(fd);
+        return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
-        int error = -EBADF;
-        if (f) {
-                error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
-                fput(f);
-        }
-        return error;
 }
+EXPORT_SYMBOL(vfs_lstat);
-EXPORT_SYMBOL(vfs_fstat);
 #ifdef __ARCH_WANT_OLD_STAT
@@ -155,23 +150,25 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
        struct kstat stat;
-        int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_stat(filename, &stat);
-                error = cp_old_stat(&stat, statbuf);
+        if (error)
+                return error;
-        return error;
+        return cp_old_stat(&stat, statbuf);
 }
 SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
        struct kstat stat;
-        int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_lstat(filename, &stat);
-                error = cp_old_stat(&stat, statbuf);
+        if (error)
+                return error;
-        return error;
+        return cp_old_stat(&stat, statbuf);
 }
 SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
@@ -240,23 +237,23 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
 {
        struct kstat stat;
-        int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+        int error = vfs_stat(filename, &stat);
-        if (!error)
-                error = cp_new_stat(&stat, statbuf);
-        return error;
+        if (error)
+                return error;
+        return cp_new_stat(&stat, statbuf);
 }
 SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
 {
        struct kstat stat;
-        int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_lstat(filename, &stat);
-                error = cp_new_stat(&stat, statbuf);
+        if (error)
+                return error;
-        return error;
+        return cp_new_stat(&stat, statbuf);
 }
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
@@ -264,21 +261,12 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
                struct stat __user *, statbuf, int, flag)
 {
        struct kstat stat;
-        int error = -EINVAL;
+        int error;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-                goto out;
-        if (flag & AT_SYMLINK_NOFOLLOW)
-                error = vfs_lstat_fd(dfd, filename, &stat);
-        else
-                error = vfs_stat_fd(dfd, filename, &stat);
-        if (!error)
-                error = cp_new_stat(&stat, statbuf);
-out:
+        error = vfs_fstatat(dfd, filename, &stat, flag);
-        return error;
+        if (error)
+                return error;
+        return cp_new_stat(&stat, statbuf);
 }
 #endif
@@ -404,21 +392,12 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
                struct stat64 __user *, statbuf, int, flag)
 {
        struct kstat stat;
-        int error = -EINVAL;
+        int error;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-                goto out;
-        if (flag & AT_SYMLINK_NOFOLLOW)
-                error = vfs_lstat_fd(dfd, filename, &stat);
-        else
-                error = vfs_stat_fd(dfd, filename, &stat);
-        if (!error)
-                error = cp_new_stat64(&stat, statbuf);
-out:
+        error = vfs_fstatat(dfd, filename, &stat, flag);
-        return error;
+        if (error)
+                return error;
+        return cp_new_stat64(&stat, statbuf);
 }
 #endif /* __ARCH_WANT_STAT64 */
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 93e0c0281d45..9345806c8853 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -157,14 +157,9 @@ static ssize_t write(struct file *file, const char __user *userbuf,
                        count = size - offs;
        }
-        temp = kmalloc(count, GFP_KERNEL);
+        temp = memdup_user(userbuf, count);
-        if (!temp)
+        if (IS_ERR(temp))
-                return -ENOMEM;
+                return PTR_ERR(temp);
-        if (copy_from_user(temp, userbuf, count)) {
-                count = -EFAULT;
-                goto out_free;
-        }
        mutex_lock(&bb->mutex);
@@ -176,8 +171,6 @@ static ssize_t write(struct file *file, const char __user *userbuf,
        if (count > 0)
                *off = offs + count;
-out_free:
-        kfree(temp);
        return count;
 }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 289c43a47263..b1606e07b7a3 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -446,11 +446,11 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
        if (buffer->event != atomic_read(&od->event))
                goto trigger;
-        return 0;
+        return DEFAULT_POLLMASK;
 trigger:
        buffer->needs_read_fill = 1;
-        return POLLERR|POLLPRI;
+        return DEFAULT_POLLMASK|POLLERR|POLLPRI;
 }
 void sysfs_notify_dirent(struct sysfs_dirent *sd)
@@ -667,6 +667,7 @@ struct sysfs_schedule_callback_struct {
        struct work_struct      work;
 };
+static struct workqueue_struct *sysfs_workqueue;
 static DEFINE_MUTEX(sysfs_workq_mutex);
 static LIST_HEAD(sysfs_workq);
 static void sysfs_schedule_callback_work(struct work_struct *work)
@@ -715,11 +716,20 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
        mutex_lock(&sysfs_workq_mutex);
        list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
                if (ss->kobj == kobj) {
+                        module_put(owner);
                        mutex_unlock(&sysfs_workq_mutex);
                        return -EAGAIN;
                }
        mutex_unlock(&sysfs_workq_mutex);
+        if (sysfs_workqueue == NULL) {
+                sysfs_workqueue = create_workqueue("sysfsd");
+                if (sysfs_workqueue == NULL) {
+                        module_put(owner);
+                        return -ENOMEM;
+                }
+        }
        ss = kmalloc(sizeof(*ss), GFP_KERNEL);
        if (!ss) {
                module_put(owner);
@@ -735,7 +745,7 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
        mutex_lock(&sysfs_workq_mutex);
        list_add_tail(&ss->workq_list, &sysfs_workq);
        mutex_unlock(&sysfs_workq_mutex);
-        schedule_work(&ss->work);
+        queue_work(sysfs_workqueue, &ss->work);
        return 0;
 }
 EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
diff --git a/fs/xattr.c b/fs/xattr.c
index 197c4fcac032..d51b8f9db921 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -237,13 +237,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
        if (size) {
                if (size > XATTR_SIZE_MAX)
                        return -E2BIG;
-                kvalue = kmalloc(size, GFP_KERNEL);
+                kvalue = memdup_user(value, size);
-                if (!kvalue)
+                if (IS_ERR(kvalue))
-                        return -ENOMEM;
+                        return PTR_ERR(kvalue);
-                if (copy_from_user(kvalue, value, size)) {
-                        kfree(kvalue);
-                        return -EFAULT;
-                }
        }
        error = vfs_setxattr(d, kname, kvalue, size, flags);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c13f67300fe7..7ec89fc05b2b 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -153,23 +153,6 @@ xfs_find_bdev_for_inode(
 }
 /*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
- */
-STATIC void
-xfs_finish_ioend(
-        xfs_ioend_t     *ioend,
-        int             wait)
-{
-        if (atomic_dec_and_test(&ioend->io_remaining)) {
-                queue_work(xfsdatad_workqueue, &ioend->io_work);
-                if (wait)
-                        flush_workqueue(xfsdatad_workqueue);
-        }
-}
-/*
 * We're now finished for good with this ioend structure.
 * Update the page state via the associated buffer_heads,
 * release holds on the inode and bio, and finally free
@@ -310,6 +293,27 @@ xfs_end_bio_read(
 }
 /*
+ * Schedule IO completion handling on a xfsdatad if this was
+ * the final hold on this ioend. If we are asked to wait,
+ * flush the workqueue.
+ */
+STATIC void
+xfs_finish_ioend(
+        xfs_ioend_t     *ioend,
+        int             wait)
+{
+        if (atomic_dec_and_test(&ioend->io_remaining)) {
+                struct workqueue_struct *wq = xfsdatad_workqueue;
+                if (ioend->io_work.func == xfs_end_bio_unwritten)
+                        wq = xfsconvertd_workqueue;
+                queue_work(wq, &ioend->io_work);
+                if (wait)
+                        flush_workqueue(wq);
+        }
+}
+/*
 * Allocate and initialise an IO completion structure.
 * We need to track unwritten extent write completion here initially.
 * We'll need to extend this for updating the ondisk inode size later
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 1dd528849755..221b3e66ceef 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -19,6 +19,7 @@
 #define __XFS_AOPS_H__
 extern struct workqueue_struct *xfsdatad_workqueue;
+extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 /*
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index aa1016bb9134..e28800a9f2b5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -51,6 +51,7 @@ static struct shrinker xfs_buf_shake = {
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
+struct workqueue_struct *xfsconvertd_workqueue;
 #ifdef XFS_BUF_TRACE
 void
@@ -1775,6 +1776,7 @@ xfs_flush_buftarg(
        xfs_buf_t       *bp, *n;
        int             pincount = 0;
+        xfs_buf_runall_queues(xfsconvertd_workqueue);
        xfs_buf_runall_queues(xfsdatad_workqueue);
        xfs_buf_runall_queues(xfslogd_workqueue);
@@ -1831,9 +1833,15 @@ xfs_buf_init(void)
        if (!xfsdatad_workqueue)
                goto out_destroy_xfslogd_workqueue;
+        xfsconvertd_workqueue = create_workqueue("xfsconvertd");
+        if (!xfsconvertd_workqueue)
+                goto out_destroy_xfsdatad_workqueue;
        register_shrinker(&xfs_buf_shake);
        return 0;
+ out_destroy_xfsdatad_workqueue:
+        destroy_workqueue(xfsdatad_workqueue);
 out_destroy_xfslogd_workqueue:
        destroy_workqueue(xfslogd_workqueue);
 out_free_buf_zone:
@@ -1849,6 +1857,7 @@ void
 xfs_buf_terminate(void)
 {
        unregister_shrinker(&xfs_buf_shake);
+        destroy_workqueue(xfsconvertd_workqueue);
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
        kmem_zone_destroy(xfs_buf_zone);
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 5aeb77776961..08be36d7326c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -74,14 +74,14 @@ xfs_flush_pages(
        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                xfs_iflags_clear(ip, XFS_ITRUNCATED);
-                ret = filemap_fdatawrite(mapping);
+                ret = -filemap_fdatawrite(mapping);
-                if (flags & XFS_B_ASYNC)
-                        return -ret;
-                ret2 = filemap_fdatawait(mapping);
-                if (!ret)
-                        ret = ret2;
        }
-        return -ret;
+        if (flags & XFS_B_ASYNC)
+                return ret;
+        ret2 = xfs_wait_on_pages(ip, first, last);
+        if (!ret)
+                ret = ret2;
+        return ret;
 }
 int
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index d0b499418a7d..34eaab608e6e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -489,17 +489,12 @@ xfs_attrmulti_attr_set(
        if (len > XATTR_SIZE_MAX)
                return EINVAL;
-        kbuf = kmalloc(len, GFP_KERNEL);
+        kbuf = memdup_user(ubuf, len);
-        if (!kbuf)
+        if (IS_ERR(kbuf))
-                return ENOMEM;
+                return PTR_ERR(kbuf);
-        if (copy_from_user(kbuf, ubuf, len))
-                goto out_kfree;
        error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
- out_kfree:
-        kfree(kbuf);
        return error;
 }
@@ -540,20 +535,16 @@ xfs_attrmulti_by_handle(
        if (!size || size > 16 * PAGE_SIZE)
                goto out_dput;
-        error = ENOMEM;
+        ops = memdup_user(am_hreq.ops, size);
-        ops = kmalloc(size, GFP_KERNEL);
+        if (IS_ERR(ops)) {
-        if (!ops)
+                error = PTR_ERR(ops);
                goto out_dput;
+        }
-        error = EFAULT;
-        if (copy_from_user(ops, am_hreq.ops, size))
-                goto out_kfree_ops;
        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
        if (!attr_name)
                goto out_kfree_ops;
        error = 0;
        for (i = 0; i < am_hreq.opcount; i++) {
                ops[i].am_error = strncpy_from_user(attr_name,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index c70c4e3db790..0882d166239a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -427,20 +427,16 @@ xfs_compat_attrmulti_by_handle(
        if (!size || size > 16 * PAGE_SIZE)
                goto out_dput;
-        error = ENOMEM;
+        ops = memdup_user(compat_ptr(am_hreq.ops), size);
-        ops = kmalloc(size, GFP_KERNEL);
+        if (IS_ERR(ops)) {
-        if (!ops)
+                error = PTR_ERR(ops);
                goto out_dput;
+        }
-        error = EFAULT;
-        if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
-                goto out_kfree_ops;
        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
        if (!attr_name)
                goto out_kfree_ops;
        error = 0;
        for (i = 0; i < am_hreq.opcount; i++) {
                ops[i].am_error = strncpy_from_user(attr_name,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7e90daa0d1d1..9142192ccbe6 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -751,10 +751,26 @@ start:
                        goto relock;
                }
        } else {
+                int enospc = 0;
+                ssize_t ret2 = 0;
+write_retry:
                xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
                                *offset, ioflags);
-                ret = generic_file_buffered_write(iocb, iovp, segs,
+                ret2 = generic_file_buffered_write(iocb, iovp, segs,
                                pos, offset, count, ret);
+                /*
+                 * if we just got an ENOSPC, flush the inode now we
+                 * aren't holding any page locks and retry *once*
+                 */
+                if (ret2 == -ENOSPC && !enospc) {
+                        error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
+                        if (error)
+                                goto out_unlock_internal;
+                        enospc = 1;
+                        goto write_retry;
+                }
+                ret = ret2;
        }
        current->backing_dev_info = NULL;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a608e72fa405..f7ba76633c29 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -62,12 +62,6 @@ xfs_sync_inodes_ag(
        uint32_t        first_index = 0;
        int             error = 0;
        int             last_error = 0;
-        int             fflag = XFS_B_ASYNC;
-        if (flags & SYNC_DELWRI)
-                fflag = XFS_B_DELWRI;
-        if (flags & SYNC_WAIT)
-                fflag = 0;              /* synchronous overrides all */
        do {
                struct inode    *inode;
@@ -128,11 +122,23 @@ xfs_sync_inodes_ag(
                 * If we have to flush data or wait for I/O completion
                 * we need to hold the iolock.
                 */
-                if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
+                if (flags & SYNC_DELWRI) {
-                        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+                        if (VN_DIRTY(inode)) {
-                        lock_flags |= XFS_IOLOCK_SHARED;
+                                if (flags & SYNC_TRYLOCK) {
-                        error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
+                                        if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-                        if (flags & SYNC_IOWAIT)
+                                                lock_flags |= XFS_IOLOCK_SHARED;
+                                } else {
+                                        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+                                        lock_flags |= XFS_IOLOCK_SHARED;
+                                }
+                                if (lock_flags & XFS_IOLOCK_SHARED) {
+                                        error = xfs_flush_pages(ip, 0, -1,
+                                                        (flags & SYNC_WAIT) ? 0
+                                                                : XFS_B_ASYNC,
+                                                        FI_NONE);
+                                }
+                        }
+                        if (VN_CACHED(inode) && (flags & SYNC_IOWAIT))
                                xfs_ioend_wait(ip);
                }
                xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -398,15 +404,17 @@ STATIC void
 xfs_syncd_queue_work(
        struct xfs_mount *mp,
        void            *data,
-        void            (*syncer)(struct xfs_mount *, void *))
+        void            (*syncer)(struct xfs_mount *, void *),
+        struct completion *completion)
 {
-        struct bhv_vfs_sync_work *work;
+        struct xfs_sync_work *work;
-        work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
+        work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
        INIT_LIST_HEAD(&work->w_list);
        work->w_syncer = syncer;
        work->w_data = data;
        work->w_mount = mp;
+        work->w_completion = completion;
        spin_lock(&mp->m_sync_lock);
        list_add_tail(&work->w_list, &mp->m_sync_list);
        spin_unlock(&mp->m_sync_lock);
@@ -420,49 +428,26 @@ xfs_syncd_queue_work(
 * heads, looking about for more room...
 */
 STATIC void
-xfs_flush_inode_work(
+xfs_flush_inodes_work(
-        struct xfs_mount *mp,
-        void            *arg)
-{
-        struct inode    *inode = arg;
-        filemap_flush(inode->i_mapping);
-        iput(inode);
-}
-void
-xfs_flush_inode(
-        xfs_inode_t     *ip)
-{
-        struct inode    *inode = VFS_I(ip);
-        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
-        delay(msecs_to_jiffies(500));
-}
-/*
- * This is the "bigger hammer" version of xfs_flush_inode_work...
- * (IOW, "If at first you don't succeed, use a Bigger Hammer").
- */
-STATIC void
-xfs_flush_device_work(
        struct xfs_mount *mp,
        void            *arg)
 {
        struct inode    *inode = arg;
-        sync_blockdev(mp->m_super->s_bdev);
+        xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK);
+        xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT);
        iput(inode);
 }
 void
-xfs_flush_device(
+xfs_flush_inodes(
        xfs_inode_t     *ip)
 {
        struct inode    *inode = VFS_I(ip);
+        DECLARE_COMPLETION_ONSTACK(completion);
        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
+        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
-        delay(msecs_to_jiffies(500));
+        wait_for_completion(&completion);
        xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
 }
@@ -497,7 +482,7 @@ xfssyncd(
 {
        struct xfs_mount        *mp = arg;
        long                    timeleft;
-        bhv_vfs_sync_work_t     *work, *n;
+        xfs_sync_work_t         *work, *n;
        LIST_HEAD               (tmp);
        set_freezable();
@@ -532,6 +517,8 @@ xfssyncd(
                        list_del(&work->w_list);
                        if (work == &mp->m_sync_work)
                                continue;
+                        if (work->w_completion)
+                                complete(work->w_completion);
                        kmem_free(work);
                }
        }
@@ -545,6 +532,7 @@ xfs_syncd_init(
 {
        mp->m_sync_work.w_syncer = xfs_sync_worker;
        mp->m_sync_work.w_mount = mp;
+        mp->m_sync_work.w_completion = NULL;
        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
        if (IS_ERR(mp->m_sync_task))
                return -PTR_ERR(mp->m_sync_task);
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 04f058c848ae..308d5bf6dfbd 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -21,18 +21,20 @@
 struct xfs_mount;
 struct xfs_perag;
-typedef struct bhv_vfs_sync_work {
+typedef struct xfs_sync_work {
        struct list_head        w_list;
        struct xfs_mount        *w_mount;
        void                    *w_data;        /* syncer routine argument */
        void                    (*w_syncer)(struct xfs_mount *, void *);
-} bhv_vfs_sync_work_t;
+        struct completion       *w_completion;
+} xfs_sync_work_t;
 #define SYNC_ATTR               0x0001  /* sync attributes */
 #define SYNC_DELWRI             0x0002  /* look at delayed writes */
 #define SYNC_WAIT               0x0004  /* wait for i/o to complete */
 #define SYNC_BDFLUSH            0x0008  /* BDFLUSH is calling -- don't block */
 #define SYNC_IOWAIT             0x0010  /* wait for all I/O to complete */
+#define SYNC_TRYLOCK            0x0020  /* only try to lock inodes */
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
@@ -43,8 +45,7 @@ int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
-void xfs_flush_inode(struct xfs_inode *ip);
+void xfs_flush_inodes(struct xfs_inode *ip);
-void xfs_flush_device(struct xfs_inode *ip);
 int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3a6ed426327a..ca7c6005a487 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5880,7 +5880,7 @@ xfs_getbmap(
        void                    *arg)           /* formatter arg */
 {
        __int64_t               bmvend;         /* last block requested */
-        int                     error;          /* return value */
+        int                     error = 0;      /* return value */
        __int64_t               fixlen;         /* length for -1 case */
        int                     i;              /* extent number */
        int                     lock;           /* lock state */
@@ -5890,39 +5890,18 @@ xfs_getbmap(
        int                     nexleft;        /* # of user extents left */
        int                     subnex;         /* # of bmapi's can do */
        int                     nmap;           /* number of map entries */
-        struct getbmapx         out;            /* output structure */
+        struct getbmapx         *out;           /* output structure */
        int                     whichfork;      /* data or attr fork */
        int                     prealloced;     /* this is a file with
                                                 * preallocated data space */
        int                     iflags;         /* interface flags */
        int                     bmapi_flags;    /* flags for xfs_bmapi */
+        int                     cur_ext = 0;
        mp = ip->i_mount;
        iflags = bmv->bmv_iflags;
        whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
-        /*      If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
-         *      generate a DMAPI read event.  Otherwise, if the DM_EVENT_READ
-         *      bit is set for the file, generate a read event in order
-         *      that the DMAPI application may do its thing before we return
-         *      the extents.  Usually this means restoring user file data to
-         *      regions of the file that look like holes.
-         *
-         *      The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
-         *      BMV_IF_NO_DMAPI_READ so that read events are generated.
-         *      If this were not true, callers of ioctl( XFS_IOC_GETBMAP )
-         *      could misinterpret holes in a DMAPI file as true holes,
-         *      when in fact they may represent offline user data.
-         */
-        if ((iflags & BMV_IF_NO_DMAPI_READ) == 0 &&
-            DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
-            whichfork == XFS_DATA_FORK) {
-                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
-                if (error)
-                        return XFS_ERROR(error);
-        }
        if (whichfork == XFS_ATTR_FORK) {
                if (XFS_IFORK_Q(ip)) {
                        if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
@@ -5936,11 +5915,37 @@ xfs_getbmap(
                                         ip->i_mount);
                        return XFS_ERROR(EFSCORRUPTED);
                }
-        } else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
-                   ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+                prealloced = 0;
-                   ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+                fixlen = 1LL << 32;
-                return XFS_ERROR(EINVAL);
+        } else {
-        if (whichfork == XFS_DATA_FORK) {
+                /*
+                 * If the BMV_IF_NO_DMAPI_READ interface bit specified, do
+                 * not generate a DMAPI read event.  Otherwise, if the
+                 * DM_EVENT_READ bit is set for the file, generate a read
+                 * event in order that the DMAPI application may do its thing
+                 * before we return the extents.  Usually this means restoring
+                 * user file data to regions of the file that look like holes.
+                 *
+                 * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
+                 * BMV_IF_NO_DMAPI_READ so that read events are generated.
+                 * If this were not true, callers of ioctl(XFS_IOC_GETBMAP)
+                 * could misinterpret holes in a DMAPI file as true holes,
+                 * when in fact they may represent offline user data.
+                 */
+                if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
+                    !(iflags & BMV_IF_NO_DMAPI_READ)) {
+                        error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip,
+                                              0, 0, 0, NULL);
+                        if (error)
+                                return XFS_ERROR(error);
+                }
+                if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+                    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+                    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+                        return XFS_ERROR(EINVAL);
                if (xfs_get_extsz_hint(ip) ||
                    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
                        prealloced = 1;
@@ -5949,42 +5954,41 @@ xfs_getbmap(
                        prealloced = 0;
                        fixlen = ip->i_size;
                }
-        } else {
-                prealloced = 0;
-                fixlen = 1LL << 32;
        }
        if (bmv->bmv_length == -1) {
                fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
-                bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset),
+                bmv->bmv_length =
-                                        (__int64_t)0);
+                        max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
-        } else if (bmv->bmv_length < 0)
+        } else if (bmv->bmv_length == 0) {
-                return XFS_ERROR(EINVAL);
-        if (bmv->bmv_length == 0) {
                bmv->bmv_entries = 0;
                return 0;
+        } else if (bmv->bmv_length < 0) {
+                return XFS_ERROR(EINVAL);
        }
        nex = bmv->bmv_count - 1;
        if (nex <= 0)
                return XFS_ERROR(EINVAL);
        bmvend = bmv->bmv_offset + bmv->bmv_length;
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        if (((iflags & BMV_IF_DELALLOC) == 0) &&
+        if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
-            (whichfork == XFS_DATA_FORK) &&
+                return XFS_ERROR(ENOMEM);
-            (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
+        out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
-                /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
+        if (!out)
-                error = xfs_flush_pages(ip, (xfs_off_t)0,
+                return XFS_ERROR(ENOMEM);
-                                               -1, 0, FI_REMAPF);
-                if (error) {
+        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
-                return error;
+                if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+                        error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
+                        if (error)
+                                goto out_unlock_iolock;
                }
-        }
-        ASSERT(whichfork == XFS_ATTR_FORK || (iflags & BMV_IF_DELALLOC) ||
+                ASSERT(ip->i_delayed_blks == 0);
-               ip->i_delayed_blks == 0);
+        }
        lock = xfs_ilock_map_shared(ip);
@@ -5995,23 +5999,25 @@ xfs_getbmap(
        if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
                nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
-        bmapi_flags = xfs_bmapi_aflag(whichfork) |
+        bmapi_flags = xfs_bmapi_aflag(whichfork);
-                        ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
+        if (!(iflags & BMV_IF_PREALLOC))
+                bmapi_flags |= XFS_BMAPI_IGSTATE;
        /*
         * Allocate enough space to handle "subnex" maps at a time.
         */
+        error = ENOMEM;
        subnex = 16;
-        map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP);
+        map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL);
+        if (!map)
+                goto out_unlock_ilock;
        bmv->bmv_entries = 0;
-        if ((XFS_IFORK_NEXTENTS(ip, whichfork) == 0)) {
+        if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
-                if (((iflags & BMV_IF_DELALLOC) == 0) ||
+            (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
-                    whichfork == XFS_ATTR_FORK) {
+                error = 0;
-                        error = 0;
+                goto out_free_map;
-                        goto unlock_and_return;
-                }
        }
        nexleft = nex;
@@ -6023,53 +6029,61 @@ xfs_getbmap(
                                  bmapi_flags, NULL, 0, map, &nmap,
                                  NULL, NULL);
                if (error)
-                        goto unlock_and_return;
+                        goto out_free_map;
                ASSERT(nmap <= subnex);
                for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
-                        out.bmv_oflags = 0;
+                        out[cur_ext].bmv_oflags = 0;
                        if (map[i].br_state == XFS_EXT_UNWRITTEN)
-                                out.bmv_oflags |= BMV_OF_PREALLOC;
+                                out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
                        else if (map[i].br_startblock == DELAYSTARTBLOCK)
-                                out.bmv_oflags |= BMV_OF_DELALLOC;
+                                out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
-                        out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
+                        out[cur_ext].bmv_offset =
-                        out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+                                XFS_FSB_TO_BB(mp, map[i].br_startoff);
-                        out.bmv_unused1 = out.bmv_unused2 = 0;
+                        out[cur_ext].bmv_length =
+                                XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+                        out[cur_ext].bmv_unused1 = 0;
+                        out[cur_ext].bmv_unused2 = 0;
                        ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
                              (map[i].br_startblock != DELAYSTARTBLOCK));
                        if (map[i].br_startblock == HOLESTARTBLOCK &&
                            whichfork == XFS_ATTR_FORK) {
                                /* came to the end of attribute fork */
-                                out.bmv_oflags |= BMV_OF_LAST;
+                                out[cur_ext].bmv_oflags |= BMV_OF_LAST;
-                                goto unlock_and_return;
+                                goto out_free_map;
-                        } else {
-                                int full = 0;   /* user array is full */
-                                if (!xfs_getbmapx_fix_eof_hole(ip, &out,
-                                                        prealloced, bmvend,
-                                                        map[i].br_startblock)) {
-                                        goto unlock_and_return;
-                                }
-                                /* format results & advance arg */
-                                error = formatter(&arg, &out, &full);
-                                if (error || full)
-                                        goto unlock_and_return;
-                                nexleft--;
-                                bmv->bmv_offset =
-                                        out.bmv_offset + out.bmv_length;
-                                bmv->bmv_length = MAX((__int64_t)0,
-                                        (__int64_t)(bmvend - bmv->bmv_offset));
-                                bmv->bmv_entries++;
                        }
+                        if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
+                                        prealloced, bmvend,
+                                        map[i].br_startblock))
+                                goto out_free_map;
+                        nexleft--;
+                        bmv->bmv_offset =
+                                out[cur_ext].bmv_offset +
+                                out[cur_ext].bmv_length;
+                        bmv->bmv_length =
+                                max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+                        bmv->bmv_entries++;
+                        cur_ext++;
                }
        } while (nmap && nexleft && bmv->bmv_length);
-unlock_and_return:
+ out_free_map:
+        kmem_free(map);
+ out_unlock_ilock:
        xfs_iunlock_map_shared(ip, lock);
+ out_unlock_iolock:
        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-        kmem_free(map);
+        for (i = 0; i < cur_ext; i++) {
+                int full = 0;   /* user array is full */
+                /* format results & advance arg */
+                error = formatter(&arg, &out[i], &full);
+                if (error || full)
+                        break;
+        }
        return error;
 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 478e587087fe..89b81eedce6a 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -69,15 +69,6 @@ xfs_inode_alloc(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
-        /*
-         * initialise the VFS inode here to get failures
-         * out of the way early.
-         */
-        if (!inode_init_always(mp->m_super, VFS_I(ip))) {
-                kmem_zone_free(xfs_inode_zone, ip);
-                return NULL;
-        }
        /* initialise the xfs inode */
        ip->i_ino = ino;
        ip->i_mount = mp;
@@ -113,6 +104,20 @@ xfs_inode_alloc(
 #ifdef XFS_DIR2_TRACE
        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
+        /*
+        * Now initialise the VFS inode. We do this after the xfs_inode
+        * initialisation as internal failures will result in ->destroy_inode
+        * being called and that will pass down through the reclaim path and
+        * free the XFS inode. This path requires the XFS inode to already be
+        * initialised. Hence if this call fails, the xfs_inode has already
+        * been freed and we should not reference it at all in the error
+        * handling.
+        */
+        if (!inode_init_always(mp->m_super, VFS_I(ip)))
+                return NULL;
+        /* prevent anyone from using this yet */
+        VFS_I(ip)->i_state = I_NEW|I_LOCK;
        return ip;
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e7ae08d1df48..123b20c8cbf2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1258,8 +1258,10 @@ xfs_file_last_byte(
         * necessary.
         */
        if (ip->i_df.if_flags & XFS_IFEXTENTS) {
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
                error = xfs_bmap_last_offset(NULL, ip, &last_block,
                        XFS_DATA_FORK);
+                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                if (error) {
                        last_block = 0;
                }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 08ce72316bfe..5aaa2d7ec155 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -338,38 +338,6 @@ xfs_iomap_eof_align_last_fsb(
 }
 STATIC int
-xfs_flush_space(
-        xfs_inode_t     *ip,
-        int             *fsynced,
-        int             *ioflags)
-{
-        switch (*fsynced) {
-        case 0:
-                if (ip->i_delayed_blks) {
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                        xfs_flush_inode(ip);
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                        *fsynced = 1;
-                } else {
-                        *ioflags |= BMAPI_SYNC;
-                        *fsynced = 2;
-                }
-                return 0;
-        case 1:
-                *fsynced = 2;
-                *ioflags |= BMAPI_SYNC;
-                return 0;
-        case 2:
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                xfs_flush_device(ip);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                *fsynced = 3;
-                return 0;
-        }
-        return 1;
-}
-STATIC int
 xfs_cmn_err_fsblock_zero(
        xfs_inode_t     *ip,
        xfs_bmbt_irec_t *imap)
@@ -538,15 +506,9 @@ error_out:
 }
 /*
- * If the caller is doing a write at the end of the file,
+ * If the caller is doing a write at the end of the file, then extend the
- * then extend the allocation out to the file system's write
+ * allocation out to the file system's write iosize.  We clean up any extra
- * iosize.  We clean up any extra space left over when the
+ * space left over when the file is closed in xfs_inactive().
- * file is closed in xfs_inactive().
- *
- * For sync writes, we are flushing delayed allocate space to
- * try to make additional space available for allocation near
- * the filesystem full boundary - preallocation hurts in that
- * situation, of course.
 */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -565,7 +527,7 @@ xfs_iomap_eof_want_preallocate(
        int             n, error, imaps;
        *prealloc = 0;
-        if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size)
+        if ((offset + count) <= ip->i_size)
                return 0;
        /*
@@ -611,7 +573,7 @@ xfs_iomap_write_delay(
        xfs_extlen_t    extsz;
        int             nimaps;
        xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-        int             prealloc, fsynced = 0;
+        int             prealloc, flushed = 0;
        int             error;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -627,12 +589,12 @@ xfs_iomap_write_delay(
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-retry:
        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
+retry:
        if (prealloc) {
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
@@ -659,15 +621,22 @@ retry:
        /*
         * If bmapi returned us nothing, and if we didn't get back EDQUOT,
-         * then we must have run out of space - flush delalloc, and retry..
+         * then we must have run out of space - flush all other inodes with
+         * delalloc blocks and retry without EOF preallocation.
         */
        if (nimaps == 0) {
                xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
                                        ip, offset, count);
-                if (xfs_flush_space(ip, &fsynced, &ioflag))
+                if (flushed)
                        return XFS_ERROR(ENOSPC);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                xfs_flush_inodes(ip);
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                flushed = 1;
                error = 0;
+                prealloc = 0;
                goto retry;
        }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index a1cc1322fc0f..fdcf7b82747f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -40,8 +40,7 @@ typedef enum {
        BMAPI_IGNSTATE = (1 << 4),      /* ignore unwritten state on read */
        BMAPI_DIRECT = (1 << 5),        /* direct instead of buffered write */
        BMAPI_MMAP = (1 << 6),          /* allocate for mmap write */
-        BMAPI_SYNC = (1 << 7),          /* sync write to flush delalloc space */
+        BMAPI_TRYLOCK = (1 << 7),       /* non-blocking request */
-        BMAPI_TRYLOCK = (1 << 8),       /* non-blocking request */
 } bmapi_flags_t;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f76c6d7cea21..3750f04ede0b 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -562,9 +562,8 @@ xfs_log_mount(
        }
        mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
-        if (!mp->m_log) {
+        if (IS_ERR(mp->m_log)) {
-                cmn_err(CE_WARN, "XFS: Log allocation failed: No memory!");
+                error = -PTR_ERR(mp->m_log);
-                error = ENOMEM;
                goto out;
        }
@@ -1180,10 +1179,13 @@ xlog_alloc_log(xfs_mount_t	*mp,
        xfs_buf_t               *bp;
        int                     i;
        int                     iclogsize;
+        int                     error = ENOMEM;
        log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
-        if (!log)
+        if (!log) {
-                return NULL;
+                xlog_warn("XFS: Log allocation failed: No memory!");
+                goto out;
+        }
        log->l_mp          = mp;
        log->l_targ        = log_target;
@@ -1201,19 +1203,35 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_grant_reserve_cycle = 1;
        log->l_grant_write_cycle = 1;
+        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
                log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
-                ASSERT(log->l_sectbb_log <= mp->m_sectbb_log);
+                if (log->l_sectbb_log < 0 ||
+                    log->l_sectbb_log > mp->m_sectbb_log) {
+                        xlog_warn("XFS: Log sector size (0x%x) out of range.",
+                                                log->l_sectbb_log);
+                        goto out_free_log;
+                }
                /* for larger sector sizes, must have v2 or external log */
-                ASSERT(log->l_sectbb_log == 0 ||
+                if (log->l_sectbb_log != 0 &&
-                        log->l_logBBstart == 0 ||
+                    (log->l_logBBstart != 0 &&
-                        xfs_sb_version_haslogv2(&mp->m_sb));
+                     !xfs_sb_version_haslogv2(&mp->m_sb))) {
-                ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT);
+                        xlog_warn("XFS: log sector size (0x%x) invalid "
+                                  "for configuration.", log->l_sectbb_log);
+                        goto out_free_log;
+                }
+                if (mp->m_sb.sb_logsectlog < BBSHIFT) {
+                        xlog_warn("XFS: Log sector log (0x%x) too small.",
+                                                mp->m_sb.sb_logsectlog);
+                        goto out_free_log;
+                }
        }
        log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
        xlog_get_iclog_buffer_size(mp, log);
+        error = ENOMEM;
        bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
        if (!bp)
                goto out_free_log;
@@ -1313,7 +1331,8 @@ out_free_iclog:
        xfs_buf_free(log->l_xbuf);
 out_free_log:
        kmem_free(log);
-        return NULL;
+out:
+        return ERR_PTR(-error);
 }       /* xlog_alloc_log */
@@ -2541,18 +2560,19 @@ redo:
                        xlog_ins_ticketq(&log->l_reserve_headq, tic);
                xlog_trace_loggrant(log, tic,
                                    "xlog_grant_log_space: sleep 2");
+                spin_unlock(&log->l_grant_lock);
+                xlog_grant_push_ail(log->l_mp, need_bytes);
+                spin_lock(&log->l_grant_lock);
                XFS_STATS_INC(xs_sleep_logspace);
                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                if (XLOG_FORCED_SHUTDOWN(log)) {
+                spin_lock(&log->l_grant_lock);
-                        spin_lock(&log->l_grant_lock);
+                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
-                }
                xlog_trace_loggrant(log, tic,
                                    "xlog_grant_log_space: wake 2");
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
                goto redo;
        } else if (tic->t_flags & XLOG_TIC_IN_Q)
                xlog_del_ticketq(&log->l_reserve_headq, tic);
@@ -2631,7 +2651,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
         * for more free space, otherwise try to get some space for
         * this transaction.
         */
+        need_bytes = tic->t_unit_res;
        if ((ntic = log->l_write_headq)) {
                free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
                                             log->l_grant_write_bytes);
@@ -2651,26 +2671,25 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                        xlog_trace_loggrant(log, tic,
                                    "xlog_regrant_write_log_space: sleep 1");
+                        spin_unlock(&log->l_grant_lock);
+                        xlog_grant_push_ail(log->l_mp, need_bytes);
+                        spin_lock(&log->l_grant_lock);
                        XFS_STATS_INC(xs_sleep_logspace);
                        sv_wait(&tic->t_wait, PINOD|PLTWAIT,
                                &log->l_grant_lock, s);
                        /* If we're shutting down, this tic is already
                         * off the queue */
-                        if (XLOG_FORCED_SHUTDOWN(log)) {
+                        spin_lock(&log->l_grant_lock);
-                                spin_lock(&log->l_grant_lock);
+                        if (XLOG_FORCED_SHUTDOWN(log))
                                goto error_return;
-                        }
                        xlog_trace_loggrant(log, tic,
                                    "xlog_regrant_write_log_space: wake 1");
-                        xlog_grant_push_ail(log->l_mp, tic->t_unit_res);
-                        spin_lock(&log->l_grant_lock);
                }
        }
-        need_bytes = tic->t_unit_res;
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
                goto error_return;
@@ -2680,19 +2699,20 @@ redo:
        if (free_bytes < need_bytes) {
                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
                        xlog_ins_ticketq(&log->l_write_headq, tic);
+                spin_unlock(&log->l_grant_lock);
+                xlog_grant_push_ail(log->l_mp, need_bytes);
+                spin_lock(&log->l_grant_lock);
                XFS_STATS_INC(xs_sleep_logspace);
                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
                /* If we're shutting down, this tic is already off the queue */
-                if (XLOG_FORCED_SHUTDOWN(log)) {
+                spin_lock(&log->l_grant_lock);
-                        spin_lock(&log->l_grant_lock);
+                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
-                }
                xlog_trace_loggrant(log, tic,
                                    "xlog_regrant_write_log_space: wake 2");
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
                goto redo;
        } else if (tic->t_flags & XLOG_TIC_IN_Q)
                xlog_del_ticketq(&log->l_write_headq, tic);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b101990df027..65a99725d0cc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -291,14 +291,17 @@ xfs_mount_validate_sb(
            sbp->sb_sectsize > XFS_MAX_SECTORSIZE                       ||
            sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG                    ||
            sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG                    ||
+            sbp->sb_sectsize != (1 << sbp->sb_sectlog)                  ||
            sbp->sb_blocksize < XFS_MIN_BLOCKSIZE                       ||
            sbp->sb_blocksize > XFS_MAX_BLOCKSIZE                       ||
            sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG                    ||
            sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG                    ||
+            sbp->sb_blocksize != (1 << sbp->sb_blocklog)                ||
            sbp->sb_inodesize < XFS_DINODE_MIN_SIZE                     ||
            sbp->sb_inodesize > XFS_DINODE_MAX_SIZE                     ||
            sbp->sb_inodelog < XFS_DINODE_MIN_LOG                       ||
            sbp->sb_inodelog > XFS_DINODE_MAX_LOG                       ||
+            sbp->sb_inodesize != (1 << sbp->sb_inodelog)                ||
            (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)   ||
            (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
            (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7af44adffc8f..d6a64392f983 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -313,7 +313,7 @@ typedef struct xfs_mount {
 #endif
        struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
        struct task_struct      *m_sync_task;   /* generalised sync thread */
-        bhv_vfs_sync_work_t     m_sync_work;    /* work item for VFS_SYNC */
+        xfs_sync_work_t         m_sync_work;    /* work item for VFS_SYNC */
        struct list_head        m_sync_list;    /* sync thread work item list */
        spinlock_t              m_sync_lock;    /* work item list lock */
        int                     m_sync_seq;     /* sync thread generation no. */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 7394c7af5de5..19cf90a9c762 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1457,6 +1457,13 @@ xfs_create(
        error = xfs_trans_reserve(tp, resblks, log_res, 0,
                        XFS_TRANS_PERM_LOG_RES, log_count);
        if (error == ENOSPC) {
+                /* flush outstanding delalloc blocks and retry */
+                xfs_flush_inodes(dp);
+                error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
+                        XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+        }
+        if (error == ENOSPC) {
+                /* No space at all so try a "no-allocation" reservation */
                resblks = 0;
                error = xfs_trans_reserve(tp, 0, log_res, 0,
                                XFS_TRANS_PERM_LOG_RES, log_count);
author	Ingo Molnar <mingo@elte.hu>	2009-05-07 05:17:13 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-05-07 05:17:34 -0400
commit	44347d947f628060b92449702071bfe1d31dfb75 (patch)
tree	c6ed74610d5b3295df4296659f80f5feb94b28cc /fs
parent	d94fc523f3c35bd8013f04827e94756cbc0212f4 (diff)
parent	413f81eba35d6ede9289b0c8a920c013a84fac71 (diff)