33 files changed, 467 insertions, 374 deletions
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index bf8c8af98004..4eb4d8dfb2f1 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -39,10 +39,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 {
        struct autofs_dirhash *dh = &sbi->dirhash;
        struct autofs_dir_ent *ent;
-        struct dentry *dentry;
        unsigned long timeout = sbi->exp_timeout;
        while (1) {
+                struct path path;
+                int umount_ok;
                if ( list_empty(&dh->expiry_head) || sbi->catatonic )
                        return NULL;    /* No entries */
                /* We keep the list sorted by last_usage and want old stuff */
@@ -57,17 +59,17 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
                        return ent; /* Symlinks are always expirable */
                /* Get the dentry for the autofs subdirectory */
-                dentry = ent->dentry;
+                path.dentry = ent->dentry;
-                if ( !dentry ) {
+                if (!path.dentry) {
                        /* Should only happen in catatonic mode */
                        printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
                        autofs_delete_usage(ent);
                        continue;
                }
-                if ( !dentry->d_inode ) {
+                if (!path.dentry->d_inode) {
-                        dput(dentry);
+                        dput(path.dentry);
                        printk("autofs: negative dentry on expiry queue: %s\n",
                               ent->name);
                        autofs_delete_usage(ent);
@@ -76,29 +78,29 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
                /* Make sure entry is mounted and unused; note that dentry will
                   point to the mounted-on-top root. */
-                if (!S_ISDIR(dentry->d_inode->i_mode)||!d_mountpoint(dentry)) {
+                if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
+                    !d_mountpoint(path.dentry)) {
                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
                        continue;
                }
-                mntget(mnt);
+                path.mnt = mnt;
-                dget(dentry);
+                path_get(&path);
-                if (!follow_down(&mnt, &dentry)) {
+                if (!follow_down(&path.mnt, &path.dentry)) {
-                        dput(dentry);
+                        path_put(&path);
-                        mntput(mnt);
                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
                        continue;
                }
-                while (d_mountpoint(dentry) && follow_down(&mnt, &dentry))
+                while (d_mountpoint(path.dentry) &&
+                       follow_down(&path.mnt, &path.dentry))
                        ;
-                dput(dentry);
+                umount_ok = may_umount(path.mnt);
+                path_put(&path);
-                if ( may_umount(mnt) ) {
+                if (umount_ok) {
-                        mntput(mnt);
                        DPRINTK(("autofs: signaling expire on %s\n", ent->name));
                        return ent; /* Expirable! */
                }
                DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
-                mntput(mnt);
        }
        return NULL;            /* No expirable entries */
 }
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 9e5ae8a4f5c8..84168c0dcc2d 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -54,11 +54,10 @@ static int check_name(const char *name)
 * Check a string doesn't overrun the chunk of
 * memory we copied from user land.
 */
-static int invalid_str(char *str, void *end)
+static int invalid_str(char *str, size_t size)
 {
-        while ((void *) str <= end)
+        if (memchr(str, 0, size))
-                if (!*str++)
+                return 0;
-                        return 0;
        return -EINVAL;
 }
@@ -138,8 +137,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
        }
        if (param->size > sizeof(*param)) {
-                err = invalid_str(param->path,
+                err = invalid_str(param->path, param->size - sizeof(*param));
-                                 (void *) ((size_t) param + param->size));
                if (err) {
                        AUTOFS_WARN(
                          "path string terminator missing for cmd(0x%08x)",
@@ -488,7 +486,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
        }
        path = param->path;
-        devid = sbi->sb->s_dev;
+        devid = new_encode_dev(sbi->sb->s_dev);
        param->requester.uid = param->requester.gid = -1;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 51bfdfc8fcda..502c3d61de62 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -25,6 +25,7 @@
 #define WORK_QUEUED_BIT 0
 #define WORK_DONE_BIT 1
 #define WORK_ORDER_DONE_BIT 2
+#define WORK_HIGH_PRIO_BIT 3
 /*
 * container for the kthread task pointer and the list of pending work
@@ -36,6 +37,7 @@ struct btrfs_worker_thread {
        /* list of struct btrfs_work that are waiting for service */
        struct list_head pending;
+        struct list_head prio_pending;
        /* list of worker threads from struct btrfs_workers */
        struct list_head worker_list;
@@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
        spin_lock_irqsave(&workers->lock, flags);
-        while (!list_empty(&workers->order_list)) {
+        while (1) {
-                work = list_entry(workers->order_list.next,
+                if (!list_empty(&workers->prio_order_list)) {
-                                  struct btrfs_work, order_list);
+                        work = list_entry(workers->prio_order_list.next,
+                                          struct btrfs_work, order_list);
+                } else if (!list_empty(&workers->order_list)) {
+                        work = list_entry(workers->order_list.next,
+                                          struct btrfs_work, order_list);
+                } else {
+                        break;
+                }
                if (!test_bit(WORK_DONE_BIT, &work->flags))
                        break;
@@ -143,8 +151,14 @@ static int worker_loop(void *arg)
        do {
                spin_lock_irq(&worker->lock);
 again_locked:
-                while (!list_empty(&worker->pending)) {
+                while (1) {
-                        cur = worker->pending.next;
+                        if (!list_empty(&worker->prio_pending))
+                                cur = worker->prio_pending.next;
+                        else if (!list_empty(&worker->pending))
+                                cur = worker->pending.next;
+                        else
+                                break;
                        work = list_entry(cur, struct btrfs_work, list);
                        list_del(&work->list);
                        clear_bit(WORK_QUEUED_BIT, &work->flags);
@@ -163,7 +177,6 @@ again_locked:
                        spin_lock_irq(&worker->lock);
                        check_idle_worker(worker);
                }
                if (freezing(current)) {
                        worker->working = 0;
@@ -178,7 +191,8 @@ again_locked:
                                 * jump_in?
                                 */
                                smp_mb();
-                                if (!list_empty(&worker->pending))
+                                if (!list_empty(&worker->pending) ||
+                                    !list_empty(&worker->prio_pending))
                                        continue;
                                /*
@@ -191,7 +205,8 @@ again_locked:
                                 */
                                schedule_timeout(1);
                                smp_mb();
-                                if (!list_empty(&worker->pending))
+                                if (!list_empty(&worker->pending) ||
+                                    !list_empty(&worker->prio_pending))
                                        continue;
                                if (kthread_should_stop())
@@ -200,7 +215,8 @@ again_locked:
                                /* still no more work?, sleep for real */
                                spin_lock_irq(&worker->lock);
                                set_current_state(TASK_INTERRUPTIBLE);
-                                if (!list_empty(&worker->pending))
+                                if (!list_empty(&worker->pending) ||
+                                    !list_empty(&worker->prio_pending))
                                        goto again_locked;
                                /*
@@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
        INIT_LIST_HEAD(&workers->worker_list);
        INIT_LIST_HEAD(&workers->idle_list);
        INIT_LIST_HEAD(&workers->order_list);
+        INIT_LIST_HEAD(&workers->prio_order_list);
        spin_lock_init(&workers->lock);
        workers->max_workers = max;
        workers->idle_thresh = 32;
@@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                }
                INIT_LIST_HEAD(&worker->pending);
+                INIT_LIST_HEAD(&worker->prio_pending);
                INIT_LIST_HEAD(&worker->worker_list);
                spin_lock_init(&worker->lock);
                atomic_set(&worker->num_pending, 0);
@@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work)
                goto out;
        spin_lock_irqsave(&worker->lock, flags);
-        list_add_tail(&work->list, &worker->pending);
+        if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+                list_add_tail(&work->list, &worker->prio_pending);
+        else
+                list_add_tail(&work->list, &worker->pending);
        atomic_inc(&worker->num_pending);
        /* by definition we're busy, take ourselves off the idle
@@ -422,6 +443,11 @@ out:
        return 0;
 }
+void btrfs_set_work_high_prio(struct btrfs_work *work)
+{
+        set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
 /*
 * places a struct btrfs_work into the pending queue of one of the kthreads
 */
@@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
        worker = find_worker(workers);
        if (workers->ordered) {
                spin_lock_irqsave(&workers->lock, flags);
-                list_add_tail(&work->order_list, &workers->order_list);
+                if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+                        list_add_tail(&work->order_list,
+                                      &workers->prio_order_list);
+                } else {
+                        list_add_tail(&work->order_list, &workers->order_list);
+                }
                spin_unlock_irqrestore(&workers->lock, flags);
        } else {
                INIT_LIST_HEAD(&work->order_list);
@@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
        spin_lock_irqsave(&worker->lock, flags);
-        list_add_tail(&work->list, &worker->pending);
+        if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+                list_add_tail(&work->list, &worker->prio_pending);
+        else
+                list_add_tail(&work->list, &worker->pending);
        atomic_inc(&worker->num_pending);
        check_busy_worker(worker);
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 31be4ed8b63e..1b511c109db6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -85,6 +85,7 @@ struct btrfs_workers {
         * of work items waiting for completion
         */
        struct list_head order_list;
+        struct list_head prio_order_list;
        /* lock for finding the next worker thread to queue on */
        spinlock_t lock;
@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
 int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_set_work_high_prio(struct btrfs_work *work);
 #endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e5b2533b691a..a99f1c2a710d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1325,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root,
        int ret = 0;
        int blocksize;
-        parent = path->nodes[level - 1];
+        parent = path->nodes[level + 1];
        if (!parent)
                return 0;
        nritems = btrfs_header_nritems(parent);
-        slot = path->slots[level];
+        slot = path->slots[level + 1];
        blocksize = btrfs_level_size(root, level);
        if (slot > 0) {
@@ -1341,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                        block1 = 0;
                free_extent_buffer(eb);
        }
-        if (slot < nritems) {
+        if (slot + 1 < nritems) {
                block2 = btrfs_node_blockptr(parent, slot + 1);
                gen = btrfs_node_ptr_generation(parent, slot + 1);
                eb = btrfs_find_tree_block(root, block2, blocksize);
@@ -1351,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
        }
        if (block1 || block2) {
                ret = -EAGAIN;
+                /* release the whole path */
                btrfs_release_path(root, path);
+                /* read the blocks */
                if (block1)
                        readahead_tree_block(root, block1, blocksize, 0);
                if (block2)
@@ -1361,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                        eb = read_tree_block(root, block1, blocksize, 0);
                        free_extent_buffer(eb);
                }
-                if (block1) {
+                if (block2) {
                        eb = read_tree_block(root, block2, blocksize, 0);
                        free_extent_buffer(eb);
                }
@@ -1481,12 +1485,15 @@ read_block_for_search(struct btrfs_trans_handle *trans,
         * of the btree by dropping locks before
         * we read.
         */
-        btrfs_release_path(NULL, p);
+        btrfs_unlock_up_safe(p, level + 1);
+        btrfs_set_path_blocking(p);
        if (tmp)
                free_extent_buffer(tmp);
        if (p->reada)
                reada_for_search(root, p, level, slot, key->objectid);
+        btrfs_release_path(NULL, p);
        tmp = read_tree_block(root, blocknr, blocksize, gen);
        if (tmp)
                free_extent_buffer(tmp);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92caa8035f36..a6b83744b05d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -579,6 +579,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->bio_flags = bio_flags;
        atomic_inc(&fs_info->nr_async_submits);
+        if (rw & (1 << BIO_RW_SYNCIO))
+                btrfs_set_work_high_prio(&async->work);
        btrfs_queue_worker(&fs_info->workers, &async->work);
 #if 0
        int limit = btrfs_async_submit_limit(fs_info);
@@ -656,6 +660,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
                                     mirror_num, 0);
        }
        /*
         * kthread helpers are used to submit writes so that checksumming
         * can happen in parallel across all CPUs
@@ -2095,10 +2100,10 @@ static int write_dev_supers(struct btrfs_device *device,
                                device->barriers = 0;
                                get_bh(bh);
                                lock_buffer(bh);
-                                ret = submit_bh(WRITE, bh);
+                                ret = submit_bh(WRITE_SYNC, bh);
                        }
                } else {
-                        ret = submit_bh(WRITE, bh);
+                        ret = submit_bh(WRITE_SYNC, bh);
                }
                if (!ret && wait) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eb2bee8b7fbf..05a1c42e25bf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -50,7 +50,10 @@ struct extent_page_data {
        /* tells writepage not to lock the state bits for this range
         * it still does the unlocking
         */
-        int extent_locked;
+        unsigned int extent_locked:1;
+        /* tells the submit_bio code to use a WRITE_SYNC */
+        unsigned int sync_io:1;
 };
 int __init extent_io_init(void)
@@ -2101,6 +2104,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
        return ret;
 }
+static noinline void update_nr_written(struct page *page,
+                                      struct writeback_control *wbc,
+                                      unsigned long nr_written)
+{
+        wbc->nr_to_write -= nr_written;
+        if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+            wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+                page->mapping->writeback_index = page->index + nr_written;
+}
 /*
 * the writepage semantics are similar to regular writepage.  extent
 * records are inserted to lock ranges in the tree, and as dirty areas
@@ -2136,8 +2149,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        u64 delalloc_end;
        int page_started;
        int compressed;
+        int write_flags;
        unsigned long nr_written = 0;
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                write_flags = WRITE_SYNC_PLUG;
+        else
+                write_flags = WRITE;
        WARN_ON(!PageLocked(page));
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
@@ -2164,6 +2183,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        delalloc_end = 0;
        page_started = 0;
        if (!epd->extent_locked) {
+                /*
+                 * make sure the wbc mapping index is at least updated
+                 * to this page.
+                 */
+                update_nr_written(page, wbc, 0);
                while (delalloc_end < page_end) {
                        nr_delalloc = find_lock_delalloc_range(inode, tree,
                                                       page,
@@ -2185,7 +2210,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                 */
                if (page_started) {
                        ret = 0;
-                        goto update_nr_written;
+                        /*
+                         * we've unlocked the page, so we can't update
+                         * the mapping's writeback index, just update
+                         * nr_to_write.
+                         */
+                        wbc->nr_to_write -= nr_written;
+                        goto done_unlocked;
                }
        }
        lock_extent(tree, start, page_end, GFP_NOFS);
@@ -2198,13 +2229,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                if (ret == -EAGAIN) {
                        unlock_extent(tree, start, page_end, GFP_NOFS);
                        redirty_page_for_writepage(wbc, page);
+                        update_nr_written(page, wbc, nr_written);
                        unlock_page(page);
                        ret = 0;
-                        goto update_nr_written;
+                        goto done_unlocked;
                }
        }
-        nr_written++;
+        /*
+         * we don't want to touch the inode after unlocking the page,
+         * so we update the mapping writeback index now
+         */
+        update_nr_written(page, wbc, nr_written + 1);
        end = page_end;
        if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
@@ -2314,9 +2350,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                                       (unsigned long long)end);
                        }
-                        ret = submit_extent_page(WRITE, tree, page, sector,
+                        ret = submit_extent_page(write_flags, tree, page,
-                                                 iosize, pg_offset, bdev,
+                                                 sector, iosize, pg_offset,
-                                                 &epd->bio, max_nr,
+                                                 bdev, &epd->bio, max_nr,
                                                 end_bio_extent_writepage,
                                                 0, 0, 0);
                        if (ret)
@@ -2336,11 +2372,8 @@ done:
                unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
        unlock_page(page);
-update_nr_written:
+done_unlocked:
-        wbc->nr_to_write -= nr_written;
-        if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
-            wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
-                page->mapping->writeback_index = page->index + nr_written;
        return 0;
 }
@@ -2460,15 +2493,23 @@ retry:
        return ret;
 }
-static noinline void flush_write_bio(void *data)
+static void flush_epd_write_bio(struct extent_page_data *epd)
 {
-        struct extent_page_data *epd = data;
        if (epd->bio) {
-                submit_one_bio(WRITE, epd->bio, 0, 0);
+                if (epd->sync_io)
+                        submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
+                else
+                        submit_one_bio(WRITE, epd->bio, 0, 0);
                epd->bio = NULL;
        }
 }
+static noinline void flush_write_bio(void *data)
+{
+        struct extent_page_data *epd = data;
+        flush_epd_write_bio(epd);
+}
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                          get_extent_t *get_extent,
                          struct writeback_control *wbc)
@@ -2480,23 +2521,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                .tree = tree,
                .get_extent = get_extent,
                .extent_locked = 0,
+                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
        struct writeback_control wbc_writepages = {
                .bdi            = wbc->bdi,
-                .sync_mode      = WB_SYNC_NONE,
+                .sync_mode      = wbc->sync_mode,
                .older_than_this = NULL,
                .nr_to_write    = 64,
                .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
                .range_end      = (loff_t)-1,
        };
        ret = __extent_writepage(page, wbc, &epd);
        extent_write_cache_pages(tree, mapping, &wbc_writepages,
                                 __extent_writepage, &epd, flush_write_bio);
-        if (epd.bio)
+        flush_epd_write_bio(&epd);
-                submit_one_bio(WRITE, epd.bio, 0, 0);
        return ret;
 }
@@ -2515,6 +2555,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                .tree = tree,
                .get_extent = get_extent,
                .extent_locked = 1,
+                .sync_io = mode == WB_SYNC_ALL,
        };
        struct writeback_control wbc_writepages = {
                .bdi            = inode->i_mapping->backing_dev_info,
@@ -2540,8 +2581,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                start += PAGE_CACHE_SIZE;
        }
-        if (epd.bio)
+        flush_epd_write_bio(&epd);
-                submit_one_bio(WRITE, epd.bio, 0, 0);
        return ret;
 }
@@ -2556,13 +2596,13 @@ int extent_writepages(struct extent_io_tree *tree,
                .tree = tree,
                .get_extent = get_extent,
                .extent_locked = 0,
+                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
        ret = extent_write_cache_pages(tree, mapping, wbc,
                                       __extent_writepage, &epd,
                                       flush_write_bio);
-        if (epd.bio)
+        flush_epd_write_bio(&epd);
-                submit_one_bio(WRITE, epd.bio, 0, 0);
        return ret;
 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9c9fb46ccd08..482f8db2cfd0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -830,7 +830,7 @@ again:
                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
                BUG_ON(ret);
-                goto done;
+                goto release;
        } else if (split == start) {
                if (locked_end < extent_end) {
                        ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
@@ -926,6 +926,8 @@ again:
        }
 done:
        btrfs_mark_buffer_dirty(leaf);
+release:
        btrfs_release_path(root, path);
        if (split_end && split == start) {
                split = end;
@@ -1131,7 +1133,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                if (will_write) {
                        btrfs_fdatawrite_range(inode->i_mapping, pos,
                                               pos + write_bytes - 1,
-                                               WB_SYNC_NONE);
+                                               WB_SYNC_ALL);
                } else {
                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
                                                           num_pages);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0d1dd492a58..65219f6a16a1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4970,10 +4970,10 @@ out_fail:
        return err;
 }
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+static int prealloc_file_range(struct btrfs_trans_handle *trans,
+                               struct inode *inode, u64 start, u64 end,
                               u64 alloc_hint, int mode)
 {
-        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 alloc_size;
@@ -4981,10 +4981,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
        u64 num_bytes = end - start;
        int ret = 0;
-        trans = btrfs_join_transaction(root, 1);
-        BUG_ON(!trans);
-        btrfs_set_trans_block_group(trans, inode);
        while (num_bytes > 0) {
                alloc_size = min(num_bytes, root->fs_info->max_extent);
                ret = btrfs_reserve_extent(trans, root, alloc_size,
@@ -5015,7 +5011,6 @@ out:
                BUG_ON(ret);
        }
-        btrfs_end_transaction(trans, root);
        return ret;
 }
@@ -5029,11 +5024,18 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        u64 alloc_hint = 0;
        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
        struct extent_map *em;
+        struct btrfs_trans_handle *trans;
        int ret;
        alloc_start = offset & ~mask;
        alloc_end =  (offset + len + mask) & ~mask;
+        /*
+         * wait for ordered IO before we have any locks.  We'll loop again
+         * below with the locks held.
+         */
+        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
        mutex_lock(&inode->i_mutex);
        if (alloc_start > inode->i_size) {
                ret = btrfs_cont_expand(inode, alloc_start);
@@ -5043,6 +5045,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        while (1) {
                struct btrfs_ordered_extent *ordered;
+                trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
+                if (!trans) {
+                        ret = -EIO;
+                        goto out;
+                }
+                /* the extent lock is ordered inside the running
+                 * transaction
+                 */
                lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
                            alloc_end - 1, GFP_NOFS);
                ordered = btrfs_lookup_first_ordered_extent(inode,
@@ -5053,6 +5065,12 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        btrfs_put_ordered_extent(ordered);
                        unlock_extent(&BTRFS_I(inode)->io_tree,
                                      alloc_start, alloc_end - 1, GFP_NOFS);
+                        btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+                        /*
+                         * we can't wait on the range with the transaction
+                         * running or with the extent lock held
+                         */
                        btrfs_wait_ordered_range(inode, alloc_start,
                                                 alloc_end - alloc_start);
                } else {
@@ -5070,7 +5088,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                last_byte = min(extent_map_end(em), alloc_end);
                last_byte = (last_byte + mask) & ~mask;
                if (em->block_start == EXTENT_MAP_HOLE) {
-                        ret = prealloc_file_range(inode, cur_offset,
+                        ret = prealloc_file_range(trans, inode, cur_offset,
                                        last_byte, alloc_hint, mode);
                        if (ret < 0) {
                                free_extent_map(em);
@@ -5089,6 +5107,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        }
        unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
                      GFP_NOFS);
+        btrfs_end_transaction(trans, BTRFS_I(inode)->root);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7594bec1be10..9f135e878507 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -461,15 +461,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
-        if (!vol_args)
+                return PTR_ERR(vol_args);
-                return -ENOMEM;
-        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-                ret = -EFAULT;
-                goto out;
-        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        namelen = strlen(vol_args->name);
@@ -545,7 +539,6 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 out_unlock:
        mutex_unlock(&root->fs_info->volume_mutex);
-out:
        kfree(vol_args);
        return ret;
 }
@@ -565,15 +558,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
-        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
-        if (!vol_args)
+                return PTR_ERR(vol_args);
-                return -ENOMEM;
-        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-                ret = -EFAULT;
-                goto out;
-        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        namelen = strlen(vol_args->name);
@@ -675,19 +662,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
+                return PTR_ERR(vol_args);
-        if (!vol_args)
-                return -ENOMEM;
-        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-                ret = -EFAULT;
-                goto out;
-        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_init_new_device(root, vol_args->name);
-out:
        kfree(vol_args);
        return ret;
 }
@@ -703,19 +684,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
-        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
+                return PTR_ERR(vol_args);
-        if (!vol_args)
-                return -ENOMEM;
-        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-                ret = -EFAULT;
-                goto out;
-        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_rm_device(root, vol_args->name);
-out:
        kfree(vol_args);
        return ret;
 }
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 53c87b197d70..d6f0806c682f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -489,7 +489,7 @@ again:
        /* start IO across the range first to instantiate any delalloc
         * extents
         */
-        btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+        btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
        /* The compression code will leave pages locked but return from
         * writepage without setting the page writeback.  Starting again
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9744af9d71e9..a7acfe639a44 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -635,14 +635,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        vol = kmalloc(sizeof(*vol), GFP_KERNEL);
+        vol = memdup_user((void __user *)arg, sizeof(*vol));
-        if (!vol)
+        if (IS_ERR(vol))
-                return -ENOMEM;
+                return PTR_ERR(vol);
-        if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
-                ret = -EFAULT;
-                goto out;
-        }
        switch (cmd) {
        case BTRFS_IOC_SCAN_DEV:
@@ -650,7 +645,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
                                            &btrfs_fs_type, &fs_devices);
                break;
        }
-out:
        kfree(vol);
        return ret;
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e0913e469728..e53835b88594 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
        return NULL;
 }
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+                        struct bio *head, struct bio *tail)
+{
+        struct bio *old_head;
+        old_head = pending_bios->head;
+        pending_bios->head = head;
+        if (pending_bios->tail)
+                tail->bi_next = old_head;
+        else
+                pending_bios->tail = tail;
+}
 /*
 * we try to collect pending bios for a device so we don't get a large
 * number of procs sending bios down to the same device.  This greatly
@@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        struct bio *pending;
        struct backing_dev_info *bdi;
        struct btrfs_fs_info *fs_info;
+        struct btrfs_pending_bios *pending_bios;
        struct bio *tail;
        struct bio *cur;
        int again = 0;
-        unsigned long num_run = 0;
+        unsigned long num_run;
+        unsigned long num_sync_run;
        unsigned long limit;
        unsigned long last_waited = 0;
@@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        limit = btrfs_async_submit_limit(fs_info);
        limit = limit * 2 / 3;
+        /* we want to make sure that every time we switch from the sync
+         * list to the normal list, we unplug
+         */
+        num_sync_run = 0;
 loop:
        spin_lock(&device->io_lock);
+        num_run = 0;
 loop_lock:
        /* take all the bios off the list at once and process them
         * later on (without the lock held).  But, remember the
         * tail and other pointers so the bios can be properly reinserted
         * into the list if we hit congestion
         */
-        pending = device->pending_bios;
+        if (device->pending_sync_bios.head)
-        tail = device->pending_bio_tail;
+                pending_bios = &device->pending_sync_bios;
+        else
+                pending_bios = &device->pending_bios;
+        pending = pending_bios->head;
+        tail = pending_bios->tail;
        WARN_ON(pending && !tail);
-        device->pending_bios = NULL;
-        device->pending_bio_tail = NULL;
        /*
         * if pending was null this time around, no bios need processing
@@ -176,16 +202,41 @@ loop_lock:
         * device->running_pending is used to synchronize with the
         * schedule_bio code.
         */
-        if (pending) {
+        if (device->pending_sync_bios.head == NULL &&
-                again = 1;
+            device->pending_bios.head == NULL) {
-                device->running_pending = 1;
-        } else {
                again = 0;
                device->running_pending = 0;
+        } else {
+                again = 1;
+                device->running_pending = 1;
        }
+        pending_bios->head = NULL;
+        pending_bios->tail = NULL;
        spin_unlock(&device->io_lock);
+        /*
+         * if we're doing the regular priority list, make sure we unplug
+         * for any high prio bios we've sent down
+         */
+        if (pending_bios == &device->pending_bios && num_sync_run > 0) {
+                num_sync_run = 0;
+                blk_run_backing_dev(bdi, NULL);
+        }
        while (pending) {
+                rmb();
+                if (pending_bios != &device->pending_sync_bios &&
+                    device->pending_sync_bios.head &&
+                    num_run > 16) {
+                        cond_resched();
+                        spin_lock(&device->io_lock);
+                        requeue_list(pending_bios, pending, tail);
+                        goto loop_lock;
+                }
                cur = pending;
                pending = pending->bi_next;
                cur->bi_next = NULL;
@@ -196,10 +247,18 @@ loop_lock:
                        wake_up(&fs_info->async_submit_wait);
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-                bio_get(cur);
                submit_bio(cur->bi_rw, cur);
-                bio_put(cur);
                num_run++;
+                if (bio_sync(cur))
+                        num_sync_run++;
+                if (need_resched()) {
+                        if (num_sync_run) {
+                                blk_run_backing_dev(bdi, NULL);
+                                num_sync_run = 0;
+                        }
+                        cond_resched();
+                }
                /*
                 * we made progress, there is more work to do and the bdi
@@ -208,7 +267,6 @@ loop_lock:
                 */
                if (pending && bdi_write_congested(bdi) && num_run > 16 &&
                    fs_info->fs_devices->open_devices > 1) {
-                        struct bio *old_head;
                        struct io_context *ioc;
                        ioc = current->io_context;
@@ -233,17 +291,17 @@ loop_lock:
                                 * against it before looping
                                 */
                                last_waited = ioc->last_waited;
+                                if (need_resched()) {
+                                        if (num_sync_run) {
+                                                blk_run_backing_dev(bdi, NULL);
+                                                num_sync_run = 0;
+                                        }
+                                        cond_resched();
+                                }
                                continue;
                        }
                        spin_lock(&device->io_lock);
+                        requeue_list(pending_bios, pending, tail);
-                        old_head = device->pending_bios;
-                        device->pending_bios = pending;
-                        if (device->pending_bio_tail)
-                                tail->bi_next = old_head;
-                        else
-                                device->pending_bio_tail = tail;
                        device->running_pending = 1;
                        spin_unlock(&device->io_lock);
@@ -251,11 +309,18 @@ loop_lock:
                        goto done;
                }
        }
+        if (num_sync_run) {
+                num_sync_run = 0;
+                blk_run_backing_dev(bdi, NULL);
+        }
+        cond_resched();
        if (again)
                goto loop;
        spin_lock(&device->io_lock);
-        if (device->pending_bios)
+        if (device->pending_bios.head || device->pending_sync_bios.head)
                goto loop_lock;
        spin_unlock(&device->io_lock);
@@ -2497,7 +2562,7 @@ again:
                        max_errors = 1;
                }
        }
-        if (multi_ret && rw == WRITE &&
+        if (multi_ret && (rw & (1 << BIO_RW)) &&
            stripes_allocated < stripes_required) {
                stripes_allocated = map->num_stripes;
                free_extent_map(em);
@@ -2762,6 +2827,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
                                 int rw, struct bio *bio)
 {
        int should_queue = 1;
+        struct btrfs_pending_bios *pending_bios;
        /* don't bother with additional async steps for reads, right now */
        if (!(rw & (1 << BIO_RW))) {
@@ -2783,13 +2849,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
        bio->bi_rw |= rw;
        spin_lock(&device->io_lock);
+        if (bio_sync(bio))
+                pending_bios = &device->pending_sync_bios;
+        else
+                pending_bios = &device->pending_bios;
-        if (device->pending_bio_tail)
+        if (pending_bios->tail)
-                device->pending_bio_tail->bi_next = bio;
+                pending_bios->tail->bi_next = bio;
-        device->pending_bio_tail = bio;
+        pending_bios->tail = bio;
-        if (!device->pending_bios)
+        if (!pending_bios->head)
-                device->pending_bios = bio;
+                pending_bios->head = bio;
        if (device->running_pending)
                should_queue = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2185de72ff7d..5836327ba5dd 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -23,13 +23,22 @@
 #include "async-thread.h"
 struct buffer_head;
+struct btrfs_pending_bios {
+        struct bio *head;
+        struct bio *tail;
+};
 struct btrfs_device {
        struct list_head dev_list;
        struct list_head dev_alloc_list;
        struct btrfs_fs_devices *fs_devices;
        struct btrfs_root *dev_root;
-        struct bio *pending_bios;
-        struct bio *pending_bio_tail;
+        /* regular prio bios */
+        struct btrfs_pending_bios pending_bios;
+        /* WRITE_SYNC bios */
+        struct btrfs_pending_bios pending_sync_bios;
        int running_pending;
        u64 generation;
diff --git a/fs/compat.c b/fs/compat.c
index 3f84d5f15889..379a399bf5c3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -181,22 +181,24 @@ asmlinkage long compat_sys_newstat(char __user * filename,
                struct compat_stat __user *statbuf)
 {
        struct kstat stat;
-        int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_stat(filename, &stat);
-                error = cp_compat_stat(&stat, statbuf);
+        if (error)
-        return error;
+                return error;
+        return cp_compat_stat(&stat, statbuf);
 }
 asmlinkage long compat_sys_newlstat(char __user * filename,
                struct compat_stat __user *statbuf)
 {
        struct kstat stat;
-        int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_lstat(filename, &stat);
-                error = cp_compat_stat(&stat, statbuf);
+        if (error)
-        return error;
+                return error;
+        return cp_compat_stat(&stat, statbuf);
 }
 #ifndef __ARCH_WANT_STAT64
@@ -204,21 +206,12 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
                struct compat_stat __user *statbuf, int flag)
 {
        struct kstat stat;
-        int error = -EINVAL;
+        int error;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-                goto out;
-        if (flag & AT_SYMLINK_NOFOLLOW)
-                error = vfs_lstat_fd(dfd, filename, &stat);
-        else
-                error = vfs_stat_fd(dfd, filename, &stat);
-        if (!error)
-                error = cp_compat_stat(&stat, statbuf);
-out:
+        error = vfs_fstatat(dfd, filename, &stat, flag);
-        return error;
+        if (error)
+                return error;
+        return cp_compat_stat(&stat, statbuf);
 }
 #endif
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 3e87ce443ea2..b83f6bcfa51a 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -58,7 +58,6 @@
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
 #include <linux/atalk.h>
-#include <linux/loop.h>
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci.h>
@@ -68,6 +67,7 @@
 #include <linux/gigaset_dev.h>
 #ifdef CONFIG_BLOCK
+#include <linux/loop.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/sg.h>
@@ -2660,6 +2660,8 @@ HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
 HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
 /* block stuff */
 #ifdef CONFIG_BLOCK
+/* loop */
+IGNORE_IOCTL(LOOP_CLR_FD)
 /* Raw devices */
 HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
 HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
@@ -2728,9 +2730,6 @@ HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
 IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
 IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
 #ifdef CONFIG_SPARC
 /* Sparc framebuffers, handled in sbusfb_compat_ioctl() */
 IGNORE_IOCTL(FBIOGTYPE)
diff --git a/fs/dcache.c b/fs/dcache.c
index 761d30be2683..1fcffebfb44f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2149,7 +2149,6 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
        int result;
        unsigned long seq;
-        /* FIXME: This is old behavior, needed? Please check callers. */
        if (new_dentry == old_dentry)
                return 1;
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index a67fea655f49..dda3c58eefc0 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -418,18 +418,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
        if (count == 0)
                goto out;
-        data = kmalloc(count, GFP_KERNEL);
-        if (!data) {
+        data = memdup_user(buf, count);
-                printk(KERN_ERR "%s: Out of memory whilst attempting to "
+        if (IS_ERR(data)) {
-                       "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
+                printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
+                       __func__, PTR_ERR(data));
                goto out;
        }
-        rc = copy_from_user(data, buf, count);
-        if (rc) {
-                printk(KERN_ERR "%s: copy_from_user returned error [%d]\n",
-                       __func__, rc);
-                goto out_free;
-        }
        sz = count;
        i = 0;
        switch (data[i++]) {
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 1aa70260e6d1..a24c58e181db 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -199,7 +199,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
        return retval;
 }
-int get_filesystem_list(char * buf)
+int __init get_filesystem_list(char *buf)
 {
        int len = 0;
        struct file_system_type * tmp;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index bf23a62aa925..70f87f43afa2 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -156,6 +156,12 @@ static void inode_go_sync(struct gfs2_glock *gl)
        error = filemap_fdatawait(metamapping);
        mapping_set_error(metamapping, error);
        gfs2_ail_empty_gl(gl);
+        /*
+         * Writeback of the data mapping may cause the dirty flag to be set
+         * so we have to clear it again here.
+         */
+        smp_mb__before_clear_bit();
+        clear_bit(GLF_DIRTY, &gl->gl_flags);
 }
 /**
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 101caf3ee861..5d82e91887e3 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -413,7 +413,9 @@ out_unlock:
        gfs2_glock_dq(&gh);
 out:
        gfs2_holder_uninit(&gh);
-        if (ret)
+        if (ret == -ENOMEM)
+                ret = VM_FAULT_OOM;
+        else if (ret)
                ret = VM_FAULT_SIGBUS;
        return ret;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 23a3c76711e0..153d9681192b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -26,7 +26,6 @@
 #include <linux/pagevec.h>
 #include <linux/parser.h>
 #include <linux/mman.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
@@ -842,7 +841,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 bad_val:
        printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
               args[0].from, p);
-        return 1;
+        return -EINVAL;
 }
 static int
diff --git a/fs/namei.c b/fs/namei.c
index b8433ebfae05..78f253cd2d4f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1248,6 +1248,8 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        int err;
        struct qstr this;
+        WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
        err = __lookup_one_len(name, &this, base, len);
        if (err)
                return ERR_PTR(err);
diff --git a/fs/namespace.c b/fs/namespace.c
index d9138f81ec10..41196209a906 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1377,7 +1377,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        if (parent_path) {
                detach_mnt(source_mnt, parent_path);
                attach_mnt(source_mnt, path);
-                touch_mnt_namespace(current->nsproxy->mnt_ns);
+                touch_mnt_namespace(parent_path->mnt->mnt_ns);
        } else {
                mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
                commit_tree(source_mnt);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index f54360f50a9c..fa038df63ac8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -660,13 +660,10 @@ outrel:
                        if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN)
                                return -ENOMEM;
                        if (user.object_name_len) {
-                                newname = kmalloc(user.object_name_len, GFP_USER);
+                                newname = memdup_user(user.object_name,
-                                if (!newname)
+                                                      user.object_name_len);
-                                        return -ENOMEM;
+                                if (IS_ERR(newname))
-                                if (copy_from_user(newname, user.object_name, user.object_name_len)) {
+                                        return PTR_ERR(newname);
-                                        kfree(newname);
-                                        return -EFAULT;
-                                }
                        } else {
                                newname = NULL;
                        }
@@ -760,13 +757,9 @@ outrel:
                        if (user.len > NCP_PRIVATE_DATA_MAX_LEN)
                                return -ENOMEM;
                        if (user.len) {
-                                new = kmalloc(user.len, GFP_USER);
+                                new = memdup_user(user.data, user.len);
-                                if (!new)
+                                if (IS_ERR(new))
-                                        return -ENOMEM;
+                                        return PTR_ERR(new);
-                                if (copy_from_user(new, user.data, user.len)) {
-                                        kfree(new);
-                                        return -EFAULT;
-                                }
                        } else {
                                new = NULL;
                        }
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index e6a1932c7110..35869a4921f1 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -713,7 +713,8 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
        if (args->npages != 0)
                xdr_encode_pages(buf, args->pages, 0, args->len);
        else
-                req->rq_slen += args->len;
+                req->rq_slen = xdr_adjust_iovec(req->rq_svec,
+                                p + XDR_QUADLEN(args->len));
        err = nfsacl_encode(buf, base, args->inode,
                            (args->mask & NFS_ACL) ?
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 3444c0052a87..5275097a7565 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -229,21 +229,23 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
                goto out;
        status = vfs_readdir(filp, nfsd4_build_namelist, &names);
        fput(filp);
+        mutex_lock(&dir->d_inode->i_mutex);
        while (!list_empty(&names)) {
                entry = list_entry(names.next, struct name_list, list);
                dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
                if (IS_ERR(dentry)) {
                        status = PTR_ERR(dentry);
-                        goto out;
+                        break;
                }
                status = f(dir, dentry);
                dput(dentry);
                if (status)
-                        goto out;
+                        break;
                list_del(&entry->list);
                kfree(entry);
        }
+        mutex_unlock(&dir->d_inode->i_mutex);
 out:
        while (!list_empty(&names)) {
                entry = list_entry(names.next, struct name_list, list);
@@ -255,36 +257,6 @@ out:
 }
 static int
-nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry)
-{
-        int status;
-        if (!S_ISREG(dir->d_inode->i_mode)) {
-                printk("nfsd4: non-file found in client recovery directory\n");
-                return -EINVAL;
-        }
-        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-        status = vfs_unlink(dir->d_inode, dentry);
-        mutex_unlock(&dir->d_inode->i_mutex);
-        return status;
-}
-static int
-nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry)
-{
-        int status;
-        /* For now this directory should already be empty, but we empty it of
-         * any regular files anyway, just in case the directory was created by
-         * a kernel from the future.... */
-        nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file);
-        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-        status = vfs_rmdir(dir->d_inode, dentry);
-        mutex_unlock(&dir->d_inode->i_mutex);
-        return status;
-}
-static int
 nfsd4_unlink_clid_dir(char *name, int namlen)
 {
        struct dentry *dentry;
@@ -294,18 +266,18 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
        mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
        dentry = lookup_one_len(name, rec_dir.dentry, namlen);
-        mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
        if (IS_ERR(dentry)) {
                status = PTR_ERR(dentry);
-                return status;
+                goto out_unlock;
        }
        status = -ENOENT;
        if (!dentry->d_inode)
                goto out;
+        status = vfs_rmdir(rec_dir.dentry->d_inode, dentry);
-        status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry);
 out:
        dput(dentry);
+out_unlock:
+        mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
        return status;
 }
@@ -348,7 +320,7 @@ purge_old(struct dentry *parent, struct dentry *child)
        if (nfs4_has_reclaimed_state(child->d_name.name, false))
                return 0;
-        status = nfsd4_clear_clid_dir(parent, child);
+        status = vfs_rmdir(parent->d_inode, child);
        if (status)
                printk("failed to remove client recovery directory %s\n",
                                child->d_name.name);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ab93fcfef254..6c68ffd6b4bb 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -116,10 +116,15 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
        }
        if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
                /* successfully crossed mount point */
-                exp_put(exp);
+                /*
-                *expp = exp2;
+                 * This is subtle: dentry is *not* under mnt at this point.
+                 * The only reason we are safe is that original mnt is pinned
+                 * down by exp, so we should dput before putting exp.
+                 */
                dput(dentry);
                *dpp = mounts;
+                exp_put(exp);
+                *expp = exp2;
        } else {
                exp_put(exp2);
                dput(mounts);
@@ -1885,8 +1890,8 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
        return 0;
 }
-static int nfsd_buffered_readdir(struct file *file, filldir_t func,
+static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
-                                 struct readdir_cd *cdp, loff_t *offsetp)
+                                    struct readdir_cd *cdp, loff_t *offsetp)
 {
        struct readdir_data buf;
        struct buffered_dirent *de;
@@ -1896,11 +1901,12 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
        buf.dirent = (void *)__get_free_page(GFP_KERNEL);
        if (!buf.dirent)
-                return -ENOMEM;
+                return nfserrno(-ENOMEM);
        offset = *offsetp;
        while (1) {
+                struct inode *dir_inode = file->f_path.dentry->d_inode;
                unsigned int reclen;
                cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1919,26 +1925,38 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
                if (!size)
                        break;
+                /*
+                 * Various filldir functions may end up calling back into
+                 * lookup_one_len() and the file system's ->lookup() method.
+                 * These expect i_mutex to be held, as it would within readdir.
+                 */
+                host_err = mutex_lock_killable(&dir_inode->i_mutex);
+                if (host_err)
+                        break;
                de = (struct buffered_dirent *)buf.dirent;
                while (size > 0) {
                        offset = de->offset;
                        if (func(cdp, de->name, de->namlen, de->offset,
                                 de->ino, de->d_type))
-                                goto done;
+                                break;
                        if (cdp->err != nfs_ok)
-                                goto done;
+                                break;
                        reclen = ALIGN(sizeof(*de) + de->namlen,
                                       sizeof(u64));
                        size -= reclen;
                        de = (struct buffered_dirent *)((char *)de + reclen);
                }
+                mutex_unlock(&dir_inode->i_mutex);
+                if (size > 0) /* We bailed out early */
+                        break;
                offset = vfs_llseek(file, 0, SEEK_CUR);
        }
- done:
        free_page((unsigned long)(buf.dirent));
        if (host_err)
diff --git a/fs/stat.c b/fs/stat.c
index 2db740a0cfb5..075694e31d8b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -55,59 +55,54 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 EXPORT_SYMBOL(vfs_getattr);
-int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
+int vfs_fstat(unsigned int fd, struct kstat *stat)
 {
-        struct path path;
+        struct file *f = fget(fd);
-        int error;
+        int error = -EBADF;
-        error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path);
+        if (f) {
-        if (!error) {
+                error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
-                error = vfs_getattr(path.mnt, path.dentry, stat);
+                fput(f);
-                path_put(&path);
        }
        return error;
 }
+EXPORT_SYMBOL(vfs_fstat);
-int vfs_stat(char __user *name, struct kstat *stat)
+int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
 {
-        return vfs_stat_fd(AT_FDCWD, name, stat);
+        struct path path;
-}
+        int error = -EINVAL;
+        int lookup_flags = 0;
-EXPORT_SYMBOL(vfs_stat);
+        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+                goto out;
-int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
+        if (!(flag & AT_SYMLINK_NOFOLLOW))
-{
+                lookup_flags |= LOOKUP_FOLLOW;
-        struct path path;
-        int error;
-        error = user_path_at(dfd, name, 0, &path);
+        error = user_path_at(dfd, filename, lookup_flags, &path);
-        if (!error) {
+        if (error)
-                error = vfs_getattr(path.mnt, path.dentry, stat);
+                goto out;
-                path_put(&path);
-        }
+        error = vfs_getattr(path.mnt, path.dentry, stat);
+        path_put(&path);
+out:
        return error;
 }
+EXPORT_SYMBOL(vfs_fstatat);
-int vfs_lstat(char __user *name, struct kstat *stat)
+int vfs_stat(char __user *name, struct kstat *stat)
 {
-        return vfs_lstat_fd(AT_FDCWD, name, stat);
+        return vfs_fstatat(AT_FDCWD, name, stat, 0);
 }
+EXPORT_SYMBOL(vfs_stat);
-EXPORT_SYMBOL(vfs_lstat);
+int vfs_lstat(char __user *name, struct kstat *stat)
-int vfs_fstat(unsigned int fd, struct kstat *stat)
 {
-        struct file *f = fget(fd);
+        return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
-        int error = -EBADF;
-        if (f) {
-                error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
-                fput(f);
-        }
-        return error;
 }
+EXPORT_SYMBOL(vfs_lstat);
-EXPORT_SYMBOL(vfs_fstat);
 #ifdef __ARCH_WANT_OLD_STAT
@@ -155,23 +150,25 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
        struct kstat stat;
-        int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_stat(filename, &stat);
-                error = cp_old_stat(&stat, statbuf);
+        if (error)
+                return error;
-        return error;
+        return cp_old_stat(&stat, statbuf);
 }
 SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
        struct kstat stat;
-        int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_lstat(filename, &stat);
-                error = cp_old_stat(&stat, statbuf);
+        if (error)
+                return error;
-        return error;
+        return cp_old_stat(&stat, statbuf);
 }
 SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
@@ -240,23 +237,23 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
 {
        struct kstat stat;
-        int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
+        int error = vfs_stat(filename, &stat);
-        if (!error)
-                error = cp_new_stat(&stat, statbuf);
-        return error;
+        if (error)
+                return error;
+        return cp_new_stat(&stat, statbuf);
 }
 SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
 {
        struct kstat stat;
-        int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
+        int error;
-        if (!error)
+        error = vfs_lstat(filename, &stat);
-                error = cp_new_stat(&stat, statbuf);
+        if (error)
+                return error;
-        return error;
+        return cp_new_stat(&stat, statbuf);
 }
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
@@ -264,21 +261,12 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
                struct stat __user *, statbuf, int, flag)
 {
        struct kstat stat;
-        int error = -EINVAL;
+        int error;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-                goto out;
-        if (flag & AT_SYMLINK_NOFOLLOW)
-                error = vfs_lstat_fd(dfd, filename, &stat);
-        else
-                error = vfs_stat_fd(dfd, filename, &stat);
-        if (!error)
-                error = cp_new_stat(&stat, statbuf);
-out:
+        error = vfs_fstatat(dfd, filename, &stat, flag);
-        return error;
+        if (error)
+                return error;
+        return cp_new_stat(&stat, statbuf);
 }
 #endif
@@ -404,21 +392,12 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
                struct stat64 __user *, statbuf, int, flag)
 {
        struct kstat stat;
-        int error = -EINVAL;
+        int error;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
-                goto out;
-        if (flag & AT_SYMLINK_NOFOLLOW)
-                error = vfs_lstat_fd(dfd, filename, &stat);
-        else
-                error = vfs_stat_fd(dfd, filename, &stat);
-        if (!error)
-                error = cp_new_stat64(&stat, statbuf);
-out:
+        error = vfs_fstatat(dfd, filename, &stat, flag);
-        return error;
+        if (error)
+                return error;
+        return cp_new_stat64(&stat, statbuf);
 }
 #endif /* __ARCH_WANT_STAT64 */
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 93e0c0281d45..9345806c8853 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -157,14 +157,9 @@ static ssize_t write(struct file *file, const char __user *userbuf,
                        count = size - offs;
        }
-        temp = kmalloc(count, GFP_KERNEL);
+        temp = memdup_user(userbuf, count);
-        if (!temp)
+        if (IS_ERR(temp))
-                return -ENOMEM;
+                return PTR_ERR(temp);
-        if (copy_from_user(temp, userbuf, count)) {
-                count = -EFAULT;
-                goto out_free;
-        }
        mutex_lock(&bb->mutex);
@@ -176,8 +171,6 @@ static ssize_t write(struct file *file, const char __user *userbuf,
        if (count > 0)
                *off = offs + count;
-out_free:
-        kfree(temp);
        return count;
 }
diff --git a/fs/xattr.c b/fs/xattr.c
index 197c4fcac032..d51b8f9db921 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -237,13 +237,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
        if (size) {
                if (size > XATTR_SIZE_MAX)
                        return -E2BIG;
-                kvalue = kmalloc(size, GFP_KERNEL);
+                kvalue = memdup_user(value, size);
-                if (!kvalue)
+                if (IS_ERR(kvalue))
-                        return -ENOMEM;
+                        return PTR_ERR(kvalue);
-                if (copy_from_user(kvalue, value, size)) {
-                        kfree(kvalue);
-                        return -EFAULT;
-                }
        }
        error = vfs_setxattr(d, kname, kvalue, size, flags);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index d0b499418a7d..34eaab608e6e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -489,17 +489,12 @@ xfs_attrmulti_attr_set(
        if (len > XATTR_SIZE_MAX)
                return EINVAL;
-        kbuf = kmalloc(len, GFP_KERNEL);
+        kbuf = memdup_user(ubuf, len);
-        if (!kbuf)
+        if (IS_ERR(kbuf))
-                return ENOMEM;
+                return PTR_ERR(kbuf);
-        if (copy_from_user(kbuf, ubuf, len))
-                goto out_kfree;
        error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
- out_kfree:
-        kfree(kbuf);
        return error;
 }
@@ -540,20 +535,16 @@ xfs_attrmulti_by_handle(
        if (!size || size > 16 * PAGE_SIZE)
                goto out_dput;
-        error = ENOMEM;
+        ops = memdup_user(am_hreq.ops, size);
-        ops = kmalloc(size, GFP_KERNEL);
+        if (IS_ERR(ops)) {
-        if (!ops)
+                error = PTR_ERR(ops);
                goto out_dput;
+        }
-        error = EFAULT;
-        if (copy_from_user(ops, am_hreq.ops, size))
-                goto out_kfree_ops;
        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
        if (!attr_name)
                goto out_kfree_ops;
        error = 0;
        for (i = 0; i < am_hreq.opcount; i++) {
                ops[i].am_error = strncpy_from_user(attr_name,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index c70c4e3db790..0882d166239a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -427,20 +427,16 @@ xfs_compat_attrmulti_by_handle(
        if (!size || size > 16 * PAGE_SIZE)
                goto out_dput;
-        error = ENOMEM;
+        ops = memdup_user(compat_ptr(am_hreq.ops), size);
-        ops = kmalloc(size, GFP_KERNEL);
+        if (IS_ERR(ops)) {
-        if (!ops)
+                error = PTR_ERR(ops);
                goto out_dput;
+        }
-        error = EFAULT;
-        if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
-                goto out_kfree_ops;
        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
        if (!attr_name)
                goto out_kfree_ops;
        error = 0;
        for (i = 0; i < am_hreq.opcount; i++) {
                ops[i].am_error = strncpy_from_user(attr_name,