52 files changed, 984 insertions, 754 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7ec14097fef1..cb97174e2366 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,8 @@ struct btrfs_worker_thread {
        int idle;
 };
+static int __btrfs_start_workers(struct btrfs_workers *workers);
 /*
 * btrfs_start_workers uses kthread_run, which can block waiting for memory
 * for a very long time.  It will actually throttle on page writeback,
@@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work)
 {
        struct worker_start *start;
        start = container_of(work, struct worker_start, work);
-        btrfs_start_workers(start->queue, 1);
+        __btrfs_start_workers(start->queue);
        kfree(start);
 }
-static int start_new_worker(struct btrfs_workers *queue)
-{
-        struct worker_start *start;
-        int ret;
-        start = kzalloc(sizeof(*start), GFP_NOFS);
-        if (!start)
-                return -ENOMEM;
-        start->work.func = start_new_worker_func;
-        start->queue = queue;
-        ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
-        if (ret)
-                kfree(start);
-        return ret;
-}
 /*
 * helper function to move a thread onto the idle list after it
 * has finished some requests.
@@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
 static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
 {
        struct btrfs_workers *workers = worker->workers;
+        struct worker_start *start;
        unsigned long flags;
        rmb();
        if (!workers->atomic_start_pending)
                return;
+        start = kzalloc(sizeof(*start), GFP_NOFS);
+        if (!start)
+                return;
+        start->work.func = start_new_worker_func;
+        start->queue = workers;
        spin_lock_irqsave(&workers->lock, flags);
        if (!workers->atomic_start_pending)
                goto out;
@@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
        workers->num_workers_starting += 1;
        spin_unlock_irqrestore(&workers->lock, flags);
-        start_new_worker(workers);
+        btrfs_queue_worker(workers->atomic_worker_start, &start->work);
        return;
 out:
+        kfree(start);
        spin_unlock_irqrestore(&workers->lock, flags);
 }
@@ -331,7 +325,7 @@ again:
                        run_ordered_completions(worker->workers, work);
                        check_pending_worker_creates(worker);
+                        cond_resched();
                }
                spin_lock_irq(&worker->lock);
@@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
 * starts new worker threads.  This does not enforce the max worker
 * count in case you need to temporarily go past it.
 */
-static int __btrfs_start_workers(struct btrfs_workers *workers,
+static int __btrfs_start_workers(struct btrfs_workers *workers)
-                                 int num_workers)
 {
        struct btrfs_worker_thread *worker;
        int ret = 0;
-        int i;
-        for (i = 0; i < num_workers; i++) {
+        worker = kzalloc(sizeof(*worker), GFP_NOFS);
-                worker = kzalloc(sizeof(*worker), GFP_NOFS);
+        if (!worker) {
-                if (!worker) {
+                ret = -ENOMEM;
-                        ret = -ENOMEM;
+                goto fail;
-                        goto fail;
+        }
-                }
-                INIT_LIST_HEAD(&worker->pending);
+        INIT_LIST_HEAD(&worker->pending);
-                INIT_LIST_HEAD(&worker->prio_pending);
+        INIT_LIST_HEAD(&worker->prio_pending);
-                INIT_LIST_HEAD(&worker->worker_list);
+        INIT_LIST_HEAD(&worker->worker_list);
-                spin_lock_init(&worker->lock);
+        spin_lock_init(&worker->lock);
-                atomic_set(&worker->num_pending, 0);
+        atomic_set(&worker->num_pending, 0);
-                atomic_set(&worker->refs, 1);
+        atomic_set(&worker->refs, 1);
-                worker->workers = workers;
+        worker->workers = workers;
-                worker->task = kthread_run(worker_loop, worker,
+        worker->task = kthread_run(worker_loop, worker,
-                                           "btrfs-%s-%d", workers->name,
+                                   "btrfs-%s-%d", workers->name,
-                                           workers->num_workers + i);
+                                   workers->num_workers + 1);
-                if (IS_ERR(worker->task)) {
+        if (IS_ERR(worker->task)) {
-                        ret = PTR_ERR(worker->task);
+                ret = PTR_ERR(worker->task);
-                        kfree(worker);
+                kfree(worker);
-                        goto fail;
+                goto fail;
-                }
-                spin_lock_irq(&workers->lock);
-                list_add_tail(&worker->worker_list, &workers->idle_list);
-                worker->idle = 1;
-                workers->num_workers++;
-                workers->num_workers_starting--;
-                WARN_ON(workers->num_workers_starting < 0);
-                spin_unlock_irq(&workers->lock);
        }
+        spin_lock_irq(&workers->lock);
+        list_add_tail(&worker->worker_list, &workers->idle_list);
+        worker->idle = 1;
+        workers->num_workers++;
+        workers->num_workers_starting--;
+        WARN_ON(workers->num_workers_starting < 0);
+        spin_unlock_irq(&workers->lock);
        return 0;
 fail:
-        btrfs_stop_workers(workers);
+        spin_lock_irq(&workers->lock);
+        workers->num_workers_starting--;
+        spin_unlock_irq(&workers->lock);
        return ret;
 }
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+int btrfs_start_workers(struct btrfs_workers *workers)
 {
        spin_lock_irq(&workers->lock);
-        workers->num_workers_starting += num_workers;
+        workers->num_workers_starting++;
        spin_unlock_irq(&workers->lock);
-        return __btrfs_start_workers(workers, num_workers);
+        return __btrfs_start_workers(workers);
 }
 /*
@@ -568,6 +561,7 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
        struct btrfs_worker_thread *worker;
        unsigned long flags;
        struct list_head *fallback;
+        int ret;
 again:
        spin_lock_irqsave(&workers->lock, flags);
@@ -584,7 +578,9 @@ again:
                        workers->num_workers_starting++;
                        spin_unlock_irqrestore(&workers->lock, flags);
                        /* we're below the limit, start another worker */
-                        __btrfs_start_workers(workers, 1);
+                        ret = __btrfs_start_workers(workers);
+                        if (ret)
+                                goto fallback;
                        goto again;
                }
        }
@@ -665,7 +661,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work)
 /*
 * places a struct btrfs_work into the pending queue of one of the kthreads
 */
-int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 {
        struct btrfs_worker_thread *worker;
        unsigned long flags;
@@ -673,7 +669,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
        /* don't requeue something already on a list */
        if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
-                goto out;
+                return;
        worker = find_worker(workers);
        if (workers->ordered) {
@@ -712,7 +708,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
        if (wake)
                wake_up_process(worker->task);
        spin_unlock_irqrestore(&worker->lock, flags);
-out:
-        return 0;
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 5077746cf85e..f34cc31fa3c9 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -109,8 +109,8 @@ struct btrfs_workers {
        char *name;
 };
-int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+int btrfs_start_workers(struct btrfs_workers *workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
                        struct btrfs_workers *async_starter);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50634abef9b4..67385033323d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2692,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-void btrfs_dirty_inode(struct inode *inode, int flags);
+int btrfs_dirty_inode(struct inode *inode);
+int btrfs_update_time(struct file *file);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 5b163572e0ca..9c1eccc2c503 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -640,8 +640,8 @@ static int btrfs_delayed_inode_reserve_metadata(
         * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
         * we're accounted for.
         */
-        if (!trans->bytes_reserved &&
+        if (!src_rsv || (!trans->bytes_reserved &&
-            src_rsv != &root->fs_info->delalloc_block_rsv) {
+            src_rsv != &root->fs_info->delalloc_block_rsv)) {
                ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
                /*
                 * Since we're under a transaction reserve_metadata_bytes could
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 632f8f3cc9db..f44b3928dc2d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2194,19 +2194,27 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->endio_meta_write_workers.idle_thresh = 2;
        fs_info->readahead_workers.idle_thresh = 2;
-        btrfs_start_workers(&fs_info->workers, 1);
+        /*
-        btrfs_start_workers(&fs_info->generic_worker, 1);
+         * btrfs_start_workers can really only fail because of ENOMEM so just
-        btrfs_start_workers(&fs_info->submit_workers, 1);
+         * return -ENOMEM if any of these fail.
-        btrfs_start_workers(&fs_info->delalloc_workers, 1);
+         */
-        btrfs_start_workers(&fs_info->fixup_workers, 1);
+        ret = btrfs_start_workers(&fs_info->workers);
-        btrfs_start_workers(&fs_info->endio_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->generic_worker);
-        btrfs_start_workers(&fs_info->endio_meta_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->submit_workers);
-        btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->delalloc_workers);
-        btrfs_start_workers(&fs_info->endio_write_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->fixup_workers);
-        btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
+        ret |= btrfs_start_workers(&fs_info->endio_workers);
-        btrfs_start_workers(&fs_info->delayed_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
-        btrfs_start_workers(&fs_info->caching_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
-        btrfs_start_workers(&fs_info->readahead_workers, 1);
+        ret |= btrfs_start_workers(&fs_info->endio_write_workers);
+        ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
+        ret |= btrfs_start_workers(&fs_info->delayed_workers);
+        ret |= btrfs_start_workers(&fs_info->caching_workers);
+        ret |= btrfs_start_workers(&fs_info->readahead_workers);
+        if (ret) {
+                ret = -ENOMEM;
+                goto fail_sb_buffer;
+        }
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f0d5718d2587..f5fbe576d2ba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2822,7 +2822,7 @@ out_free:
        btrfs_release_path(path);
 out:
        spin_lock(&block_group->lock);
-        if (!ret)
+        if (!ret && dcs == BTRFS_DC_SETUP)
                block_group->cache_generation = trans->transid;
        block_group->disk_cache_state = dcs;
        spin_unlock(&block_group->lock);
@@ -4204,12 +4204,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
        u64 to_reserve = 0;
+        u64 csum_bytes;
        unsigned nr_extents = 0;
+        int extra_reserve = 0;
        int flush = 1;
        int ret;
+        /* Need to be holding the i_mutex here if we aren't free space cache */
        if (btrfs_is_free_space_inode(root, inode))
                flush = 0;
+        else
+                WARN_ON(!mutex_is_locked(&inode->i_mutex));
        if (flush && btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
@@ -4220,11 +4225,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        BTRFS_I(inode)->outstanding_extents++;
        if (BTRFS_I(inode)->outstanding_extents >
-            BTRFS_I(inode)->reserved_extents) {
+            BTRFS_I(inode)->reserved_extents)
                nr_extents = BTRFS_I(inode)->outstanding_extents -
                        BTRFS_I(inode)->reserved_extents;
-                BTRFS_I(inode)->reserved_extents += nr_extents;
-        }
        /*
         * Add an item to reserve for updating the inode when we complete the
@@ -4232,11 +4235,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         */
        if (!BTRFS_I(inode)->delalloc_meta_reserved) {
                nr_extents++;
-                BTRFS_I(inode)->delalloc_meta_reserved = 1;
+                extra_reserve = 1;
        }
        to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
        to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
+        csum_bytes = BTRFS_I(inode)->csum_bytes;
        spin_unlock(&BTRFS_I(inode)->lock);
        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
@@ -4246,22 +4250,35 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                spin_lock(&BTRFS_I(inode)->lock);
                dropped = drop_outstanding_extent(inode);
-                to_free = calc_csum_metadata_size(inode, num_bytes, 0);
-                spin_unlock(&BTRFS_I(inode)->lock);
-                to_free += btrfs_calc_trans_metadata_size(root, dropped);
                /*
-                 * Somebody could have come in and twiddled with the
+                 * If the inodes csum_bytes is the same as the original
-                 * reservation, so if we have to free more than we would have
+                 * csum_bytes then we know we haven't raced with any free()ers
-                 * reserved from this reservation go ahead and release those
+                 * so we can just reduce our inodes csum bytes and carry on.
-                 * bytes.
+                 * Otherwise we have to do the normal free thing to account for
+                 * the case that the free side didn't free up its reserve
+                 * because of this outstanding reservation.
                 */
-                to_free -= to_reserve;
+                if (BTRFS_I(inode)->csum_bytes == csum_bytes)
+                        calc_csum_metadata_size(inode, num_bytes, 0);
+                else
+                        to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+                spin_unlock(&BTRFS_I(inode)->lock);
+                if (dropped)
+                        to_free += btrfs_calc_trans_metadata_size(root, dropped);
                if (to_free)
                        btrfs_block_rsv_release(root, block_rsv, to_free);
                return ret;
        }
+        spin_lock(&BTRFS_I(inode)->lock);
+        if (extra_reserve) {
+                BTRFS_I(inode)->delalloc_meta_reserved = 1;
+                nr_extents--;
+        }
+        BTRFS_I(inode)->reserved_extents += nr_extents;
+        spin_unlock(&BTRFS_I(inode)->lock);
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
        return 0;
@@ -5107,11 +5124,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = orig_root->fs_info->extent_root;
        struct btrfs_free_cluster *last_ptr = NULL;
        struct btrfs_block_group_cache *block_group = NULL;
+        struct btrfs_block_group_cache *used_block_group;
        int empty_cluster = 2 * 1024 * 1024;
        int allowed_chunk_alloc = 0;
        int done_chunk_alloc = 0;
        struct btrfs_space_info *space_info;
-        int last_ptr_loop = 0;
        int loop = 0;
        int index = 0;
        int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
@@ -5173,6 +5190,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 ideal_cache:
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
+                used_block_group = block_group;
                /*
                 * we don't want to use the block group if it doesn't match our
                 * allocation bits, or if its not cached.
@@ -5210,6 +5228,7 @@ search:
                u64 offset;
                int cached;
+                used_block_group = block_group;
                btrfs_get_block_group(block_group);
                search_start = block_group->key.objectid;
@@ -5286,71 +5305,62 @@ alloc:
                spin_unlock(&block_group->free_space_ctl->tree_lock);
                /*
-                 * Ok we want to try and use the cluster allocator, so lets look
+                 * Ok we want to try and use the cluster allocator, so
-                 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
+                 * lets look there
-                 * have tried the cluster allocator plenty of times at this
-                 * point and not have found anything, so we are likely way too
-                 * fragmented for the clustering stuff to find anything, so lets
-                 * just skip it and let the allocator find whatever block it can
-                 * find
                 */
-                if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+                if (last_ptr) {
                        /*
                         * the refill lock keeps out other
                         * people trying to start a new cluster
                         */
                        spin_lock(&last_ptr->refill_lock);
-                        if (!last_ptr->block_group ||
+                        used_block_group = last_ptr->block_group;
-                            last_ptr->block_group->ro ||
+                        if (used_block_group != block_group &&
-                            !block_group_bits(last_ptr->block_group, data))
+                            (!used_block_group ||
+                             used_block_group->ro ||
+                             !block_group_bits(used_block_group, data))) {
+                                used_block_group = block_group;
                                goto refill_cluster;
+                        }
+                        if (used_block_group != block_group)
+                                btrfs_get_block_group(used_block_group);
-                        offset = btrfs_alloc_from_cluster(block_group, last_ptr,
+                        offset = btrfs_alloc_from_cluster(used_block_group,
-                                                 num_bytes, search_start);
+                          last_ptr, num_bytes, used_block_group->key.objectid);
                        if (offset) {
                                /* we have a block, we're done */
                                spin_unlock(&last_ptr->refill_lock);
                                goto checks;
                        }
-                        spin_lock(&last_ptr->lock);
+                        WARN_ON(last_ptr->block_group != used_block_group);
-                        /*
+                        if (used_block_group != block_group) {
-                         * whoops, this cluster doesn't actually point to
+                                btrfs_put_block_group(used_block_group);
-                         * this block group.  Get a ref on the block
+                                used_block_group = block_group;
-                         * group is does point to and try again
-                         */
-                        if (!last_ptr_loop && last_ptr->block_group &&
-                            last_ptr->block_group != block_group &&
-                            index <=
-                                 get_block_group_index(last_ptr->block_group)) {
-                                btrfs_put_block_group(block_group);
-                                block_group = last_ptr->block_group;
-                                btrfs_get_block_group(block_group);
-                                spin_unlock(&last_ptr->lock);
-                                spin_unlock(&last_ptr->refill_lock);
-                                last_ptr_loop = 1;
-                                search_start = block_group->key.objectid;
-                                /*
-                                 * we know this block group is properly
-                                 * in the list because
-                                 * btrfs_remove_block_group, drops the
-                                 * cluster before it removes the block
-                                 * group from the list
-                                 */
-                                goto have_block_group;
                        }
-                        spin_unlock(&last_ptr->lock);
 refill_cluster:
+                        BUG_ON(used_block_group != block_group);
+                        /* If we are on LOOP_NO_EMPTY_SIZE, we can't
+                         * set up a new clusters, so lets just skip it
+                         * and let the allocator find whatever block
+                         * it can find.  If we reach this point, we
+                         * will have tried the cluster allocator
+                         * plenty of times and not have found
+                         * anything, so we are likely way too
+                         * fragmented for the clustering stuff to find
+                         * anything.  */
+                        if (loop >= LOOP_NO_EMPTY_SIZE) {
+                                spin_unlock(&last_ptr->refill_lock);
+                                goto unclustered_alloc;
+                        }
                        /*
                         * this cluster didn't work out, free it and
                         * start over
                         */
                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
-                        last_ptr_loop = 0;
                        /* allocate a cluster in this block group */
                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
@@ -5390,6 +5400,7 @@ refill_cluster:
                        goto loop;
                }
+unclustered_alloc:
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
                /*
@@ -5416,14 +5427,14 @@ checks:
                search_start = stripe_align(root, offset);
                /* move on to the next group */
                if (search_start + num_bytes >= search_end) {
-                        btrfs_add_free_space(block_group, offset, num_bytes);
+                        btrfs_add_free_space(used_block_group, offset, num_bytes);
                        goto loop;
                }
                /* move on to the next group */
                if (search_start + num_bytes >
-                    block_group->key.objectid + block_group->key.offset) {
+                    used_block_group->key.objectid + used_block_group->key.offset) {
-                        btrfs_add_free_space(block_group, offset, num_bytes);
+                        btrfs_add_free_space(used_block_group, offset, num_bytes);
                        goto loop;
                }
@@ -5431,14 +5442,14 @@ checks:
                ins->offset = num_bytes;
                if (offset < search_start)
-                        btrfs_add_free_space(block_group, offset,
+                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
-                ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+                ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
                                                  alloc_type);
                if (ret == -EAGAIN) {
-                        btrfs_add_free_space(block_group, offset, num_bytes);
+                        btrfs_add_free_space(used_block_group, offset, num_bytes);
                        goto loop;
                }
@@ -5447,15 +5458,19 @@ checks:
                ins->offset = num_bytes;
                if (offset < search_start)
-                        btrfs_add_free_space(block_group, offset,
+                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
+                if (used_block_group != block_group)
+                        btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
                break;
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
                BUG_ON(index != get_block_group_index(block_group));
+                if (used_block_group != block_group)
+                        btrfs_put_block_group(used_block_group);
                btrfs_put_block_group(block_group);
        }
        up_read(&space_info->groups_sem);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index be1bf627a14b..49f3c9dc09f4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -935,8 +935,10 @@ again:
        node = tree_search(tree, start);
        if (!node) {
                prealloc = alloc_extent_state_atomic(prealloc);
-                if (!prealloc)
+                if (!prealloc) {
-                        return -ENOMEM;
+                        err = -ENOMEM;
+                        goto out;
+                }
                err = insert_state(tree, prealloc, start, end, &bits);
                prealloc = NULL;
                BUG_ON(err == -EEXIST);
@@ -992,8 +994,10 @@ hit_next:
         */
        if (state->start < start) {
                prealloc = alloc_extent_state_atomic(prealloc);
-                if (!prealloc)
+                if (!prealloc) {
-                        return -ENOMEM;
+                        err = -ENOMEM;
+                        goto out;
+                }
                err = split_state(tree, state, prealloc, start);
                BUG_ON(err == -EEXIST);
                prealloc = NULL;
@@ -1024,8 +1028,10 @@ hit_next:
                        this_end = last_start - 1;
                prealloc = alloc_extent_state_atomic(prealloc);
-                if (!prealloc)
+                if (!prealloc) {
-                        return -ENOMEM;
+                        err = -ENOMEM;
+                        goto out;
+                }
                /*
                 * Avoid to free 'prealloc' if it can be merged with
@@ -1051,8 +1057,10 @@ hit_next:
         */
        if (state->start <= end && state->end > end) {
                prealloc = alloc_extent_state_atomic(prealloc);
-                if (!prealloc)
+                if (!prealloc) {
-                        return -ENOMEM;
+                        err = -ENOMEM;
+                        goto out;
+                }
                err = split_state(tree, state, prealloc, end + 1);
                BUG_ON(err == -EEXIST);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dafdfa059bf6..97fbe939c050 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1167,6 +1167,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
                     (sizeof(struct page *)));
+        nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
+        nrptrs = max(nrptrs, 8);
        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
        if (!pages)
                return -ENOMEM;
@@ -1387,7 +1389,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                goto out;
        }
-        file_update_time(file);
+        err = btrfs_update_time(file);
+        if (err) {
+                mutex_unlock(&inode->i_mutex);
+                goto out;
+        }
        BTRFS_I(inode)->sequence++;
        start_pos = round_down(pos, root->sectorsize);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2c984f7d4c2a..0a6b928813a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -38,6 +38,7 @@
 #include <linux/falloc.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <linux/mount.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -2031,7 +2032,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
        /* insert an orphan item to track this unlinked/truncated file */
        if (insert >= 1) {
                ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
-                BUG_ON(ret);
+                BUG_ON(ret && ret != -EEXIST);
        }
        /* insert an orphan item to track subvolume contains orphan files */
@@ -2158,6 +2159,38 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                if (ret && ret != -ESTALE)
                        goto out;
+                if (ret == -ESTALE && root == root->fs_info->tree_root) {
+                        struct btrfs_root *dead_root;
+                        struct btrfs_fs_info *fs_info = root->fs_info;
+                        int is_dead_root = 0;
+                        /*
+                         * this is an orphan in the tree root. Currently these
+                         * could come from 2 sources:
+                         *  a) a snapshot deletion in progress
+                         *  b) a free space cache inode
+                         * We need to distinguish those two, as the snapshot
+                         * orphan must not get deleted.
+                         * find_dead_roots already ran before us, so if this
+                         * is a snapshot deletion, we should find the root
+                         * in the dead_roots list
+                         */
+                        spin_lock(&fs_info->trans_lock);
+                        list_for_each_entry(dead_root, &fs_info->dead_roots,
+                                            root_list) {
+                                if (dead_root->root_key.objectid ==
+                                    found_key.objectid) {
+                                        is_dead_root = 1;
+                                        break;
+                                }
+                        }
+                        spin_unlock(&fs_info->trans_lock);
+                        if (is_dead_root) {
+                                /* prevent this orphan from being found again */
+                                key.offset = found_key.objectid - 1;
+                                continue;
+                        }
+                }
                /*
                 * Inode is already gone but the orphan item is still there,
                 * kill the orphan item.
@@ -2191,7 +2224,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                continue;
                        }
                        nr_truncate++;
+                        /*
+                         * Need to hold the imutex for reservation purposes, not
+                         * a huge deal here but I have a WARN_ON in
+                         * btrfs_delalloc_reserve_space to catch offenders.
+                         */
+                        mutex_lock(&inode->i_mutex);
                        ret = btrfs_truncate(inode);
+                        mutex_unlock(&inode->i_mutex);
                } else {
                        nr_unlink++;
                }
@@ -3327,7 +3367,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                        u64 hint_byte = 0;
                        hole_size = last_byte - cur_offset;
-                        trans = btrfs_start_transaction(root, 2);
+                        trans = btrfs_start_transaction(root, 3);
                        if (IS_ERR(trans)) {
                                err = PTR_ERR(trans);
                                break;
@@ -3337,6 +3377,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                                 cur_offset + hole_size,
                                                 &hint_byte, 1);
                        if (err) {
+                                btrfs_update_inode(trans, root, inode);
                                btrfs_end_transaction(trans, root);
                                break;
                        }
@@ -3346,6 +3387,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                        0, hole_size, 0, hole_size,
                                        0, 0, 0);
                        if (err) {
+                                btrfs_update_inode(trans, root, inode);
                                btrfs_end_transaction(trans, root);
                                break;
                        }
@@ -3353,6 +3395,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                        btrfs_drop_extent_cache(inode, hole_start,
                                        last_byte - 1, 0);
+                        btrfs_update_inode(trans, root, inode);
                        btrfs_end_transaction(trans, root);
                }
                free_extent_map(em);
@@ -3370,6 +3413,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 static int btrfs_setsize(struct inode *inode, loff_t newsize)
 {
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
        loff_t oldsize = i_size_read(inode);
        int ret;
@@ -3377,16 +3422,19 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
                return 0;
        if (newsize > oldsize) {
-                i_size_write(inode, newsize);
-                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
                truncate_pagecache(inode, oldsize, newsize);
                ret = btrfs_cont_expand(inode, oldsize, newsize);
-                if (ret) {
+                if (ret)
-                        btrfs_setsize(inode, oldsize);
                        return ret;
-                }
-                mark_inode_dirty(inode);
+                trans = btrfs_start_transaction(root, 1);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
+                i_size_write(inode, newsize);
+                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+                ret = btrfs_update_inode(trans, root, inode);
+                btrfs_end_transaction_throttle(trans, root);
        } else {
                /*
@@ -3426,9 +3474,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
        if (attr->ia_valid) {
                setattr_copy(inode, attr);
-                mark_inode_dirty(inode);
+                err = btrfs_dirty_inode(inode);
-                if (attr->ia_valid & ATTR_MODE)
+                if (!err && attr->ia_valid & ATTR_MODE)
                        err = btrfs_acl_chmod(inode);
        }
@@ -4204,42 +4252,80 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 * FIXME, needs more benchmarking...there are no reasons other than performance
 * to keep or drop this code.
 */
-void btrfs_dirty_inode(struct inode *inode, int flags)
+int btrfs_dirty_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        int ret;
        if (BTRFS_I(inode)->dummy_inode)
-                return;
+                return 0;
        trans = btrfs_join_transaction(root);
-        BUG_ON(IS_ERR(trans));
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        ret = btrfs_update_inode(trans, root, inode);
        if (ret && ret == -ENOSPC) {
                /* whoops, lets try again with the full transaction */
                btrfs_end_transaction(trans, root);
                trans = btrfs_start_transaction(root, 1);
-                if (IS_ERR(trans)) {
+                if (IS_ERR(trans))
-                        printk_ratelimited(KERN_ERR "btrfs: fail to "
+                        return PTR_ERR(trans);
-                                       "dirty  inode %llu error %ld\n",
-                                       (unsigned long long)btrfs_ino(inode),
-                                       PTR_ERR(trans));
-                        return;
-                }
                ret = btrfs_update_inode(trans, root, inode);
-                if (ret) {
-                        printk_ratelimited(KERN_ERR "btrfs: fail to "
-                                       "dirty  inode %llu error %d\n",
-                                       (unsigned long long)btrfs_ino(inode),
-                                       ret);
-                }
        }
        btrfs_end_transaction(trans, root);
        if (BTRFS_I(inode)->delayed_node)
                btrfs_balance_delayed_items(root);
+        return ret;
+}
+/*
+ * This is a copy of file_update_time.  We need this so we can return error on
+ * ENOSPC for updating the inode in the case of file write and mmap writes.
+ */
+int btrfs_update_time(struct file *file)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct timespec now;
+        int ret;
+        enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
+        /* First try to exhaust all avenues to not sync */
+        if (IS_NOCMTIME(inode))
+                return 0;
+        now = current_fs_time(inode->i_sb);
+        if (!timespec_equal(&inode->i_mtime, &now))
+                sync_it = S_MTIME;
+        if (!timespec_equal(&inode->i_ctime, &now))
+                sync_it |= S_CTIME;
+        if (IS_I_VERSION(inode))
+                sync_it |= S_VERSION;
+        if (!sync_it)
+                return 0;
+        /* Finally allowed to write? Takes lock. */
+        if (mnt_want_write_file(file))
+                return 0;
+        /* Only change inode inside the lock region */
+        if (sync_it & S_VERSION)
+                inode_inc_iversion(inode);
+        if (sync_it & S_CTIME)
+                inode->i_ctime = now;
+        if (sync_it & S_MTIME)
+                inode->i_mtime = now;
+        ret = btrfs_dirty_inode(inode);
+        if (!ret)
+                mark_inode_dirty_sync(inode);
+        mnt_drop_write(file->f_path.mnt);
+        return ret;
 }
 /*
@@ -4555,11 +4641,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
+        /*
+        * If the active LSM wants to access the inode during
+        * d_instantiate it needs these. Smack checks to see
+        * if the filesystem supports xattrs by looking at the
+        * ops vector.
+        */
+        inode->i_op = &btrfs_special_inode_operations;
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
-                inode->i_op = &btrfs_special_inode_operations;
                init_special_inode(inode, inode->i_mode, rdev);
                btrfs_update_inode(trans, root, inode);
        }
@@ -4613,14 +4706,21 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
+        /*
+        * If the active LSM wants to access the inode during
+        * d_instantiate it needs these. Smack checks to see
+        * if the filesystem supports xattrs by looking at the
+        * ops vector.
+        */
+        inode->i_fop = &btrfs_file_operations;
+        inode->i_op = &btrfs_file_inode_operations;
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
                inode->i_mapping->a_ops = &btrfs_aops;
                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-                inode->i_fop = &btrfs_file_operations;
-                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
 out_unlock:
@@ -6303,7 +6403,12 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_start;
        u64 page_end;
+        /* Need this to keep space reservations serialized */
+        mutex_lock(&inode->i_mutex);
        ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+        mutex_unlock(&inode->i_mutex);
+        if (!ret)
+                ret = btrfs_update_time(vma->vm_file);
        if (ret) {
                if (ret == -ENOMEM)
                        ret = VM_FAULT_OOM;
@@ -6515,8 +6620,9 @@ static int btrfs_truncate(struct inode *inode)
                        /* Just need the 1 for updating the inode */
                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
-                                err = PTR_ERR(trans);
+                                ret = err = PTR_ERR(trans);
-                                goto out;
+                                trans = NULL;
+                                break;
                        }
                }
@@ -7076,14 +7182,21 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
+        /*
+        * If the active LSM wants to access the inode during
+        * d_instantiate it needs these. Smack checks to see
+        * if the filesystem supports xattrs by looking at the
+        * ops vector.
+        */
+        inode->i_fop = &btrfs_file_operations;
+        inode->i_op = &btrfs_file_inode_operations;
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
                inode->i_mapping->a_ops = &btrfs_aops;
                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-                inode->i_fop = &btrfs_file_operations;
-                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
        if (drop_inode)
@@ -7353,6 +7466,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
        .getattr        = btrfs_getattr,
+        .setattr        = btrfs_setattr,
        .permission     = btrfs_permission,
        .setxattr       = btrfs_setxattr,
        .getxattr       = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 72d461656f60..c04f02c7d5bb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -252,11 +252,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
+        btrfs_update_iflags(inode);
+        inode->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
-        btrfs_update_iflags(inode);
-        inode->i_ctime = CURRENT_TIME;
        btrfs_end_transaction(trans, root);
        mnt_drop_write(file->f_path.mnt);
@@ -858,8 +858,10 @@ static int cluster_pages_for_defrag(struct inode *inode,
                return 0;
        file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+        mutex_lock(&inode->i_mutex);
        ret = btrfs_delalloc_reserve_space(inode,
                                           num_pages << PAGE_CACHE_SHIFT);
+        mutex_unlock(&inode->i_mutex);
        if (ret)
                return ret;
 again:
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index dff29d5e151a..cfb55434a469 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2947,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
        while (index <= last_index) {
+                mutex_lock(&inode->i_mutex);
                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+                mutex_unlock(&inode->i_mutex);
                if (ret)
                        goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index c27bcb67f330..ddf2c90d3fc0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1535,18 +1535,22 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
 static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
+        int ret = 0;
        mutex_lock(&fs_info->scrub_lock);
        if (fs_info->scrub_workers_refcnt == 0) {
                btrfs_init_workers(&fs_info->scrub_workers, "scrub",
                           fs_info->thread_pool_size, &fs_info->generic_worker);
                fs_info->scrub_workers.idle_thresh = 4;
-                btrfs_start_workers(&fs_info->scrub_workers, 1);
+                ret = btrfs_start_workers(&fs_info->scrub_workers);
+                if (ret)
+                        goto out;
        }
        ++fs_info->scrub_workers_refcnt;
+out:
        mutex_unlock(&fs_info->scrub_lock);
-        return 0;
+        return ret;
 }
 static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e28ad4baf483..200f63bc6675 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/cleancache.h>
 #include <linux/mnt_namespace.h>
+#include <linux/ratelimit.h>
 #include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -1053,7 +1054,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        u64 avail_space;
        u64 used_space;
        u64 min_stripe_size;
-        int min_stripes = 1;
+        int min_stripes = 1, num_stripes = 1;
        int i = 0, nr_devices;
        int ret;
@@ -1067,12 +1068,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        /* calc min stripe number for data space alloction */
        type = btrfs_get_alloc_profile(root, 1);
-        if (type & BTRFS_BLOCK_GROUP_RAID0)
+        if (type & BTRFS_BLOCK_GROUP_RAID0) {
                min_stripes = 2;
-        else if (type & BTRFS_BLOCK_GROUP_RAID1)
+                num_stripes = nr_devices;
+        } else if (type & BTRFS_BLOCK_GROUP_RAID1) {
                min_stripes = 2;
-        else if (type & BTRFS_BLOCK_GROUP_RAID10)
+                num_stripes = 2;
+        } else if (type & BTRFS_BLOCK_GROUP_RAID10) {
                min_stripes = 4;
+                num_stripes = 4;
+        }
        if (type & BTRFS_BLOCK_GROUP_DUP)
                min_stripe_size = 2 * BTRFS_STRIPE_LEN;
@@ -1141,13 +1146,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        i = nr_devices - 1;
        avail_space = 0;
        while (nr_devices >= min_stripes) {
+                if (num_stripes > nr_devices)
+                        num_stripes = nr_devices;
                if (devices_info[i].max_avail >= min_stripe_size) {
                        int j;
                        u64 alloc_size;
-                        avail_space += devices_info[i].max_avail * min_stripes;
+                        avail_space += devices_info[i].max_avail * num_stripes;
                        alloc_size = devices_info[i].max_avail;
-                        for (j = i + 1 - min_stripes; j <= i; j++)
+                        for (j = i + 1 - num_stripes; j <= i; j++)
                                devices_info[j].max_avail -= alloc_size;
                }
                i--;
@@ -1264,6 +1272,16 @@ static int btrfs_unfreeze(struct super_block *sb)
        return 0;
 }
+static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
+{
+        int ret;
+        ret = btrfs_dirty_inode(inode);
+        if (ret)
+                printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
+                                   "error %d\n", btrfs_ino(inode), ret);
+}
 static const struct super_operations btrfs_super_ops = {
        .drop_inode     = btrfs_drop_inode,
        .evict_inode    = btrfs_evict_inode,
@@ -1271,7 +1289,7 @@ static const struct super_operations btrfs_super_ops = {
        .sync_fs        = btrfs_sync_fs,
        .show_options   = btrfs_show_options,
        .write_inode    = btrfs_write_inode,
-        .dirty_inode    = btrfs_dirty_inode,
+        .dirty_inode    = btrfs_fs_dirty_inode,
        .alloc_inode    = btrfs_alloc_inode,
        .destroy_inode  = btrfs_destroy_inode,
        .statfs         = btrfs_statfs,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c37433d3cd82..f4b839fd3c9d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -295,6 +295,12 @@ loop_lock:
                        btrfs_requeue_work(&device->work);
                        goto done;
                }
+                /* unplug every 64 requests just for good measure */
+                if (batch_run % 64 == 0) {
+                        blk_finish_plug(&plug);
+                        blk_start_plug(&plug);
+                        sync_pending = 0;
+                }
        }
        cond_resched();
@@ -1611,7 +1617,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
                return -EINVAL;
-        bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
                                  root->fs_info->bdev_holder);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
@@ -3258,7 +3264,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
                 */
                if (atomic_read(&bbio->error) > bbio->max_errors) {
                        err = -EIO;
-                } else if (err) {
+                } else {
                        /*
                         * this bio is actually up to date, we didn't
                         * go over the max number of errors
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4144caf2f9d3..173b1d22e59b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page)
        snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
        /* dirty the head */
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if (ci->i_head_snapc == NULL)
                ci->i_head_snapc = ceph_get_snap_context(snapc);
        ++ci->i_wrbuffer_ref_head;
@@ -100,7 +100,7 @@ static int ceph_set_page_dirty(struct page *page)
             ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
             ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
             snapc, snapc->seq, snapc->num_snaps);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        /* now adjust page */
        spin_lock_irq(&mapping->tree_lock);
@@ -391,7 +391,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
        struct ceph_snap_context *snapc = NULL;
        struct ceph_cap_snap *capsnap = NULL;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
                dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
                     capsnap->context, capsnap->dirty_pages);
@@ -407,7 +407,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
                dout(" head snapc %p has %d dirty pages\n",
                     snapc, ci->i_wrbuffer_ref_head);
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return snapc;
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0f327c6c9679..8b53193e4f7c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -309,7 +309,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
 /*
 * Find ceph_cap for given mds, if any.
 *
- * Called with i_lock held.
+ * Called with i_ceph_lock held.
 */
 static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
 {
@@ -332,9 +332,9 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
 {
        struct ceph_cap *cap;
-        spin_lock(&ci->vfs_inode.i_lock);
+        spin_lock(&ci->i_ceph_lock);
        cap = __get_cap_for_mds(ci, mds);
-        spin_unlock(&ci->vfs_inode.i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return cap;
 }
@@ -361,15 +361,16 @@ static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
 int ceph_get_cap_mds(struct inode *inode)
 {
+        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        mds = __ceph_get_cap_mds(ceph_inode(inode));
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return mds;
 }
 /*
- * Called under i_lock.
+ * Called under i_ceph_lock.
 */
 static void __insert_cap_node(struct ceph_inode_info *ci,
                              struct ceph_cap *new)
@@ -415,7 +416,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 *
 * If I_FLUSH is set, leave the inode at the front of the list.
 *
- * Caller holds i_lock
+ * Caller holds i_ceph_lock
 *    -> we take mdsc->cap_delay_lock
 */
 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
@@ -457,7 +458,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
 /*
 * Cancel delayed work on cap.
 *
- * Caller must hold i_lock.
+ * Caller must hold i_ceph_lock.
 */
 static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
                               struct ceph_inode_info *ci)
@@ -532,14 +533,14 @@ int ceph_add_cap(struct inode *inode,
                wanted |= ceph_caps_for_mode(fmode);
 retry:
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        cap = __get_cap_for_mds(ci, mds);
        if (!cap) {
                if (new_cap) {
                        cap = new_cap;
                        new_cap = NULL;
                } else {
-                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&ci->i_ceph_lock);
                        new_cap = get_cap(mdsc, caps_reservation);
                        if (new_cap == NULL)
                                return -ENOMEM;
@@ -625,7 +626,7 @@ retry:
        if (fmode >= 0)
                __ceph_get_fmode(ci, fmode);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        wake_up_all(&ci->i_cap_wq);
        return 0;
 }
@@ -792,7 +793,7 @@ int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
        struct rb_node *p;
        int ret = 0;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
                cap = rb_entry(p, struct ceph_cap, ci_node);
                if (__cap_is_valid(cap) &&
@@ -801,7 +802,7 @@ int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
                        break;
                }
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        dout("ceph_caps_revoking %p %s = %d\n", inode,
             ceph_cap_string(mask), ret);
        return ret;
@@ -855,7 +856,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
 }
 /*
- * called under i_lock
+ * called under i_ceph_lock
 */
 static int __ceph_is_any_caps(struct ceph_inode_info *ci)
 {
@@ -865,7 +866,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
 /*
 * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
 *
- * caller should hold i_lock.
+ * caller should hold i_ceph_lock.
 * caller will not hold session s_mutex if called from destroy_inode.
 */
 void __ceph_remove_cap(struct ceph_cap *cap)
@@ -1028,7 +1029,7 @@ static void __queue_cap_release(struct ceph_mds_session *session,
 /*
 * Queue cap releases when an inode is dropped from our cache.  Since
- * inode is about to be destroyed, there is no need for i_lock.
+ * inode is about to be destroyed, there is no need for i_ceph_lock.
 */
 void ceph_queue_caps_release(struct inode *inode)
 {
@@ -1049,7 +1050,7 @@ void ceph_queue_caps_release(struct inode *inode)
 /*
 * Send a cap msg on the given inode.  Update our caps state, then
- * drop i_lock and send the message.
+ * drop i_ceph_lock and send the message.
 *
 * Make note of max_size reported/requested from mds, revoked caps
 * that have now been implemented.
@@ -1061,13 +1062,13 @@ void ceph_queue_caps_release(struct inode *inode)
 * Return non-zero if delayed release, or we experienced an error
 * such that the caller should requeue + retry later.
 *
- * called with i_lock, then drops it.
+ * called with i_ceph_lock, then drops it.
 * caller should hold snap_rwsem (read), s_mutex.
 */
 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
                      int op, int used, int want, int retain, int flushing,
                      unsigned *pflush_tid)
-        __releases(cap->ci->vfs_inode->i_lock)
+        __releases(cap->ci->i_ceph_lock)
 {
        struct ceph_inode_info *ci = cap->ci;
        struct inode *inode = &ci->vfs_inode;
@@ -1170,7 +1171,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
                xattr_version = ci->i_xattrs.version;
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
                op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
@@ -1198,13 +1199,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 * Unless @again is true, skip cap_snaps that were already sent to
 * the MDS (i.e., during this session).
 *
- * Called under i_lock.  Takes s_mutex as needed.
+ * Called under i_ceph_lock.  Takes s_mutex as needed.
 */
 void __ceph_flush_snaps(struct ceph_inode_info *ci,
                        struct ceph_mds_session **psession,
                        int again)
-                __releases(ci->vfs_inode->i_lock)
+                __releases(ci->i_ceph_lock)
-                __acquires(ci->vfs_inode->i_lock)
+                __acquires(ci->i_ceph_lock)
 {
        struct inode *inode = &ci->vfs_inode;
        int mds;
@@ -1261,7 +1262,7 @@ retry:
                        session = NULL;
                }
                if (!session) {
-                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&ci->i_ceph_lock);
                        mutex_lock(&mdsc->mutex);
                        session = __ceph_lookup_mds_session(mdsc, mds);
                        mutex_unlock(&mdsc->mutex);
@@ -1275,7 +1276,7 @@ retry:
                         * deletion or migration.  retry, and we'll
                         * get a better @mds value next time.
                         */
-                        spin_lock(&inode->i_lock);
+                        spin_lock(&ci->i_ceph_lock);
                        goto retry;
                }
@@ -1285,7 +1286,7 @@ retry:
                        list_del_init(&capsnap->flushing_item);
                list_add_tail(&capsnap->flushing_item,
                              &session->s_cap_snaps_flushing);
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
                     inode, capsnap, capsnap->follows, capsnap->flush_tid);
@@ -1302,7 +1303,7 @@ retry:
                next_follows = capsnap->follows + 1;
                ceph_put_cap_snap(capsnap);
-                spin_lock(&inode->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                goto retry;
        }
@@ -1322,11 +1323,9 @@ out:
 static void ceph_flush_snaps(struct ceph_inode_info *ci)
 {
-        struct inode *inode = &ci->vfs_inode;
+        spin_lock(&ci->i_ceph_lock);
-        spin_lock(&inode->i_lock);
        __ceph_flush_snaps(ci, NULL, 0);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
 }
 /*
@@ -1373,7 +1372,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 * Add dirty inode to the flushing list.  Assigned a seq number so we
 * can wait for caps to flush without starving.
 *
- * Called under i_lock.
+ * Called under i_ceph_lock.
 */
 static int __mark_caps_flushing(struct inode *inode,
                                 struct ceph_mds_session *session)
@@ -1421,9 +1420,9 @@ static int try_nonblocking_invalidate(struct inode *inode)
        struct ceph_inode_info *ci = ceph_inode(inode);
        u32 invalidating_gen = ci->i_rdcache_gen;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        invalidate_mapping_pages(&inode->i_data, 0, -1);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if (inode->i_data.nrpages == 0 &&
            invalidating_gen == ci->i_rdcache_gen) {
@@ -1470,7 +1469,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
        if (mdsc->stopping)
                is_delayed = 1;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if (ci->i_ceph_flags & CEPH_I_FLUSH)
                flags |= CHECK_CAPS_FLUSH;
@@ -1480,7 +1479,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                __ceph_flush_snaps(ci, &session, 0);
        goto retry_locked;
 retry:
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
 retry_locked:
        file_wanted = __ceph_caps_file_wanted(ci);
        used = __ceph_caps_used(ci);
@@ -1634,7 +1633,7 @@ ack:
                        if (mutex_trylock(&session->s_mutex) == 0) {
                                dout("inverting session/ino locks on %p\n",
                                     session);
-                                spin_unlock(&inode->i_lock);
+                                spin_unlock(&ci->i_ceph_lock);
                                if (took_snap_rwsem) {
                                        up_read(&mdsc->snap_rwsem);
                                        took_snap_rwsem = 0;
@@ -1648,7 +1647,7 @@ ack:
                        if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
                                dout("inverting snap/in locks on %p\n",
                                     inode);
-                                spin_unlock(&inode->i_lock);
+                                spin_unlock(&ci->i_ceph_lock);
                                down_read(&mdsc->snap_rwsem);
                                took_snap_rwsem = 1;
                                goto retry;
@@ -1664,10 +1663,10 @@ ack:
                mds = cap->mds;  /* remember mds, so we don't repeat */
                sent++;
-                /* __send_cap drops i_lock */
+                /* __send_cap drops i_ceph_lock */
                delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
                                      retain, flushing, NULL);
-                goto retry; /* retake i_lock and restart our cap scan. */
+                goto retry; /* retake i_ceph_lock and restart our cap scan. */
        }
        /*
@@ -1681,7 +1680,7 @@ ack:
        else if (!is_delayed || force_requeue)
                __cap_delay_requeue(mdsc, ci);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (queue_invalidate)
                ceph_queue_invalidate(inode);
@@ -1704,7 +1703,7 @@ static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
        int flushing = 0;
 retry:
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
                dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
                goto out;
@@ -1716,7 +1715,7 @@ retry:
                int delayed;
                if (!session) {
-                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&ci->i_ceph_lock);
                        session = cap->session;
                        mutex_lock(&session->s_mutex);
                        goto retry;
@@ -1727,18 +1726,18 @@ retry:
                flushing = __mark_caps_flushing(inode, session);
-                /* __send_cap drops i_lock */
+                /* __send_cap drops i_ceph_lock */
                delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
                                     cap->issued | cap->implemented, flushing,
                                     flush_tid);
                if (!delayed)
                        goto out_unlocked;
-                spin_lock(&inode->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                __cap_delay_requeue(mdsc, ci);
        }
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
 out_unlocked:
        if (session && unlock_session)
                mutex_unlock(&session->s_mutex);
@@ -1753,7 +1752,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid)
        struct ceph_inode_info *ci = ceph_inode(inode);
        int i, ret = 1;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        for (i = 0; i < CEPH_CAP_BITS; i++)
                if ((ci->i_flushing_caps & (1 << i)) &&
                    ci->i_cap_flush_tid[i] <= tid) {
@@ -1761,7 +1760,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid)
                        ret = 0;
                        break;
                }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return ret;
 }
@@ -1868,10 +1867,10 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
                struct ceph_mds_client *mdsc =
                        ceph_sb_to_client(inode->i_sb)->mdsc;
-                spin_lock(&inode->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                if (__ceph_caps_dirty(ci))
                        __cap_delay_requeue_front(mdsc, ci);
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
        }
        return err;
 }
@@ -1894,7 +1893,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
                struct inode *inode = &ci->vfs_inode;
                struct ceph_cap *cap;
-                spin_lock(&inode->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                cap = ci->i_auth_cap;
                if (cap && cap->session == session) {
                        dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
@@ -1904,7 +1903,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
                        pr_err("%p auth cap %p not mds%d ???\n", inode,
                               cap, session->s_mds);
                }
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
        }
 }
@@ -1921,7 +1920,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                struct ceph_cap *cap;
                int delayed = 0;
-                spin_lock(&inode->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                cap = ci->i_auth_cap;
                if (cap && cap->session == session) {
                        dout("kick_flushing_caps %p cap %p %s\n", inode,
@@ -1932,14 +1931,14 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                                             cap->issued | cap->implemented,
                                             ci->i_flushing_caps, NULL);
                        if (delayed) {
-                                spin_lock(&inode->i_lock);
+                                spin_lock(&ci->i_ceph_lock);
                                __cap_delay_requeue(mdsc, ci);
-                                spin_unlock(&inode->i_lock);
+                                spin_unlock(&ci->i_ceph_lock);
                        }
                } else {
                        pr_err("%p auth cap %p not mds%d ???\n", inode,
                               cap, session->s_mds);
-                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&ci->i_ceph_lock);
                }
        }
 }
@@ -1952,7 +1951,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
        struct ceph_cap *cap;
        int delayed = 0;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        cap = ci->i_auth_cap;
        dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
             ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
@@ -1964,12 +1963,12 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
                                     cap->issued | cap->implemented,
                                     ci->i_flushing_caps, NULL);
                if (delayed) {
-                        spin_lock(&inode->i_lock);
+                        spin_lock(&ci->i_ceph_lock);
                        __cap_delay_requeue(mdsc, ci);
-                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&ci->i_ceph_lock);
                }
        } else {
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
        }
 }
@@ -1978,7 +1977,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
 * Take references to capabilities we hold, so that we don't release
 * them to the MDS prematurely.
 *
- * Protected by i_lock.
+ * Protected by i_ceph_lock.
 */
 static void __take_cap_refs(struct ceph_inode_info *ci, int got)
 {
@@ -2016,7 +2015,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
        dout("get_cap_refs %p need %s want %s\n", inode,
             ceph_cap_string(need), ceph_cap_string(want));
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        /* make sure file is actually open */
        file_wanted = __ceph_caps_file_wanted(ci);
@@ -2077,7 +2076,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
                     ceph_cap_string(have), ceph_cap_string(need));
        }
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        dout("get_cap_refs %p ret %d got %s\n", inode,
             ret, ceph_cap_string(*got));
        return ret;
@@ -2094,7 +2093,7 @@ static void check_max_size(struct inode *inode, loff_t endoff)
        int check = 0;
        /* do we need to explicitly request a larger max_size? */
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if ((endoff >= ci->i_max_size ||
             endoff > (inode->i_size << 1)) &&
            endoff > ci->i_wanted_max_size) {
@@ -2103,7 +2102,7 @@ static void check_max_size(struct inode *inode, loff_t endoff)
                ci->i_wanted_max_size = endoff;
                check = 1;
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (check)
                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
 }
@@ -2140,9 +2139,9 @@ retry:
 */
 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
 {
-        spin_lock(&ci->vfs_inode.i_lock);
+        spin_lock(&ci->i_ceph_lock);
        __take_cap_refs(ci, caps);
-        spin_unlock(&ci->vfs_inode.i_lock);
+        spin_unlock(&ci->i_ceph_lock);
 }
 /*
@@ -2160,7 +2159,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
        int last = 0, put = 0, flushsnaps = 0, wake = 0;
        struct ceph_cap_snap *capsnap;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if (had & CEPH_CAP_PIN)
                --ci->i_pin_ref;
        if (had & CEPH_CAP_FILE_RD)
@@ -2193,7 +2192,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
                                }
                        }
                }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
             last ? " last" : "", put ? " put" : "");
@@ -2225,7 +2224,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
        int found = 0;
        struct ceph_cap_snap *capsnap = NULL;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        ci->i_wrbuffer_ref -= nr;
        last = !ci->i_wrbuffer_ref;
@@ -2274,7 +2273,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                }
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (last) {
                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
@@ -2291,7 +2290,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
 * actually be a revocation if it specifies a smaller cap set.)
 *
- * caller holds s_mutex and i_lock, we drop both.
+ * caller holds s_mutex and i_ceph_lock, we drop both.
 *
 * return value:
 *  0 - ok
@@ -2302,7 +2301,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
                             struct ceph_mds_session *session,
                             struct ceph_cap *cap,
                             struct ceph_buffer *xattr_buf)
-                __releases(inode->i_lock)
+                __releases(ci->i_ceph_lock)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
@@ -2453,7 +2452,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        }
        BUG_ON(cap->issued & ~cap->implemented);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (writeback)
                /*
                 * queue inode for writeback: we can't actually call
@@ -2483,7 +2482,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
                                 struct ceph_mds_caps *m,
                                 struct ceph_mds_session *session,
                                 struct ceph_cap *cap)
-        __releases(inode->i_lock)
+        __releases(ci->i_ceph_lock)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -2539,7 +2538,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
        wake_up_all(&ci->i_cap_wq);
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (drop)
                iput(inode);
 }
@@ -2562,7 +2561,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
        dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
             inode, ci, session->s_mds, follows);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
                if (capsnap->follows == follows) {
                        if (capsnap->flush_tid != flush_tid) {
@@ -2585,7 +2584,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
                             capsnap, capsnap->follows);
                }
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (drop)
                iput(inode);
 }
@@ -2598,7 +2597,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
 static void handle_cap_trunc(struct inode *inode,
                             struct ceph_mds_caps *trunc,
                             struct ceph_mds_session *session)
-        __releases(inode->i_lock)
+        __releases(ci->i_ceph_lock)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
@@ -2617,7 +2616,7 @@ static void handle_cap_trunc(struct inode *inode,
             inode, mds, seq, truncate_size, truncate_seq);
        queue_trunc = ceph_fill_file_size(inode, issued,
                                          truncate_seq, truncate_size, size);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (queue_trunc)
                ceph_queue_vmtruncate(inode);
@@ -2646,7 +2645,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
        dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
             inode, ci, mds, mseq);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        /* make sure we haven't seen a higher mseq */
        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
@@ -2690,7 +2689,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
        }
        /* else, we already released it */
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
 }
 /*
@@ -2745,9 +2744,9 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
        up_read(&mdsc->snap_rwsem);
        /* make sure we re-request max_size, if necessary */
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        ci->i_requested_max_size = 0;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
 }
 /*
@@ -2762,6 +2761,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        struct ceph_mds_client *mdsc = session->s_mdsc;
        struct super_block *sb = mdsc->fsc->sb;
        struct inode *inode;
+        struct ceph_inode_info *ci;
        struct ceph_cap *cap;
        struct ceph_mds_caps *h;
        int mds = session->s_mds;
@@ -2815,6 +2815,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        /* lookup ino */
        inode = ceph_find_inode(sb, vino);
+        ci = ceph_inode(inode);
        dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
             vino.snap, inode);
        if (!inode) {
@@ -2844,16 +2845,16 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        }
        /* the rest require a cap */
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        cap = __get_cap_for_mds(ceph_inode(inode), mds);
        if (!cap) {
                dout(" no cap on %p ino %llx.%llx from mds%d\n",
                     inode, ceph_ino(inode), ceph_snap(inode), mds);
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                goto flush_cap_releases;
        }
-        /* note that each of these drops i_lock for us */
+        /* note that each of these drops i_ceph_lock for us */
        switch (op) {
        case CEPH_CAP_OP_REVOKE:
        case CEPH_CAP_OP_GRANT:
@@ -2869,7 +2870,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                break;
        default:
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
                       ceph_cap_op_name(op));
        }
@@ -2962,13 +2963,13 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
        struct inode *inode = &ci->vfs_inode;
        int last = 0;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
             ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
        BUG_ON(ci->i_nr_by_mode[fmode] == 0);
        if (--ci->i_nr_by_mode[fmode] == 0)
                last++;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (last && ci->i_vino.snap == CEPH_NOSNAP)
                ceph_check_caps(ci, 0, NULL);
@@ -2991,7 +2992,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
        int used, dirty;
        int ret = 0;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        used = __ceph_caps_used(ci);
        dirty = __ceph_caps_dirty(ci);
@@ -3046,7 +3047,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
                             inode, cap, ceph_cap_string(cap->issued));
                }
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return ret;
 }
@@ -3061,7 +3062,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
        /*
         * force an record for the directory caps if we have a dentry lease.
-         * this is racy (can't take i_lock and d_lock together), but it
+         * this is racy (can't take i_ceph_lock and d_lock together), but it
         * doesn't have to be perfect; the mds will revoke anything we don't
         * release.
         */
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index bca3948e9dbf..3eeb97661262 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -281,18 +281,18 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        }
        /* can we use the dcache? */
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if ((filp->f_pos == 2 || fi->dentry) &&
            !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
            ceph_snap(inode) != CEPH_SNAPDIR &&
            ceph_dir_test_complete(inode) &&
            __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                err = __dcache_readdir(filp, dirent, filldir);
                if (err != -EAGAIN)
                        return err;
        } else {
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
        }
        if (fi->dentry) {
                err = note_last_dentry(fi, fi->dentry->d_name.name,
@@ -428,12 +428,12 @@ more:
         * were released during the whole readdir, and we should have
         * the complete dir contents in our cache.
         */
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if (ci->i_release_count == fi->dir_release_count) {
                ceph_dir_set_complete(inode);
                ci->i_max_offset = filp->f_pos;
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        dout("readdir %p filp %p done.\n", inode, filp);
        return 0;
@@ -607,7 +607,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                struct ceph_inode_info *ci = ceph_inode(dir);
                struct ceph_dentry_info *di = ceph_dentry(dentry);
-                spin_lock(&dir->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
                if (strncmp(dentry->d_name.name,
                            fsc->mount_options->snapdir_name,
@@ -615,13 +615,13 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                    !is_root_ceph_dentry(dir, dentry) &&
                    ceph_dir_test_complete(dir) &&
                    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
-                        spin_unlock(&dir->i_lock);
+                        spin_unlock(&ci->i_ceph_lock);
                        dout(" dir %p complete, -ENOENT\n", dir);
                        d_add(dentry, NULL);
                        di->lease_shared_gen = ci->i_shared_gen;
                        return NULL;
                }
-                spin_unlock(&dir->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
        }
        op = ceph_snap(dir) == CEPH_SNAPDIR ?
@@ -841,12 +841,12 @@ static int drop_caps_for_unlink(struct inode *inode)
        struct ceph_inode_info *ci = ceph_inode(inode);
        int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if (inode->i_nlink == 1) {
                drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
                ci->i_ceph_flags |= CEPH_I_NODELAY;
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return drop;
 }
@@ -1015,10 +1015,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
        struct ceph_dentry_info *di = ceph_dentry(dentry);
        int valid = 0;
-        spin_lock(&dir->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if (ci->i_shared_gen == di->lease_shared_gen)
                valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
-        spin_unlock(&dir->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
             dir, (unsigned)ci->i_shared_gen, dentry,
             (unsigned)di->lease_shared_gen, valid);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce549d31eeb7..ed72428d9c75 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -147,9 +147,9 @@ int ceph_open(struct inode *inode, struct file *file)
        /* trivially open snapdir */
        if (ceph_snap(inode) == CEPH_SNAPDIR) {
-                spin_lock(&inode->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                __ceph_get_fmode(ci, fmode);
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                return ceph_init_file(inode, file, fmode);
        }
@@ -158,7 +158,7 @@ int ceph_open(struct inode *inode, struct file *file)
         * write) or any MDS (for read).  Update wanted set
         * asynchronously.
         */
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if (__ceph_is_any_real_caps(ci) &&
            (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
                int mds_wanted = __ceph_caps_mds_wanted(ci);
@@ -168,7 +168,7 @@ int ceph_open(struct inode *inode, struct file *file)
                     inode, fmode, ceph_cap_string(wanted),
                     ceph_cap_string(issued));
                __ceph_get_fmode(ci, fmode);
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                /* adjust wanted? */
                if ((issued & wanted) != wanted &&
@@ -180,10 +180,10 @@ int ceph_open(struct inode *inode, struct file *file)
        } else if (ceph_snap(inode) != CEPH_NOSNAP &&
                   (ci->i_snap_caps & wanted) == wanted) {
                __ceph_get_fmode(ci, fmode);
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                return ceph_init_file(inode, file, fmode);
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
        req = prepare_open_request(inode->i_sb, flags, 0);
@@ -743,9 +743,9 @@ retry_snap:
                 */
                int dirty;
-                spin_lock(&inode->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                ceph_put_cap_refs(ci, got);
                ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
@@ -764,9 +764,9 @@ retry_snap:
        if (ret >= 0) {
                int dirty;
-                spin_lock(&inode->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                if (dirty)
                        __mark_inode_dirty(inode, dirty);
        }
@@ -797,7 +797,8 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
        mutex_lock(&inode->i_mutex);
        __ceph_do_pending_vmtruncate(inode);
-        if (origin != SEEK_CUR || origin != SEEK_SET) {
+        if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) {
                ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
                if (ret < 0) {
                        offset = ret;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 116f36502f17..87fb132fb330 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        dout("alloc_inode %p\n", &ci->vfs_inode);
+        spin_lock_init(&ci->i_ceph_lock);
        ci->i_version = 0;
        ci->i_time_warp_seq = 0;
        ci->i_ceph_flags = 0;
@@ -583,7 +585,7 @@ static int fill_inode(struct inode *inode,
                               iinfo->xattr_len);
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        /*
         * provided version will be odd if inode value is projected,
@@ -680,7 +682,7 @@ static int fill_inode(struct inode *inode,
                        char *sym;
                        BUG_ON(symlen != inode->i_size);
-                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&ci->i_ceph_lock);
                        err = -ENOMEM;
                        sym = kmalloc(symlen+1, GFP_NOFS);
@@ -689,7 +691,7 @@ static int fill_inode(struct inode *inode,
                        memcpy(sym, iinfo->symlink, symlen);
                        sym[symlen] = 0;
-                        spin_lock(&inode->i_lock);
+                        spin_lock(&ci->i_ceph_lock);
                        if (!ci->i_symlink)
                                ci->i_symlink = sym;
                        else
@@ -715,7 +717,7 @@ static int fill_inode(struct inode *inode,
        }
 no_change:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        /* queue truncate if we saw i_size decrease */
        if (queue_trunc)
@@ -750,13 +752,13 @@ no_change:
                                     info->cap.flags,
                                     caps_reservation);
                } else {
-                        spin_lock(&inode->i_lock);
+                        spin_lock(&ci->i_ceph_lock);
                        dout(" %p got snap_caps %s\n", inode,
                             ceph_cap_string(le32_to_cpu(info->cap.caps)));
                        ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
                        if (cap_fmode >= 0)
                                __ceph_get_fmode(ci, cap_fmode);
-                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&ci->i_ceph_lock);
                }
        } else if (cap_fmode >= 0) {
                pr_warning("mds issued no caps on %llx.%llx\n",
@@ -849,19 +851,20 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 {
        struct dentry *dir = dn->d_parent;
        struct inode *inode = dir->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_dentry_info *di;
        BUG_ON(!inode);
        di = ceph_dentry(dn);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if (!ceph_dir_test_complete(inode)) {
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                return;
        }
        di->offset = ceph_inode(inode)->i_max_offset++;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        spin_lock(&dir->d_lock);
        spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
@@ -1308,7 +1311,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
        struct ceph_inode_info *ci = ceph_inode(inode);
        int ret = 0;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
        inode->i_size = size;
        inode->i_blocks = (size + (1 << 9) - 1) >> 9;
@@ -1318,7 +1321,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
            (ci->i_reported_size << 1) < ci->i_max_size)
                ret = 1;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return ret;
 }
@@ -1376,20 +1379,20 @@ static void ceph_invalidate_work(struct work_struct *work)
        u32 orig_gen;
        int check = 0;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        dout("invalidate_pages %p gen %d revoking %d\n", inode,
             ci->i_rdcache_gen, ci->i_rdcache_revoking);
        if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
                /* nevermind! */
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                goto out;
        }
        orig_gen = ci->i_rdcache_gen;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        truncate_inode_pages(&inode->i_data, 0);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if (orig_gen == ci->i_rdcache_gen &&
            orig_gen == ci->i_rdcache_revoking) {
                dout("invalidate_pages %p gen %d successful\n", inode,
@@ -1401,7 +1404,7 @@ static void ceph_invalidate_work(struct work_struct *work)
                     inode, orig_gen, ci->i_rdcache_gen,
                     ci->i_rdcache_revoking);
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (check)
                ceph_check_caps(ci, 0, NULL);
@@ -1460,10 +1463,10 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
        int wrbuffer_refs, wake = 0;
 retry:
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if (ci->i_truncate_pending == 0) {
                dout("__do_pending_vmtruncate %p none pending\n", inode);
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                return;
        }
@@ -1474,7 +1477,7 @@ retry:
        if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
                dout("__do_pending_vmtruncate %p flushing snaps first\n",
                     inode);
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                filemap_write_and_wait_range(&inode->i_data, 0,
                                             inode->i_sb->s_maxbytes);
                goto retry;
@@ -1484,15 +1487,15 @@ retry:
        wrbuffer_refs = ci->i_wrbuffer_ref;
        dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
             ci->i_truncate_pending, to);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        truncate_inode_pages(inode->i_mapping, to);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        ci->i_truncate_pending--;
        if (ci->i_truncate_pending == 0)
                wake = 1;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (wrbuffer_refs == 0)
                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
@@ -1547,7 +1550,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        if (IS_ERR(req))
                return PTR_ERR(req);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        issued = __ceph_caps_issued(ci, NULL);
        dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
@@ -1695,7 +1698,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        }
        release &= issued;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (inode_dirty_flags)
                __mark_inode_dirty(inode, inode_dirty_flags);
@@ -1717,7 +1720,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        __ceph_do_pending_vmtruncate(inode);
        return err;
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        ceph_mdsc_put_request(req);
        return err;
 }
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 5a14c29cbba6..790914a598dd 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -241,11 +241,11 @@ static long ceph_ioctl_lazyio(struct file *file)
        struct ceph_inode_info *ci = ceph_inode(inode);
        if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
-                spin_lock(&inode->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                ci->i_nr_by_mode[fi->fmode]--;
                fi->fmode |= CEPH_FILE_MODE_LAZY;
                ci->i_nr_by_mode[fi->fmode]++;
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                dout("ioctl_layzio: file %p marked lazy\n", file);
                ceph_check_caps(ci, 0, NULL);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 264ab701154f..6203d805eb45 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -732,21 +732,21 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                }
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        cap = NULL;
        if (mode == USE_AUTH_MDS)
                cap = ci->i_auth_cap;
        if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
                cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
        if (!cap) {
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                goto random;
        }
        mds = cap->session->s_mds;
        dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
             inode, ceph_vinop(inode), mds,
             cap == ci->i_auth_cap ? "auth " : "", cap);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return mds;
 random:
@@ -951,7 +951,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
        dout("removing cap %p, ci is %p, inode is %p\n",
             cap, ci, &ci->vfs_inode);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        __ceph_remove_cap(cap);
        if (!__ceph_is_any_real_caps(ci)) {
                struct ceph_mds_client *mdsc =
@@ -984,7 +984,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                }
                spin_unlock(&mdsc->cap_dirty_lock);
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        while (drop--)
                iput(inode);
        return 0;
@@ -1015,10 +1015,10 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
        wake_up_all(&ci->i_cap_wq);
        if (arg) {
-                spin_lock(&inode->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                ci->i_wanted_max_size = 0;
                ci->i_requested_max_size = 0;
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
        }
        return 0;
 }
@@ -1151,7 +1151,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
        if (session->s_trim_caps <= 0)
                return -1;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        mine = cap->issued | cap->implemented;
        used = __ceph_caps_used(ci);
        oissued = __ceph_caps_issued_other(ci, cap);
@@ -1170,7 +1170,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
                __ceph_remove_cap(cap);
        } else {
                /* try to drop referring dentries */
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                d_prune_aliases(inode);
                dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
                     inode, cap, atomic_read(&inode->i_count));
@@ -1178,7 +1178,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
        }
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return 0;
 }
@@ -1296,7 +1296,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
                                           i_flushing_item);
                        struct inode *inode = &ci->vfs_inode;
-                        spin_lock(&inode->i_lock);
+                        spin_lock(&ci->i_ceph_lock);
                        if (ci->i_cap_flush_seq <= want_flush_seq) {
                                dout("check_cap_flush still flushing %p "
                                     "seq %lld <= %lld to mds%d\n", inode,
@@ -1304,7 +1304,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
                                     session->s_mds);
                                ret = 0;
                        }
-                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&ci->i_ceph_lock);
                }
                mutex_unlock(&session->s_mutex);
                ceph_put_mds_session(session);
@@ -1495,6 +1495,7 @@ retry:
                             pos, temp);
                } else if (stop_on_nosnap && inode &&
                           ceph_snap(inode) == CEPH_NOSNAP) {
+                        spin_unlock(&temp->d_lock);
                        break;
                } else {
                        pos -= temp->d_name.len;
@@ -2011,10 +2012,10 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
        struct ceph_inode_info *ci = ceph_inode(inode);
        dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        ceph_dir_clear_complete(inode);
        ci->i_release_count++;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (req->r_dentry)
                ceph_invalidate_dentry_lease(req->r_dentry);
@@ -2422,7 +2423,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
        if (err)
                goto out_free;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        cap->seq = 0;        /* reset cap seq */
        cap->issue_seq = 0;  /* and issue_seq */
@@ -2445,7 +2446,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                rec.v1.pathbase = cpu_to_le64(pathbase);
                reclen = sizeof(rec.v1);
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (recon_state->flock) {
                int num_fcntl_locks, num_flock_locks;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4bb239921dbd..a50ca0e39475 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -20,7 +20,7 @@
 *
 *         mdsc->snap_rwsem
 *
- *         inode->i_lock
+ *         ci->i_ceph_lock
 *                 mdsc->snap_flush_lock
 *                 mdsc->cap_delay_lock
 *
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index e26437191333..a559c80f127a 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -446,7 +446,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                return;
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        used = __ceph_caps_used(ci);
        dirty = __ceph_caps_dirty(ci);
@@ -528,7 +528,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                kfree(capsnap);
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
 }
 /*
@@ -537,7 +537,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 *
 * If capsnap can now be flushed, add to snap_flush list, and return 1.
 *
- * Caller must hold i_lock.
+ * Caller must hold i_ceph_lock.
 */
 int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                            struct ceph_cap_snap *capsnap)
@@ -739,9 +739,9 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
                inode = &ci->vfs_inode;
                ihold(inode);
                spin_unlock(&mdsc->snap_flush_lock);
-                spin_lock(&inode->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                __ceph_flush_snaps(ci, &session, 0);
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                iput(inode);
                spin_lock(&mdsc->snap_flush_lock);
        }
@@ -847,7 +847,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                                continue;
                        ci = ceph_inode(inode);
-                        spin_lock(&inode->i_lock);
+                        spin_lock(&ci->i_ceph_lock);
                        if (!ci->i_snap_realm)
                                goto skip_inode;
                        /*
@@ -876,7 +876,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                        oldrealm = ci->i_snap_realm;
                        ci->i_snap_realm = realm;
                        spin_unlock(&realm->inodes_with_caps_lock);
-                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&ci->i_ceph_lock);
                        ceph_get_snap_realm(mdsc, realm);
                        ceph_put_snap_realm(mdsc, oldrealm);
@@ -885,7 +885,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                        continue;
 skip_inode:
-                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&ci->i_ceph_lock);
                        iput(inode);
                }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 8dc73a594a90..b48f15f101a0 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -383,7 +383,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
        if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
                seq_printf(m, ",rsize=%d", fsopt->rsize);
        if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
-                seq_printf(m, ",rasize=%d", fsopt->rsize);
+                seq_printf(m, ",rasize=%d", fsopt->rasize);
        if (fsopt->congestion_kb != default_congestion_kb())
                seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
        if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 01bf189e08a9..edcbf3774a56 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -220,7 +220,7 @@ struct ceph_dentry_info {
 * The locking for D_COMPLETE is a bit odd:
 *  - we can clear it at almost any time (see ceph_d_prune)
 *  - it is only meaningful if:
- *    - we hold dir inode i_lock
+ *    - we hold dir inode i_ceph_lock
 *    - we hold dir FILE_SHARED caps
 *    - the dentry D_COMPLETE is set
 */
@@ -250,6 +250,8 @@ struct ceph_inode_xattrs_info {
 struct ceph_inode_info {
        struct ceph_vino i_vino;   /* ceph ino + snap */
+        spinlock_t i_ceph_lock;
        u64 i_version;
        u32 i_time_warp_seq;
@@ -271,7 +273,7 @@ struct ceph_inode_info {
        struct ceph_inode_xattrs_info i_xattrs;
-        /* capabilities.  protected _both_ by i_lock and cap->session's
+        /* capabilities.  protected _both_ by i_ceph_lock and cap->session's
         * s_mutex. */
        struct rb_root i_caps;           /* cap list */
        struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
@@ -437,18 +439,18 @@ static inline void ceph_i_clear(struct inode *inode, unsigned mask)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        ci->i_ceph_flags &= ~mask;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
 }
 static inline void ceph_i_set(struct inode *inode, unsigned mask)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        ci->i_ceph_flags |= mask;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
 }
 static inline bool ceph_i_test(struct inode *inode, unsigned mask)
@@ -456,9 +458,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
        struct ceph_inode_info *ci = ceph_inode(inode);
        bool r;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        r = (ci->i_ceph_flags & mask) == mask;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return r;
 }
@@ -508,9 +510,9 @@ extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
 static inline int ceph_caps_issued(struct ceph_inode_info *ci)
 {
        int issued;
-        spin_lock(&ci->vfs_inode.i_lock);
+        spin_lock(&ci->i_ceph_lock);
        issued = __ceph_caps_issued(ci, NULL);
-        spin_unlock(&ci->vfs_inode.i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return issued;
 }
@@ -518,9 +520,9 @@ static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
                                        int touch)
 {
        int r;
-        spin_lock(&ci->vfs_inode.i_lock);
+        spin_lock(&ci->i_ceph_lock);
        r = __ceph_caps_issued_mask(ci, mask, touch);
-        spin_unlock(&ci->vfs_inode.i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return r;
 }
@@ -743,10 +745,9 @@ extern int ceph_add_cap(struct inode *inode,
 extern void __ceph_remove_cap(struct ceph_cap *cap);
 static inline void ceph_remove_cap(struct ceph_cap *cap)
 {
-        struct inode *inode = &cap->ci->vfs_inode;
+        spin_lock(&cap->ci->i_ceph_lock);
-        spin_lock(&inode->i_lock);
        __ceph_remove_cap(cap);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&cap->ci->i_ceph_lock);
 }
 extern void ceph_put_cap(struct ceph_mds_client *mdsc,
                         struct ceph_cap *cap);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 96c6739a0280..a5e36e4488a7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -343,8 +343,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
 }
 static int __build_xattrs(struct inode *inode)
-        __releases(inode->i_lock)
+        __releases(ci->i_ceph_lock)
-        __acquires(inode->i_lock)
+        __acquires(ci->i_ceph_lock)
 {
        u32 namelen;
        u32 numattr = 0;
@@ -372,7 +372,7 @@ start:
                end = p + ci->i_xattrs.blob->vec.iov_len;
                ceph_decode_32_safe(&p, end, numattr, bad);
                xattr_version = ci->i_xattrs.version;
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
                                 GFP_NOFS);
@@ -387,7 +387,7 @@ start:
                                goto bad_lock;
                }
-                spin_lock(&inode->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                if (ci->i_xattrs.version != xattr_version) {
                        /* lost a race, retry */
                        for (i = 0; i < numattr; i++)
@@ -418,7 +418,7 @@ start:
        return err;
 bad_lock:
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
 bad:
        if (xattrs) {
                for (i = 0; i < numattr; i++)
@@ -512,7 +512,7 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
        if (vxattrs)
                vxattr = ceph_match_vxattr(vxattrs, name);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
             ci->i_xattrs.version, ci->i_xattrs.index_version);
@@ -520,14 +520,14 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
            (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
                goto get_xattr;
        } else {
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                /* get xattrs from mds (if we don't already have them) */
                err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
                if (err)
                        return err;
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        if (vxattr && vxattr->readonly) {
                err = vxattr->getxattr_cb(ci, value, size);
@@ -558,7 +558,7 @@ get_xattr:
        memcpy(value, xattr->val, xattr->val_len);
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return err;
 }
@@ -573,7 +573,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
        u32 len;
        int i;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
             ci->i_xattrs.version, ci->i_xattrs.index_version);
@@ -581,13 +581,13 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
            (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
                goto list_xattr;
        } else {
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
                if (err)
                        return err;
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        err = __build_xattrs(inode);
        if (err < 0)
@@ -619,7 +619,7 @@ list_xattr:
                }
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        return err;
 }
@@ -739,7 +739,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
        if (!xattr)
                goto out;
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
 retry:
        issued = __ceph_caps_issued(ci, NULL);
        if (!(issued & CEPH_CAP_XATTR_EXCL))
@@ -752,12 +752,12 @@ retry:
            required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
                struct ceph_buffer *blob = NULL;
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ci->i_ceph_lock);
                dout(" preaallocating new blob size=%d\n", required_blob_size);
                blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
                if (!blob)
                        goto out;
-                spin_lock(&inode->i_lock);
+                spin_lock(&ci->i_ceph_lock);
                if (ci->i_xattrs.prealloc_blob)
                        ceph_buffer_put(ci->i_xattrs.prealloc_blob);
                ci->i_xattrs.prealloc_blob = blob;
@@ -770,13 +770,13 @@ retry:
        dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
        ci->i_xattrs.dirty = true;
        inode->i_ctime = CURRENT_TIME;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (dirty)
                __mark_inode_dirty(inode, dirty);
        return err;
 do_sync:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        err = ceph_sync_setxattr(dentry, name, value, size, flags);
 out:
        kfree(newname);
@@ -833,7 +833,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
                        return -EOPNOTSUPP;
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ci->i_ceph_lock);
        __build_xattrs(inode);
        issued = __ceph_caps_issued(ci, NULL);
        dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
@@ -846,12 +846,12 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
        ci->i_xattrs.dirty = true;
        inode->i_ctime = CURRENT_TIME;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        if (dirty)
                __mark_inode_dirty(inode, dirty);
        return err;
 do_sync:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ci->i_ceph_lock);
        err = ceph_send_removexattr(dentry, name);
        return err;
 }
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d6a972df0338..8cd4b52d4217 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -441,6 +441,8 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
        smb_msg.msg_controllen = 0;
        for (total_read = 0; to_read; total_read += length, to_read -= length) {
+                try_to_freeze();
                if (server_unresponsive(server)) {
                        total_read = -EAGAIN;
                        break;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cf0b1539b321..4dd9283885e7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -702,6 +702,13 @@ cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
                                         lock->type, lock->netfid, conf_lock);
 }
+/*
+ * Check if there is another lock that prevents us to set the lock (mandatory
+ * style). If such a lock exists, update the flock structure with its
+ * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
+ * or leave it the same if we can't. Returns 0 if we don't need to request to
+ * the server or 1 otherwise.
+ */
 static int
 cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
               __u8 type, __u16 netfid, struct file_lock *flock)
@@ -739,6 +746,12 @@ cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock)
        mutex_unlock(&cinode->lock_mutex);
 }
+/*
+ * Set the byte-range lock (mandatory style). Returns:
+ * 1) 0, if we set the lock and don't need to request to the server;
+ * 2) 1, if no locks prevent us but we need to request to the server;
+ * 3) -EACCESS, if there is a lock that prevents us and wait is false.
+ */
 static int
 cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
                 bool wait)
@@ -778,6 +791,13 @@ try_again:
        return rc;
 }
+/*
+ * Check if there is another lock that prevents us to set the lock (posix
+ * style). If such a lock exists, update the flock structure with its
+ * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
+ * or leave it the same if we can't. Returns 0 if we don't need to request to
+ * the server or 1 otherwise.
+ */
 static int
 cifs_posix_lock_test(struct file *file, struct file_lock *flock)
 {
@@ -800,6 +820,12 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
        return rc;
 }
+/*
+ * Set the byte-range lock (posix style). Returns:
+ * 1) 0, if we set the lock and don't need to request to the server;
+ * 2) 1, if we need to request to the server;
+ * 3) <0, if the error occurs while setting the lock.
+ */
 static int
 cifs_posix_lock_set(struct file *file, struct file_lock *flock)
 {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5de03ec20144..a090bbe6ee29 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -554,7 +554,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
                                 rc);
                        return rc;
                }
-                cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
+                /* FindFirst/Next set last_entry to NULL on malformed reply */
+                if (cifsFile->srch_inf.last_entry)
+                        cifs_save_resume_key(cifsFile->srch_inf.last_entry,
+                                                cifsFile);
        }
        while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
@@ -562,7 +565,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
                cFYI(1, "calling findnext2");
                rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
                                  &cifsFile->srch_inf);
-                cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
+                /* FindFirst/Next set last_entry to NULL on malformed reply */
+                if (cifsFile->srch_inf.last_entry)
+                        cifs_save_resume_key(cifsFile->srch_inf.last_entry,
+                                                cifsFile);
                if (rc)
                        return -ENOENT;
        }
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 7cacba12b8f1..80d850881938 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -209,7 +209,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
 {
        int rc;
        int len;
-        __u16 wpwd[129];
+        __le16 wpwd[129];
        /* Password cannot be longer than 128 characters */
        if (passwd) /* Password must be converted to NT unicode */
@@ -219,8 +219,8 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
                *wpwd = 0; /* Ensure string is null terminated */
        }
-        rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__u16));
+        rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16));
-        memset(wpwd, 0, 129 * sizeof(__u16));
+        memset(wpwd, 0, 129 * sizeof(__le16));
        return rc;
 }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index ca418aaf6352..9d8715c45f25 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -292,7 +292,7 @@ int __init configfs_inode_init(void)
        return bdi_init(&configfs_backing_dev_info);
 }
-void __exit configfs_inode_exit(void)
+void configfs_inode_exit(void)
 {
        bdi_destroy(&configfs_backing_dev_info);
 }
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index ecc62178beda..276e15cafd58 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -143,28 +143,26 @@ static int __init configfs_init(void)
                goto out;
        config_kobj = kobject_create_and_add("config", kernel_kobj);
-        if (!config_kobj) {
+        if (!config_kobj)
-                kmem_cache_destroy(configfs_dir_cachep);
+                goto out2;
-                configfs_dir_cachep = NULL;
-                goto out;
+        err = configfs_inode_init();
-        }
+        if (err)
+                goto out3;
        err = register_filesystem(&configfs_fs_type);
-        if (err) {
+        if (err)
-                printk(KERN_ERR "configfs: Unable to register filesystem!\n");
+                goto out4;
-                kobject_put(config_kobj);
-                kmem_cache_destroy(configfs_dir_cachep);
-                configfs_dir_cachep = NULL;
-                goto out;
-        }
-        err = configfs_inode_init();
+        return 0;
-        if (err) {
+out4:
-                unregister_filesystem(&configfs_fs_type);
+        printk(KERN_ERR "configfs: Unable to register filesystem!\n");
-                kobject_put(config_kobj);
+        configfs_inode_exit();
-                kmem_cache_destroy(configfs_dir_cachep);
+out3:
-                configfs_dir_cachep = NULL;
+        kobject_put(config_kobj);
-        }
+out2:
+        kmem_cache_destroy(configfs_dir_cachep);
+        configfs_dir_cachep = NULL;
 out:
        return err;
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index 10ba92def3f6..89509b5a090e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2439,16 +2439,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 /**
 * prepend_path - Prepend path string to a buffer
 * @path: the dentry/vfsmount to report
- * @root: root vfsmnt/dentry (may be modified by this function)
+ * @root: root vfsmnt/dentry
 * @buffer: pointer to the end of the buffer
 * @buflen: pointer to buffer length
 *
 * Caller holds the rename_lock.
- *
- * If path is not reachable from the supplied root, then the value of
- * root is changed (without modifying refcounts).
 */
-static int prepend_path(const struct path *path, struct path *root,
+static int prepend_path(const struct path *path,
+                        const struct path *root,
                        char **buffer, int *buflen)
 {
        struct dentry *dentry = path->dentry;
@@ -2483,10 +2481,10 @@ static int prepend_path(const struct path *path, struct path *root,
                dentry = parent;
        }
-out:
        if (!error && !slash)
                error = prepend(buffer, buflen, "/", 1);
+out:
        br_read_unlock(vfsmount_lock);
        return error;
@@ -2500,15 +2498,17 @@ global_root:
                WARN(1, "Root dentry has weird name <%.*s>\n",
                     (int) dentry->d_name.len, dentry->d_name.name);
        }
-        root->mnt = vfsmnt;
+        if (!slash)
-        root->dentry = dentry;
+                error = prepend(buffer, buflen, "/", 1);
+        if (!error)
+                error = vfsmnt->mnt_ns ? 1 : 2;
        goto out;
 }
 /**
 * __d_path - return the path of a dentry
 * @path: the dentry/vfsmount to report
- * @root: root vfsmnt/dentry (may be modified by this function)
+ * @root: root vfsmnt/dentry
 * @buf: buffer to return value in
 * @buflen: buffer length
 *
@@ -2519,10 +2519,10 @@ global_root:
 *
 * "buflen" should be positive.
 *
- * If path is not reachable from the supplied root, then the value of
+ * If the path is not reachable from the supplied root, return %NULL.
- * root is changed (without modifying refcounts).
 */
-char *__d_path(const struct path *path, struct path *root,
+char *__d_path(const struct path *path,
+               const struct path *root,
               char *buf, int buflen)
 {
        char *res = buf + buflen;
@@ -2533,7 +2533,28 @@ char *__d_path(const struct path *path, struct path *root,
        error = prepend_path(path, root, &res, &buflen);
        write_sequnlock(&rename_lock);
-        if (error)
+        if (error < 0)
+                return ERR_PTR(error);
+        if (error > 0)
+                return NULL;
+        return res;
+}
+char *d_absolute_path(const struct path *path,
+               char *buf, int buflen)
+{
+        struct path root = {};
+        char *res = buf + buflen;
+        int error;
+        prepend(&res, &buflen, "\0", 1);
+        write_seqlock(&rename_lock);
+        error = prepend_path(path, &root, &res, &buflen);
+        write_sequnlock(&rename_lock);
+        if (error > 1)
+                error = -EINVAL;
+        if (error < 0)
                return ERR_PTR(error);
        return res;
 }
@@ -2541,8 +2562,9 @@ char *__d_path(const struct path *path, struct path *root,
 /*
 * same as __d_path but appends "(deleted)" for unlinked files.
 */
-static int path_with_deleted(const struct path *path, struct path *root,
+static int path_with_deleted(const struct path *path,
-                                 char **buf, int *buflen)
+                             const struct path *root,
+                             char **buf, int *buflen)
 {
        prepend(buf, buflen, "\0", 1);
        if (d_unlinked(path->dentry)) {
@@ -2579,7 +2601,6 @@ char *d_path(const struct path *path, char *buf, int buflen)
 {
        char *res = buf + buflen;
        struct path root;
-        struct path tmp;
        int error;
        /*
@@ -2594,9 +2615,8 @@ char *d_path(const struct path *path, char *buf, int buflen)
        get_fs_root(current->fs, &root);
        write_seqlock(&rename_lock);
-        tmp = root;
+        error = path_with_deleted(path, &root, &res, &buflen);
-        error = path_with_deleted(path, &tmp, &res, &buflen);
+        if (error < 0)
-        if (error)
                res = ERR_PTR(error);
        write_sequnlock(&rename_lock);
        path_put(&root);
@@ -2617,7 +2637,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
 {
        char *res = buf + buflen;
        struct path root;
-        struct path tmp;
        int error;
        if (path->dentry->d_op && path->dentry->d_op->d_dname)
@@ -2625,9 +2644,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
        get_fs_root(current->fs, &root);
        write_seqlock(&rename_lock);
-        tmp = root;
+        error = path_with_deleted(path, &root, &res, &buflen);
-        error = path_with_deleted(path, &tmp, &res, &buflen);
+        if (error > 0)
-        if (!error && !path_equal(&tmp, &root))
                error = prepend_unreachable(&res, &buflen);
        write_sequnlock(&rename_lock);
        path_put(&root);
@@ -2758,19 +2776,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
        write_seqlock(&rename_lock);
        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
-                struct path tmp = root;
                char *cwd = page + PAGE_SIZE;
                int buflen = PAGE_SIZE;
                prepend(&cwd, &buflen, "\0", 1);
-                error = prepend_path(&pwd, &tmp, &cwd, &buflen);
+                error = prepend_path(&pwd, &root, &cwd, &buflen);
                write_sequnlock(&rename_lock);
-                if (error)
+                if (error < 0)
                        goto out;
                /* Unreachable from current root */
-                if (!path_equal(&tmp, &root)) {
+                if (error > 0) {
                        error = prepend_unreachable(&cwd, &buflen);
                        if (error)
                                goto out;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 61fa9e1614af..607b1557d292 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1095,7 +1095,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
-        neh->eh_depth = cpu_to_le16(neh->eh_depth + 1);
+        neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1);
        ext4_mark_inode_dirty(handle, inode);
 out:
        brelse(bh);
@@ -2955,7 +2955,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        /* Pre-conditions */
        BUG_ON(!ext4_ext_is_uninitialized(ex));
        BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
-        BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
        /*
         * Attempt to transfer newly initialized blocks from the currently
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 848f436df29f..92655fd89657 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1339,8 +1339,11 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                                        clear_buffer_unwritten(bh);
                                }
-                                /* skip page if block allocation undone */
+                                /*
-                                if (buffer_delay(bh) || buffer_unwritten(bh))
+                                 * skip page if block allocation undone and
+                                 * block is dirty
+                                 */
+                                if (ext4_bh_delay_or_unwritten(NULL, bh))
                                        skip_page = 1;
                                bh = bh->b_this_page;
                                block_start += bh->b_size;
@@ -2387,7 +2390,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
        pgoff_t index;
        struct inode *inode = mapping->host;
        handle_t *handle;
-        loff_t page_len;
        index = pos >> PAGE_CACHE_SHIFT;
@@ -2434,13 +2436,6 @@ retry:
                 */
                if (pos + len > inode->i_size)
                        ext4_truncate_failed_write(inode);
-        } else {
-                page_len = pos & (PAGE_CACHE_SIZE - 1);
-                if (page_len > 0) {
-                        ret = ext4_discard_partial_page_buffers_no_lock(handle,
-                                inode, page, pos - page_len, page_len,
-                                EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
-                }
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2483,7 +2478,6 @@ static int ext4_da_write_end(struct file *file,
        loff_t new_i_size;
        unsigned long start, end;
        int write_mode = (int)(unsigned long)fsdata;
-        loff_t page_len;
        if (write_mode == FALL_BACK_TO_NONDELALLOC) {
                if (ext4_should_order_data(inode)) {
@@ -2508,7 +2502,7 @@ static int ext4_da_write_end(struct file *file,
         */
        new_i_size = pos + copied;
-        if (new_i_size > EXT4_I(inode)->i_disksize) {
+        if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
                if (ext4_da_should_update_i_disksize(page, end)) {
                        down_write(&EXT4_I(inode)->i_data_sem);
                        if (new_i_size > EXT4_I(inode)->i_disksize) {
@@ -2532,16 +2526,6 @@ static int ext4_da_write_end(struct file *file,
        }
        ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
-        page_len = PAGE_CACHE_SIZE -
-                        ((pos + copied - 1) & (PAGE_CACHE_SIZE - 1));
-        if (page_len > 0) {
-                ret = ext4_discard_partial_page_buffers_no_lock(handle,
-                        inode, page, pos + copied - 1, page_len,
-                        EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
-        }
        copied = ret2;
        if (ret2 < 0)
                ret = ret2;
@@ -2781,10 +2765,11 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  iocb->private, io_end->inode->i_ino, iocb, offset,
                  size);
+        iocb->private = NULL;
        /* if not aio dio with unwritten extents, just free io and return */
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                ext4_free_io_end(io_end);
-                iocb->private = NULL;
 out:
                if (is_async)
                        aio_complete(iocb, ret, 0);
@@ -2807,7 +2792,6 @@ out:
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        /* queue the work to convert unwritten extents to written */
-        iocb->private = NULL;
        queue_work(wq, &io_end->work);
        /* XXX: probably should move into the real I/O completion handler */
@@ -3203,26 +3187,8 @@ int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-        if (!page_has_buffers(page)) {
+        if (!page_has_buffers(page))
-                /*
+                create_empty_buffers(page, blocksize, 0);
-                 * If the range to be discarded covers a partial block
-                 * we need to get the page buffers.  This is because
-                 * partial blocks cannot be released and the page needs
-                 * to be updated with the contents of the block before
-                 * we write the zeros on top of it.
-                 */
-                if ((from & (blocksize - 1)) ||
-                    ((from + length) & (blocksize - 1))) {
-                        create_empty_buffers(page, blocksize, 0);
-                } else {
-                        /*
-                         * If there are no partial blocks,
-                         * there is nothing to update,
-                         * so we can return now
-                         */
-                        return 0;
-                }
-        }
        /* Find the buffer that contains "offset" */
        bh = page_buffers(page);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7ce1d0b19c94..7e106c810c62 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -385,6 +385,18 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                block_end = block_start + blocksize;
                if (block_start >= len) {
+                        /*
+                         * Comments copied from block_write_full_page_endio:
+                         *
+                         * The page straddles i_size.  It must be zeroed out on
+                         * each and every writepage invocation because it may
+                         * be mmapped.  "A file is mapped in multiples of the
+                         * page size.  For a file that is not a multiple of
+                         * the  page size, the remaining memory is zeroed when
+                         * mapped, and writes to that region are not written
+                         * out to the file."
+                         */
+                        zero_user_segment(page, block_start, block_end);
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
                        continue;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3858767ec672..3e1329e2f826 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1155,9 +1155,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",block_validity");
        if (!test_opt(sb, INIT_INODE_TABLE))
-                seq_puts(seq, ",noinit_inode_table");
+                seq_puts(seq, ",noinit_itable");
        else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
-                seq_printf(seq, ",init_inode_table=%u",
+                seq_printf(seq, ",init_itable=%u",
                           (unsigned) sbi->s_li_wait_mult);
        ext4_show_quota_options(seq, sb);
@@ -1333,8 +1333,7 @@ enum {
        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
-        Opt_discard, Opt_nodiscard,
+        Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
-        Opt_init_inode_table, Opt_noinit_inode_table,
 };
 static const match_table_t tokens = {
@@ -1407,9 +1406,9 @@ static const match_table_t tokens = {
        {Opt_dioread_lock, "dioread_lock"},
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
-        {Opt_init_inode_table, "init_itable=%u"},
+        {Opt_init_itable, "init_itable=%u"},
-        {Opt_init_inode_table, "init_itable"},
+        {Opt_init_itable, "init_itable"},
-        {Opt_noinit_inode_table, "noinit_itable"},
+        {Opt_noinit_itable, "noinit_itable"},
        {Opt_err, NULL},
 };
@@ -1892,7 +1891,7 @@ set_qf_format:
                case Opt_dioread_lock:
                        clear_opt(sb, DIOREAD_NOLOCK);
                        break;
-                case Opt_init_inode_table:
+                case Opt_init_itable:
                        set_opt(sb, INIT_INODE_TABLE);
                        if (args[0].from) {
                                if (match_int(&args[0], &option))
@@ -1903,7 +1902,7 @@ set_qf_format:
                                return 0;
                        sbi->s_li_wait_mult = option;
                        break;
-                case Opt_noinit_inode_table:
+                case Opt_noinit_itable:
                        clear_opt(sb, INIT_INODE_TABLE);
                        break;
                default:
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 73c3992b2bb4..ac86f8b3e3cb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -156,6 +156,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 * bdi_start_writeback - start writeback
 * @bdi: the backing device to write from
 * @nr_pages: the number of pages to write
+ * @reason: reason why some writeback work was initiated
 *
 * Description:
 *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
@@ -1223,6 +1224,7 @@ static void wait_sb_inodes(struct super_block *sb)
 * writeback_inodes_sb_nr -     writeback dirty inodes from given super_block
 * @sb: the superblock
 * @nr: the number of pages to write
+ * @reason: reason why some writeback work initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
@@ -1251,6 +1253,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
 /**
 * writeback_inodes_sb  -       writeback dirty inodes from given super_block
 * @sb: the superblock
+ * @reason: reason why some writeback work was initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
@@ -1265,6 +1268,7 @@ EXPORT_SYMBOL(writeback_inodes_sb);
 /**
 * writeback_inodes_sb_if_idle  -       start writeback if none underway
 * @sb: the superblock
+ * @reason: reason why some writeback work was initiated
 *
 * Invoke writeback_inodes_sb if no writeback is currently underway.
 * Returns 1 if writeback was started, 0 if not.
@@ -1285,6 +1289,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
 * writeback_inodes_sb_if_idle  -       start writeback if none underway
 * @sb: the superblock
 * @nr: the number of pages to write
+ * @reason: reason why some writeback work was initiated
 *
 * Invoke writeback_inodes_sb if no writeback is currently underway.
 * Returns 1 if writeback was started, 0 if not.
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 5cb8614508c3..2aaf3eaaf13d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1512,7 +1512,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
        else if (outarg->offset + num > file_size)
                num = file_size - outarg->offset;
-        while (num) {
+        while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) {
                struct page *page;
                unsigned int this_num;
@@ -1526,6 +1526,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
                num -= this_num;
                total_len += this_num;
+                index++;
        }
        req->misc.retrieve_in.offset = outarg->offset;
        req->misc.retrieve_in.size = total_len;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 594f07a81c28..0c84100acd44 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1556,7 +1556,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
        struct inode *inode = file->f_path.dentry->d_inode;
        mutex_lock(&inode->i_mutex);
-        if (origin != SEEK_CUR || origin != SEEK_SET) {
+        if (origin != SEEK_CUR && origin != SEEK_SET) {
                retval = fuse_update_attributes(inode, NULL, file, NULL);
                if (retval)
                        goto exit;
@@ -1567,6 +1567,10 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
                offset += i_size_read(inode);
                break;
        case SEEK_CUR:
+                if (offset == 0) {
+                        retval = file->f_pos;
+                        goto exit;
+                }
                offset += file->f_pos;
                break;
        case SEEK_DATA:
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3e6d72756479..aa83109b9431 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1138,28 +1138,28 @@ static int __init fuse_fs_init(void)
 {
        int err;
-        err = register_filesystem(&fuse_fs_type);
-        if (err)
-                goto out;
-        err = register_fuseblk();
-        if (err)
-                goto out_unreg;
        fuse_inode_cachep = kmem_cache_create("fuse_inode",
                                              sizeof(struct fuse_inode),
                                              0, SLAB_HWCACHE_ALIGN,
                                              fuse_inode_init_once);
        err = -ENOMEM;
        if (!fuse_inode_cachep)
-                goto out_unreg2;
+                goto out;
+        err = register_fuseblk();
+        if (err)
+                goto out2;
+        err = register_filesystem(&fuse_fs_type);
+        if (err)
+                goto out3;
        return 0;
- out_unreg2:
+ out3:
        unregister_fuseblk();
- out_unreg:
+ out2:
-        unregister_filesystem(&fuse_fs_type);
+        kmem_cache_destroy(fuse_inode_cachep);
 out:
        return err;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 6d3a1963879b..cfc6d4448aa5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1048,15 +1048,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
        if (err)
                goto out;
        seq_putc(m, ' ');
-        seq_path_root(m, &mnt_path, &root, " \t\n\\");
-        if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
+        /* mountpoints outside of chroot jail will give SEQ_SKIP on this */
-                /*
+        err = seq_path_root(m, &mnt_path, &root, " \t\n\\");
-                 * Mountpoint is outside root, discard that one.  Ugly,
+        if (err)
-                 * but less so than trying to do that in iterator in a
+                goto out;
-                 * race-free way (due to renames).
-                 */
-                return SEQ_SKIP;
-        }
        seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
        show_mnt_opts(m, mnt);
@@ -2776,3 +2773,8 @@ void kern_unmount(struct vfsmount *mnt)
        }
 }
 EXPORT_SYMBOL(kern_unmount);
+bool our_mnt(struct vfsmount *mnt)
+{
+        return check_mnt(mnt);
+}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 5b5fa33b6b9d..cbd1a61c110a 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -548,7 +548,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
        if (error)
-                goto out_bdi;
+                goto out_fput;
        server->ncp_filp = ncp_filp;
        server->ncp_sock = sock;
@@ -559,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
                error = -EBADF;
                server->info_filp = fget(data.info_fd);
                if (!server->info_filp)
-                        goto out_fput;
+                        goto out_bdi;
                error = -ENOTSOCK;
                sock_inode = server->info_filp->f_path.dentry->d_inode;
                if (!S_ISSOCK(sock_inode->i_mode))
@@ -746,9 +746,9 @@ out_nls:
 out_fput2:
        if (server->info_filp)
                fput(server->info_filp);
-out_fput:
-        bdi_destroy(&server->bdi);
 out_bdi:
+        bdi_destroy(&server->bdi);
+out_fput:
        /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
         * 
         * The previously used put_filp(ncp_filp); was bogus, since
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 586174168e2a..80e4645f7990 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -131,12 +131,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(i.freeswap),
                K(global_page_state(NR_FILE_DIRTY)),
                K(global_page_state(NR_WRITEBACK)),
-                K(global_page_state(NR_ANON_PAGES)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                K(global_page_state(NR_ANON_PAGES)
                  + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
-                  HPAGE_PMD_NR
+                  HPAGE_PMD_NR),
+#else
+                K(global_page_state(NR_ANON_PAGES)),
 #endif
-                  ),
                K(global_page_state(NR_FILE_MAPPED)),
                K(global_page_state(NR_SHMEM)),
                K(global_page_state(NR_SLAB_RECLAIMABLE) +
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9a8a2b77b874..03102d978180 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -91,20 +91,18 @@ static struct file_system_type proc_fs_type = {
 void __init proc_root_init(void)
 {
-        struct vfsmount *mnt;
        int err;
        proc_init_inodecache();
        err = register_filesystem(&proc_fs_type);
        if (err)
                return;
-        mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
+        err = pid_ns_prepare_proc(&init_pid_ns);
-        if (IS_ERR(mnt)) {
+        if (err) {
                unregister_filesystem(&proc_fs_type);
                return;
        }
-        init_pid_ns.proc_mnt = mnt;
        proc_symlink("mounts", NULL, "self/mounts");
        proc_net_init();
@@ -209,5 +207,5 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
 void pid_ns_release_proc(struct pid_namespace *ns)
 {
-        mntput(ns->proc_mnt);
+        kern_unmount(ns->proc_mnt);
 }
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 42b274da92c3..2a30d67dd6b8 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -32,7 +32,7 @@ static cputime64_t get_idle_time(int cpu)
                idle = kstat_cpu(cpu).cpustat.idle;
                idle = cputime64_add(idle, arch_idle_time(cpu));
        } else
-                idle = usecs_to_cputime(idle_time);
+                idle = nsecs_to_jiffies64(1000 * idle_time);
        return idle;
 }
@@ -46,7 +46,7 @@ static cputime64_t get_iowait_time(int cpu)
                /* !NO_HZ so we can rely on cpustat.iowait */
                iowait = kstat_cpu(cpu).cpustat.iowait;
        else
-                iowait = usecs_to_cputime(iowait_time);
+                iowait = nsecs_to_jiffies64(1000 * iowait_time);
        return iowait;
 }
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 05d6b0e78c95..dba43c3ea3af 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -449,8 +449,6 @@ EXPORT_SYMBOL(seq_path);
 /*
 * Same as seq_path, but relative to supplied root.
- *
- * root may be changed, see __d_path().
 */
 int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
                  char *esc)
@@ -463,6 +461,8 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
                char *p;
                p = __d_path(path, root, buf, size);
+                if (!p)
+                        return SEQ_SKIP;
                res = PTR_ERR(p);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
@@ -474,7 +474,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
        }
        seq_commit(m, res);
-        return res < 0 ? res : 0;
+        return res < 0 && res != -ENAMETOOLONG ? res : 0;
 }
 /*
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 20403dc5d437..ae0e76bb6ebf 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2264,19 +2264,12 @@ static int __init ubifs_init(void)
                return -EINVAL;
        }
-        err = register_filesystem(&ubifs_fs_type);
-        if (err) {
-                ubifs_err("cannot register file system, error %d", err);
-                return err;
-        }
-        err = -ENOMEM;
        ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
                                sizeof(struct ubifs_inode), 0,
                                SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
                                &inode_slab_ctor);
        if (!ubifs_inode_slab)
-                goto out_reg;
+                return -ENOMEM;
        register_shrinker(&ubifs_shrinker_info);
@@ -2288,15 +2281,20 @@ static int __init ubifs_init(void)
        if (err)
                goto out_compr;
+        err = register_filesystem(&ubifs_fs_type);
+        if (err) {
+                ubifs_err("cannot register file system, error %d", err);
+                goto out_dbg;
+        }
        return 0;
+out_dbg:
+        dbg_debugfs_exit();
 out_compr:
        ubifs_compressors_exit();
 out_shrinker:
        unregister_shrinker(&ubifs_shrinker_info);
        kmem_cache_destroy(ubifs_inode_slab);
-out_reg:
-        unregister_filesystem(&ubifs_fs_type);
        return err;
 }
 /* late_initcall to let compressors initialize first */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c68baeb0974a..d0ab78837057 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2383,6 +2383,8 @@ xfs_bmap_btalloc(
        int             tryagain;
        int             error;
+        ASSERT(ap->length);
        mp = ap->ip->i_mount;
        align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
        if (unlikely(align)) {
@@ -4629,6 +4631,8 @@ xfs_bmapi_allocate(
        int                     error;
        int                     rt;
+        ASSERT(bma->length > 0);
        rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
        /*
@@ -4849,6 +4853,7 @@ xfs_bmapi_write(
        ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
        ASSERT(!(flags & XFS_BMAPI_IGSTATE));
        ASSERT(tp != NULL);
+        ASSERT(len > 0);
        whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
                XFS_ATTR_FORK : XFS_DATA_FORK;
@@ -4918,9 +4923,22 @@ xfs_bmapi_write(
                        bma.eof = eof;
                        bma.conv = !!(flags & XFS_BMAPI_CONVERT);
                        bma.wasdel = wasdelay;
-                        bma.length = len;
                        bma.offset = bno;
+                        /*
+                         * There's a 32/64 bit type mismatch between the
+                         * allocation length request (which can be 64 bits in
+                         * length) and the bma length request, which is
+                         * xfs_extlen_t and therefore 32 bits. Hence we have to
+                         * check for 32-bit overflows and handle them here.
+                         */
+                        if (len > (xfs_filblks_t)MAXEXTLEN)
+                                bma.length = MAXEXTLEN;
+                        else
+                                bma.length = len;
+                        ASSERT(len > 0);
+                        ASSERT(bma.length > 0);
                        error = xfs_bmapi_allocate(&bma, flags);
                        if (error)
                                goto error0;
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index da108977b21f..558910f5e3c0 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -98,22 +98,22 @@ xfs_fs_encode_fh(
        switch (fileid_type) {
        case FILEID_INO32_GEN_PARENT:
                spin_lock(&dentry->d_lock);
-                fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino;
+                fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
                fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
                spin_unlock(&dentry->d_lock);
                /*FALLTHRU*/
        case FILEID_INO32_GEN:
-                fid->i32.ino = inode->i_ino;
+                fid->i32.ino = XFS_I(inode)->i_ino;
                fid->i32.gen = inode->i_generation;
                break;
        case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
                spin_lock(&dentry->d_lock);
-                fid64->parent_ino = dentry->d_parent->d_inode->i_ino;
+                fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
                fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
                spin_unlock(&dentry->d_lock);
                /*FALLTHRU*/
        case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
-                fid64->ino = inode->i_ino;
+                fid64->ino = XFS_I(inode)->i_ino;
                fid64->gen = inode->i_generation;
                break;
        }
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a14cd89fe465..34817adf4b9e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -150,6 +150,117 @@ xlog_grant_add_space(
        } while (head_val != old);
 }
+STATIC bool
+xlog_reserveq_wake(
+        struct log              *log,
+        int                     *free_bytes)
+{
+        struct xlog_ticket      *tic;
+        int                     need_bytes;
+        list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+                if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+                        need_bytes = tic->t_unit_res * tic->t_cnt;
+                else
+                        need_bytes = tic->t_unit_res;
+                if (*free_bytes < need_bytes)
+                        return false;
+                *free_bytes -= need_bytes;
+                trace_xfs_log_grant_wake_up(log, tic);
+                wake_up(&tic->t_wait);
+        }
+        return true;
+}
+STATIC bool
+xlog_writeq_wake(
+        struct log              *log,
+        int                     *free_bytes)
+{
+        struct xlog_ticket      *tic;
+        int                     need_bytes;
+        list_for_each_entry(tic, &log->l_writeq, t_queue) {
+                ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+                need_bytes = tic->t_unit_res;
+                if (*free_bytes < need_bytes)
+                        return false;
+                *free_bytes -= need_bytes;
+                trace_xfs_log_regrant_write_wake_up(log, tic);
+                wake_up(&tic->t_wait);
+        }
+        return true;
+}
+STATIC int
+xlog_reserveq_wait(
+        struct log              *log,
+        struct xlog_ticket      *tic,
+        int                     need_bytes)
+{
+        list_add_tail(&tic->t_queue, &log->l_reserveq);
+        do {
+                if (XLOG_FORCED_SHUTDOWN(log))
+                        goto shutdown;
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                trace_xfs_log_grant_sleep(log, tic);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+                trace_xfs_log_grant_wake(log, tic);
+                spin_lock(&log->l_grant_reserve_lock);
+                if (XLOG_FORCED_SHUTDOWN(log))
+                        goto shutdown;
+        } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
+        list_del_init(&tic->t_queue);
+        return 0;
+shutdown:
+        list_del_init(&tic->t_queue);
+        return XFS_ERROR(EIO);
+}
+STATIC int
+xlog_writeq_wait(
+        struct log              *log,
+        struct xlog_ticket      *tic,
+        int                     need_bytes)
+{
+        list_add_tail(&tic->t_queue, &log->l_writeq);
+        do {
+                if (XLOG_FORCED_SHUTDOWN(log))
+                        goto shutdown;
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                trace_xfs_log_regrant_write_sleep(log, tic);
+                xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+                trace_xfs_log_regrant_write_wake(log, tic);
+                spin_lock(&log->l_grant_write_lock);
+                if (XLOG_FORCED_SHUTDOWN(log))
+                        goto shutdown;
+        } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes);
+        list_del_init(&tic->t_queue);
+        return 0;
+shutdown:
+        list_del_init(&tic->t_queue);
+        return XFS_ERROR(EIO);
+}
 static void
 xlog_tic_reset_res(xlog_ticket_t *tic)
 {
@@ -350,8 +461,19 @@ xfs_log_reserve(
                retval = xlog_grant_log_space(log, internal_ticket);
        }
+        if (unlikely(retval)) {
+                /*
+                 * If we are failing, make sure the ticket doesn't have any
+                 * current reservations.  We don't want to add this back
+                 * when the ticket/ transaction gets cancelled.
+                 */
+                internal_ticket->t_curr_res = 0;
+                /* ungrant will give back unit_res * t_cnt. */
+                internal_ticket->t_cnt = 0;
+        }
        return retval;
-}       /* xfs_log_reserve */
+}
 /*
@@ -2481,8 +2603,8 @@ restart:
 /*
 * Atomically get the log space required for a log ticket.
 *
- * Once a ticket gets put onto the reserveq, it will only return after
+ * Once a ticket gets put onto the reserveq, it will only return after the
- * the needed reservation is satisfied.
+ * needed reservation is satisfied.
 *
 * This function is structured so that it has a lock free fast path. This is
 * necessary because every new transaction reservation will come through this
@@ -2490,113 +2612,53 @@ restart:
 * every pass.
 *
 * As tickets are only ever moved on and off the reserveq under the
- * l_grant_reserve_lock, we only need to take that lock if we are going
+ * l_grant_reserve_lock, we only need to take that lock if we are going to add
- * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
- * ticket was never added to the reserveq because the t_queue list head will be
+ * was never added to the reserveq because the t_queue list head will be empty
- * empty and we hold the only reference to it so it can safely be checked
+ * and we hold the only reference to it so it can safely be checked unlocked.
- * unlocked.
 */
 STATIC int
-xlog_grant_log_space(xlog_t        *log,
+xlog_grant_log_space(
-                     xlog_ticket_t *tic)
+        struct log              *log,
+        struct xlog_ticket      *tic)
 {
-        int              free_bytes;
+        int                     free_bytes, need_bytes;
-        int              need_bytes;
+        int                     error = 0;
-#ifdef DEBUG
+        ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
-                panic("grant Recovery problem");
-#endif
        trace_xfs_log_grant_enter(log, tic);
+        /*
+         * If there are other waiters on the queue then give them a chance at
+         * logspace before us.  Wake up the first waiters, if we do not wake
+         * up all the waiters then go to sleep waiting for more free space,
+         * otherwise try to get some space for this transaction.
+         */
        need_bytes = tic->t_unit_res;
        if (tic->t_flags & XFS_LOG_PERM_RESERV)
                need_bytes *= tic->t_ocnt;
-        /* something is already sleeping; insert new transaction at end */
-        if (!list_empty_careful(&log->l_reserveq)) {
-                spin_lock(&log->l_grant_reserve_lock);
-                /* recheck the queue now we are locked */
-                if (list_empty(&log->l_reserveq)) {
-                        spin_unlock(&log->l_grant_reserve_lock);
-                        goto redo;
-                }
-                list_add_tail(&tic->t_queue, &log->l_reserveq);
-                trace_xfs_log_grant_sleep1(log, tic);
-                /*
-                 * Gotta check this before going to sleep, while we're
-                 * holding the grant lock.
-                 */
-                if (XLOG_FORCED_SHUTDOWN(log))
-                        goto error_return;
-                XFS_STATS_INC(xs_sleep_logspace);
-                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
-                /*
-                 * If we got an error, and the filesystem is shutting down,
-                 * we'll catch it down below. So just continue...
-                 */
-                trace_xfs_log_grant_wake1(log, tic);
-        }
-redo:
-        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return_unlocked;
        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-        if (free_bytes < need_bytes) {
+        if (!list_empty_careful(&log->l_reserveq)) {
                spin_lock(&log->l_grant_reserve_lock);
-                if (list_empty(&tic->t_queue))
+                if (!xlog_reserveq_wake(log, &free_bytes) ||
-                        list_add_tail(&tic->t_queue, &log->l_reserveq);
+                    free_bytes < need_bytes)
+                        error = xlog_reserveq_wait(log, tic, need_bytes);
-                trace_xfs_log_grant_sleep2(log, tic);
+                spin_unlock(&log->l_grant_reserve_lock);
+        } else if (free_bytes < need_bytes) {
-                if (XLOG_FORCED_SHUTDOWN(log))
-                        goto error_return;
-                xlog_grant_push_ail(log, need_bytes);
-                XFS_STATS_INC(xs_sleep_logspace);
-                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
-                trace_xfs_log_grant_wake2(log, tic);
-                goto redo;
-        }
-        if (!list_empty(&tic->t_queue)) {
                spin_lock(&log->l_grant_reserve_lock);
-                list_del_init(&tic->t_queue);
+                error = xlog_reserveq_wait(log, tic, need_bytes);
                spin_unlock(&log->l_grant_reserve_lock);
        }
+        if (error)
+                return error;
-        /* we've got enough space */
        xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_grant_exit(log, tic);
        xlog_verify_grant_tail(log);
        return 0;
+}
-error_return_unlocked:
-        spin_lock(&log->l_grant_reserve_lock);
-error_return:
-        list_del_init(&tic->t_queue);
-        spin_unlock(&log->l_grant_reserve_lock);
-        trace_xfs_log_grant_error(log, tic);
-        /*
-         * If we are failing, make sure the ticket doesn't have any
-         * current reservations. We don't want to add this back when
-         * the ticket/transaction gets cancelled.
-         */
-        tic->t_curr_res = 0;
-        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        return XFS_ERROR(EIO);
-}       /* xlog_grant_log_space */
 /*
 * Replenish the byte reservation required by moving the grant write head.
@@ -2605,10 +2667,12 @@ error_return:
 * free fast path.
 */
 STATIC int
-xlog_regrant_write_log_space(xlog_t        *log,
+xlog_regrant_write_log_space(
-                             xlog_ticket_t *tic)
+        struct log              *log,
+        struct xlog_ticket      *tic)
 {
-        int             free_bytes, need_bytes;
+        int                     free_bytes, need_bytes;
+        int                     error = 0;
        tic->t_curr_res = tic->t_unit_res;
        xlog_tic_reset_res(tic);
@@ -2616,104 +2680,38 @@ xlog_regrant_write_log_space(xlog_t	   *log,
        if (tic->t_cnt > 0)
                return 0;
-#ifdef DEBUG
+        ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
-                panic("regrant Recovery problem");
-#endif
        trace_xfs_log_regrant_write_enter(log, tic);
-        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return_unlocked;
-        /* If there are other waiters on the queue then give them a
+        /*
-         * chance at logspace before us. Wake up the first waiters,
+         * If there are other waiters on the queue then give them a chance at
-         * if we do not wake up all the waiters then go to sleep waiting
+         * logspace before us.  Wake up the first waiters, if we do not wake
-         * for more free space, otherwise try to get some space for
+         * up all the waiters then go to sleep waiting for more free space,
-         * this transaction.
+         * otherwise try to get some space for this transaction.
         */
        need_bytes = tic->t_unit_res;
-        if (!list_empty_careful(&log->l_writeq)) {
-                struct xlog_ticket *ntic;
-                spin_lock(&log->l_grant_write_lock);
-                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                list_for_each_entry(ntic, &log->l_writeq, t_queue) {
-                        ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
-                        if (free_bytes < ntic->t_unit_res)
-                                break;
-                        free_bytes -= ntic->t_unit_res;
-                        wake_up(&ntic->t_wait);
-                }
-                if (ntic != list_first_entry(&log->l_writeq,
-                                                struct xlog_ticket, t_queue)) {
-                        if (list_empty(&tic->t_queue))
-                                list_add_tail(&tic->t_queue, &log->l_writeq);
-                        trace_xfs_log_regrant_write_sleep1(log, tic);
-                        xlog_grant_push_ail(log, need_bytes);
-                        XFS_STATS_INC(xs_sleep_logspace);
-                        xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-                        trace_xfs_log_regrant_write_wake1(log, tic);
-                } else
-                        spin_unlock(&log->l_grant_write_lock);
-        }
-redo:
-        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return_unlocked;
        free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-        if (free_bytes < need_bytes) {
+        if (!list_empty_careful(&log->l_writeq)) {
                spin_lock(&log->l_grant_write_lock);
-                if (list_empty(&tic->t_queue))
+                if (!xlog_writeq_wake(log, &free_bytes) ||
-                        list_add_tail(&tic->t_queue, &log->l_writeq);
+                    free_bytes < need_bytes)
+                        error = xlog_writeq_wait(log, tic, need_bytes);
-                if (XLOG_FORCED_SHUTDOWN(log))
+                spin_unlock(&log->l_grant_write_lock);
-                        goto error_return;
+        } else if (free_bytes < need_bytes) {
-                xlog_grant_push_ail(log, need_bytes);
-                XFS_STATS_INC(xs_sleep_logspace);
-                trace_xfs_log_regrant_write_sleep2(log, tic);
-                xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-                trace_xfs_log_regrant_write_wake2(log, tic);
-                goto redo;
-        }
-        if (!list_empty(&tic->t_queue)) {
                spin_lock(&log->l_grant_write_lock);
-                list_del_init(&tic->t_queue);
+                error = xlog_writeq_wait(log, tic, need_bytes);
                spin_unlock(&log->l_grant_write_lock);
        }
-        /* we've got enough space */
+        if (error)
+                return error;
        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_regrant_write_exit(log, tic);
        xlog_verify_grant_tail(log);
        return 0;
+}
- error_return_unlocked:
-        spin_lock(&log->l_grant_write_lock);
- error_return:
-        list_del_init(&tic->t_queue);
-        spin_unlock(&log->l_grant_write_lock);
-        trace_xfs_log_regrant_write_error(log, tic);
-        /*
-         * If we are failing, make sure the ticket doesn't have any
-         * current reservations. We don't want to add this back when
-         * the ticket/transaction gets cancelled.
-         */
-        tic->t_curr_res = 0;
-        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        return XFS_ERROR(EIO);
-}       /* xlog_regrant_write_log_space */
 /* The first cnt-1 times through here we don't need to
 * move the grant write head because the permanent
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f1d2802b2f07..494035798873 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -834,18 +834,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);