19 files changed, 535 insertions, 303 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 56b28607c32d..4f078c054b41 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -477,7 +477,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
 }
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
-static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb)
+static int kiocb_cancel(struct kiocb *kiocb)
 {
        kiocb_cancel_fn *old, *cancel;
@@ -538,7 +538,7 @@ static void free_ioctx_users(struct percpu_ref *ref)
                                       struct kiocb, ki_list);
                list_del_init(&req->ki_list);
-                kiocb_cancel(ctx, req);
+                kiocb_cancel(req);
        }
        spin_unlock_irq(&ctx->ctx_lock);
@@ -727,42 +727,42 @@ err:
 *      when the processes owning a context have all exited to encourage
 *      the rapid destruction of the kioctx.
 */
-static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
+static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
                struct completion *requests_done)
 {
-        if (!atomic_xchg(&ctx->dead, 1)) {
+        struct kioctx_table *table;
-                struct kioctx_table *table;
-                spin_lock(&mm->ioctx_lock);
+        if (atomic_xchg(&ctx->dead, 1))
-                rcu_read_lock();
+                return -EINVAL;
-                table = rcu_dereference(mm->ioctx_table);
-                WARN_ON(ctx != table->table[ctx->id]);
-                table->table[ctx->id] = NULL;
-                rcu_read_unlock();
-                spin_unlock(&mm->ioctx_lock);
-                /* percpu_ref_kill() will do the necessary call_rcu() */
+        spin_lock(&mm->ioctx_lock);
-                wake_up_all(&ctx->wait);
+        rcu_read_lock();
+        table = rcu_dereference(mm->ioctx_table);
-                /*
+        WARN_ON(ctx != table->table[ctx->id]);
-                 * It'd be more correct to do this in free_ioctx(), after all
+        table->table[ctx->id] = NULL;
-                 * the outstanding kiocbs have finished - but by then io_destroy
+        rcu_read_unlock();
-                 * has already returned, so io_setup() could potentially return
+        spin_unlock(&mm->ioctx_lock);
-                 * -EAGAIN with no ioctxs actually in use (as far as userspace
-                 *  could tell).
-                 */
-                aio_nr_sub(ctx->max_reqs);
-                if (ctx->mmap_size)
+        /* percpu_ref_kill() will do the necessary call_rcu() */
-                        vm_munmap(ctx->mmap_base, ctx->mmap_size);
+        wake_up_all(&ctx->wait);
-                ctx->requests_done = requests_done;
+        /*
-                percpu_ref_kill(&ctx->users);
+         * It'd be more correct to do this in free_ioctx(), after all
-        } else {
+         * the outstanding kiocbs have finished - but by then io_destroy
-                if (requests_done)
+         * has already returned, so io_setup() could potentially return
-                        complete(requests_done);
+         * -EAGAIN with no ioctxs actually in use (as far as userspace
-        }
+         *  could tell).
+         */
+        aio_nr_sub(ctx->max_reqs);
+        if (ctx->mmap_size)
+                vm_munmap(ctx->mmap_base, ctx->mmap_size);
+        ctx->requests_done = requests_done;
+        percpu_ref_kill(&ctx->users);
+        return 0;
 }
 /* wait_on_sync_kiocb:
@@ -1219,21 +1219,23 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
        if (likely(NULL != ioctx)) {
                struct completion requests_done =
                        COMPLETION_INITIALIZER_ONSTACK(requests_done);
+                int ret;
                /* Pass requests_done to kill_ioctx() where it can be set
                 * in a thread-safe way. If we try to set it here then we have
                 * a race condition if two io_destroy() called simultaneously.
                 */
-                kill_ioctx(current->mm, ioctx, &requests_done);
+                ret = kill_ioctx(current->mm, ioctx, &requests_done);
                percpu_ref_put(&ioctx->users);
                /* Wait until all IO for the context are done. Otherwise kernel
                 * keep using user-space buffers even if user thinks the context
                 * is destroyed.
                 */
-                wait_for_completion(&requests_done);
+                if (!ret)
+                        wait_for_completion(&requests_done);
-                return 0;
+                return ret;
        }
        pr_debug("EINVAL: io_destroy: invalid context id\n");
        return -EINVAL;
@@ -1595,7 +1597,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
        kiocb = lookup_kiocb(ctx, iocb, key);
        if (kiocb)
-                ret = kiocb_cancel(ctx, kiocb);
+                ret = kiocb_cancel(kiocb);
        else
                ret = -EINVAL;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f25a9092b946..a389820d158b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2354,7 +2354,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
 {
        int uptodate = (err == 0);
        struct extent_io_tree *tree;
-        int ret;
+        int ret = 0;
        tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -5068,6 +5068,43 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
        }
 }
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
+                        unsigned long start,
+                        unsigned long len)
+{
+        size_t cur;
+        size_t offset;
+        struct page *page;
+        char *kaddr;
+        char __user *dst = (char __user *)dstv;
+        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+        int ret = 0;
+        WARN_ON(start > eb->len);
+        WARN_ON(start + len > eb->start + eb->len);
+        offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+        while (len > 0) {
+                page = extent_buffer_page(eb, i);
+                cur = min(len, (PAGE_CACHE_SIZE - offset));
+                kaddr = page_address(page);
+                if (copy_to_user(dst, kaddr + offset, cur)) {
+                        ret = -EFAULT;
+                        break;
+                }
+                dst += cur;
+                len -= cur;
+                offset = 0;
+                i++;
+        }
+        return ret;
+}
 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
                               unsigned long min_len, char **map,
                               unsigned long *map_start,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 8b63f2d46518..15ce5f2a2b62 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -304,6 +304,9 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 void read_extent_buffer(struct extent_buffer *eb, void *dst,
                        unsigned long start,
                        unsigned long len);
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
+                               unsigned long start,
+                               unsigned long len);
 void write_extent_buffer(struct extent_buffer *eb, const void *src,
                         unsigned long start, unsigned long len);
 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 82c18ba12e3f..0d321c23069a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1957,7 +1957,8 @@ static noinline int copy_to_sk(struct btrfs_root *root,
                               struct btrfs_path *path,
                               struct btrfs_key *key,
                               struct btrfs_ioctl_search_key *sk,
-                               char *buf,
+                               size_t *buf_size,
+                               char __user *ubuf,
                               unsigned long *sk_offset,
                               int *num_found)
 {
@@ -1989,13 +1990,25 @@ static noinline int copy_to_sk(struct btrfs_root *root,
                if (!key_in_sk(key, sk))
                        continue;
-                if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+                if (sizeof(sh) + item_len > *buf_size) {
+                        if (*num_found) {
+                                ret = 1;
+                                goto out;
+                        }
+                        /*
+                         * return one empty item back for v1, which does not
+                         * handle -EOVERFLOW
+                         */
+                        *buf_size = sizeof(sh) + item_len;
                        item_len = 0;
+                        ret = -EOVERFLOW;
+                }
-                if (sizeof(sh) + item_len + *sk_offset >
+                if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
-                    BTRFS_SEARCH_ARGS_BUFSIZE) {
                        ret = 1;
-                        goto overflow;
+                        goto out;
                }
                sh.objectid = key->objectid;
@@ -2005,20 +2018,33 @@ static noinline int copy_to_sk(struct btrfs_root *root,
                sh.transid = found_transid;
                /* copy search result header */
-                memcpy(buf + *sk_offset, &sh, sizeof(sh));
+                if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
+                        ret = -EFAULT;
+                        goto out;
+                }
                *sk_offset += sizeof(sh);
                if (item_len) {
-                        char *p = buf + *sk_offset;
+                        char __user *up = ubuf + *sk_offset;
                        /* copy the item */
-                        read_extent_buffer(leaf, p,
+                        if (read_extent_buffer_to_user(leaf, up,
-                                           item_off, item_len);
+                                                       item_off, item_len)) {
+                                ret = -EFAULT;
+                                goto out;
+                        }
                        *sk_offset += item_len;
                }
                (*num_found)++;
-                if (*num_found >= sk->nr_items)
+                if (ret) /* -EOVERFLOW from above */
-                        break;
+                        goto out;
+                if (*num_found >= sk->nr_items) {
+                        ret = 1;
+                        goto out;
+                }
        }
 advance_key:
        ret = 0;
@@ -2033,22 +2059,37 @@ advance_key:
                key->objectid++;
        } else
                ret = 1;
-overflow:
+out:
+        /*
+         *  0: all items from this leaf copied, continue with next
+         *  1: * more items can be copied, but unused buffer is too small
+         *     * all items were found
+         *     Either way, it will stops the loop which iterates to the next
+         *     leaf
+         *  -EOVERFLOW: item was to large for buffer
+         *  -EFAULT: could not copy extent buffer back to userspace
+         */
        return ret;
 }
 static noinline int search_ioctl(struct inode *inode,
-                                 struct btrfs_ioctl_search_args *args)
+                                 struct btrfs_ioctl_search_key *sk,
+                                 size_t *buf_size,
+                                 char __user *ubuf)
 {
        struct btrfs_root *root;
        struct btrfs_key key;
        struct btrfs_path *path;
-        struct btrfs_ioctl_search_key *sk = &args->key;
        struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
        int ret;
        int num_found = 0;
        unsigned long sk_offset = 0;
+        if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
+                *buf_size = sizeof(struct btrfs_ioctl_search_header);
+                return -EOVERFLOW;
+        }
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -2082,14 +2123,15 @@ static noinline int search_ioctl(struct inode *inode,
                                ret = 0;
                        goto err;
                }
-                ret = copy_to_sk(root, path, &key, sk, args->buf,
+                ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf,
                                 &sk_offset, &num_found);
                btrfs_release_path(path);
-                if (ret || num_found >= sk->nr_items)
+                if (ret)
                        break;
        }
-        ret = 0;
+        if (ret > 0)
+                ret = 0;
 err:
        sk->nr_items = num_found;
        btrfs_free_path(path);
@@ -2099,22 +2141,73 @@ err:
 static noinline int btrfs_ioctl_tree_search(struct file *file,
                                           void __user *argp)
 {
-         struct btrfs_ioctl_search_args *args;
+        struct btrfs_ioctl_search_args __user *uargs;
-         struct inode *inode;
+        struct btrfs_ioctl_search_key sk;
-         int ret;
+        struct inode *inode;
+        int ret;
+        size_t buf_size;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        args = memdup_user(argp, sizeof(*args));
+        uargs = (struct btrfs_ioctl_search_args __user *)argp;
-        if (IS_ERR(args))
-                return PTR_ERR(args);
+        if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
+                return -EFAULT;
+        buf_size = sizeof(uargs->buf);
        inode = file_inode(file);
-        ret = search_ioctl(inode, args);
+        ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
-        if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+        /*
+         * In the origin implementation an overflow is handled by returning a
+         * search header with a len of zero, so reset ret.
+         */
+        if (ret == -EOVERFLOW)
+                ret = 0;
+        if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
                ret = -EFAULT;
-        kfree(args);
+        return ret;
+}
+static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
+                                               void __user *argp)
+{
+        struct btrfs_ioctl_search_args_v2 __user *uarg;
+        struct btrfs_ioctl_search_args_v2 args;
+        struct inode *inode;
+        int ret;
+        size_t buf_size;
+        const size_t buf_limit = 16 * 1024 * 1024;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        /* copy search header and buffer size */
+        uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
+        if (copy_from_user(&args, uarg, sizeof(args)))
+                return -EFAULT;
+        buf_size = args.buf_size;
+        if (buf_size < sizeof(struct btrfs_ioctl_search_header))
+                return -EOVERFLOW;
+        /* limit result size to 16MB */
+        if (buf_size > buf_limit)
+                buf_size = buf_limit;
+        inode = file_inode(file);
+        ret = search_ioctl(inode, &args.key, &buf_size,
+                           (char *)(&uarg->buf[0]));
+        if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
+                ret = -EFAULT;
+        else if (ret == -EOVERFLOW &&
+                copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
+                ret = -EFAULT;
        return ret;
 }
@@ -5198,6 +5291,8 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_trans_end(file);
        case BTRFS_IOC_TREE_SEARCH:
                return btrfs_ioctl_tree_search(file, argp);
+        case BTRFS_IOC_TREE_SEARCH_V2:
+                return btrfs_ioctl_tree_search_v2(file, argp);
        case BTRFS_IOC_INO_LOOKUP:
                return btrfs_ioctl_ino_lookup(file, argp);
        case BTRFS_IOC_INO_PATHS:
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index cf5aead95a7f..98cb6b2630f9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1798,8 +1798,10 @@ static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        tmp = ulist_alloc(GFP_NOFS);
-        if (!tmp)
+        if (!tmp) {
+                ulist_free(qgroups);
                return -ENOMEM;
+        }
        btrfs_get_tree_mod_seq(fs_info, &elem);
        ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 30947f923620..09230cf3a244 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -428,8 +428,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                        continue;
                }
                if (!dev->bdev) {
-                        /* cannot read ahead on missing device */
+                        /*
-                        continue;
+                         * cannot read ahead on missing device, but for RAID5/6,
+                         * REQ_GET_READ_MIRRORS return 1. So don't skip missing
+                         * device for such case.
+                         */
+                        if (nzones > 1)
+                                continue;
                }
                if (dev_replace_is_ongoing &&
                    dev == fs_info->dev_replace.tgtdev) {
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index a5dcacb5df9c..9626252ee6b4 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -135,7 +135,7 @@ restart:
        radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
                struct extent_buffer *eb;
-                eb = radix_tree_deref_slot(slot);
+                eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
                if (!eb)
                        continue;
                /* Shouldn't happen but that kind of thinking creates CVE's */
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index fa691b754aaf..ec3dcb202357 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -415,6 +415,8 @@ int btrfs_test_qgroups(void)
                ret = -ENOMEM;
                goto out;
        }
+        btrfs_set_header_level(root->node, 0);
+        btrfs_set_header_nritems(root->node, 0);
        root->alloc_bytenr += 8192;
        tmp_root = btrfs_alloc_dummy_root();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9630f10f8e1e..511839c04f11 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1284,11 +1284,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                goto fail;
        }
-        pending->error = btrfs_qgroup_inherit(trans, fs_info,
+        ret = btrfs_qgroup_inherit(trans, fs_info,
-                                              root->root_key.objectid,
+                                   root->root_key.objectid,
-                                              objectid, pending->inherit);
+                                   objectid, pending->inherit);
-        if (pending->error)
+        if (ret) {
-                goto no_free_objectid;
+                btrfs_abort_transaction(trans, root, ret);
+                goto fail;
+        }
        /* see comments in should_cow_block() */
        set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 21887d63dad5..469f2e8657e8 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -104,12 +104,6 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
        umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
        struct dentry *dentry;
-        if (acl) {
-                ret = posix_acl_valid(acl);
-                if (ret < 0)
-                        goto out;
-        }
        switch (type) {
        case ACL_TYPE_ACCESS:
                name = POSIX_ACL_XATTR_ACCESS;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4f3f69079f36..90b3954d48ed 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -211,18 +211,15 @@ static int readpage_nounlock(struct file *filp, struct page *page)
                SetPageError(page);
                ceph_fscache_readpage_cancel(inode, page);
                goto out;
-        } else {
-                if (err < PAGE_CACHE_SIZE) {
-                /* zero fill remainder of page */
-                        zero_user_segment(page, err, PAGE_CACHE_SIZE);
-                } else {
-                        flush_dcache_page(page);
-                }
        }
-        SetPageUptodate(page);
+        if (err < PAGE_CACHE_SIZE)
+                /* zero fill remainder of page */
+                zero_user_segment(page, err, PAGE_CACHE_SIZE);
+        else
+                flush_dcache_page(page);
-        if (err >= 0)
+        SetPageUptodate(page);
-                ceph_readpage_to_fscache(inode, page);
+        ceph_readpage_to_fscache(inode, page);
 out:
        return err < 0 ? err : 0;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index c561b628ebce..1fde164b74b5 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -221,8 +221,8 @@ int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
        return 0;
 }
-static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
+struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
-                                struct ceph_cap_reservation *ctx)
+                              struct ceph_cap_reservation *ctx)
 {
        struct ceph_cap *cap = NULL;
@@ -508,15 +508,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
 * it is < 0.  (This is so we can atomically add the cap and add an
 * open file reference to it.)
 */
-int ceph_add_cap(struct inode *inode,
+void ceph_add_cap(struct inode *inode,
-                 struct ceph_mds_session *session, u64 cap_id,
+                  struct ceph_mds_session *session, u64 cap_id,
-                 int fmode, unsigned issued, unsigned wanted,
+                  int fmode, unsigned issued, unsigned wanted,
-                 unsigned seq, unsigned mseq, u64 realmino, int flags,
+                  unsigned seq, unsigned mseq, u64 realmino, int flags,
-                 struct ceph_cap_reservation *caps_reservation)
+                  struct ceph_cap **new_cap)
 {
        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_cap *new_cap = NULL;
        struct ceph_cap *cap;
        int mds = session->s_mds;
        int actual_wanted;
@@ -531,20 +530,10 @@ int ceph_add_cap(struct inode *inode,
        if (fmode >= 0)
                wanted |= ceph_caps_for_mode(fmode);
-retry:
-        spin_lock(&ci->i_ceph_lock);
        cap = __get_cap_for_mds(ci, mds);
        if (!cap) {
-                if (new_cap) {
+                cap = *new_cap;
-                        cap = new_cap;
+                *new_cap = NULL;
-                        new_cap = NULL;
-                } else {
-                        spin_unlock(&ci->i_ceph_lock);
-                        new_cap = get_cap(mdsc, caps_reservation);
-                        if (new_cap == NULL)
-                                return -ENOMEM;
-                        goto retry;
-                }
                cap->issued = 0;
                cap->implemented = 0;
@@ -562,9 +551,6 @@ retry:
                session->s_nr_caps++;
                spin_unlock(&session->s_cap_lock);
        } else {
-                if (new_cap)
-                        ceph_put_cap(mdsc, new_cap);
                /*
                 * auth mds of the inode changed. we received the cap export
                 * message, but still haven't received the cap import message.
@@ -626,7 +612,6 @@ retry:
                        ci->i_auth_cap = cap;
                        cap->mds_wanted = wanted;
                }
-                ci->i_cap_exporting_issued = 0;
        } else {
                WARN_ON(ci->i_auth_cap == cap);
        }
@@ -648,9 +633,6 @@ retry:
        if (fmode >= 0)
                __ceph_get_fmode(ci, fmode);
-        spin_unlock(&ci->i_ceph_lock);
-        wake_up_all(&ci->i_cap_wq);
-        return 0;
 }
 /*
@@ -685,7 +667,7 @@ static int __cap_is_valid(struct ceph_cap *cap)
 */
 int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
 {
-        int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
+        int have = ci->i_snap_caps;
        struct ceph_cap *cap;
        struct rb_node *p;
@@ -900,7 +882,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
 */
 static int __ceph_is_any_caps(struct ceph_inode_info *ci)
 {
-        return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued;
+        return !RB_EMPTY_ROOT(&ci->i_caps);
 }
 int ceph_is_any_caps(struct inode *inode)
@@ -2397,32 +2379,30 @@ static void invalidate_aliases(struct inode *inode)
 * actually be a revocation if it specifies a smaller cap set.)
 *
 * caller holds s_mutex and i_ceph_lock, we drop both.
- *
- * return value:
- *  0 - ok
- *  1 - check_caps on auth cap only (writeback)
- *  2 - check_caps (ack revoke)
 */
-static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+static void handle_cap_grant(struct ceph_mds_client *mdsc,
+                             struct inode *inode, struct ceph_mds_caps *grant,
+                             void *snaptrace, int snaptrace_len,
+                             struct ceph_buffer *xattr_buf,
                             struct ceph_mds_session *session,
-                             struct ceph_cap *cap,
+                             struct ceph_cap *cap, int issued)
-                             struct ceph_buffer *xattr_buf)
+        __releases(ci->i_ceph_lock)
-                __releases(ci->i_ceph_lock)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
        int seq = le32_to_cpu(grant->seq);
        int newcaps = le32_to_cpu(grant->caps);
-        int issued, implemented, used, wanted, dirty;
+        int used, wanted, dirty;
        u64 size = le64_to_cpu(grant->size);
        u64 max_size = le64_to_cpu(grant->max_size);
        struct timespec mtime, atime, ctime;
        int check_caps = 0;
-        int wake = 0;
+        bool wake = 0;
-        int writeback = 0;
+        bool writeback = 0;
-        int queue_invalidate = 0;
+        bool queue_trunc = 0;
-        int deleted_inode = 0;
+        bool queue_invalidate = 0;
-        int queue_revalidate = 0;
+        bool queue_revalidate = 0;
+        bool deleted_inode = 0;
        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
             inode, cap, mds, seq, ceph_cap_string(newcaps));
@@ -2466,16 +2446,13 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        }
        /* side effects now are allowed */
-        issued = __ceph_caps_issued(ci, &implemented);
-        issued |= implemented | __ceph_caps_dirty(ci);
        cap->cap_gen = session->s_cap_gen;
        cap->seq = seq;
        __check_cap_issue(ci, cap, newcaps);
-        if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+        if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+            (issued & CEPH_CAP_AUTH_EXCL) == 0) {
                inode->i_mode = le32_to_cpu(grant->mode);
                inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
                inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
@@ -2484,7 +2461,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
                     from_kgid(&init_user_ns, inode->i_gid));
        }
-        if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
+        if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+            (issued & CEPH_CAP_LINK_EXCL) == 0) {
                set_nlink(inode, le32_to_cpu(grant->nlink));
                if (inode->i_nlink == 0 &&
                    (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
@@ -2511,30 +2489,35 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
                queue_revalidate = 1;
-        /* size/ctime/mtime/atime? */
+        if (newcaps & CEPH_CAP_ANY_RD) {
-        ceph_fill_file_size(inode, issued,
+                /* ctime/mtime/atime? */
-                            le32_to_cpu(grant->truncate_seq),
+                ceph_decode_timespec(&mtime, &grant->mtime);
-                            le64_to_cpu(grant->truncate_size), size);
+                ceph_decode_timespec(&atime, &grant->atime);
-        ceph_decode_timespec(&mtime, &grant->mtime);
+                ceph_decode_timespec(&ctime, &grant->ctime);
-        ceph_decode_timespec(&atime, &grant->atime);
+                ceph_fill_file_time(inode, issued,
-        ceph_decode_timespec(&ctime, &grant->ctime);
+                                    le32_to_cpu(grant->time_warp_seq),
-        ceph_fill_file_time(inode, issued,
+                                    &ctime, &mtime, &atime);
-                            le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
+        }
-                            &atime);
+        if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
+                /* file layout may have changed */
-        /* file layout may have changed */
+                ci->i_layout = grant->layout;
-        ci->i_layout = grant->layout;
+                /* size/truncate_seq? */
+                queue_trunc = ceph_fill_file_size(inode, issued,
-        /* max size increase? */
+                                        le32_to_cpu(grant->truncate_seq),
-        if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
+                                        le64_to_cpu(grant->truncate_size),
-                dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
+                                        size);
-                ci->i_max_size = max_size;
+                /* max size increase? */
-                if (max_size >= ci->i_wanted_max_size) {
+                if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
-                        ci->i_wanted_max_size = 0;  /* reset */
+                        dout("max_size %lld -> %llu\n",
-                        ci->i_requested_max_size = 0;
+                             ci->i_max_size, max_size);
+                        ci->i_max_size = max_size;
+                        if (max_size >= ci->i_wanted_max_size) {
+                                ci->i_wanted_max_size = 0;  /* reset */
+                                ci->i_requested_max_size = 0;
+                        }
+                        wake = 1;
                }
-                wake = 1;
        }
        /* check cap bits */
@@ -2595,6 +2578,23 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        spin_unlock(&ci->i_ceph_lock);
+        if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
+                down_write(&mdsc->snap_rwsem);
+                ceph_update_snap_trace(mdsc, snaptrace,
+                                       snaptrace + snaptrace_len, false);
+                downgrade_write(&mdsc->snap_rwsem);
+                kick_flushing_inode_caps(mdsc, session, inode);
+                up_read(&mdsc->snap_rwsem);
+                if (newcaps & ~issued)
+                        wake = 1;
+        }
+        if (queue_trunc) {
+                ceph_queue_vmtruncate(inode);
+                ceph_queue_revalidate(inode);
+        } else if (queue_revalidate)
+                ceph_queue_revalidate(inode);
        if (writeback)
                /*
                 * queue inode for writeback: we can't actually call
@@ -2606,8 +2606,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
                ceph_queue_invalidate(inode);
        if (deleted_inode)
                invalidate_aliases(inode);
-        if (queue_revalidate)
-                ceph_queue_revalidate(inode);
        if (wake)
                wake_up_all(&ci->i_cap_wq);
@@ -2784,7 +2782,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 {
        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        struct ceph_mds_session *tsession = NULL;
-        struct ceph_cap *cap, *tcap;
+        struct ceph_cap *cap, *tcap, *new_cap = NULL;
        struct ceph_inode_info *ci = ceph_inode(inode);
        u64 t_cap_id;
        unsigned mseq = le32_to_cpu(ex->migrate_seq);
@@ -2807,7 +2805,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 retry:
        spin_lock(&ci->i_ceph_lock);
        cap = __get_cap_for_mds(ci, mds);
-        if (!cap)
+        if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
                goto out_unlock;
        if (target < 0) {
@@ -2846,15 +2844,14 @@ retry:
                }
                __ceph_remove_cap(cap, false);
                goto out_unlock;
-        }
+        } else if (tsession) {
-        if (tsession) {
-                int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
-                spin_unlock(&ci->i_ceph_lock);
                /* add placeholder for the export tagert */
+                int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
                ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
-                             t_seq - 1, t_mseq, (u64)-1, flag, NULL);
+                             t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
-                goto retry;
+                __ceph_remove_cap(cap, false);
+                goto out_unlock;
        }
        spin_unlock(&ci->i_ceph_lock);
@@ -2873,6 +2870,7 @@ retry:
                                          SINGLE_DEPTH_NESTING);
                }
                ceph_add_cap_releases(mdsc, tsession);
+                new_cap = ceph_get_cap(mdsc, NULL);
        } else {
                WARN_ON(1);
                tsession = NULL;
@@ -2887,24 +2885,27 @@ out_unlock:
                mutex_unlock(&tsession->s_mutex);
                ceph_put_mds_session(tsession);
        }
+        if (new_cap)
+                ceph_put_cap(mdsc, new_cap);
 }
 /*
- * Handle cap IMPORT.  If there are temp bits from an older EXPORT,
+ * Handle cap IMPORT.
- * clean them up.
 *
- * caller holds s_mutex.
+ * caller holds s_mutex. acquires i_ceph_lock
 */
 static void handle_cap_import(struct ceph_mds_client *mdsc,
                              struct inode *inode, struct ceph_mds_caps *im,
                              struct ceph_mds_cap_peer *ph,
                              struct ceph_mds_session *session,
-                              void *snaptrace, int snaptrace_len)
+                              struct ceph_cap **target_cap, int *old_issued)
+        __acquires(ci->i_ceph_lock)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_cap *cap;
+        struct ceph_cap *cap, *ocap, *new_cap = NULL;
        int mds = session->s_mds;
-        unsigned issued = le32_to_cpu(im->caps);
+        int issued;
+        unsigned caps = le32_to_cpu(im->caps);
        unsigned wanted = le32_to_cpu(im->wanted);
        unsigned seq = le32_to_cpu(im->seq);
        unsigned mseq = le32_to_cpu(im->migrate_seq);
@@ -2924,40 +2925,52 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
        dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
             inode, ci, mds, mseq, peer);
+retry:
        spin_lock(&ci->i_ceph_lock);
-        cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
+        cap = __get_cap_for_mds(ci, mds);
-        if (cap && cap->cap_id == p_cap_id) {
+        if (!cap) {
+                if (!new_cap) {
+                        spin_unlock(&ci->i_ceph_lock);
+                        new_cap = ceph_get_cap(mdsc, NULL);
+                        goto retry;
+                }
+                cap = new_cap;
+        } else {
+                if (new_cap) {
+                        ceph_put_cap(mdsc, new_cap);
+                        new_cap = NULL;
+                }
+        }
+        __ceph_caps_issued(ci, &issued);
+        issued |= __ceph_caps_dirty(ci);
+        ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+                     realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
+        ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
+        if (ocap && ocap->cap_id == p_cap_id) {
                dout(" remove export cap %p mds%d flags %d\n",
-                     cap, peer, ph->flags);
+                     ocap, peer, ph->flags);
                if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
-                    (cap->seq != le32_to_cpu(ph->seq) ||
+                    (ocap->seq != le32_to_cpu(ph->seq) ||
-                     cap->mseq != le32_to_cpu(ph->mseq))) {
+                     ocap->mseq != le32_to_cpu(ph->mseq))) {
                        pr_err("handle_cap_import: mismatched seq/mseq: "
                               "ino (%llx.%llx) mds%d seq %d mseq %d "
                               "importer mds%d has peer seq %d mseq %d\n",
-                               ceph_vinop(inode), peer, cap->seq,
+                               ceph_vinop(inode), peer, ocap->seq,
-                               cap->mseq, mds, le32_to_cpu(ph->seq),
+                               ocap->mseq, mds, le32_to_cpu(ph->seq),
                               le32_to_cpu(ph->mseq));
                }
-                ci->i_cap_exporting_issued = cap->issued;
+                __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
-                __ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
        }
        /* make sure we re-request max_size, if necessary */
        ci->i_wanted_max_size = 0;
        ci->i_requested_max_size = 0;
-        spin_unlock(&ci->i_ceph_lock);
-        down_write(&mdsc->snap_rwsem);
-        ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
-                               false);
-        downgrade_write(&mdsc->snap_rwsem);
-        ceph_add_cap(inode, session, cap_id, -1,
-                     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
-                     NULL /* no caps context */);
-        kick_flushing_inode_caps(mdsc, session, inode);
-        up_read(&mdsc->snap_rwsem);
+        *old_issued = issued;
+        *target_cap = cap;
 }
 /*
@@ -2977,7 +2990,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        struct ceph_mds_caps *h;
        struct ceph_mds_cap_peer *peer = NULL;
        int mds = session->s_mds;
-        int op;
+        int op, issued;
        u32 seq, mseq;
        struct ceph_vino vino;
        u64 cap_id;
@@ -3069,7 +3082,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        case CEPH_CAP_OP_IMPORT:
                handle_cap_import(mdsc, inode, h, peer, session,
-                                  snaptrace, snaptrace_len);
+                                  &cap, &issued);
+                handle_cap_grant(mdsc, inode, h,  snaptrace, snaptrace_len,
+                                 msg->middle, session, cap, issued);
+                goto done_unlocked;
        }
        /* the rest require a cap */
@@ -3086,8 +3102,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        switch (op) {
        case CEPH_CAP_OP_REVOKE:
        case CEPH_CAP_OP_GRANT:
-        case CEPH_CAP_OP_IMPORT:
+                __ceph_caps_issued(ci, &issued);
-                handle_cap_grant(inode, h, session, cap, msg->middle);
+                issued |= __ceph_caps_dirty(ci);
+                handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle,
+                                 session, cap, issued);
                goto done_unlocked;
        case CEPH_CAP_OP_FLUSH_ACK:
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 00d6af6a32ec..8d7d782f4382 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -169,7 +169,7 @@ static struct dentry *__get_parent(struct super_block *sb,
        return dentry;
 }
-struct dentry *ceph_get_parent(struct dentry *child)
+static struct dentry *ceph_get_parent(struct dentry *child)
 {
        /* don't re-export snaps */
        if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e4fff9ff1c27..04c89c266cec 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -10,6 +10,7 @@
 #include <linux/writeback.h>
 #include <linux/vmalloc.h>
 #include <linux/posix_acl.h>
+#include <linux/random.h>
 #include "super.h"
 #include "mds_client.h"
@@ -179,9 +180,8 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
 * specified, copy the frag delegation info to the caller if
 * it is present.
 */
-u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
-                     struct ceph_inode_frag *pfrag,
+                              struct ceph_inode_frag *pfrag, int *found)
-                     int *found)
 {
        u32 t = ceph_frag_make(0, 0);
        struct ceph_inode_frag *frag;
@@ -191,7 +191,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
        if (found)
                *found = 0;
-        mutex_lock(&ci->i_fragtree_mutex);
        while (1) {
                WARN_ON(!ceph_frag_contains_value(t, v));
                frag = __ceph_find_frag(ci, t);
@@ -220,10 +219,19 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
        }
        dout("choose_frag(%x) = %x\n", v, t);
-        mutex_unlock(&ci->i_fragtree_mutex);
        return t;
 }
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+                     struct ceph_inode_frag *pfrag, int *found)
+{
+        u32 ret;
+        mutex_lock(&ci->i_fragtree_mutex);
+        ret = __ceph_choose_frag(ci, v, pfrag, found);
+        mutex_unlock(&ci->i_fragtree_mutex);
+        return ret;
+}
 /*
 * Process dirfrag (delegation) info from the mds.  Include leaf
 * fragment in tree ONLY if ndist > 0.  Otherwise, only
@@ -237,11 +245,17 @@ static int ceph_fill_dirfrag(struct inode *inode,
        u32 id = le32_to_cpu(dirinfo->frag);
        int mds = le32_to_cpu(dirinfo->auth);
        int ndist = le32_to_cpu(dirinfo->ndist);
+        int diri_auth = -1;
        int i;
        int err = 0;
+        spin_lock(&ci->i_ceph_lock);
+        if (ci->i_auth_cap)
+                diri_auth = ci->i_auth_cap->mds;
+        spin_unlock(&ci->i_ceph_lock);
        mutex_lock(&ci->i_fragtree_mutex);
-        if (ndist == 0) {
+        if (ndist == 0 && mds == diri_auth) {
                /* no delegation info needed. */
                frag = __ceph_find_frag(ci, id);
                if (!frag)
@@ -286,6 +300,75 @@ out:
        return err;
 }
+static int ceph_fill_fragtree(struct inode *inode,
+                              struct ceph_frag_tree_head *fragtree,
+                              struct ceph_mds_reply_dirfrag *dirinfo)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_inode_frag *frag;
+        struct rb_node *rb_node;
+        int i;
+        u32 id, nsplits;
+        bool update = false;
+        mutex_lock(&ci->i_fragtree_mutex);
+        nsplits = le32_to_cpu(fragtree->nsplits);
+        if (nsplits) {
+                i = prandom_u32() % nsplits;
+                id = le32_to_cpu(fragtree->splits[i].frag);
+                if (!__ceph_find_frag(ci, id))
+                        update = true;
+        } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
+                rb_node = rb_first(&ci->i_fragtree);
+                frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+                if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
+                        update = true;
+        }
+        if (!update && dirinfo) {
+                id = le32_to_cpu(dirinfo->frag);
+                if (id != __ceph_choose_frag(ci, id, NULL, NULL))
+                        update = true;
+        }
+        if (!update)
+                goto out_unlock;
+        dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
+        rb_node = rb_first(&ci->i_fragtree);
+        for (i = 0; i < nsplits; i++) {
+                id = le32_to_cpu(fragtree->splits[i].frag);
+                frag = NULL;
+                while (rb_node) {
+                        frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+                        if (ceph_frag_compare(frag->frag, id) >= 0) {
+                                if (frag->frag != id)
+                                        frag = NULL;
+                                else
+                                        rb_node = rb_next(rb_node);
+                                break;
+                        }
+                        rb_node = rb_next(rb_node);
+                        rb_erase(&frag->node, &ci->i_fragtree);
+                        kfree(frag);
+                        frag = NULL;
+                }
+                if (!frag) {
+                        frag = __get_or_create_frag(ci, id);
+                        if (IS_ERR(frag))
+                                continue;
+                }
+                frag->split_by = le32_to_cpu(fragtree->splits[i].by);
+                dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+        }
+        while (rb_node) {
+                frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+                rb_node = rb_next(rb_node);
+                rb_erase(&frag->node, &ci->i_fragtree);
+                kfree(frag);
+        }
+out_unlock:
+        mutex_unlock(&ci->i_fragtree_mutex);
+        return 0;
+}
 /*
 * initialize a newly allocated inode.
@@ -341,7 +424,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        INIT_LIST_HEAD(&ci->i_cap_snaps);
        ci->i_head_snapc = NULL;
        ci->i_snap_caps = 0;
-        ci->i_cap_exporting_issued = 0;
        for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
                ci->i_nr_by_mode[i] = 0;
@@ -407,7 +489,7 @@ void ceph_destroy_inode(struct inode *inode)
        /*
         * we may still have a snap_realm reference if there are stray
-         * caps in i_cap_exporting_issued or i_snap_caps.
+         * caps in i_snap_caps.
         */
        if (ci->i_snap_realm) {
                struct ceph_mds_client *mdsc =
@@ -582,22 +664,26 @@ static int fill_inode(struct inode *inode,
                      unsigned long ttl_from, int cap_fmode,
                      struct ceph_cap_reservation *caps_reservation)
 {
+        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        struct ceph_mds_reply_inode *info = iinfo->in;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        int i;
+        int issued = 0, implemented, new_issued;
-        int issued = 0, implemented;
        struct timespec mtime, atime, ctime;
-        u32 nsplits;
-        struct ceph_inode_frag *frag;
-        struct rb_node *rb_node;
        struct ceph_buffer *xattr_blob = NULL;
+        struct ceph_cap *new_cap = NULL;
        int err = 0;
-        int queue_trunc = 0;
+        bool wake = false;
+        bool queue_trunc = false;
+        bool new_version = false;
        dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
             inode, ceph_vinop(inode), le64_to_cpu(info->version),
             ci->i_version);
+        /* prealloc new cap struct */
+        if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP)
+                new_cap = ceph_get_cap(mdsc, caps_reservation);
        /*
         * prealloc xattr data, if it looks like we'll need it.  only
         * if len > 4 (meaning there are actually xattrs; the first 4
@@ -623,19 +709,23 @@ static int fill_inode(struct inode *inode,
         *   3    2     skip
         *   3    3     update
         */
-        if (le64_to_cpu(info->version) > 0 &&
+        if (ci->i_version == 0 ||
-            (ci->i_version & ~1) >= le64_to_cpu(info->version))
+            ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
-                goto no_change;
+             le64_to_cpu(info->version) > (ci->i_version & ~1)))
-        
+                new_version = true;
        issued = __ceph_caps_issued(ci, &implemented);
        issued |= implemented | __ceph_caps_dirty(ci);
+        new_issued = ~issued & le32_to_cpu(info->cap.caps);
        /* update inode */
        ci->i_version = le64_to_cpu(info->version);
        inode->i_version++;
        inode->i_rdev = le32_to_cpu(info->rdev);
+        inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
-        if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+        if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
+            (issued & CEPH_CAP_AUTH_EXCL) == 0) {
                inode->i_mode = le32_to_cpu(info->mode);
                inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
                inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
@@ -644,23 +734,35 @@ static int fill_inode(struct inode *inode,
                     from_kgid(&init_user_ns, inode->i_gid));
        }
-        if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+        if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
+            (issued & CEPH_CAP_LINK_EXCL) == 0)
                set_nlink(inode, le32_to_cpu(info->nlink));
-        /* be careful with mtime, atime, size */
+        if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
-        ceph_decode_timespec(&atime, &info->atime);
+                /* be careful with mtime, atime, size */
-        ceph_decode_timespec(&mtime, &info->mtime);
+                ceph_decode_timespec(&atime, &info->atime);
-        ceph_decode_timespec(&ctime, &info->ctime);
+                ceph_decode_timespec(&mtime, &info->mtime);
-        queue_trunc = ceph_fill_file_size(inode, issued,
+                ceph_decode_timespec(&ctime, &info->ctime);
-                                          le32_to_cpu(info->truncate_seq),
+                ceph_fill_file_time(inode, issued,
-                                          le64_to_cpu(info->truncate_size),
+                                le32_to_cpu(info->time_warp_seq),
-                                          le64_to_cpu(info->size));
+                                &ctime, &mtime, &atime);
-        ceph_fill_file_time(inode, issued,
+        }
-                            le32_to_cpu(info->time_warp_seq),
-                            &ctime, &mtime, &atime);
+        if (new_version ||
+            (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
-        ci->i_layout = info->layout;
+                ci->i_layout = info->layout;
-        inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+                queue_trunc = ceph_fill_file_size(inode, issued,
+                                        le32_to_cpu(info->truncate_seq),
+                                        le64_to_cpu(info->truncate_size),
+                                        le64_to_cpu(info->size));
+                /* only update max_size on auth cap */
+                if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+                    ci->i_max_size != le64_to_cpu(info->max_size)) {
+                        dout("max_size %lld -> %llu\n", ci->i_max_size,
+                                        le64_to_cpu(info->max_size));
+                        ci->i_max_size = le64_to_cpu(info->max_size);
+                }
+        }
        /* xattrs */
        /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
@@ -745,58 +847,6 @@ static int fill_inode(struct inode *inode,
                dout(" marking %p complete (empty)\n", inode);
                __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
        }
-no_change:
-        /* only update max_size on auth cap */
-        if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
-            ci->i_max_size != le64_to_cpu(info->max_size)) {
-                dout("max_size %lld -> %llu\n", ci->i_max_size,
-                     le64_to_cpu(info->max_size));
-                ci->i_max_size = le64_to_cpu(info->max_size);
-        }
-        spin_unlock(&ci->i_ceph_lock);
-        /* queue truncate if we saw i_size decrease */
-        if (queue_trunc)
-                ceph_queue_vmtruncate(inode);
-        /* populate frag tree */
-        /* FIXME: move me up, if/when version reflects fragtree changes */
-        nsplits = le32_to_cpu(info->fragtree.nsplits);
-        mutex_lock(&ci->i_fragtree_mutex);
-        rb_node = rb_first(&ci->i_fragtree);
-        for (i = 0; i < nsplits; i++) {
-                u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
-                frag = NULL;
-                while (rb_node) {
-                        frag = rb_entry(rb_node, struct ceph_inode_frag, node);
-                        if (ceph_frag_compare(frag->frag, id) >= 0) {
-                                if (frag->frag != id)
-                                        frag = NULL;
-                                else
-                                        rb_node = rb_next(rb_node);
-                                break;
-                        }
-                        rb_node = rb_next(rb_node);
-                        rb_erase(&frag->node, &ci->i_fragtree);
-                        kfree(frag);
-                        frag = NULL;
-                }
-                if (!frag) {
-                        frag = __get_or_create_frag(ci, id);
-                        if (IS_ERR(frag))
-                                continue;
-                }
-                frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
-                dout(" frag %x split by %d\n", frag->frag, frag->split_by);
-        }
-        while (rb_node) {
-                frag = rb_entry(rb_node, struct ceph_inode_frag, node);
-                rb_node = rb_next(rb_node);
-                rb_erase(&frag->node, &ci->i_fragtree);
-                kfree(frag);
-        }
-        mutex_unlock(&ci->i_fragtree_mutex);
        /* were we issued a capability? */
        if (info->cap.caps) {
@@ -809,30 +859,41 @@ no_change:
                                     le32_to_cpu(info->cap.seq),
                                     le32_to_cpu(info->cap.mseq),
                                     le64_to_cpu(info->cap.realm),
-                                     info->cap.flags,
+                                     info->cap.flags, &new_cap);
-                                     caps_reservation);
+                        wake = true;
                } else {
-                        spin_lock(&ci->i_ceph_lock);
                        dout(" %p got snap_caps %s\n", inode,
                             ceph_cap_string(le32_to_cpu(info->cap.caps)));
                        ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
                        if (cap_fmode >= 0)
                                __ceph_get_fmode(ci, cap_fmode);
-                        spin_unlock(&ci->i_ceph_lock);
                }
        } else if (cap_fmode >= 0) {
                pr_warn("mds issued no caps on %llx.%llx\n",
                           ceph_vinop(inode));
                __ceph_get_fmode(ci, cap_fmode);
        }
+        spin_unlock(&ci->i_ceph_lock);
+        if (wake)
+                wake_up_all(&ci->i_cap_wq);
+        /* queue truncate if we saw i_size decrease */
+        if (queue_trunc)
+                ceph_queue_vmtruncate(inode);
+        /* populate frag tree */
+        if (S_ISDIR(inode->i_mode))
+                ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
        /* update delegation info? */
        if (dirinfo)
                ceph_fill_dirfrag(inode, dirinfo);
        err = 0;
 out:
+        if (new_cap)
+                ceph_put_cap(mdsc, new_cap);
        if (xattr_blob)
                ceph_buffer_put(xattr_blob);
        return err;
@@ -1485,7 +1546,7 @@ static void ceph_invalidate_work(struct work_struct *work)
        orig_gen = ci->i_rdcache_gen;
        spin_unlock(&ci->i_ceph_lock);
-        truncate_inode_pages(inode->i_mapping, 0);
+        truncate_pagecache(inode, 0);
        spin_lock(&ci->i_ceph_lock);
        if (orig_gen == ci->i_rdcache_gen &&
@@ -1588,7 +1649,7 @@ retry:
             ci->i_truncate_pending, to);
        spin_unlock(&ci->i_ceph_lock);
-        truncate_inode_pages(inode->i_mapping, to);
+        truncate_pagecache(inode, to);
        spin_lock(&ci->i_ceph_lock);
        if (to == ci->i_truncate_size) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9a33b98cb000..92a2548278fc 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1558,6 +1558,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
        init_completion(&req->r_safe_completion);
        INIT_LIST_HEAD(&req->r_unsafe_item);
+        req->r_stamp = CURRENT_TIME;
        req->r_op = op;
        req->r_direct_mode = mode;
        return req;
@@ -1783,7 +1785,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        }
        len = sizeof(*head) +
-                pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
+                pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
+                sizeof(struct timespec);
        /* calculate (max) length for cap releases */
        len += sizeof(struct ceph_mds_request_release) *
@@ -1800,6 +1803,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
                goto out_free2;
        }
+        msg->hdr.version = 2;
        msg->hdr.tid = cpu_to_le64(req->r_tid);
        head = msg->front.iov_base;
@@ -1836,6 +1840,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
                      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
        head->num_releases = cpu_to_le16(releases);
+        /* time stamp */
+        ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
        BUG_ON(p > end);
        msg->front.iov_len = p - msg->front.iov_base;
        msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e90cfccf93bd..e00737cf523c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -194,6 +194,7 @@ struct ceph_mds_request {
        int r_fmode;        /* file mode, if expecting cap */
        kuid_t r_uid;
        kgid_t r_gid;
+        struct timespec r_stamp;
        /* for choosing which mds to send this request to */
        int r_direct_mode;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ead05cc1f447..12b20744e386 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -292,7 +292,6 @@ struct ceph_inode_info {
        struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or
                                                    dirty|flushing caps */
        unsigned i_snap_caps;           /* cap bits for snapped files */
-        unsigned i_cap_exporting_issued;
        int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
@@ -775,11 +774,13 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode)
 extern const char *ceph_cap_string(int c);
 extern void ceph_handle_caps(struct ceph_mds_session *session,
                             struct ceph_msg *msg);
-extern int ceph_add_cap(struct inode *inode,
+extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
-                        struct ceph_mds_session *session, u64 cap_id,
+                                     struct ceph_cap_reservation *ctx);
-                        int fmode, unsigned issued, unsigned wanted,
+extern void ceph_add_cap(struct inode *inode,
-                        unsigned cap, unsigned seq, u64 realmino, int flags,
+                         struct ceph_mds_session *session, u64 cap_id,
-                        struct ceph_cap_reservation *caps_reservation);
+                         int fmode, unsigned issued, unsigned wanted,
+                         unsigned cap, unsigned seq, u64 realmino, int flags,
+                         struct ceph_cap **new_cap);
 extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
 extern void ceph_put_cap(struct ceph_mds_client *mdsc,
                         struct ceph_cap *cap);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 1e5b45359509..d08e079ea5d3 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -617,6 +617,11 @@ static void retry_failed_sctp_send(struct connection *recv_con,
        int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
        log_print("Retry sending %d bytes to node id %d", len, nodeid);
+        
+        if (!nodeid) {
+                log_print("Shouldn't resend data via listening connection.");
+                return;
+        }
        con = nodeid2con(nodeid, 0);
        if (!con) {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b73e0621ce9e..b10b48c2a7af 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -910,7 +910,7 @@ static const struct file_operations eventpoll_fops = {
 void eventpoll_release_file(struct file *file)
 {
        struct eventpoll *ep;
-        struct epitem *epi;
+        struct epitem *epi, *next;
        /*
         * We don't want to get "file->f_lock" because it is not
@@ -926,7 +926,7 @@ void eventpoll_release_file(struct file *file)
         * Besides, ep_remove() acquires the lock, so we can't hold it here.
         */
        mutex_lock(&epmutex);
-        list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
+        list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
                ep = epi->ep;
                mutex_lock_nested(&ep->mtx, 0);
                ep_remove(ep, epi);