63 files changed, 1465 insertions, 13877 deletions
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 0fcd2640c23f..9eb134ea6eb2 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,9 +1,11 @@
 config CEPH_FS
        tristate "Ceph distributed file system (EXPERIMENTAL)"
        depends on INET && EXPERIMENTAL
+        select CEPH_LIB
        select LIBCRC32C
        select CRYPTO_AES
        select CRYPTO
+        default n
        help
          Choose Y or M here to include support for mounting the
          experimental Ceph distributed file system.  Ceph is an extremely
@@ -14,15 +16,3 @@ config CEPH_FS
          If unsure, say N.
-config CEPH_FS_PRETTYDEBUG
-        bool "Include file:line in ceph debug output"
-        depends on CEPH_FS
-        default n
-        help
-          If you say Y here, debug output will include a filename and
-          line to aid debugging.  This icnreases kernel size and slows
-          execution slightly when debug call sites are enabled (e.g.,
-          via CONFIG_DYNAMIC_DEBUG).
-          If unsure, say N.
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 278e1172600d..bd352125e829 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -2,38 +2,10 @@
 # Makefile for CEPH filesystem.
 #
-ifneq ($(KERNELRELEASE),)
 obj-$(CONFIG_CEPH_FS) += ceph.o
-ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
+ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
        export.o caps.o snap.o xattr.o \
-        messenger.o msgpool.o buffer.o pagelist.o \
+        mds_client.o mdsmap.o strings.o ceph_frag.o \
-        mds_client.o mdsmap.o \
+        debugfs.o
-        mon_client.o \
-        osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
-        debugfs.o \
-        auth.o auth_none.o \
-        crypto.o armor.o \
-        auth_x.o \
-        ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
-else
-#Otherwise we were called directly from the command
-# line; invoke the kernel build system.
-KERNELDIR ?= /lib/modules/$(shell uname -r)/build
-PWD := $(shell pwd)
-default: all
-all:
-        $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
-modules_install:
-        $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
-clean:
-        $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
-endif
diff --git a/fs/ceph/README b/fs/ceph/README
deleted file mode 100644
index 18352fab37c0..000000000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# The following files are shared by (and manually synchronized
-# between) the Ceph userland and kernel client.
-#
-# userland                  kernel
-src/include/ceph_fs.h       fs/ceph/ceph_fs.h
-src/include/ceph_fs.cc      fs/ceph/ceph_fs.c
-src/include/msgr.h          fs/ceph/msgr.h
-src/include/rados.h         fs/ceph/rados.h
-src/include/ceph_strings.cc fs/ceph/ceph_strings.c
-src/include/ceph_frag.h     fs/ceph/ceph_frag.h
-src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
-src/include/ceph_hash.h     fs/ceph/ceph_hash.h
-src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
-src/crush/crush.c           fs/ceph/crush/crush.c
-src/crush/crush.h           fs/ceph/crush/crush.h
-src/crush/mapper.c          fs/ceph/crush/mapper.c
-src/crush/mapper.h          fs/ceph/crush/mapper.h
-src/crush/hash.h            fs/ceph/crush/hash.h
-src/crush/hash.c            fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index efbc604001c8..561438b6a50c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/backing-dev.h>
 #include <linux/fs.h>
@@ -10,7 +10,8 @@
 #include <linux/task_io_accounting_ops.h>
 #include "super.h"
-#include "osd_client.h"
+#include "mds_client.h"
+#include <linux/ceph/osd_client.h>
 /*
 * Ceph address space ops.
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
 {
        struct inode *inode = filp->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+        struct ceph_osd_client *osdc = 
+                &ceph_inode_to_client(inode)->client->osdc;
        int err = 0;
        u64 len = PAGE_CACHE_SIZE;
@@ -202,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
        err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
                                  page->index << PAGE_CACHE_SHIFT, &len,
                                  ci->i_truncate_seq, ci->i_truncate_size,
-                                  &page, 1);
+                                  &page, 1, 0);
        if (err == -ENOENT)
                err = 0;
        if (err < 0) {
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+        struct ceph_osd_client *osdc =
+                &ceph_inode_to_client(inode)->client->osdc;
        int rc = 0;
        struct page **pages;
        loff_t offset;
@@ -284,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
                                 offset, &len,
                                 ci->i_truncate_seq, ci->i_truncate_size,
-                                 pages, nr_pages);
+                                 pages, nr_pages, 0);
        if (rc == -ENOENT)
                rc = 0;
        if (rc < 0)
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 {
        struct inode *inode;
        struct ceph_inode_info *ci;
-        struct ceph_client *client;
+        struct ceph_fs_client *fsc;
        struct ceph_osd_client *osdc;
        loff_t page_off = page->index << PAGE_CACHE_SHIFT;
        int len = PAGE_CACHE_SIZE;
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        }
        inode = page->mapping->host;
        ci = ceph_inode(inode);
-        client = ceph_inode_to_client(inode);
+        fsc = ceph_inode_to_client(inode);
-        osdc = &client->osdc;
+        osdc = &fsc->client->osdc;
        /* verify this is a writeable snap context */
        snapc = (void *)page->private;
@@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
             inode, page, page->index, page_off, len, snapc);
-        writeback_stat = atomic_long_inc_return(&client->writeback_count);
+        writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
        if (writeback_stat >
-            CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
+            CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
-                set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+                set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
        set_page_writeback(page);
        err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
        struct address_space *mapping = inode->i_mapping;
        __s32 rc = -EIO;
        u64 bytes = 0;
-        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        long writeback_stat;
        unsigned issued = ceph_caps_issued(ci);
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
                WARN_ON(!PageUptodate(page));
                writeback_stat =
-                        atomic_long_dec_return(&client->writeback_count);
+                        atomic_long_dec_return(&fsc->writeback_count);
                if (writeback_stat <
-                    CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
+                    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
-                        clear_bdi_congested(&client->backing_dev_info,
+                        clear_bdi_congested(&fsc->backing_dev_info,
                                            BLK_RW_ASYNC);
                ceph_put_snap_context((void *)page->private);
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
 * mempool.  we avoid the mempool if we can because req->r_num_pages
 * may be less than the maximum write size.
 */
-static void alloc_page_vec(struct ceph_client *client,
+static void alloc_page_vec(struct ceph_fs_client *fsc,
                           struct ceph_osd_request *req)
 {
        req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
                               GFP_NOFS);
        if (!req->r_pages) {
-                req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+                req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
                req->r_pages_from_pool = 1;
                WARN_ON(!req->r_pages);
        }
@@ -588,9 +591,8 @@ static int ceph_writepages_start(struct address_space *mapping,
                                 struct writeback_control *wbc)
 {
        struct inode *inode = mapping->host;
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_client *client;
+        struct ceph_fs_client *fsc;
        pgoff_t index, start, end;
        int range_whole = 0;
        int should_loop = 1;
@@ -617,26 +619,19 @@ static int ceph_writepages_start(struct address_space *mapping,
             wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
-        client = ceph_inode_to_client(inode);
+        fsc = ceph_inode_to_client(inode);
-        if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+        if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
                pr_warning("writepage_start %p on forced umount\n", inode);
                return -EIO; /* we're in a forced umount, don't write! */
        }
-        if (client->mount_args->wsize && client->mount_args->wsize < wsize)
+        if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
-                wsize = client->mount_args->wsize;
+                wsize = fsc->mount_options->wsize;
        if (wsize < PAGE_CACHE_SIZE)
                wsize = PAGE_CACHE_SIZE;
        max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
        pagevec_init(&pvec, 0);
-        /* ?? */
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                dout(" writepages congested\n");
-                wbc->encountered_congestion = 1;
-                goto out_final;
-        }
        /* where to start/end? */
        if (wbc->range_cyclic) {
                start = mapping->writeback_index; /* Start from prev offset */
@@ -769,7 +764,7 @@ get_more_pages:
                                offset = (unsigned long long)page->index
                                        << PAGE_CACHE_SHIFT;
                                len = wsize;
-                                req = ceph_osdc_new_request(&client->osdc,
+                                req = ceph_osdc_new_request(&fsc->client->osdc,
                                            &ci->i_layout,
                                            ceph_vino(inode),
                                            offset, &len,
@@ -779,10 +774,10 @@ get_more_pages:
                                            snapc, do_sync,
                                            ci->i_truncate_seq,
                                            ci->i_truncate_size,
-                                            &inode->i_mtime, true, 1);
+                                            &inode->i_mtime, true, 1, 0);
                                max_pages = req->r_num_pages;
-                                alloc_page_vec(client, req);
+                                alloc_page_vec(fsc, req);
                                req->r_callback = writepages_finish;
                                req->r_inode = inode;
                        }
@@ -794,10 +789,10 @@ get_more_pages:
                             inode, page, page->index);
                        writeback_stat =
-                               atomic_long_inc_return(&client->writeback_count);
+                               atomic_long_inc_return(&fsc->writeback_count);
                        if (writeback_stat > CONGESTION_ON_THRESH(
-                                    client->mount_args->congestion_kb)) {
+                                    fsc->mount_options->congestion_kb)) {
-                                set_bdi_congested(&client->backing_dev_info,
+                                set_bdi_congested(&fsc->backing_dev_info,
                                                  BLK_RW_ASYNC);
                        }
@@ -846,7 +841,7 @@ get_more_pages:
                op->payload_len = cpu_to_le32(len);
                req->r_request->hdr.data_len = cpu_to_le32(len);
-                ceph_osdc_start_request(&client->osdc, req, true);
+                ceph_osdc_start_request(&fsc->client->osdc, req, true);
                req = NULL;
                /* continue? */
@@ -882,7 +877,6 @@ out:
                rc = 0;  /* vfs expects us to return 0 */
        ceph_put_snap_context(snapc);
        dout("writepages done, rc = %d\n", rc);
-out_final:
        return rc;
 }
@@ -915,7 +909,7 @@ static int ceph_update_writeable_page(struct file *file,
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        loff_t page_off = pos & PAGE_CACHE_MASK;
        int pos_in_page = pos & ~PAGE_CACHE_MASK;
        int end_in_page = pos_in_page + len;
@@ -1053,8 +1047,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
                          struct page *page, void *fsdata)
 {
        struct inode *inode = file->f_dentry->d_inode;
-        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
        int check_cap = 0;
@@ -1123,7 +1117,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct inode *inode = vma->vm_file->f_dentry->d_inode;
        struct page *page = vmf->page;
-        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        loff_t off = page->index << PAGE_CACHE_SHIFT;
        loff_t size, len;
        int ret;
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
deleted file mode 100644
index eb2a666b0be7..000000000000
--- a/fs/ceph/armor.c
+++ /dev/null
@@ -1,103 +0,0 @@
-#include <linux/errno.h>
-int ceph_armor(char *dst, const char *src, const char *end);
-int ceph_unarmor(char *dst, const char *src, const char *end);
-/*
- * base64 encode/decode.
- */
-static const char *pem_key =
-        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-static int encode_bits(int c)
-{
-        return pem_key[c];
-}
-static int decode_bits(char c)
-{
-        if (c >= 'A' && c <= 'Z')
-                return c - 'A';
-        if (c >= 'a' && c <= 'z')
-                return c - 'a' + 26;
-        if (c >= '0' && c <= '9')
-                return c - '0' + 52;
-        if (c == '+')
-                return 62;
-        if (c == '/')
-                return 63;
-        if (c == '=')
-                return 0; /* just non-negative, please */
-        return -EINVAL;
-}
-int ceph_armor(char *dst, const char *src, const char *end)
-{
-        int olen = 0;
-        int line = 0;
-        while (src < end) {
-                unsigned char a, b, c;
-                a = *src++;
-                *dst++ = encode_bits(a >> 2);
-                if (src < end) {
-                        b = *src++;
-                        *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
-                        if (src < end) {
-                                c = *src++;
-                                *dst++ = encode_bits(((b & 15) << 2) |
-                                                     (c >> 6));
-                                *dst++ = encode_bits(c & 63);
-                        } else {
-                                *dst++ = encode_bits((b & 15) << 2);
-                                *dst++ = '=';
-                        }
-                } else {
-                        *dst++ = encode_bits(((a & 3) << 4));
-                        *dst++ = '=';
-                        *dst++ = '=';
-                }
-                olen += 4;
-                line += 4;
-                if (line == 64) {
-                        line = 0;
-                        *(dst++) = '\n';
-                        olen++;
-                }
-        }
-        return olen;
-}
-int ceph_unarmor(char *dst, const char *src, const char *end)
-{
-        int olen = 0;
-        while (src < end) {
-                int a, b, c, d;
-                if (src < end && src[0] == '\n')
-                        src++;
-                if (src + 4 > end)
-                        return -EINVAL;
-                a = decode_bits(src[0]);
-                b = decode_bits(src[1]);
-                c = decode_bits(src[2]);
-                d = decode_bits(src[3]);
-                if (a < 0 || b < 0 || c < 0 || d < 0)
-                        return -EINVAL;
-                *dst++ = (a << 2) | (b >> 4);
-                if (src[2] == '=')
-                        return olen + 1;
-                *dst++ = ((b & 15) << 4) | (c >> 2);
-                if (src[3] == '=')
-                        return olen + 2;
-                *dst++ = ((c & 3) << 6) | d;
-                olen += 3;
-                src += 4;
-        }
-        return olen;
-}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
deleted file mode 100644
index 6d2e30600627..000000000000
--- a/fs/ceph/auth.c
+++ /dev/null
@@ -1,259 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include "types.h"
-#include "auth_none.h"
-#include "auth_x.h"
-#include "decode.h"
-#include "super.h"
-#include "messenger.h"
-/*
- * get protocol handler
- */
-static u32 supported_protocols[] = {
-        CEPH_AUTH_NONE,
-        CEPH_AUTH_CEPHX
-};
-static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
-{
-        switch (protocol) {
-        case CEPH_AUTH_NONE:
-                return ceph_auth_none_init(ac);
-        case CEPH_AUTH_CEPHX:
-                return ceph_x_init(ac);
-        default:
-                return -ENOENT;
-        }
-}
-/*
- * setup, teardown.
- */
-struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
-{
-        struct ceph_auth_client *ac;
-        int ret;
-        dout("auth_init name '%s' secret '%s'\n", name, secret);
-        ret = -ENOMEM;
-        ac = kzalloc(sizeof(*ac), GFP_NOFS);
-        if (!ac)
-                goto out;
-        ac->negotiating = true;
-        if (name)
-                ac->name = name;
-        else
-                ac->name = CEPH_AUTH_NAME_DEFAULT;
-        dout("auth_init name %s secret %s\n", ac->name, secret);
-        ac->secret = secret;
-        return ac;
-out:
-        return ERR_PTR(ret);
-}
-void ceph_auth_destroy(struct ceph_auth_client *ac)
-{
-        dout("auth_destroy %p\n", ac);
-        if (ac->ops)
-                ac->ops->destroy(ac);
-        kfree(ac);
-}
-/*
- * Reset occurs when reconnecting to the monitor.
- */
-void ceph_auth_reset(struct ceph_auth_client *ac)
-{
-        dout("auth_reset %p\n", ac);
-        if (ac->ops && !ac->negotiating)
-                ac->ops->reset(ac);
-        ac->negotiating = true;
-}
-int ceph_entity_name_encode(const char *name, void **p, void *end)
-{
-        int len = strlen(name);
-        if (*p + 2*sizeof(u32) + len > end)
-                return -ERANGE;
-        ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
-        ceph_encode_32(p, len);
-        ceph_encode_copy(p, name, len);
-        return 0;
-}
-/*
- * Initiate protocol negotiation with monitor.  Include entity name
- * and list supported protocols.
- */
-int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
-{
-        struct ceph_mon_request_header *monhdr = buf;
-        void *p = monhdr + 1, *end = buf + len, *lenp;
-        int i, num;
-        int ret;
-        dout("auth_build_hello\n");
-        monhdr->have_version = 0;
-        monhdr->session_mon = cpu_to_le16(-1);
-        monhdr->session_mon_tid = 0;
-        ceph_encode_32(&p, 0);  /* no protocol, yet */
-        lenp = p;
-        p += sizeof(u32);
-        ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-        ceph_encode_8(&p, 1);
-        num = ARRAY_SIZE(supported_protocols);
-        ceph_encode_32(&p, num);
-        ceph_decode_need(&p, end, num * sizeof(u32), bad);
-        for (i = 0; i < num; i++)
-                ceph_encode_32(&p, supported_protocols[i]);
-        ret = ceph_entity_name_encode(ac->name, &p, end);
-        if (ret < 0)
-                return ret;
-        ceph_decode_need(&p, end, sizeof(u64), bad);
-        ceph_encode_64(&p, ac->global_id);
-        ceph_encode_32(&lenp, p - lenp - sizeof(u32));
-        return p - buf;
-bad:
-        return -ERANGE;
-}
-static int ceph_build_auth_request(struct ceph_auth_client *ac,
-                                   void *msg_buf, size_t msg_len)
-{
-        struct ceph_mon_request_header *monhdr = msg_buf;
-        void *p = monhdr + 1;
-        void *end = msg_buf + msg_len;
-        int ret;
-        monhdr->have_version = 0;
-        monhdr->session_mon = cpu_to_le16(-1);
-        monhdr->session_mon_tid = 0;
-        ceph_encode_32(&p, ac->protocol);
-        ret = ac->ops->build_request(ac, p + sizeof(u32), end);
-        if (ret < 0) {
-                pr_err("error %d building auth method %s request\n", ret,
-                       ac->ops->name);
-                return ret;
-        }
-        dout(" built request %d bytes\n", ret);
-        ceph_encode_32(&p, ret);
-        return p + ret - msg_buf;
-}
-/*
- * Handle auth message from monitor.
- */
-int ceph_handle_auth_reply(struct ceph_auth_client *ac,
-                           void *buf, size_t len,
-                           void *reply_buf, size_t reply_len)
-{
-        void *p = buf;
-        void *end = buf + len;
-        int protocol;
-        s32 result;
-        u64 global_id;
-        void *payload, *payload_end;
-        int payload_len;
-        char *result_msg;
-        int result_msg_len;
-        int ret = -EINVAL;
-        dout("handle_auth_reply %p %p\n", p, end);
-        ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
-        protocol = ceph_decode_32(&p);
-        result = ceph_decode_32(&p);
-        global_id = ceph_decode_64(&p);
-        payload_len = ceph_decode_32(&p);
-        payload = p;
-        p += payload_len;
-        ceph_decode_need(&p, end, sizeof(u32), bad);
-        result_msg_len = ceph_decode_32(&p);
-        result_msg = p;
-        p += result_msg_len;
-        if (p != end)
-                goto bad;
-        dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
-             result_msg, global_id, payload_len);
-        payload_end = payload + payload_len;
-        if (global_id && ac->global_id != global_id) {
-                dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
-                ac->global_id = global_id;
-        }
-        if (ac->negotiating) {
-                /* server does not support our protocols? */
-                if (!protocol && result < 0) {
-                        ret = result;
-                        goto out;
-                }
-                /* set up (new) protocol handler? */
-                if (ac->protocol && ac->protocol != protocol) {
-                        ac->ops->destroy(ac);
-                        ac->protocol = 0;
-                        ac->ops = NULL;
-                }
-                if (ac->protocol != protocol) {
-                        ret = ceph_auth_init_protocol(ac, protocol);
-                        if (ret) {
-                                pr_err("error %d on auth protocol %d init\n",
-                                       ret, protocol);
-                                goto out;
-                        }
-                }
-                ac->negotiating = false;
-        }
-        ret = ac->ops->handle_reply(ac, result, payload, payload_end);
-        if (ret == -EAGAIN) {
-                return ceph_build_auth_request(ac, reply_buf, reply_len);
-        } else if (ret) {
-                pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
-                return ret;
-        }
-        return 0;
-bad:
-        pr_err("failed to decode auth msg\n");
-out:
-        return ret;
-}
-int ceph_build_auth(struct ceph_auth_client *ac,
-                    void *msg_buf, size_t msg_len)
-{
-        if (!ac->protocol)
-                return ceph_auth_build_hello(ac, msg_buf, msg_len);
-        BUG_ON(!ac->ops);
-        if (ac->ops->should_authenticate(ac))
-                return ceph_build_auth_request(ac, msg_buf, msg_len);
-        return 0;
-}
-int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
-{
-        if (!ac->ops)
-                return 0;
-        return ac->ops->is_authenticated(ac);
-}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
deleted file mode 100644
index d38a2fb4a137..000000000000
--- a/fs/ceph/auth.h
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifndef _FS_CEPH_AUTH_H
-#define _FS_CEPH_AUTH_H
-#include "types.h"
-#include "buffer.h"
-/*
- * Abstract interface for communicating with the authenticate module.
- * There is some handshake that takes place between us and the monitor
- * to acquire the necessary keys.  These are used to generate an
- * 'authorizer' that we use when connecting to a service (mds, osd).
- */
-struct ceph_auth_client;
-struct ceph_authorizer;
-struct ceph_auth_client_ops {
-        const char *name;
-        /*
-         * true if we are authenticated and can connect to
-         * services.
-         */
-        int (*is_authenticated)(struct ceph_auth_client *ac);
-        /*
-         * true if we should (re)authenticate, e.g., when our tickets
-         * are getting old and crusty.
-         */
-        int (*should_authenticate)(struct ceph_auth_client *ac);
-        /*
-         * build requests and process replies during monitor
-         * handshake.  if handle_reply returns -EAGAIN, we build
-         * another request.
-         */
-        int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
-        int (*handle_reply)(struct ceph_auth_client *ac, int result,
-                            void *buf, void *end);
-        /*
-         * Create authorizer for connecting to a service, and verify
-         * the response to authenticate the service.
-         */
-        int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
-                                 struct ceph_authorizer **a,
-                                 void **buf, size_t *len,
-                                 void **reply_buf, size_t *reply_len);
-        int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
-                                       struct ceph_authorizer *a, size_t len);
-        void (*destroy_authorizer)(struct ceph_auth_client *ac,
-                                   struct ceph_authorizer *a);
-        void (*invalidate_authorizer)(struct ceph_auth_client *ac,
-                                      int peer_type);
-        /* reset when we (re)connect to a monitor */
-        void (*reset)(struct ceph_auth_client *ac);
-        void (*destroy)(struct ceph_auth_client *ac);
-};
-struct ceph_auth_client {
-        u32 protocol;           /* CEPH_AUTH_* */
-        void *private;          /* for use by protocol implementation */
-        const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
-        bool negotiating;       /* true if negotiating protocol */
-        const char *name;       /* entity name */
-        u64 global_id;          /* our unique id in system */
-        const char *secret;     /* our secret key */
-        unsigned want_keys;     /* which services we want */
-};
-extern struct ceph_auth_client *ceph_auth_init(const char *name,
-                                               const char *secret);
-extern void ceph_auth_destroy(struct ceph_auth_client *ac);
-extern void ceph_auth_reset(struct ceph_auth_client *ac);
-extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
-                                 void *buf, size_t len);
-extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
-                                  void *buf, size_t len,
-                                  void *reply_buf, size_t reply_len);
-extern int ceph_entity_name_encode(const char *name, void **p, void *end);
-extern int ceph_build_auth(struct ceph_auth_client *ac,
-                    void *msg_buf, size_t msg_len);
-extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
-#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
deleted file mode 100644
index ad1dc21286c7..000000000000
--- a/fs/ceph/auth_none.c
+++ /dev/null
@@ -1,131 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include "auth_none.h"
-#include "auth.h"
-#include "decode.h"
-static void reset(struct ceph_auth_client *ac)
-{
-        struct ceph_auth_none_info *xi = ac->private;
-        xi->starting = true;
-        xi->built_authorizer = false;
-}
-static void destroy(struct ceph_auth_client *ac)
-{
-        kfree(ac->private);
-        ac->private = NULL;
-}
-static int is_authenticated(struct ceph_auth_client *ac)
-{
-        struct ceph_auth_none_info *xi = ac->private;
-        return !xi->starting;
-}
-static int should_authenticate(struct ceph_auth_client *ac)
-{
-        struct ceph_auth_none_info *xi = ac->private;
-        return xi->starting;
-}
-/*
- * the generic auth code decode the global_id, and we carry no actual
- * authenticate state, so nothing happens here.
- */
-static int handle_reply(struct ceph_auth_client *ac, int result,
-                        void *buf, void *end)
-{
-        struct ceph_auth_none_info *xi = ac->private;
-        xi->starting = false;
-        return result;
-}
-/*
- * build an 'authorizer' with our entity_name and global_id.  we can
- * reuse a single static copy since it is identical for all services
- * we connect to.
- */
-static int ceph_auth_none_create_authorizer(
-        struct ceph_auth_client *ac, int peer_type,
-        struct ceph_authorizer **a,
-        void **buf, size_t *len,
-        void **reply_buf, size_t *reply_len)
-{
-        struct ceph_auth_none_info *ai = ac->private;
-        struct ceph_none_authorizer *au = &ai->au;
-        void *p, *end;
-        int ret;
-        if (!ai->built_authorizer) {
-                p = au->buf;
-                end = p + sizeof(au->buf);
-                ceph_encode_8(&p, 1);
-                ret = ceph_entity_name_encode(ac->name, &p, end - 8);
-                if (ret < 0)
-                        goto bad;
-                ceph_decode_need(&p, end, sizeof(u64), bad2);
-                ceph_encode_64(&p, ac->global_id);
-                au->buf_len = p - (void *)au->buf;
-                ai->built_authorizer = true;
-                dout("built authorizer len %d\n", au->buf_len);
-        }
-        *a = (struct ceph_authorizer *)au;
-        *buf = au->buf;
-        *len = au->buf_len;
-        *reply_buf = au->reply_buf;
-        *reply_len = sizeof(au->reply_buf);
-        return 0;
-bad2:
-        ret = -ERANGE;
-bad:
-        return ret;
-}
-static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
-                                      struct ceph_authorizer *a)
-{
-        /* nothing to do */
-}
-static const struct ceph_auth_client_ops ceph_auth_none_ops = {
-        .name = "none",
-        .reset = reset,
-        .destroy = destroy,
-        .is_authenticated = is_authenticated,
-        .should_authenticate = should_authenticate,
-        .handle_reply = handle_reply,
-        .create_authorizer = ceph_auth_none_create_authorizer,
-        .destroy_authorizer = ceph_auth_none_destroy_authorizer,
-};
-int ceph_auth_none_init(struct ceph_auth_client *ac)
-{
-        struct ceph_auth_none_info *xi;
-        dout("ceph_auth_none_init %p\n", ac);
-        xi = kzalloc(sizeof(*xi), GFP_NOFS);
-        if (!xi)
-                return -ENOMEM;
-        xi->starting = true;
-        xi->built_authorizer = false;
-        ac->protocol = CEPH_AUTH_NONE;
-        ac->private = xi;
-        ac->ops = &ceph_auth_none_ops;
-        return 0;
-}
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
deleted file mode 100644
index 8164df1a08be..000000000000
--- a/fs/ceph/auth_none.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _FS_CEPH_AUTH_NONE_H
-#define _FS_CEPH_AUTH_NONE_H
-#include <linux/slab.h>
-#include "auth.h"
-/*
- * null security mode.
- *
- * we use a single static authorizer that simply encodes our entity name
- * and global id.
- */
-struct ceph_none_authorizer {
-        char buf[128];
-        int buf_len;
-        char reply_buf[0];
-};
-struct ceph_auth_none_info {
-        bool starting;
-        bool built_authorizer;
-        struct ceph_none_authorizer au;   /* we only need one; it's static */
-};
-extern int ceph_auth_none_init(struct ceph_auth_client *ac);
-#endif
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
deleted file mode 100644
index a2d002cbdec2..000000000000
--- a/fs/ceph/auth_x.c
+++ /dev/null
@@ -1,687 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include "auth_x.h"
-#include "auth_x_protocol.h"
-#include "crypto.h"
-#include "auth.h"
-#include "decode.h"
-#define TEMP_TICKET_BUF_LEN     256
-static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
-static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
-{
-        struct ceph_x_info *xi = ac->private;
-        int need;
-        ceph_x_validate_tickets(ac, &need);
-        dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
-             ac->want_keys, need, xi->have_keys);
-        return (ac->want_keys & xi->have_keys) == ac->want_keys;
-}
-static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
-{
-        struct ceph_x_info *xi = ac->private;
-        int need;
-        ceph_x_validate_tickets(ac, &need);
-        dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
-             ac->want_keys, need, xi->have_keys);
-        return need != 0;
-}
-static int ceph_x_encrypt_buflen(int ilen)
-{
-        return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
-                sizeof(u32);
-}
-static int ceph_x_encrypt(struct ceph_crypto_key *secret,
-                          void *ibuf, int ilen, void *obuf, size_t olen)
-{
-        struct ceph_x_encrypt_header head = {
-                .struct_v = 1,
-                .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
-        };
-        size_t len = olen - sizeof(u32);
-        int ret;
-        ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
-                            &head, sizeof(head), ibuf, ilen);
-        if (ret)
-                return ret;
-        ceph_encode_32(&obuf, len);
-        return len + sizeof(u32);
-}
-static int ceph_x_decrypt(struct ceph_crypto_key *secret,
-                          void **p, void *end, void *obuf, size_t olen)
-{
-        struct ceph_x_encrypt_header head;
-        size_t head_len = sizeof(head);
-        int len, ret;
-        len = ceph_decode_32(p);
-        if (*p + len > end)
-                return -EINVAL;
-        dout("ceph_x_decrypt len %d\n", len);
-        ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
-                            *p, len);
-        if (ret)
-                return ret;
-        if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
-                return -EPERM;
-        *p += len;
-        return olen;
-}
-/*
- * get existing (or insert new) ticket handler
- */
-static struct ceph_x_ticket_handler *
-get_ticket_handler(struct ceph_auth_client *ac, int service)
-{
-        struct ceph_x_ticket_handler *th;
-        struct ceph_x_info *xi = ac->private;
-        struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
-        while (*p) {
-                parent = *p;
-                th = rb_entry(parent, struct ceph_x_ticket_handler, node);
-                if (service < th->service)
-                        p = &(*p)->rb_left;
-                else if (service > th->service)
-                        p = &(*p)->rb_right;
-                else
-                        return th;
-        }
-        /* add it */
-        th = kzalloc(sizeof(*th), GFP_NOFS);
-        if (!th)
-                return ERR_PTR(-ENOMEM);
-        th->service = service;
-        rb_link_node(&th->node, parent, p);
-        rb_insert_color(&th->node, &xi->ticket_handlers);
-        return th;
-}
-static void remove_ticket_handler(struct ceph_auth_client *ac,
-                                  struct ceph_x_ticket_handler *th)
-{
-        struct ceph_x_info *xi = ac->private;
-        dout("remove_ticket_handler %p %d\n", th, th->service);
-        rb_erase(&th->node, &xi->ticket_handlers);
-        ceph_crypto_key_destroy(&th->session_key);
-        if (th->ticket_blob)
-                ceph_buffer_put(th->ticket_blob);
-        kfree(th);
-}
-static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
-                                    struct ceph_crypto_key *secret,
-                                    void *buf, void *end)
-{
-        struct ceph_x_info *xi = ac->private;
-        int num;
-        void *p = buf;
-        int ret;
-        char *dbuf;
-        char *ticket_buf;
-        u8 reply_struct_v;
-        dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
-        if (!dbuf)
-                return -ENOMEM;
-        ret = -ENOMEM;
-        ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
-        if (!ticket_buf)
-                goto out_dbuf;
-        ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-        reply_struct_v = ceph_decode_8(&p);
-        if (reply_struct_v != 1)
-                goto bad;
-        num = ceph_decode_32(&p);
-        dout("%d tickets\n", num);
-        while (num--) {
-                int type;
-                u8 tkt_struct_v, blob_struct_v;
-                struct ceph_x_ticket_handler *th;
-                void *dp, *dend;
-                int dlen;
-                char is_enc;
-                struct timespec validity;
-                struct ceph_crypto_key old_key;
-                void *tp, *tpend;
-                struct ceph_timespec new_validity;
-                struct ceph_crypto_key new_session_key;
-                struct ceph_buffer *new_ticket_blob;
-                unsigned long new_expires, new_renew_after;
-                u64 new_secret_id;
-                ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
-                type = ceph_decode_32(&p);
-                dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
-                tkt_struct_v = ceph_decode_8(&p);
-                if (tkt_struct_v != 1)
-                        goto bad;
-                th = get_ticket_handler(ac, type);
-                if (IS_ERR(th)) {
-                        ret = PTR_ERR(th);
-                        goto out;
-                }
-                /* blob for me */
-                dlen = ceph_x_decrypt(secret, &p, end, dbuf,
-                                      TEMP_TICKET_BUF_LEN);
-                if (dlen <= 0) {
-                        ret = dlen;
-                        goto out;
-                }
-                dout(" decrypted %d bytes\n", dlen);
-                dend = dbuf + dlen;
-                dp = dbuf;
-                tkt_struct_v = ceph_decode_8(&dp);
-                if (tkt_struct_v != 1)
-                        goto bad;
-                memcpy(&old_key, &th->session_key, sizeof(old_key));
-                ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
-                if (ret)
-                        goto out;
-                ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
-                ceph_decode_timespec(&validity, &new_validity);
-                new_expires = get_seconds() + validity.tv_sec;
-                new_renew_after = new_expires - (validity.tv_sec / 4);
-                dout(" expires=%lu renew_after=%lu\n", new_expires,
-                     new_renew_after);
-                /* ticket blob for service */
-                ceph_decode_8_safe(&p, end, is_enc, bad);
-                tp = ticket_buf;
-                if (is_enc) {
-                        /* encrypted */
-                        dout(" encrypted ticket\n");
-                        dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
-                                              TEMP_TICKET_BUF_LEN);
-                        if (dlen < 0) {
-                                ret = dlen;
-                                goto out;
-                        }
-                        dlen = ceph_decode_32(&tp);
-                } else {
-                        /* unencrypted */
-                        ceph_decode_32_safe(&p, end, dlen, bad);
-                        ceph_decode_need(&p, end, dlen, bad);
-                        ceph_decode_copy(&p, ticket_buf, dlen);
-                }
-                tpend = tp + dlen;
-                dout(" ticket blob is %d bytes\n", dlen);
-                ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
-                blob_struct_v = ceph_decode_8(&tp);
-                new_secret_id = ceph_decode_64(&tp);
-                ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
-                if (ret)
-                        goto out;
-                /* all is well, update our ticket */
-                ceph_crypto_key_destroy(&th->session_key);
-                if (th->ticket_blob)
-                        ceph_buffer_put(th->ticket_blob);
-                th->session_key = new_session_key;
-                th->ticket_blob = new_ticket_blob;
-                th->validity = new_validity;
-                th->secret_id = new_secret_id;
-                th->expires = new_expires;
-                th->renew_after = new_renew_after;
-                dout(" got ticket service %d (%s) secret_id %lld len %d\n",
-                     type, ceph_entity_type_name(type), th->secret_id,
-                     (int)th->ticket_blob->vec.iov_len);
-                xi->have_keys |= th->service;
-        }
-        ret = 0;
-out:
-        kfree(ticket_buf);
-out_dbuf:
-        kfree(dbuf);
-        return ret;
-bad:
-        ret = -EINVAL;
-        goto out;
-}
-static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
-                                   struct ceph_x_ticket_handler *th,
-                                   struct ceph_x_authorizer *au)
-{
-        int maxlen;
-        struct ceph_x_authorize_a *msg_a;
-        struct ceph_x_authorize_b msg_b;
-        void *p, *end;
-        int ret;
-        int ticket_blob_len =
-                (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
-        dout("build_authorizer for %s %p\n",
-             ceph_entity_type_name(th->service), au);
-        maxlen = sizeof(*msg_a) + sizeof(msg_b) +
-                ceph_x_encrypt_buflen(ticket_blob_len);
-        dout("  need len %d\n", maxlen);
-        if (au->buf && au->buf->alloc_len < maxlen) {
-                ceph_buffer_put(au->buf);
-                au->buf = NULL;
-        }
-        if (!au->buf) {
-                au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
-                if (!au->buf)
-                        return -ENOMEM;
-        }
-        au->service = th->service;
-        msg_a = au->buf->vec.iov_base;
-        msg_a->struct_v = 1;
-        msg_a->global_id = cpu_to_le64(ac->global_id);
-        msg_a->service_id = cpu_to_le32(th->service);
-        msg_a->ticket_blob.struct_v = 1;
-        msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
-        msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
-        if (ticket_blob_len) {
-                memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
-                       th->ticket_blob->vec.iov_len);
-        }
-        dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
-             le64_to_cpu(msg_a->ticket_blob.secret_id));
-        p = msg_a + 1;
-        p += ticket_blob_len;
-        end = au->buf->vec.iov_base + au->buf->vec.iov_len;
-        get_random_bytes(&au->nonce, sizeof(au->nonce));
-        msg_b.struct_v = 1;
-        msg_b.nonce = cpu_to_le64(au->nonce);
-        ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
-                             p, end - p);
-        if (ret < 0)
-                goto out_buf;
-        p += ret;
-        au->buf->vec.iov_len = p - au->buf->vec.iov_base;
-        dout(" built authorizer nonce %llx len %d\n", au->nonce,
-             (int)au->buf->vec.iov_len);
-        BUG_ON(au->buf->vec.iov_len > maxlen);
-        return 0;
-out_buf:
-        ceph_buffer_put(au->buf);
-        au->buf = NULL;
-        return ret;
-}
-static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
-                                void **p, void *end)
-{
-        ceph_decode_need(p, end, 1 + sizeof(u64), bad);
-        ceph_encode_8(p, 1);
-        ceph_encode_64(p, th->secret_id);
-        if (th->ticket_blob) {
-                const char *buf = th->ticket_blob->vec.iov_base;
-                u32 len = th->ticket_blob->vec.iov_len;
-                ceph_encode_32_safe(p, end, len, bad);
-                ceph_encode_copy_safe(p, end, buf, len, bad);
-        } else {
-                ceph_encode_32_safe(p, end, 0, bad);
-        }
-        return 0;
-bad:
-        return -ERANGE;
-}
-static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
-{
-        int want = ac->want_keys;
-        struct ceph_x_info *xi = ac->private;
-        int service;
-        *pneed = ac->want_keys & ~(xi->have_keys);
-        for (service = 1; service <= want; service <<= 1) {
-                struct ceph_x_ticket_handler *th;
-                if (!(ac->want_keys & service))
-                        continue;
-                if (*pneed & service)
-                        continue;
-                th = get_ticket_handler(ac, service);
-                if (IS_ERR(th)) {
-                        *pneed |= service;
-                        continue;
-                }
-                if (get_seconds() >= th->renew_after)
-                        *pneed |= service;
-                if (get_seconds() >= th->expires)
-                        xi->have_keys &= ~service;
-        }
-}
-static int ceph_x_build_request(struct ceph_auth_client *ac,
-                                void *buf, void *end)
-{
-        struct ceph_x_info *xi = ac->private;
-        int need;
-        struct ceph_x_request_header *head = buf;
-        int ret;
-        struct ceph_x_ticket_handler *th =
-                get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-        if (IS_ERR(th))
-                return PTR_ERR(th);
-        ceph_x_validate_tickets(ac, &need);
-        dout("build_request want %x have %x need %x\n",
-             ac->want_keys, xi->have_keys, need);
-        if (need & CEPH_ENTITY_TYPE_AUTH) {
-                struct ceph_x_authenticate *auth = (void *)(head + 1);
-                void *p = auth + 1;
-                struct ceph_x_challenge_blob tmp;
-                char tmp_enc[40];
-                u64 *u;
-                if (p > end)
-                        return -ERANGE;
-                dout(" get_auth_session_key\n");
-                head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
-                /* encrypt and hash */
-                get_random_bytes(&auth->client_challenge, sizeof(u64));
-                tmp.client_challenge = auth->client_challenge;
-                tmp.server_challenge = cpu_to_le64(xi->server_challenge);
-                ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
-                                     tmp_enc, sizeof(tmp_enc));
-                if (ret < 0)
-                        return ret;
-                auth->struct_v = 1;
-                auth->key = 0;
-                for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
-                        auth->key ^= *(__le64 *)u;
-                dout(" server_challenge %llx client_challenge %llx key %llx\n",
-                     xi->server_challenge, le64_to_cpu(auth->client_challenge),
-                     le64_to_cpu(auth->key));
-                /* now encode the old ticket if exists */
-                ret = ceph_x_encode_ticket(th, &p, end);
-                if (ret < 0)
-                        return ret;
-                return p - buf;
-        }
-        if (need) {
-                void *p = head + 1;
-                struct ceph_x_service_ticket_request *req;
-                if (p > end)
-                        return -ERANGE;
-                head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-                ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
-                if (ret)
-                        return ret;
-                ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
-                                 xi->auth_authorizer.buf->vec.iov_len);
-                req = p;
-                req->keys = cpu_to_le32(need);
-                p += sizeof(*req);
-                return p - buf;
-        }
-        return 0;
-}
-static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
-                               void *buf, void *end)
-{
-        struct ceph_x_info *xi = ac->private;
-        struct ceph_x_reply_header *head = buf;
-        struct ceph_x_ticket_handler *th;
-        int len = end - buf;
-        int op;
-        int ret;
-        if (result)
-                return result;  /* XXX hmm? */
-        if (xi->starting) {
-                /* it's a hello */
-                struct ceph_x_server_challenge *sc = buf;
-                if (len != sizeof(*sc))
-                        return -EINVAL;
-                xi->server_challenge = le64_to_cpu(sc->server_challenge);
-                dout("handle_reply got server challenge %llx\n",
-                     xi->server_challenge);
-                xi->starting = false;
-                xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
-                return -EAGAIN;
-        }
-        op = le16_to_cpu(head->op);
-        result = le32_to_cpu(head->result);
-        dout("handle_reply op %d result %d\n", op, result);
-        switch (op) {
-        case CEPHX_GET_AUTH_SESSION_KEY:
-                /* verify auth key */
-                ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
-                                               buf + sizeof(*head), end);
-                break;
-        case CEPHX_GET_PRINCIPAL_SESSION_KEY:
-                th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-                if (IS_ERR(th))
-                        return PTR_ERR(th);
-                ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
-                                               buf + sizeof(*head), end);
-                break;
-        default:
-                return -EINVAL;
-        }
-        if (ret)
-                return ret;
-        if (ac->want_keys == xi->have_keys)
-                return 0;
-        return -EAGAIN;
-}
-static int ceph_x_create_authorizer(
-        struct ceph_auth_client *ac, int peer_type,
-        struct ceph_authorizer **a,
-        void **buf, size_t *len,
-        void **reply_buf, size_t *reply_len)
-{
-        struct ceph_x_authorizer *au;
-        struct ceph_x_ticket_handler *th;
-        int ret;
-        th = get_ticket_handler(ac, peer_type);
-        if (IS_ERR(th))
-                return PTR_ERR(th);
-        au = kzalloc(sizeof(*au), GFP_NOFS);
-        if (!au)
-                return -ENOMEM;
-        ret = ceph_x_build_authorizer(ac, th, au);
-        if (ret) {
-                kfree(au);
-                return ret;
-        }
-        *a = (struct ceph_authorizer *)au;
-        *buf = au->buf->vec.iov_base;
-        *len = au->buf->vec.iov_len;
-        *reply_buf = au->reply_buf;
-        *reply_len = sizeof(au->reply_buf);
-        return 0;
-}
-static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
-                                          struct ceph_authorizer *a, size_t len)
-{
-        struct ceph_x_authorizer *au = (void *)a;
-        struct ceph_x_ticket_handler *th;
-        int ret = 0;
-        struct ceph_x_authorize_reply reply;
-        void *p = au->reply_buf;
-        void *end = p + sizeof(au->reply_buf);
-        th = get_ticket_handler(ac, au->service);
-        if (IS_ERR(th))
-                return PTR_ERR(th);
-        ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
-        if (ret < 0)
-                return ret;
-        if (ret != sizeof(reply))
-                return -EPERM;
-        if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
-                ret = -EPERM;
-        else
-                ret = 0;
-        dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
-             au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
-        return ret;
-}
-static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
-                                      struct ceph_authorizer *a)
-{
-        struct ceph_x_authorizer *au = (void *)a;
-        ceph_buffer_put(au->buf);
-        kfree(au);
-}
-static void ceph_x_reset(struct ceph_auth_client *ac)
-{
-        struct ceph_x_info *xi = ac->private;
-        dout("reset\n");
-        xi->starting = true;
-        xi->server_challenge = 0;
-}
-static void ceph_x_destroy(struct ceph_auth_client *ac)
-{
-        struct ceph_x_info *xi = ac->private;
-        struct rb_node *p;
-        dout("ceph_x_destroy %p\n", ac);
-        ceph_crypto_key_destroy(&xi->secret);
-        while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
-                struct ceph_x_ticket_handler *th =
-                        rb_entry(p, struct ceph_x_ticket_handler, node);
-                remove_ticket_handler(ac, th);
-        }
-        if (xi->auth_authorizer.buf)
-                ceph_buffer_put(xi->auth_authorizer.buf);
-        kfree(ac->private);
-        ac->private = NULL;
-}
-static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
-                                   int peer_type)
-{
-        struct ceph_x_ticket_handler *th;
-        th = get_ticket_handler(ac, peer_type);
-        if (!IS_ERR(th))
-                remove_ticket_handler(ac, th);
-}
-static const struct ceph_auth_client_ops ceph_x_ops = {
-        .name = "x",
-        .is_authenticated = ceph_x_is_authenticated,
-        .should_authenticate = ceph_x_should_authenticate,
-        .build_request = ceph_x_build_request,
-        .handle_reply = ceph_x_handle_reply,
-        .create_authorizer = ceph_x_create_authorizer,
-        .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
-        .destroy_authorizer = ceph_x_destroy_authorizer,
-        .invalidate_authorizer = ceph_x_invalidate_authorizer,
-        .reset =  ceph_x_reset,
-        .destroy = ceph_x_destroy,
-};
-int ceph_x_init(struct ceph_auth_client *ac)
-{
-        struct ceph_x_info *xi;
-        int ret;
-        dout("ceph_x_init %p\n", ac);
-        ret = -ENOMEM;
-        xi = kzalloc(sizeof(*xi), GFP_NOFS);
-        if (!xi)
-                goto out;
-        ret = -EINVAL;
-        if (!ac->secret) {
-                pr_err("no secret set (for auth_x protocol)\n");
-                goto out_nomem;
-        }
-        ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
-        if (ret)
-                goto out_nomem;
-        xi->starting = true;
-        xi->ticket_handlers = RB_ROOT;
-        ac->protocol = CEPH_AUTH_CEPHX;
-        ac->private = xi;
-        ac->ops = &ceph_x_ops;
-        return 0;
-out_nomem:
-        kfree(xi);
-out:
-        return ret;
-}
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
deleted file mode 100644
index ff6f8180e681..000000000000
--- a/fs/ceph/auth_x.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef _FS_CEPH_AUTH_X_H
-#define _FS_CEPH_AUTH_X_H
-#include <linux/rbtree.h>
-#include "crypto.h"
-#include "auth.h"
-#include "auth_x_protocol.h"
-/*
- * Handle ticket for a single service.
- */
-struct ceph_x_ticket_handler {
-        struct rb_node node;
-        unsigned service;
-        struct ceph_crypto_key session_key;
-        struct ceph_timespec validity;
-        u64 secret_id;
-        struct ceph_buffer *ticket_blob;
-        unsigned long renew_after, expires;
-};
-struct ceph_x_authorizer {
-        struct ceph_buffer *buf;
-        unsigned service;
-        u64 nonce;
-        char reply_buf[128];  /* big enough for encrypted blob */
-};
-struct ceph_x_info {
-        struct ceph_crypto_key secret;
-        bool starting;
-        u64 server_challenge;
-        unsigned have_keys;
-        struct rb_root ticket_handlers;
-        struct ceph_x_authorizer auth_authorizer;
-};
-extern int ceph_x_init(struct ceph_auth_client *ac);
-#endif
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
deleted file mode 100644
index 671d30576c4f..000000000000
--- a/fs/ceph/auth_x_protocol.h
+++ /dev/null
@@ -1,90 +0,0 @@
-#ifndef __FS_CEPH_AUTH_X_PROTOCOL
-#define __FS_CEPH_AUTH_X_PROTOCOL
-#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
-#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
-#define CEPHX_GET_ROTATING_KEY          0x0400
-/* common bits */
-struct ceph_x_ticket_blob {
-        __u8 struct_v;
-        __le64 secret_id;
-        __le32 blob_len;
-        char blob[];
-} __attribute__ ((packed));
-/* common request/reply headers */
-struct ceph_x_request_header {
-        __le16 op;
-} __attribute__ ((packed));
-struct ceph_x_reply_header {
-        __le16 op;
-        __le32 result;
-} __attribute__ ((packed));
-/* authenticate handshake */
-/* initial hello (no reply header) */
-struct ceph_x_server_challenge {
-        __u8 struct_v;
-        __le64 server_challenge;
-} __attribute__ ((packed));
-struct ceph_x_authenticate {
-        __u8 struct_v;
-        __le64 client_challenge;
-        __le64 key;
-        /* ticket blob */
-} __attribute__ ((packed));
-struct ceph_x_service_ticket_request {
-        __u8 struct_v;
-        __le32 keys;
-} __attribute__ ((packed));
-struct ceph_x_challenge_blob {
-        __le64 server_challenge;
-        __le64 client_challenge;
-} __attribute__ ((packed));
-/* authorize handshake */
-/*
- * The authorizer consists of two pieces:
- *  a - service id, ticket blob
- *  b - encrypted with session key
- */
-struct ceph_x_authorize_a {
-        __u8 struct_v;
-        __le64 global_id;
-        __le32 service_id;
-        struct ceph_x_ticket_blob ticket_blob;
-} __attribute__ ((packed));
-struct ceph_x_authorize_b {
-        __u8 struct_v;
-        __le64 nonce;
-} __attribute__ ((packed));
-struct ceph_x_authorize_reply {
-        __u8 struct_v;
-        __le64 nonce_plus_one;
-} __attribute__ ((packed));
-/*
- * encyption bundle
- */
-#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
-struct ceph_x_encrypt_header {
-        __u8 struct_v;
-        __le64 magic;
-} __attribute__ ((packed));
-#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
deleted file mode 100644
index cd39f17021de..000000000000
--- a/fs/ceph/buffer.c
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/slab.h>
-#include "buffer.h"
-#include "decode.h"
-struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
-{
-        struct ceph_buffer *b;
-        b = kmalloc(sizeof(*b), gfp);
-        if (!b)
-                return NULL;
-        b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
-        if (b->vec.iov_base) {
-                b->is_vmalloc = false;
-        } else {
-                b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
-                if (!b->vec.iov_base) {
-                        kfree(b);
-                        return NULL;
-                }
-                b->is_vmalloc = true;
-        }
-        kref_init(&b->kref);
-        b->alloc_len = len;
-        b->vec.iov_len = len;
-        dout("buffer_new %p\n", b);
-        return b;
-}
-void ceph_buffer_release(struct kref *kref)
-{
-        struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
-        dout("buffer_release %p\n", b);
-        if (b->vec.iov_base) {
-                if (b->is_vmalloc)
-                        vfree(b->vec.iov_base);
-                else
-                        kfree(b->vec.iov_base);
-        }
-        kfree(b);
-}
-int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
-{
-        size_t len;
-        ceph_decode_need(p, end, sizeof(u32), bad);
-        len = ceph_decode_32(p);
-        dout("decode_buffer len %d\n", (int)len);
-        ceph_decode_need(p, end, len, bad);
-        *b = ceph_buffer_new(len, GFP_NOFS);
-        if (!*b)
-                return -ENOMEM;
-        ceph_decode_copy(p, (*b)->vec.iov_base, len);
-        return 0;
-bad:
-        return -EINVAL;
-}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
deleted file mode 100644
index 58d19014068f..000000000000
--- a/fs/ceph/buffer.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __FS_CEPH_BUFFER_H
-#define __FS_CEPH_BUFFER_H
-#include <linux/kref.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/types.h>
-#include <linux/uio.h>
-/*
- * a simple reference counted buffer.
- *
- * use kmalloc for small sizes (<= one page), vmalloc for larger
- * sizes.
- */
-struct ceph_buffer {
-        struct kref kref;
-        struct kvec vec;
-        size_t alloc_len;
-        bool is_vmalloc;
-};
-extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
-extern void ceph_buffer_release(struct kref *kref);
-static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
-{
-        kref_get(&b->kref);
-        return b;
-}
-static inline void ceph_buffer_put(struct ceph_buffer *b)
-{
-        kref_put(&b->kref, ceph_buffer_release);
-}
-extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
-#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 5e9da996a151..6b61ded701e1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
@@ -9,8 +9,9 @@
 #include <linux/writeback.h>
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
-#include "messenger.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h>
 /*
 * Capability management
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
        spin_unlock(&mdsc->caps_list_lock);
 }
-void ceph_reservation_status(struct ceph_client *client,
+void ceph_reservation_status(struct ceph_fs_client *fsc,
                             int *total, int *avail, int *used, int *reserved,
                             int *min)
 {
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        if (total)
                *total = mdsc->caps_total_count;
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
                               struct ceph_inode_info *ci)
 {
-        struct ceph_mount_args *ma = mdsc->client->mount_args;
+        struct ceph_mount_options *ma = mdsc->fsc->mount_options;
        ci->i_hold_caps_min = round_jiffies(jiffies +
                                            ma->caps_wanted_delay_min * HZ);
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
                 unsigned seq, unsigned mseq, u64 realmino, int flags,
                 struct ceph_cap_reservation *caps_reservation)
 {
-        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_cap *new_cap = NULL;
        struct ceph_cap *cap;
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
        struct ceph_mds_session *session = cap->session;
        struct ceph_inode_info *ci = cap->ci;
        struct ceph_mds_client *mdsc =
-                &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+                ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
        int removed = 0;
        dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
        int mds;
        struct ceph_cap_snap *capsnap;
        u32 mseq;
-        struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
                                                    session->s_mutex */
        u64 next_follows = 0;  /* keep track of how far we've gotten through the
@@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
 void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
        struct ceph_mds_client *mdsc =
-                &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+                ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
        struct inode *inode = &ci->vfs_inode;
        int was = ci->i_dirty_caps;
        int dirty = 0;
@@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 static int __mark_caps_flushing(struct inode *inode,
                                 struct ceph_mds_session *session)
 {
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int flushing;
@@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
 /*
 * try to invalidate mapping pages without blocking.
 */
-static int mapping_is_empty(struct address_space *mapping)
-{
-        struct page *page = find_get_page(mapping, 0);
-        if (!page)
-                return 1;
-        put_page(page);
-        return 0;
-}
 static int try_nonblocking_invalidate(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1436,12 +1426,12 @@ static int try_nonblocking_invalidate(struct inode *inode)
        invalidate_mapping_pages(&inode->i_data, 0, -1);
        spin_lock(&inode->i_lock);
-        if (mapping_is_empty(&inode->i_data) &&
+        if (inode->i_data.nrpages == 0 &&
            invalidating_gen == ci->i_rdcache_gen) {
                /* success. */
                dout("try_nonblocking_invalidate %p success\n", inode);
-                ci->i_rdcache_gen = 0;
+                /* save any racing async invalidate some trouble */
-                ci->i_rdcache_revoking = 0;
+                ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
                return 0;
        }
        dout("try_nonblocking_invalidate %p failed\n", inode);
@@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                     struct ceph_mds_session *session)
 {
-        struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct inode *inode = &ci->vfs_inode;
        struct ceph_cap *cap;
        int file_wanted, used;
@@ -1533,7 +1523,7 @@ retry_locked:
         */
        if ((!is_delayed || mdsc->stopping) &&
            ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
-            ci->i_rdcache_gen &&                     /* may have cached pages */
+            inode->i_data.nrpages &&                 /* have cached pages */
            (file_wanted == 0 ||                     /* no open files */
             (revoking & (CEPH_CAP_FILE_CACHE|
                          CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
@@ -1570,9 +1560,10 @@ retry_locked:
                /* NOTE: no side-effects allowed, until we take s_mutex */
                revoking = cap->implemented & ~cap->issued;
-                if (revoking)
+                dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
-                        dout(" mds%d revoking %s\n", cap->mds,
+                     cap->mds, cap, ceph_cap_string(cap->issued),
-                             ceph_cap_string(revoking));
+                     ceph_cap_string(cap->implemented),
+                     ceph_cap_string(revoking));
                if (cap == ci->i_auth_cap &&
                    (cap->issued & CEPH_CAP_FILE_WR)) {
@@ -1668,6 +1659,8 @@ ack:
                if (cap == ci->i_auth_cap && ci->i_dirty_caps)
                        flushing = __mark_caps_flushing(inode, session);
+                else
+                        flushing = 0;
                mds = cap->mds;  /* remember mds, so we don't repeat */
                sent++;
@@ -1706,7 +1699,7 @@ ack:
 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
                          unsigned *flush_tid)
 {
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int unlock_session = session ? 0 : 1;
        int flushing = 0;
@@ -1872,7 +1865,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
                                       caps_are_flushed(inode, flush_tid));
        } else {
                struct ceph_mds_client *mdsc =
-                        &ceph_sb_to_client(inode->i_sb)->mdsc;
+                        ceph_sb_to_client(inode->i_sb)->mdsc;
                spin_lock(&inode->i_lock);
                if (__ceph_caps_dirty(ci))
@@ -1950,6 +1943,35 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
        }
 }
+static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
+                                     struct ceph_mds_session *session,
+                                     struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_cap *cap;
+        int delayed = 0;
+        spin_lock(&inode->i_lock);
+        cap = ci->i_auth_cap;
+        dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
+             ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+        __ceph_flush_snaps(ci, &session, 1);
+        if (ci->i_flushing_caps) {
+                delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+                                     __ceph_caps_used(ci),
+                                     __ceph_caps_wanted(ci),
+                                     cap->issued | cap->implemented,
+                                     ci->i_flushing_caps, NULL);
+                if (delayed) {
+                        spin_lock(&inode->i_lock);
+                        __cap_delay_requeue(mdsc, ci);
+                        spin_unlock(&inode->i_lock);
+                }
+        } else {
+                spin_unlock(&inode->i_lock);
+        }
+}
 /*
 * Take references to capabilities we hold, so that we don't release
@@ -2283,8 +2305,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
-        unsigned seq = le32_to_cpu(grant->seq);
+        int seq = le32_to_cpu(grant->seq);
-        unsigned issue_seq = le32_to_cpu(grant->issue_seq);
        int newcaps = le32_to_cpu(grant->caps);
        int issued, implemented, used, wanted, dirty;
        u64 size = le64_to_cpu(grant->size);
@@ -2296,8 +2317,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        int revoked_rdcache = 0;
        int queue_invalidate = 0;
-        dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
+        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
-             inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
+             inode, cap, mds, seq, ceph_cap_string(newcaps));
        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
                inode->i_size);
@@ -2393,7 +2414,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        }
        cap->seq = seq;
-        cap->issue_seq = issue_seq;
        /* file layout may have changed */
        ci->i_layout = grant->layout;
@@ -2465,7 +2485,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
        __releases(inode->i_lock)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        unsigned seq = le32_to_cpu(m->seq);
        int dirty = le32_to_cpu(m->dirty);
        int cleaned = 0;
@@ -2699,8 +2719,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
        ceph_add_cap(inode, session, cap_id, -1,
                     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
                     NULL /* no caps context */);
-        try_flush_caps(inode, session, NULL);
+        kick_flushing_inode_caps(mdsc, session, inode);
        up_read(&mdsc->snap_rwsem);
+        /* make sure we re-request max_size, if necessary */
+        spin_lock(&inode->i_lock);
+        ci->i_requested_max_size = 0;
+        spin_unlock(&inode->i_lock);
 }
 /*
@@ -2713,7 +2738,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                      struct ceph_msg *msg)
 {
        struct ceph_mds_client *mdsc = session->s_mdsc;
-        struct super_block *sb = mdsc->client->sb;
+        struct super_block *sb = mdsc->fsc->sb;
        struct inode *inode;
        struct ceph_cap *cap;
        struct ceph_mds_caps *h;
@@ -2792,8 +2817,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        case CEPH_CAP_OP_IMPORT:
                handle_cap_import(mdsc, inode, h, session,
                                  snaptrace, snaptrace_len);
-                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
+                ceph_check_caps(ceph_inode(inode), 0, session);
-                                session);
                goto done_unlocked;
        }
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
deleted file mode 100644
index 1818c2305610..000000000000
--- a/fs/ceph/ceph_debug.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef _FS_CEPH_DEBUG_H
-#define _FS_CEPH_DEBUG_H
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
-/*
- * wrap pr_debug to include a filename:lineno prefix on each line.
- * this incurs some overhead (kernel size and execution time) due to
- * the extra function call at each call site.
- */
-# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
-extern const char *ceph_file_part(const char *s, int len);
-#  define dout(fmt, ...)                                                \
-        pr_debug(" %12.12s:%-4d : " fmt,                                \
-                 ceph_file_part(__FILE__, sizeof(__FILE__)),            \
-                 __LINE__, ##__VA_ARGS__)
-# else
-/* faux printk call just to see any compiler warnings. */
-#  define dout(fmt, ...)        do {                            \
-                if (0)                                          \
-                        printk(KERN_DEBUG fmt, ##__VA_ARGS__);  \
-        } while (0)
-# endif
-#else
-/*
- * or, just wrap pr_debug
- */
-# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
-#endif
-#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index ab6cf35c4091..bdce8b1fbd06 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
 /*
 * Ceph 'frag' type
 */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
 int ceph_frag_compare(__u32 a, __u32 b)
 {
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
deleted file mode 100644
index 5babb8e95352..000000000000
--- a/fs/ceph/ceph_frag.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef FS_CEPH_FRAG_H
-#define FS_CEPH_FRAG_H
-/*
- * "Frags" are a way to describe a subset of a 32-bit number space,
- * using a mask and a value to match against that mask.  Any given frag
- * (subset of the number space) can be partitioned into 2^n sub-frags.
- *
- * Frags are encoded into a 32-bit word:
- *   8 upper bits = "bits"
- *  24 lower bits = "value"
- * (We could go to 5+27 bits, but who cares.)
- *
- * We use the _most_ significant bits of the 24 bit value.  This makes
- * values logically sort.
- *
- * Unfortunately, because the "bits" field is still in the high bits, we
- * can't sort encoded frags numerically.  However, it does allow you
- * to feed encoded frags as values into frag_contains_value.
- */
-static inline __u32 ceph_frag_make(__u32 b, __u32 v)
-{
-        return (b << 24) |
-                (v & (0xffffffu << (24-b)) & 0xffffffu);
-}
-static inline __u32 ceph_frag_bits(__u32 f)
-{
-        return f >> 24;
-}
-static inline __u32 ceph_frag_value(__u32 f)
-{
-        return f & 0xffffffu;
-}
-static inline __u32 ceph_frag_mask(__u32 f)
-{
-        return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
-}
-static inline __u32 ceph_frag_mask_shift(__u32 f)
-{
-        return 24 - ceph_frag_bits(f);
-}
-static inline int ceph_frag_contains_value(__u32 f, __u32 v)
-{
-        return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
-static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
-{
-        /* is sub as specific as us, and contained by us? */
-        return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
-               (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
-static inline __u32 ceph_frag_parent(__u32 f)
-{
-        return ceph_frag_make(ceph_frag_bits(f) - 1,
-                         ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
-}
-static inline int ceph_frag_is_left_child(__u32 f)
-{
-        return ceph_frag_bits(f) > 0 &&
-                (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
-}
-static inline int ceph_frag_is_right_child(__u32 f)
-{
-        return ceph_frag_bits(f) > 0 &&
-                (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
-}
-static inline __u32 ceph_frag_sibling(__u32 f)
-{
-        return ceph_frag_make(ceph_frag_bits(f),
-                      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
-}
-static inline __u32 ceph_frag_left_child(__u32 f)
-{
-        return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
-}
-static inline __u32 ceph_frag_right_child(__u32 f)
-{
-        return ceph_frag_make(ceph_frag_bits(f)+1,
-              ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
-}
-static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
-{
-        int newbits = ceph_frag_bits(f) + by;
-        return ceph_frag_make(newbits,
-                         ceph_frag_value(f) | (i << (24 - newbits)));
-}
-static inline int ceph_frag_is_leftmost(__u32 f)
-{
-        return ceph_frag_value(f) == 0;
-}
-static inline int ceph_frag_is_rightmost(__u32 f)
-{
-        return ceph_frag_value(f) == ceph_frag_mask(f);
-}
-static inline __u32 ceph_frag_next(__u32 f)
-{
-        return ceph_frag_make(ceph_frag_bits(f),
-                         ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
-}
-/*
- * comparator to sort frags logically, as when traversing the
- * number space in ascending order...
- */
-int ceph_frag_compare(__u32 a, __u32 b);
-#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
deleted file mode 100644
index 3ac6cc7c1156..000000000000
--- a/fs/ceph/ceph_fs.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Some non-inline ceph helpers
- */
-#include "types.h"
-/*
- * return true if @layout appears to be valid
- */
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
-{
-        __u32 su = le32_to_cpu(layout->fl_stripe_unit);
-        __u32 sc = le32_to_cpu(layout->fl_stripe_count);
-        __u32 os = le32_to_cpu(layout->fl_object_size);
-        /* stripe unit, object size must be non-zero, 64k increment */
-        if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
-                return 0;
-        if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
-                return 0;
-        /* object size must be a multiple of stripe unit */
-        if (os < su || os % su)
-                return 0;
-        /* stripe count must be non-zero */
-        if (!sc)
-                return 0;
-        return 1;
-}
-int ceph_flags_to_mode(int flags)
-{
-        int mode;
-#ifdef O_DIRECTORY  /* fixme */
-        if ((flags & O_DIRECTORY) == O_DIRECTORY)
-                return CEPH_FILE_MODE_PIN;
-#endif
-        if ((flags & O_APPEND) == O_APPEND)
-                flags |= O_WRONLY;
-        if ((flags & O_ACCMODE) == O_RDWR)
-                mode = CEPH_FILE_MODE_RDWR;
-        else if ((flags & O_ACCMODE) == O_WRONLY)
-                mode = CEPH_FILE_MODE_WR;
-        else
-                mode = CEPH_FILE_MODE_RD;
-#ifdef O_LAZY
-        if (flags & O_LAZY)
-                mode |= CEPH_FILE_MODE_LAZY;
-#endif
-        return mode;
-}
-int ceph_caps_for_mode(int mode)
-{
-        int caps = CEPH_CAP_PIN;
-        if (mode & CEPH_FILE_MODE_RD)
-                caps |= CEPH_CAP_FILE_SHARED |
-                        CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
-        if (mode & CEPH_FILE_MODE_WR)
-                caps |= CEPH_CAP_FILE_EXCL |
-                        CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
-                        CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
-                        CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
-        if (mode & CEPH_FILE_MODE_LAZY)
-                caps |= CEPH_CAP_FILE_LAZYIO;
-        return caps;
-}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
deleted file mode 100644
index d5619ac86711..000000000000
--- a/fs/ceph/ceph_fs.h
+++ /dev/null
@@ -1,728 +0,0 @@
-/*
- * ceph_fs.h - Ceph constants and data types to share between kernel and
- * user space.
- *
- * Most types in this file are defined as little-endian, and are
- * primarily intended to describe data structures that pass over the
- * wire or that are stored on disk.
- *
- * LGPL2
- */
-#ifndef CEPH_FS_H
-#define CEPH_FS_H
-#include "msgr.h"
-#include "rados.h"
-/*
- * subprotocol versions.  when specific messages types or high-level
- * protocols change, bump the affected components.  we keep rev
- * internal cluster protocols separately from the public,
- * client-facing protocol.
- */
-#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
-#define CEPH_MON_PROTOCOL     5 /* cluster internal */
-#define CEPH_OSDC_PROTOCOL   24 /* server/client */
-#define CEPH_MDSC_PROTOCOL   32 /* server/client */
-#define CEPH_MONC_PROTOCOL   15 /* server/client */
-#define CEPH_INO_ROOT  1
-#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
-/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
-#define CEPH_MAX_MON   31
-/*
- * feature bits
- */
-#define CEPH_FEATURE_UID            (1<<0)
-#define CEPH_FEATURE_NOSRCADDR      (1<<1)
-#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
-#define CEPH_FEATURE_FLOCK          (1<<3)
-/*
- * ceph_file_layout - describe data layout for a file/inode
- */
-struct ceph_file_layout {
-        /* file -> object mapping */
-        __le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
-                                      of page size. */
-        __le32 fl_stripe_count;    /* over this many objects */
-        __le32 fl_object_size;     /* until objects are this big, then move to
-                                      new objects */
-        __le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
-        /* pg -> disk layout */
-        __le32 fl_object_stripe_unit;  /* for per-object parity, if any */
-        /* object -> pg layout */
-        __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
-        __le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
-} __attribute__ ((packed));
-#define CEPH_MIN_STRIPE_UNIT 65536
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
-/* crypto algorithms */
-#define CEPH_CRYPTO_NONE 0x0
-#define CEPH_CRYPTO_AES  0x1
-#define CEPH_AES_IV "cephsageyudagreg"
-/* security/authentication protocols */
-#define CEPH_AUTH_UNKNOWN       0x0
-#define CEPH_AUTH_NONE          0x1
-#define CEPH_AUTH_CEPHX         0x2
-#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
-/*********************************************
- * message layer
- */
-/*
- * message types
- */
-/* misc */
-#define CEPH_MSG_SHUTDOWN               1
-#define CEPH_MSG_PING                   2
-/* client <-> monitor */
-#define CEPH_MSG_MON_MAP                4
-#define CEPH_MSG_MON_GET_MAP            5
-#define CEPH_MSG_STATFS                 13
-#define CEPH_MSG_STATFS_REPLY           14
-#define CEPH_MSG_MON_SUBSCRIBE          15
-#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
-#define CEPH_MSG_AUTH                   17
-#define CEPH_MSG_AUTH_REPLY             18
-/* client <-> mds */
-#define CEPH_MSG_MDS_MAP                21
-#define CEPH_MSG_CLIENT_SESSION         22
-#define CEPH_MSG_CLIENT_RECONNECT       23
-#define CEPH_MSG_CLIENT_REQUEST         24
-#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
-#define CEPH_MSG_CLIENT_REPLY           26
-#define CEPH_MSG_CLIENT_CAPS            0x310
-#define CEPH_MSG_CLIENT_LEASE           0x311
-#define CEPH_MSG_CLIENT_SNAP            0x312
-#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
-/* pool ops */
-#define CEPH_MSG_POOLOP_REPLY           48
-#define CEPH_MSG_POOLOP                 49
-/* osd */
-#define CEPH_MSG_OSD_MAP          41
-#define CEPH_MSG_OSD_OP           42
-#define CEPH_MSG_OSD_OPREPLY      43
-/* pool operations */
-enum {
-  POOL_OP_CREATE                        = 0x01,
-  POOL_OP_DELETE                        = 0x02,
-  POOL_OP_AUID_CHANGE                   = 0x03,
-  POOL_OP_CREATE_SNAP                   = 0x11,
-  POOL_OP_DELETE_SNAP                   = 0x12,
-  POOL_OP_CREATE_UNMANAGED_SNAP         = 0x21,
-  POOL_OP_DELETE_UNMANAGED_SNAP         = 0x22,
-};
-struct ceph_mon_request_header {
-        __le64 have_version;
-        __le16 session_mon;
-        __le64 session_mon_tid;
-} __attribute__ ((packed));
-struct ceph_mon_statfs {
-        struct ceph_mon_request_header monhdr;
-        struct ceph_fsid fsid;
-} __attribute__ ((packed));
-struct ceph_statfs {
-        __le64 kb, kb_used, kb_avail;
-        __le64 num_objects;
-} __attribute__ ((packed));
-struct ceph_mon_statfs_reply {
-        struct ceph_fsid fsid;
-        __le64 version;
-        struct ceph_statfs st;
-} __attribute__ ((packed));
-const char *ceph_pool_op_name(int op);
-struct ceph_mon_poolop {
-        struct ceph_mon_request_header monhdr;
-        struct ceph_fsid fsid;
-        __le32 pool;
-        __le32 op;
-        __le64 auid;
-        __le64 snapid;
-        __le32 name_len;
-} __attribute__ ((packed));
-struct ceph_mon_poolop_reply {
-        struct ceph_mon_request_header monhdr;
-        struct ceph_fsid fsid;
-        __le32 reply_code;
-        __le32 epoch;
-        char has_data;
-        char data[0];
-} __attribute__ ((packed));
-struct ceph_mon_unmanaged_snap {
-        __le64 snapid;
-} __attribute__ ((packed));
-struct ceph_osd_getmap {
-        struct ceph_mon_request_header monhdr;
-        struct ceph_fsid fsid;
-        __le32 start;
-} __attribute__ ((packed));
-struct ceph_mds_getmap {
-        struct ceph_mon_request_header monhdr;
-        struct ceph_fsid fsid;
-} __attribute__ ((packed));
-struct ceph_client_mount {
-        struct ceph_mon_request_header monhdr;
-} __attribute__ ((packed));
-struct ceph_mon_subscribe_item {
-        __le64 have_version;    __le64 have;
-        __u8 onetime;
-} __attribute__ ((packed));
-struct ceph_mon_subscribe_ack {
-        __le32 duration;         /* seconds */
-        struct ceph_fsid fsid;
-} __attribute__ ((packed));
-/*
- * mds states
- *   > 0 -> in
- *  <= 0 -> out
- */
-#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
-#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
-                                          empty log. */
-#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
-#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
-#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
-#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
-#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
-#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
-#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
-                                          operations (import, rename, etc.) */
-#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
-#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
-#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
-#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
-#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
-extern const char *ceph_mds_state_name(int s);
-/*
- * metadata lock types.
- *  - these are bitmasks.. we can compose them
- *  - they also define the lock ordering by the MDS
- *  - a few of these are internal to the mds
- */
-#define CEPH_LOCK_DVERSION    1
-#define CEPH_LOCK_DN          2
-#define CEPH_LOCK_ISNAP       16
-#define CEPH_LOCK_IVERSION    32    /* mds internal */
-#define CEPH_LOCK_IFILE       64
-#define CEPH_LOCK_IAUTH       128
-#define CEPH_LOCK_ILINK       256
-#define CEPH_LOCK_IDFT        512   /* dir frag tree */
-#define CEPH_LOCK_INEST       1024  /* mds internal */
-#define CEPH_LOCK_IXATTR      2048
-#define CEPH_LOCK_IFLOCK      4096  /* advisory file locks */
-#define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
-/* client_session ops */
-enum {
-        CEPH_SESSION_REQUEST_OPEN,
-        CEPH_SESSION_OPEN,
-        CEPH_SESSION_REQUEST_CLOSE,
-        CEPH_SESSION_CLOSE,
-        CEPH_SESSION_REQUEST_RENEWCAPS,
-        CEPH_SESSION_RENEWCAPS,
-        CEPH_SESSION_STALE,
-        CEPH_SESSION_RECALL_STATE,
-};
-extern const char *ceph_session_op_name(int op);
-struct ceph_mds_session_head {
-        __le32 op;
-        __le64 seq;
-        struct ceph_timespec stamp;
-        __le32 max_caps, max_leases;
-} __attribute__ ((packed));
-/* client_request */
-/*
- * metadata ops.
- *  & 0x001000 -> write op
- *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
- &  & 0x100000 -> use weird ino/path trace
- */
-#define CEPH_MDS_OP_WRITE        0x001000
-enum {
-        CEPH_MDS_OP_LOOKUP     = 0x00100,
-        CEPH_MDS_OP_GETATTR    = 0x00101,
-        CEPH_MDS_OP_LOOKUPHASH = 0x00102,
-        CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
-        CEPH_MDS_OP_SETXATTR   = 0x01105,
-        CEPH_MDS_OP_RMXATTR    = 0x01106,
-        CEPH_MDS_OP_SETLAYOUT  = 0x01107,
-        CEPH_MDS_OP_SETATTR    = 0x01108,
-        CEPH_MDS_OP_SETFILELOCK= 0x01109,
-        CEPH_MDS_OP_GETFILELOCK= 0x00110,
-        CEPH_MDS_OP_MKNOD      = 0x01201,
-        CEPH_MDS_OP_LINK       = 0x01202,
-        CEPH_MDS_OP_UNLINK     = 0x01203,
-        CEPH_MDS_OP_RENAME     = 0x01204,
-        CEPH_MDS_OP_MKDIR      = 0x01220,
-        CEPH_MDS_OP_RMDIR      = 0x01221,
-        CEPH_MDS_OP_SYMLINK    = 0x01222,
-        CEPH_MDS_OP_CREATE     = 0x01301,
-        CEPH_MDS_OP_OPEN       = 0x00302,
-        CEPH_MDS_OP_READDIR    = 0x00305,
-        CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
-        CEPH_MDS_OP_MKSNAP     = 0x01400,
-        CEPH_MDS_OP_RMSNAP     = 0x01401,
-        CEPH_MDS_OP_LSSNAP     = 0x00402,
-};
-extern const char *ceph_mds_op_name(int op);
-#define CEPH_SETATTR_MODE   1
-#define CEPH_SETATTR_UID    2
-#define CEPH_SETATTR_GID    4
-#define CEPH_SETATTR_MTIME  8
-#define CEPH_SETATTR_ATIME 16
-#define CEPH_SETATTR_SIZE  32
-#define CEPH_SETATTR_CTIME 64
-union ceph_mds_request_args {
-        struct {
-                __le32 mask;                 /* CEPH_CAP_* */
-        } __attribute__ ((packed)) getattr;
-        struct {
-                __le32 mode;
-                __le32 uid;
-                __le32 gid;
-                struct ceph_timespec mtime;
-                struct ceph_timespec atime;
-                __le64 size, old_size;       /* old_size needed by truncate */
-                __le32 mask;                 /* CEPH_SETATTR_* */
-        } __attribute__ ((packed)) setattr;
-        struct {
-                __le32 frag;                 /* which dir fragment */
-                __le32 max_entries;          /* how many dentries to grab */
-                __le32 max_bytes;
-        } __attribute__ ((packed)) readdir;
-        struct {
-                __le32 mode;
-                __le32 rdev;
-        } __attribute__ ((packed)) mknod;
-        struct {
-                __le32 mode;
-        } __attribute__ ((packed)) mkdir;
-        struct {
-                __le32 flags;
-                __le32 mode;
-                __le32 stripe_unit;          /* layout for newly created file */
-                __le32 stripe_count;         /* ... */
-                __le32 object_size;
-                __le32 file_replication;
-                __le32 preferred;
-        } __attribute__ ((packed)) open;
-        struct {
-                __le32 flags;
-        } __attribute__ ((packed)) setxattr;
-        struct {
-                struct ceph_file_layout layout;
-        } __attribute__ ((packed)) setlayout;
-        struct {
-                __u8 rule; /* currently fcntl or flock */
-                __u8 type; /* shared, exclusive, remove*/
-                __le64 pid; /* process id requesting the lock */
-                __le64 pid_namespace;
-                __le64 start; /* initial location to lock */
-                __le64 length; /* num bytes to lock from start */
-                __u8 wait; /* will caller wait for lock to become available? */
-        } __attribute__ ((packed)) filelock_change;
-} __attribute__ ((packed));
-#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
-#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
-struct ceph_mds_request_head {
-        __le64 oldest_client_tid;
-        __le32 mdsmap_epoch;           /* on client */
-        __le32 flags;                  /* CEPH_MDS_FLAG_* */
-        __u8 num_retry, num_fwd;       /* count retry, fwd attempts */
-        __le16 num_releases;           /* # include cap/lease release records */
-        __le32 op;                     /* mds op code */
-        __le32 caller_uid, caller_gid;
-        __le64 ino;                    /* use this ino for openc, mkdir, mknod,
-                                          etc. (if replaying) */
-        union ceph_mds_request_args args;
-} __attribute__ ((packed));
-/* cap/lease release record */
-struct ceph_mds_request_release {
-        __le64 ino, cap_id;            /* ino and unique cap id */
-        __le32 caps, wanted;           /* new issued, wanted */
-        __le32 seq, issue_seq, mseq;
-        __le32 dname_seq;              /* if releasing a dentry lease, a */
-        __le32 dname_len;              /* string follows. */
-} __attribute__ ((packed));
-/* client reply */
-struct ceph_mds_reply_head {
-        __le32 op;
-        __le32 result;
-        __le32 mdsmap_epoch;
-        __u8 safe;                     /* true if committed to disk */
-        __u8 is_dentry, is_target;     /* true if dentry, target inode records
-                                          are included with reply */
-} __attribute__ ((packed));
-/* one for each node split */
-struct ceph_frag_tree_split {
-        __le32 frag;                   /* this frag splits... */
-        __le32 by;                     /* ...by this many bits */
-} __attribute__ ((packed));
-struct ceph_frag_tree_head {
-        __le32 nsplits;                /* num ceph_frag_tree_split records */
-        struct ceph_frag_tree_split splits[];
-} __attribute__ ((packed));
-/* capability issue, for bundling with mds reply */
-struct ceph_mds_reply_cap {
-        __le32 caps, wanted;           /* caps issued, wanted */
-        __le64 cap_id;
-        __le32 seq, mseq;
-        __le64 realm;                  /* snap realm */
-        __u8 flags;                    /* CEPH_CAP_FLAG_* */
-} __attribute__ ((packed));
-#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
-/* inode record, for bundling with mds reply */
-struct ceph_mds_reply_inode {
-        __le64 ino;
-        __le64 snapid;
-        __le32 rdev;
-        __le64 version;                /* inode version */
-        __le64 xattr_version;          /* version for xattr blob */
-        struct ceph_mds_reply_cap cap; /* caps issued for this inode */
-        struct ceph_file_layout layout;
-        struct ceph_timespec ctime, mtime, atime;
-        __le32 time_warp_seq;
-        __le64 size, max_size, truncate_size;
-        __le32 truncate_seq;
-        __le32 mode, uid, gid;
-        __le32 nlink;
-        __le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
-        struct ceph_timespec rctime;
-        struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
-} __attribute__ ((packed));
-/* followed by frag array, then symlink string, then xattr blob */
-/* reply_lease follows dname, and reply_inode */
-struct ceph_mds_reply_lease {
-        __le16 mask;            /* lease type(s) */
-        __le32 duration_ms;     /* lease duration */
-        __le32 seq;
-} __attribute__ ((packed));
-struct ceph_mds_reply_dirfrag {
-        __le32 frag;            /* fragment */
-        __le32 auth;            /* auth mds, if this is a delegation point */
-        __le32 ndist;           /* number of mds' this is replicated on */
-        __le32 dist[];
-} __attribute__ ((packed));
-#define CEPH_LOCK_FCNTL    1
-#define CEPH_LOCK_FLOCK    2
-#define CEPH_LOCK_SHARED   1
-#define CEPH_LOCK_EXCL     2
-#define CEPH_LOCK_UNLOCK   4
-struct ceph_filelock {
-        __le64 start;/* file offset to start lock at */
-        __le64 length; /* num bytes to lock; 0 for all following start */
-        __le64 client; /* which client holds the lock */
-        __le64 pid; /* process id holding the lock on the client */
-        __le64 pid_namespace;
-        __u8 type; /* shared lock, exclusive lock, or unlock */
-} __attribute__ ((packed));
-/* file access modes */
-#define CEPH_FILE_MODE_PIN        0
-#define CEPH_FILE_MODE_RD         1
-#define CEPH_FILE_MODE_WR         2
-#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
-#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
-#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
-int ceph_flags_to_mode(int flags);
-/* capability bits */
-#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
-/* generic cap bits */
-#define CEPH_CAP_GSHARED     1  /* client can reads */
-#define CEPH_CAP_GEXCL       2  /* client can read and update */
-#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
-#define CEPH_CAP_GRD         8  /* (file) client can read */
-#define CEPH_CAP_GWR        16  /* (file) client can write */
-#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
-#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
-#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
-/* per-lock shift */
-#define CEPH_CAP_SAUTH      2
-#define CEPH_CAP_SLINK      4
-#define CEPH_CAP_SXATTR     6
-#define CEPH_CAP_SFILE      8
-#define CEPH_CAP_SFLOCK    20 
-#define CEPH_CAP_BITS       22
-/* composed values */
-#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
-#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
-#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
-#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
-#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
-#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
-#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FLOCK_SHARED  (CEPH_CAP_GSHARED   << CEPH_CAP_SFLOCK)
-#define CEPH_CAP_FLOCK_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SFLOCK)
-/* cap masks (for getattr) */
-#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
-#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
-#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
-#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
-#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
-#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
-#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |                 \
-                                 CEPH_CAP_AUTH_SHARED | \
-                                 CEPH_CAP_LINK_SHARED | \
-                                 CEPH_CAP_FILE_SHARED | \
-                                 CEPH_CAP_XATTR_SHARED)
-#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |                     \
-                              CEPH_CAP_LINK_SHARED |                    \
-                              CEPH_CAP_XATTR_SHARED |                   \
-                              CEPH_CAP_FILE_SHARED)
-#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |     \
-                           CEPH_CAP_FILE_CACHE)
-#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |         \
-                           CEPH_CAP_LINK_EXCL |         \
-                           CEPH_CAP_XATTR_EXCL |        \
-                           CEPH_CAP_FILE_EXCL)
-#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
-                              CEPH_CAP_FILE_EXCL)
-#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
-#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
-                           CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
-                           CEPH_CAP_PIN)
-#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
-                        CEPH_LOCK_IXATTR)
-int ceph_caps_for_mode(int mode);
-enum {
-        CEPH_CAP_OP_GRANT,         /* mds->client grant */
-        CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
-        CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
-        CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
-        CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
-        CEPH_CAP_OP_UPDATE,        /* client->mds update */
-        CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
-        CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
-        CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
-        CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
-        CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
-        CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
-        CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
-};
-extern const char *ceph_cap_op_name(int op);
-/*
- * caps message, used for capability callbacks, acks, requests, etc.
- */
-struct ceph_mds_caps {
-        __le32 op;                  /* CEPH_CAP_OP_* */
-        __le64 ino, realm;
-        __le64 cap_id;
-        __le32 seq, issue_seq;
-        __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
-        __le32 migrate_seq;
-        __le64 snap_follows;
-        __le32 snap_trace_len;
-        /* authlock */
-        __le32 uid, gid, mode;
-        /* linklock */
-        __le32 nlink;
-        /* xattrlock */
-        __le32 xattr_len;
-        __le64 xattr_version;
-        /* filelock */
-        __le64 size, max_size, truncate_size;
-        __le32 truncate_seq;
-        struct ceph_timespec mtime, atime, ctime;
-        struct ceph_file_layout layout;
-        __le32 time_warp_seq;
-} __attribute__ ((packed));
-/* cap release msg head */
-struct ceph_mds_cap_release {
-        __le32 num;                /* number of cap_items that follow */
-} __attribute__ ((packed));
-struct ceph_mds_cap_item {
-        __le64 ino;
-        __le64 cap_id;
-        __le32 migrate_seq, seq;
-} __attribute__ ((packed));
-#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
-#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
-#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
-#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
-extern const char *ceph_lease_op_name(int o);
-/* lease msg header */
-struct ceph_mds_lease {
-        __u8 action;            /* CEPH_MDS_LEASE_* */
-        __le16 mask;            /* which lease */
-        __le64 ino;
-        __le64 first, last;     /* snap range */
-        __le32 seq;
-        __le32 duration_ms;     /* duration of renewal */
-} __attribute__ ((packed));
-/* followed by a __le32+string for dname */
-/* client reconnect */
-struct ceph_mds_cap_reconnect {
-        __le64 cap_id;
-        __le32 wanted;
-        __le32 issued;
-        __le64 snaprealm;
-        __le64 pathbase;        /* base ino for our path to this ino */
-        __le32 flock_len;       /* size of flock state blob, if any */
-} __attribute__ ((packed));
-/* followed by flock blob */
-struct ceph_mds_cap_reconnect_v1 {
-        __le64 cap_id;
-        __le32 wanted;
-        __le32 issued;
-        __le64 size;
-        struct ceph_timespec mtime, atime;
-        __le64 snaprealm;
-        __le64 pathbase;        /* base ino for our path to this ino */
-} __attribute__ ((packed));
-struct ceph_mds_snaprealm_reconnect {
-        __le64 ino;     /* snap realm base */
-        __le64 seq;     /* snap seq for this snap realm */
-        __le64 parent;  /* parent realm */
-} __attribute__ ((packed));
-/*
- * snaps
- */
-enum {
-        CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
-        CEPH_SNAP_OP_CREATE,
-        CEPH_SNAP_OP_DESTROY,
-        CEPH_SNAP_OP_SPLIT,
-};
-extern const char *ceph_snap_op_name(int o);
-/* snap msg header */
-struct ceph_mds_snap_head {
-        __le32 op;                /* CEPH_SNAP_OP_* */
-        __le64 split;             /* ino to split off, if any */
-        __le32 num_split_inos;    /* # inos belonging to new child realm */
-        __le32 num_split_realms;  /* # child realms udner new child realm */
-        __le32 trace_len;         /* size of snap trace blob */
-} __attribute__ ((packed));
-/* followed by split ino list, then split realms, then the trace blob */
-/*
- * encode info about a snaprealm, as viewed by a client
- */
-struct ceph_mds_snap_realm {
-        __le64 ino;           /* ino */
-        __le64 created;       /* snap: when created */
-        __le64 parent;        /* ino: parent realm */
-        __le64 parent_since;  /* snap: same parent since */
-        __le64 seq;           /* snap: version */
-        __le32 num_snaps;
-        __le32 num_prior_parent_snaps;
-} __attribute__ ((packed));
-/* followed by my snap list, then prior parent snap list */
-#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
deleted file mode 100644
index bd570015d147..000000000000
--- a/fs/ceph/ceph_hash.c
+++ /dev/null
@@ -1,118 +0,0 @@
-#include "types.h"
-/*
- * Robert Jenkin's hash function.
- * http://burtleburtle.net/bob/hash/evahash.html
- * This is in the public domain.
- */
-#define mix(a, b, c)                                            \
-        do {                                                    \
-                a = a - b;  a = a - c;  a = a ^ (c >> 13);      \
-                b = b - c;  b = b - a;  b = b ^ (a << 8);       \
-                c = c - a;  c = c - b;  c = c ^ (b >> 13);      \
-                a = a - b;  a = a - c;  a = a ^ (c >> 12);      \
-                b = b - c;  b = b - a;  b = b ^ (a << 16);      \
-                c = c - a;  c = c - b;  c = c ^ (b >> 5);       \
-                a = a - b;  a = a - c;  a = a ^ (c >> 3);       \
-                b = b - c;  b = b - a;  b = b ^ (a << 10);      \
-                c = c - a;  c = c - b;  c = c ^ (b >> 15);      \
-        } while (0)
-unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
-{
-        const unsigned char *k = (const unsigned char *)str;
-        __u32 a, b, c;  /* the internal state */
-        __u32 len;      /* how many key bytes still need mixing */
-        /* Set up the internal state */
-        len = length;
-        a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
-        b = a;
-        c = 0;               /* variable initialization of internal state */
-        /* handle most of the key */
-        while (len >= 12) {
-                a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
-                         ((__u32)k[3] << 24));
-                b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
-                         ((__u32)k[7] << 24));
-                c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
-                         ((__u32)k[11] << 24));
-                mix(a, b, c);
-                k = k + 12;
-                len = len - 12;
-        }
-        /* handle the last 11 bytes */
-        c = c + length;
-        switch (len) {            /* all the case statements fall through */
-        case 11:
-                c = c + ((__u32)k[10] << 24);
-        case 10:
-                c = c + ((__u32)k[9] << 16);
-        case 9:
-                c = c + ((__u32)k[8] << 8);
-                /* the first byte of c is reserved for the length */
-        case 8:
-                b = b + ((__u32)k[7] << 24);
-        case 7:
-                b = b + ((__u32)k[6] << 16);
-        case 6:
-                b = b + ((__u32)k[5] << 8);
-        case 5:
-                b = b + k[4];
-        case 4:
-                a = a + ((__u32)k[3] << 24);
-        case 3:
-                a = a + ((__u32)k[2] << 16);
-        case 2:
-                a = a + ((__u32)k[1] << 8);
-        case 1:
-                a = a + k[0];
-                /* case 0: nothing left to add */
-        }
-        mix(a, b, c);
-        return c;
-}
-/*
- * linux dcache hash
- */
-unsigned ceph_str_hash_linux(const char *str, unsigned length)
-{
-        unsigned long hash = 0;
-        unsigned char c;
-        while (length--) {
-                c = *str++;
-                hash = (hash + (c << 4) + (c >> 4)) * 11;
-        }
-        return hash;
-}
-unsigned ceph_str_hash(int type, const char *s, unsigned len)
-{
-        switch (type) {
-        case CEPH_STR_HASH_LINUX:
-                return ceph_str_hash_linux(s, len);
-        case CEPH_STR_HASH_RJENKINS:
-                return ceph_str_hash_rjenkins(s, len);
-        default:
-                return -1;
-        }
-}
-const char *ceph_str_hash_name(int type)
-{
-        switch (type) {
-        case CEPH_STR_HASH_LINUX:
-                return "linux";
-        case CEPH_STR_HASH_RJENKINS:
-                return "rjenkins";
-        default:
-                return "unknown";
-        }
-}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
deleted file mode 100644
index d099c3f90236..000000000000
--- a/fs/ceph/ceph_hash.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef FS_CEPH_HASH_H
-#define FS_CEPH_HASH_H
-#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
-#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
-extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
-extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
-extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
-extern const char *ceph_str_hash_name(int type);
-#endif
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
deleted file mode 100644
index fabd302e5779..000000000000
--- a/fs/ceph/crush/crush.c
+++ /dev/null
@@ -1,151 +0,0 @@
-#ifdef __KERNEL__
-# include <linux/slab.h>
-#else
-# include <stdlib.h>
-# include <assert.h>
-# define kfree(x) do { if (x) free(x); } while (0)
-# define BUG_ON(x) assert(!(x))
-#endif
-#include "crush.h"
-const char *crush_bucket_alg_name(int alg)
-{
-        switch (alg) {
-        case CRUSH_BUCKET_UNIFORM: return "uniform";
-        case CRUSH_BUCKET_LIST: return "list";
-        case CRUSH_BUCKET_TREE: return "tree";
-        case CRUSH_BUCKET_STRAW: return "straw";
-        default: return "unknown";
-        }
-}
-/**
- * crush_get_bucket_item_weight - Get weight of an item in given bucket
- * @b: bucket pointer
- * @p: item index in bucket
- */
-int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
-{
-        if (p >= b->size)
-                return 0;
-        switch (b->alg) {
-        case CRUSH_BUCKET_UNIFORM:
-                return ((struct crush_bucket_uniform *)b)->item_weight;
-        case CRUSH_BUCKET_LIST:
-                return ((struct crush_bucket_list *)b)->item_weights[p];
-        case CRUSH_BUCKET_TREE:
-                if (p & 1)
-                        return ((struct crush_bucket_tree *)b)->node_weights[p];
-                return 0;
-        case CRUSH_BUCKET_STRAW:
-                return ((struct crush_bucket_straw *)b)->item_weights[p];
-        }
-        return 0;
-}
-/**
- * crush_calc_parents - Calculate parent vectors for the given crush map.
- * @map: crush_map pointer
- */
-void crush_calc_parents(struct crush_map *map)
-{
-        int i, b, c;
-        for (b = 0; b < map->max_buckets; b++) {
-                if (map->buckets[b] == NULL)
-                        continue;
-                for (i = 0; i < map->buckets[b]->size; i++) {
-                        c = map->buckets[b]->items[i];
-                        BUG_ON(c >= map->max_devices ||
-                               c < -map->max_buckets);
-                        if (c >= 0)
-                                map->device_parents[c] = map->buckets[b]->id;
-                        else
-                                map->bucket_parents[-1-c] = map->buckets[b]->id;
-                }
-        }
-}
-void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
-{
-        kfree(b->h.perm);
-        kfree(b->h.items);
-        kfree(b);
-}
-void crush_destroy_bucket_list(struct crush_bucket_list *b)
-{
-        kfree(b->item_weights);
-        kfree(b->sum_weights);
-        kfree(b->h.perm);
-        kfree(b->h.items);
-        kfree(b);
-}
-void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
-{
-        kfree(b->node_weights);
-        kfree(b);
-}
-void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
-{
-        kfree(b->straws);
-        kfree(b->item_weights);
-        kfree(b->h.perm);
-        kfree(b->h.items);
-        kfree(b);
-}
-void crush_destroy_bucket(struct crush_bucket *b)
-{
-        switch (b->alg) {
-        case CRUSH_BUCKET_UNIFORM:
-                crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
-                break;
-        case CRUSH_BUCKET_LIST:
-                crush_destroy_bucket_list((struct crush_bucket_list *)b);
-                break;
-        case CRUSH_BUCKET_TREE:
-                crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
-                break;
-        case CRUSH_BUCKET_STRAW:
-                crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
-                break;
-        }
-}
-/**
- * crush_destroy - Destroy a crush_map
- * @map: crush_map pointer
- */
-void crush_destroy(struct crush_map *map)
-{
-        int b;
-        /* buckets */
-        if (map->buckets) {
-                for (b = 0; b < map->max_buckets; b++) {
-                        if (map->buckets[b] == NULL)
-                                continue;
-                        crush_destroy_bucket(map->buckets[b]);
-                }
-                kfree(map->buckets);
-        }
-        /* rules */
-        if (map->rules) {
-                for (b = 0; b < map->max_rules; b++)
-                        kfree(map->rules[b]);
-                kfree(map->rules);
-        }
-        kfree(map->bucket_parents);
-        kfree(map->device_parents);
-        kfree(map);
-}
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
deleted file mode 100644
index 97e435b191f4..000000000000
--- a/fs/ceph/crush/crush.h
+++ /dev/null
@@ -1,180 +0,0 @@
-#ifndef CEPH_CRUSH_CRUSH_H
-#define CEPH_CRUSH_CRUSH_H
-#include <linux/types.h>
-/*
- * CRUSH is a pseudo-random data distribution algorithm that
- * efficiently distributes input values (typically, data objects)
- * across a heterogeneous, structured storage cluster.
- *
- * The algorithm was originally described in detail in this paper
- * (although the algorithm has evolved somewhat since then):
- *
- *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
- *
- * LGPL2
- */
-#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
-#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
-#define CRUSH_MAX_SET   10  /* max size of a mapping result */
-/*
- * CRUSH uses user-defined "rules" to describe how inputs should be
- * mapped to devices.  A rule consists of sequence of steps to perform
- * to generate the set of output devices.
- */
-struct crush_rule_step {
-        __u32 op;
-        __s32 arg1;
-        __s32 arg2;
-};
-/* step op codes */
-enum {
-        CRUSH_RULE_NOOP = 0,
-        CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
-        CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
-                                      /* arg2 = type */
-        CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
-        CRUSH_RULE_EMIT = 4,          /* no args */
-        CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
-        CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
-};
-/*
- * for specifying choose num (arg1) relative to the max parameter
- * passed to do_rule
- */
-#define CRUSH_CHOOSE_N            0
-#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
-/*
- * The rule mask is used to describe what the rule is intended for.
- * Given a ruleset and size of output set, we search through the
- * rule list for a matching rule_mask.
- */
-struct crush_rule_mask {
-        __u8 ruleset;
-        __u8 type;
-        __u8 min_size;
-        __u8 max_size;
-};
-struct crush_rule {
-        __u32 len;
-        struct crush_rule_mask mask;
-        struct crush_rule_step steps[0];
-};
-#define crush_rule_size(len) (sizeof(struct crush_rule) + \
-                              (len)*sizeof(struct crush_rule_step))
-/*
- * A bucket is a named container of other items (either devices or
- * other buckets).  Items within a bucket are chosen using one of a
- * few different algorithms.  The table summarizes how the speed of
- * each option measures up against mapping stability when items are
- * added or removed.
- *
- *  Bucket Alg     Speed       Additions    Removals
- *  ------------------------------------------------
- *  uniform         O(1)       poor         poor
- *  list            O(n)       optimal      poor
- *  tree            O(log n)   good         good
- *  straw           O(n)       optimal      optimal
- */
-enum {
-        CRUSH_BUCKET_UNIFORM = 1,
-        CRUSH_BUCKET_LIST = 2,
-        CRUSH_BUCKET_TREE = 3,
-        CRUSH_BUCKET_STRAW = 4
-};
-extern const char *crush_bucket_alg_name(int alg);
-struct crush_bucket {
-        __s32 id;        /* this'll be negative */
-        __u16 type;      /* non-zero; type=0 is reserved for devices */
-        __u8 alg;        /* one of CRUSH_BUCKET_* */
-        __u8 hash;       /* which hash function to use, CRUSH_HASH_* */
-        __u32 weight;    /* 16-bit fixed point */
-        __u32 size;      /* num items */
-        __s32 *items;
-        /*
-         * cached random permutation: used for uniform bucket and for
-         * the linear search fallback for the other bucket types.
-         */
-        __u32 perm_x;  /* @x for which *perm is defined */
-        __u32 perm_n;  /* num elements of *perm that are permuted/defined */
-        __u32 *perm;
-};
-struct crush_bucket_uniform {
-        struct crush_bucket h;
-        __u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
-};
-struct crush_bucket_list {
-        struct crush_bucket h;
-        __u32 *item_weights;  /* 16-bit fixed point */
-        __u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
-                                 of weights 0..i, inclusive */
-};
-struct crush_bucket_tree {
-        struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
-                                   actual items */
-        __u8 num_nodes;
-        __u32 *node_weights;
-};
-struct crush_bucket_straw {
-        struct crush_bucket h;
-        __u32 *item_weights;   /* 16-bit fixed point */
-        __u32 *straws;         /* 16-bit fixed point */
-};
-/*
- * CRUSH map includes all buckets, rules, etc.
- */
-struct crush_map {
-        struct crush_bucket **buckets;
-        struct crush_rule **rules;
-        /*
-         * Parent pointers to identify the parent bucket a device or
-         * bucket in the hierarchy.  If an item appears more than
-         * once, this is the _last_ time it appeared (where buckets
-         * are processed in bucket id order, from -1 on down to
-         * -max_buckets.
-         */
-        __u32 *bucket_parents;
-        __u32 *device_parents;
-        __s32 max_buckets;
-        __u32 max_rules;
-        __s32 max_devices;
-};
-/* crush.c */
-extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
-extern void crush_calc_parents(struct crush_map *map);
-extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
-extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
-extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
-extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
-extern void crush_destroy_bucket(struct crush_bucket *b);
-extern void crush_destroy(struct crush_map *map);
-#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
deleted file mode 100644
index 5873aed694bf..000000000000
--- a/fs/ceph/crush/hash.c
+++ /dev/null
@@ -1,149 +0,0 @@
-#include <linux/types.h>
-#include "hash.h"
-/*
- * Robert Jenkins' function for mixing 32-bit values
- * http://burtleburtle.net/bob/hash/evahash.html
- * a, b = random bits, c = input and output
- */
-#define crush_hashmix(a, b, c) do {                     \
-                a = a-b;  a = a-c;  a = a^(c>>13);      \
-                b = b-c;  b = b-a;  b = b^(a<<8);       \
-                c = c-a;  c = c-b;  c = c^(b>>13);      \
-                a = a-b;  a = a-c;  a = a^(c>>12);      \
-                b = b-c;  b = b-a;  b = b^(a<<16);      \
-                c = c-a;  c = c-b;  c = c^(b>>5);       \
-                a = a-b;  a = a-c;  a = a^(c>>3);       \
-                b = b-c;  b = b-a;  b = b^(a<<10);      \
-                c = c-a;  c = c-b;  c = c^(b>>15);      \
-        } while (0)
-#define crush_hash_seed 1315423911
-static __u32 crush_hash32_rjenkins1(__u32 a)
-{
-        __u32 hash = crush_hash_seed ^ a;
-        __u32 b = a;
-        __u32 x = 231232;
-        __u32 y = 1232;
-        crush_hashmix(b, x, hash);
-        crush_hashmix(y, a, hash);
-        return hash;
-}
-static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
-{
-        __u32 hash = crush_hash_seed ^ a ^ b;
-        __u32 x = 231232;
-        __u32 y = 1232;
-        crush_hashmix(a, b, hash);
-        crush_hashmix(x, a, hash);
-        crush_hashmix(b, y, hash);
-        return hash;
-}
-static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
-{
-        __u32 hash = crush_hash_seed ^ a ^ b ^ c;
-        __u32 x = 231232;
-        __u32 y = 1232;
-        crush_hashmix(a, b, hash);
-        crush_hashmix(c, x, hash);
-        crush_hashmix(y, a, hash);
-        crush_hashmix(b, x, hash);
-        crush_hashmix(y, c, hash);
-        return hash;
-}
-static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
-{
-        __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
-        __u32 x = 231232;
-        __u32 y = 1232;
-        crush_hashmix(a, b, hash);
-        crush_hashmix(c, d, hash);
-        crush_hashmix(a, x, hash);
-        crush_hashmix(y, b, hash);
-        crush_hashmix(c, x, hash);
-        crush_hashmix(y, d, hash);
-        return hash;
-}
-static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
-                                      __u32 e)
-{
-        __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
-        __u32 x = 231232;
-        __u32 y = 1232;
-        crush_hashmix(a, b, hash);
-        crush_hashmix(c, d, hash);
-        crush_hashmix(e, x, hash);
-        crush_hashmix(y, a, hash);
-        crush_hashmix(b, x, hash);
-        crush_hashmix(y, c, hash);
-        crush_hashmix(d, x, hash);
-        crush_hashmix(y, e, hash);
-        return hash;
-}
-__u32 crush_hash32(int type, __u32 a)
-{
-        switch (type) {
-        case CRUSH_HASH_RJENKINS1:
-                return crush_hash32_rjenkins1(a);
-        default:
-                return 0;
-        }
-}
-__u32 crush_hash32_2(int type, __u32 a, __u32 b)
-{
-        switch (type) {
-        case CRUSH_HASH_RJENKINS1:
-                return crush_hash32_rjenkins1_2(a, b);
-        default:
-                return 0;
-        }
-}
-__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
-{
-        switch (type) {
-        case CRUSH_HASH_RJENKINS1:
-                return crush_hash32_rjenkins1_3(a, b, c);
-        default:
-                return 0;
-        }
-}
-__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
-{
-        switch (type) {
-        case CRUSH_HASH_RJENKINS1:
-                return crush_hash32_rjenkins1_4(a, b, c, d);
-        default:
-                return 0;
-        }
-}
-__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
-{
-        switch (type) {
-        case CRUSH_HASH_RJENKINS1:
-                return crush_hash32_rjenkins1_5(a, b, c, d, e);
-        default:
-                return 0;
-        }
-}
-const char *crush_hash_name(int type)
-{
-        switch (type) {
-        case CRUSH_HASH_RJENKINS1:
-                return "rjenkins1";
-        default:
-                return "unknown";
-        }
-}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
deleted file mode 100644
index 91e884230d5d..000000000000
--- a/fs/ceph/crush/hash.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef CEPH_CRUSH_HASH_H
-#define CEPH_CRUSH_HASH_H
-#define CRUSH_HASH_RJENKINS1   0
-#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
-extern const char *crush_hash_name(int type);
-extern __u32 crush_hash32(int type, __u32 a);
-extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
-extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
-extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
-extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
-                            __u32 e);
-#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
deleted file mode 100644
index a4eec133258e..000000000000
--- a/fs/ceph/crush/mapper.c
+++ /dev/null
@@ -1,609 +0,0 @@
-#ifdef __KERNEL__
-# include <linux/string.h>
-# include <linux/slab.h>
-# include <linux/bug.h>
-# include <linux/kernel.h>
-# ifndef dprintk
-#  define dprintk(args...)
-# endif
-#else
-# include <string.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <assert.h>
-# define BUG_ON(x) assert(!(x))
-# define dprintk(args...) /* printf(args) */
-# define kmalloc(x, f) malloc(x)
-# define kfree(x) free(x)
-#endif
-#include "crush.h"
-#include "hash.h"
-/*
- * Implement the core CRUSH mapping algorithm.
- */
-/**
- * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
- * @map: the crush_map
- * @ruleset: the storage ruleset id (user defined)
- * @type: storage ruleset type (user defined)
- * @size: output set size
- */
-int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
-{
-        int i;
-        for (i = 0; i < map->max_rules; i++) {
-                if (map->rules[i] &&
-                    map->rules[i]->mask.ruleset == ruleset &&
-                    map->rules[i]->mask.type == type &&
-                    map->rules[i]->mask.min_size <= size &&
-                    map->rules[i]->mask.max_size >= size)
-                        return i;
-        }
-        return -1;
-}
-/*
- * bucket choose methods
- *
- * For each bucket algorithm, we have a "choose" method that, given a
- * crush input @x and replica position (usually, position in output set) @r,
- * will produce an item in the bucket.
- */
-/*
- * Choose based on a random permutation of the bucket.
- *
- * We used to use some prime number arithmetic to do this, but it
- * wasn't very random, and had some other bad behaviors.  Instead, we
- * calculate an actual random permutation of the bucket members.
- * Since this is expensive, we optimize for the r=0 case, which
- * captures the vast majority of calls.
- */
-static int bucket_perm_choose(struct crush_bucket *bucket,
-                              int x, int r)
-{
-        unsigned pr = r % bucket->size;
-        unsigned i, s;
-        /* start a new permutation if @x has changed */
-        if (bucket->perm_x != x || bucket->perm_n == 0) {
-                dprintk("bucket %d new x=%d\n", bucket->id, x);
-                bucket->perm_x = x;
-                /* optimize common r=0 case */
-                if (pr == 0) {
-                        s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
-                                bucket->size;
-                        bucket->perm[0] = s;
-                        bucket->perm_n = 0xffff;   /* magic value, see below */
-                        goto out;
-                }
-                for (i = 0; i < bucket->size; i++)
-                        bucket->perm[i] = i;
-                bucket->perm_n = 0;
-        } else if (bucket->perm_n == 0xffff) {
-                /* clean up after the r=0 case above */
-                for (i = 1; i < bucket->size; i++)
-                        bucket->perm[i] = i;
-                bucket->perm[bucket->perm[0]] = 0;
-                bucket->perm_n = 1;
-        }
-        /* calculate permutation up to pr */
-        for (i = 0; i < bucket->perm_n; i++)
-                dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
-        while (bucket->perm_n <= pr) {
-                unsigned p = bucket->perm_n;
-                /* no point in swapping the final entry */
-                if (p < bucket->size - 1) {
-                        i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
-                                (bucket->size - p);
-                        if (i) {
-                                unsigned t = bucket->perm[p + i];
-                                bucket->perm[p + i] = bucket->perm[p];
-                                bucket->perm[p] = t;
-                        }
-                        dprintk(" perm_choose swap %d with %d\n", p, p+i);
-                }
-                bucket->perm_n++;
-        }
-        for (i = 0; i < bucket->size; i++)
-                dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
-        s = bucket->perm[pr];
-out:
-        dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
-                bucket->size, x, r, pr, s);
-        return bucket->items[s];
-}
-/* uniform */
-static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
-                                 int x, int r)
-{
-        return bucket_perm_choose(&bucket->h, x, r);
-}
-/* list */
-static int bucket_list_choose(struct crush_bucket_list *bucket,
-                              int x, int r)
-{
-        int i;
-        for (i = bucket->h.size-1; i >= 0; i--) {
-                __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
-                                         r, bucket->h.id);
-                w &= 0xffff;
-                dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
-                        "sw %x rand %llx",
-                        i, x, r, bucket->h.items[i], bucket->item_weights[i],
-                        bucket->sum_weights[i], w);
-                w *= bucket->sum_weights[i];
-                w = w >> 16;
-                /*dprintk(" scaled %llx\n", w);*/
-                if (w < bucket->item_weights[i])
-                        return bucket->h.items[i];
-        }
-        BUG_ON(1);
-        return 0;
-}
-/* (binary) tree */
-static int height(int n)
-{
-        int h = 0;
-        while ((n & 1) == 0) {
-                h++;
-                n = n >> 1;
-        }
-        return h;
-}
-static int left(int x)
-{
-        int h = height(x);
-        return x - (1 << (h-1));
-}
-static int right(int x)
-{
-        int h = height(x);
-        return x + (1 << (h-1));
-}
-static int terminal(int x)
-{
-        return x & 1;
-}
-static int bucket_tree_choose(struct crush_bucket_tree *bucket,
-                              int x, int r)
-{
-        int n, l;
-        __u32 w;
-        __u64 t;
-        /* start at root */
-        n = bucket->num_nodes >> 1;
-        while (!terminal(n)) {
-                /* pick point in [0, w) */
-                w = bucket->node_weights[n];
-                t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
-                                          bucket->h.id) * (__u64)w;
-                t = t >> 32;
-                /* descend to the left or right? */
-                l = left(n);
-                if (t < bucket->node_weights[l])
-                        n = l;
-                else
-                        n = right(n);
-        }
-        return bucket->h.items[n >> 1];
-}
-/* straw */
-static int bucket_straw_choose(struct crush_bucket_straw *bucket,
-                               int x, int r)
-{
-        int i;
-        int high = 0;
-        __u64 high_draw = 0;
-        __u64 draw;
-        for (i = 0; i < bucket->h.size; i++) {
-                draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
-                draw &= 0xffff;
-                draw *= bucket->straws[i];
-                if (i == 0 || draw > high_draw) {
-                        high = i;
-                        high_draw = draw;
-                }
-        }
-        return bucket->h.items[high];
-}
-static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
-{
-        dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
-        switch (in->alg) {
-        case CRUSH_BUCKET_UNIFORM:
-                return bucket_uniform_choose((struct crush_bucket_uniform *)in,
-                                          x, r);
-        case CRUSH_BUCKET_LIST:
-                return bucket_list_choose((struct crush_bucket_list *)in,
-                                          x, r);
-        case CRUSH_BUCKET_TREE:
-                return bucket_tree_choose((struct crush_bucket_tree *)in,
-                                          x, r);
-        case CRUSH_BUCKET_STRAW:
-                return bucket_straw_choose((struct crush_bucket_straw *)in,
-                                           x, r);
-        default:
-                BUG_ON(1);
-                return in->items[0];
-        }
-}
-/*
- * true if device is marked "out" (failed, fully offloaded)
- * of the cluster
- */
-static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
-{
-        if (weight[item] >= 0x10000)
-                return 0;
-        if (weight[item] == 0)
-                return 1;
-        if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
-            < weight[item])
-                return 0;
-        return 1;
-}
-/**
- * crush_choose - choose numrep distinct items of given type
- * @map: the crush_map
- * @bucket: the bucket we are choose an item from
- * @x: crush input value
- * @numrep: the number of items to choose
- * @type: the type of item to choose
- * @out: pointer to output vector
- * @outpos: our position in that vector
- * @firstn: true if choosing "first n" items, false if choosing "indep"
- * @recurse_to_leaf: true if we want one device under each item of given type
- * @out2: second output vector for leaf items (if @recurse_to_leaf)
- */
-static int crush_choose(struct crush_map *map,
-                        struct crush_bucket *bucket,
-                        __u32 *weight,
-                        int x, int numrep, int type,
-                        int *out, int outpos,
-                        int firstn, int recurse_to_leaf,
-                        int *out2)
-{
-        int rep;
-        int ftotal, flocal;
-        int retry_descent, retry_bucket, skip_rep;
-        struct crush_bucket *in = bucket;
-        int r;
-        int i;
-        int item = 0;
-        int itemtype;
-        int collide, reject;
-        const int orig_tries = 5; /* attempts before we fall back to search */
-        dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
-                bucket->id, x, outpos, numrep);
-        for (rep = outpos; rep < numrep; rep++) {
-                /* keep trying until we get a non-out, non-colliding item */
-                ftotal = 0;
-                skip_rep = 0;
-                do {
-                        retry_descent = 0;
-                        in = bucket;               /* initial bucket */
-                        /* choose through intervening buckets */
-                        flocal = 0;
-                        do {
-                                collide = 0;
-                                retry_bucket = 0;
-                                r = rep;
-                                if (in->alg == CRUSH_BUCKET_UNIFORM) {
-                                        /* be careful */
-                                        if (firstn || numrep >= in->size)
-                                                /* r' = r + f_total */
-                                                r += ftotal;
-                                        else if (in->size % numrep == 0)
-                                                /* r'=r+(n+1)*f_local */
-                                                r += (numrep+1) *
-                                                        (flocal+ftotal);
-                                        else
-                                                /* r' = r + n*f_local */
-                                                r += numrep * (flocal+ftotal);
-                                } else {
-                                        if (firstn)
-                                                /* r' = r + f_total */
-                                                r += ftotal;
-                                        else
-                                                /* r' = r + n*f_local */
-                                                r += numrep * (flocal+ftotal);
-                                }
-                                /* bucket choose */
-                                if (in->size == 0) {
-                                        reject = 1;
-                                        goto reject;
-                                }
-                                if (flocal >= (in->size>>1) &&
-                                    flocal > orig_tries)
-                                        item = bucket_perm_choose(in, x, r);
-                                else
-                                        item = crush_bucket_choose(in, x, r);
-                                BUG_ON(item >= map->max_devices);
-                                /* desired type? */
-                                if (item < 0)
-                                        itemtype = map->buckets[-1-item]->type;
-                                else
-                                        itemtype = 0;
-                                dprintk("  item %d type %d\n", item, itemtype);
-                                /* keep going? */
-                                if (itemtype != type) {
-                                        BUG_ON(item >= 0 ||
-                                               (-1-item) >= map->max_buckets);
-                                        in = map->buckets[-1-item];
-                                        retry_bucket = 1;
-                                        continue;
-                                }
-                                /* collision? */
-                                for (i = 0; i < outpos; i++) {
-                                        if (out[i] == item) {
-                                                collide = 1;
-                                                break;
-                                        }
-                                }
-                                reject = 0;
-                                if (recurse_to_leaf) {
-                                        if (item < 0) {
-                                                if (crush_choose(map,
-                                                         map->buckets[-1-item],
-                                                         weight,
-                                                         x, outpos+1, 0,
-                                                         out2, outpos,
-                                                         firstn, 0,
-                                                         NULL) <= outpos)
-                                                        /* didn't get leaf */
-                                                        reject = 1;
-                                        } else {
-                                                /* we already have a leaf! */
-                                                out2[outpos] = item;
-                                        }
-                                }
-                                if (!reject) {
-                                        /* out? */
-                                        if (itemtype == 0)
-                                                reject = is_out(map, weight,
-                                                                item, x);
-                                        else
-                                                reject = 0;
-                                }
-reject:
-                                if (reject || collide) {
-                                        ftotal++;
-                                        flocal++;
-                                        if (collide && flocal < 3)
-                                                /* retry locally a few times */
-                                                retry_bucket = 1;
-                                        else if (flocal < in->size + orig_tries)
-                                                /* exhaustive bucket search */
-                                                retry_bucket = 1;
-                                        else if (ftotal < 20)
-                                                /* then retry descent */
-                                                retry_descent = 1;
-                                        else
-                                                /* else give up */
-                                                skip_rep = 1;
-                                        dprintk("  reject %d  collide %d  "
-                                                "ftotal %d  flocal %d\n",
-                                                reject, collide, ftotal,
-                                                flocal);
-                                }
-                        } while (retry_bucket);
-                } while (retry_descent);
-                if (skip_rep) {
-                        dprintk("skip rep\n");
-                        continue;
-                }
-                dprintk("CHOOSE got %d\n", item);
-                out[outpos] = item;
-                outpos++;
-        }
-        dprintk("CHOOSE returns %d\n", outpos);
-        return outpos;
-}
-/**
- * crush_do_rule - calculate a mapping with the given input and rule
- * @map: the crush_map
- * @ruleno: the rule id
- * @x: hash input
- * @result: pointer to result vector
- * @result_max: maximum result size
- * @force: force initial replica choice; -1 for none
- */
-int crush_do_rule(struct crush_map *map,
-                  int ruleno, int x, int *result, int result_max,
-                  int force, __u32 *weight)
-{
-        int result_len;
-        int force_context[CRUSH_MAX_DEPTH];
-        int force_pos = -1;
-        int a[CRUSH_MAX_SET];
-        int b[CRUSH_MAX_SET];
-        int c[CRUSH_MAX_SET];
-        int recurse_to_leaf;
-        int *w;
-        int wsize = 0;
-        int *o;
-        int osize;
-        int *tmp;
-        struct crush_rule *rule;
-        int step;
-        int i, j;
-        int numrep;
-        int firstn;
-        int rc = -1;
-        BUG_ON(ruleno >= map->max_rules);
-        rule = map->rules[ruleno];
-        result_len = 0;
-        w = a;
-        o = b;
-        /*
-         * determine hierarchical context of force, if any.  note
-         * that this may or may not correspond to the specific types
-         * referenced by the crush rule.
-         */
-        if (force >= 0) {
-                if (force >= map->max_devices ||
-                    map->device_parents[force] == 0) {
-                        /*dprintk("CRUSH: forcefed device dne\n");*/
-                        rc = -1;  /* force fed device dne */
-                        goto out;
-                }
-                if (!is_out(map, weight, force, x)) {
-                        while (1) {
-                                force_context[++force_pos] = force;
-                                if (force >= 0)
-                                        force = map->device_parents[force];
-                                else
-                                        force = map->bucket_parents[-1-force];
-                                if (force == 0)
-                                        break;
-                        }
-                }
-        }
-        for (step = 0; step < rule->len; step++) {
-                firstn = 0;
-                switch (rule->steps[step].op) {
-                case CRUSH_RULE_TAKE:
-                        w[0] = rule->steps[step].arg1;
-                        if (force_pos >= 0) {
-                                BUG_ON(force_context[force_pos] != w[0]);
-                                force_pos--;
-                        }
-                        wsize = 1;
-                        break;
-                case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
-                case CRUSH_RULE_CHOOSE_FIRSTN:
-                        firstn = 1;
-                case CRUSH_RULE_CHOOSE_LEAF_INDEP:
-                case CRUSH_RULE_CHOOSE_INDEP:
-                        BUG_ON(wsize == 0);
-                        recurse_to_leaf =
-                                rule->steps[step].op ==
-                                 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
-                                rule->steps[step].op ==
-                                CRUSH_RULE_CHOOSE_LEAF_INDEP;
-                        /* reset output */
-                        osize = 0;
-                        for (i = 0; i < wsize; i++) {
-                                /*
-                                 * see CRUSH_N, CRUSH_N_MINUS macros.
-                                 * basically, numrep <= 0 means relative to
-                                 * the provided result_max
-                                 */
-                                numrep = rule->steps[step].arg1;
-                                if (numrep <= 0) {
-                                        numrep += result_max;
-                                        if (numrep <= 0)
-                                                continue;
-                                }
-                                j = 0;
-                                if (osize == 0 && force_pos >= 0) {
-                                        /* skip any intermediate types */
-                                        while (force_pos &&
-                                               force_context[force_pos] < 0 &&
-                                               rule->steps[step].arg2 !=
-                                               map->buckets[-1 -
-                                               force_context[force_pos]]->type)
-                                                force_pos--;
-                                        o[osize] = force_context[force_pos];
-                                        if (recurse_to_leaf)
-                                                c[osize] = force_context[0];
-                                        j++;
-                                        force_pos--;
-                                }
-                                osize += crush_choose(map,
-                                                      map->buckets[-1-w[i]],
-                                                      weight,
-                                                      x, numrep,
-                                                      rule->steps[step].arg2,
-                                                      o+osize, j,
-                                                      firstn,
-                                                      recurse_to_leaf, c+osize);
-                        }
-                        if (recurse_to_leaf)
-                                /* copy final _leaf_ values to output set */
-                                memcpy(o, c, osize*sizeof(*o));
-                        /* swap t and w arrays */
-                        tmp = o;
-                        o = w;
-                        w = tmp;
-                        wsize = osize;
-                        break;
-                case CRUSH_RULE_EMIT:
-                        for (i = 0; i < wsize && result_len < result_max; i++) {
-                                result[result_len] = w[i];
-                                result_len++;
-                        }
-                        wsize = 0;
-                        break;
-                default:
-                        BUG_ON(1);
-                }
-        }
-        rc = result_len;
-out:
-        return rc;
-}
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
deleted file mode 100644
index c46b99c18bb0..000000000000
--- a/fs/ceph/crush/mapper.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef CEPH_CRUSH_MAPPER_H
-#define CEPH_CRUSH_MAPPER_H
-/*
- * CRUSH functions for find rules and then mapping an input to an
- * output set.
- *
- * LGPL2
- */
-#include "crush.h"
-extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
-extern int crush_do_rule(struct crush_map *map,
-                         int ruleno,
-                         int x, int *result, int result_max,
-                         int forcefeed,    /* -1 for none */
-                         __u32 *weights);
-#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
deleted file mode 100644
index a3e627f63293..000000000000
--- a/fs/ceph/crypto.c
+++ /dev/null
@@ -1,412 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/err.h>
-#include <linux/scatterlist.h>
-#include <linux/slab.h>
-#include <crypto/hash.h>
-#include "crypto.h"
-#include "decode.h"
-int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
-{
-        if (*p + sizeof(u16) + sizeof(key->created) +
-            sizeof(u16) + key->len > end)
-                return -ERANGE;
-        ceph_encode_16(p, key->type);
-        ceph_encode_copy(p, &key->created, sizeof(key->created));
-        ceph_encode_16(p, key->len);
-        ceph_encode_copy(p, key->key, key->len);
-        return 0;
-}
-int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
-{
-        ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
-        key->type = ceph_decode_16(p);
-        ceph_decode_copy(p, &key->created, sizeof(key->created));
-        key->len = ceph_decode_16(p);
-        ceph_decode_need(p, end, key->len, bad);
-        key->key = kmalloc(key->len, GFP_NOFS);
-        if (!key->key)
-                return -ENOMEM;
-        ceph_decode_copy(p, key->key, key->len);
-        return 0;
-bad:
-        dout("failed to decode crypto key\n");
-        return -EINVAL;
-}
-int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
-{
-        int inlen = strlen(inkey);
-        int blen = inlen * 3 / 4;
-        void *buf, *p;
-        int ret;
-        dout("crypto_key_unarmor %s\n", inkey);
-        buf = kmalloc(blen, GFP_NOFS);
-        if (!buf)
-                return -ENOMEM;
-        blen = ceph_unarmor(buf, inkey, inkey+inlen);
-        if (blen < 0) {
-                kfree(buf);
-                return blen;
-        }
-        p = buf;
-        ret = ceph_crypto_key_decode(key, &p, p + blen);
-        kfree(buf);
-        if (ret)
-                return ret;
-        dout("crypto_key_unarmor key %p type %d len %d\n", key,
-             key->type, key->len);
-        return 0;
-}
-#define AES_KEY_SIZE 16
-static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
-{
-        return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
-}
-static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
-static int ceph_aes_encrypt(const void *key, int key_len,
-                            void *dst, size_t *dst_len,
-                            const void *src, size_t src_len)
-{
-        struct scatterlist sg_in[2], sg_out[1];
-        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-        struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
-        int ret;
-        void *iv;
-        int ivsize;
-        size_t zero_padding = (0x10 - (src_len & 0x0f));
-        char pad[16];
-        if (IS_ERR(tfm))
-                return PTR_ERR(tfm);
-        memset(pad, zero_padding, zero_padding);
-        *dst_len = src_len + zero_padding;
-        crypto_blkcipher_setkey((void *)tfm, key, key_len);
-        sg_init_table(sg_in, 2);
-        sg_set_buf(&sg_in[0], src, src_len);
-        sg_set_buf(&sg_in[1], pad, zero_padding);
-        sg_init_table(sg_out, 1);
-        sg_set_buf(sg_out, dst, *dst_len);
-        iv = crypto_blkcipher_crt(tfm)->iv;
-        ivsize = crypto_blkcipher_ivsize(tfm);
-        memcpy(iv, aes_iv, ivsize);
-        /*
-        print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
-                       key, key_len, 1);
-        print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
-                        src, src_len, 1);
-        print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
-                        pad, zero_padding, 1);
-        */
-        ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
-                                     src_len + zero_padding);
-        crypto_free_blkcipher(tfm);
-        if (ret < 0)
-                pr_err("ceph_aes_crypt failed %d\n", ret);
-        /*
-        print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
-                       dst, *dst_len, 1);
-        */
-        return 0;
-}
-static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
-                             size_t *dst_len,
-                             const void *src1, size_t src1_len,
-                             const void *src2, size_t src2_len)
-{
-        struct scatterlist sg_in[3], sg_out[1];
-        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-        struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
-        int ret;
-        void *iv;
-        int ivsize;
-        size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
-        char pad[16];
-        if (IS_ERR(tfm))
-                return PTR_ERR(tfm);
-        memset(pad, zero_padding, zero_padding);
-        *dst_len = src1_len + src2_len + zero_padding;
-        crypto_blkcipher_setkey((void *)tfm, key, key_len);
-        sg_init_table(sg_in, 3);
-        sg_set_buf(&sg_in[0], src1, src1_len);
-        sg_set_buf(&sg_in[1], src2, src2_len);
-        sg_set_buf(&sg_in[2], pad, zero_padding);
-        sg_init_table(sg_out, 1);
-        sg_set_buf(sg_out, dst, *dst_len);
-        iv = crypto_blkcipher_crt(tfm)->iv;
-        ivsize = crypto_blkcipher_ivsize(tfm);
-        memcpy(iv, aes_iv, ivsize);
-        /*
-        print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
-                       key, key_len, 1);
-        print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
-                        src1, src1_len, 1);
-        print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
-                        src2, src2_len, 1);
-        print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
-                        pad, zero_padding, 1);
-        */
-        ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
-                                     src1_len + src2_len + zero_padding);
-        crypto_free_blkcipher(tfm);
-        if (ret < 0)
-                pr_err("ceph_aes_crypt2 failed %d\n", ret);
-        /*
-        print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
-                       dst, *dst_len, 1);
-        */
-        return 0;
-}
-static int ceph_aes_decrypt(const void *key, int key_len,
-                            void *dst, size_t *dst_len,
-                            const void *src, size_t src_len)
-{
-        struct scatterlist sg_in[1], sg_out[2];
-        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-        struct blkcipher_desc desc = { .tfm = tfm };
-        char pad[16];
-        void *iv;
-        int ivsize;
-        int ret;
-        int last_byte;
-        if (IS_ERR(tfm))
-                return PTR_ERR(tfm);
-        crypto_blkcipher_setkey((void *)tfm, key, key_len);
-        sg_init_table(sg_in, 1);
-        sg_init_table(sg_out, 2);
-        sg_set_buf(sg_in, src, src_len);
-        sg_set_buf(&sg_out[0], dst, *dst_len);
-        sg_set_buf(&sg_out[1], pad, sizeof(pad));
-        iv = crypto_blkcipher_crt(tfm)->iv;
-        ivsize = crypto_blkcipher_ivsize(tfm);
-        memcpy(iv, aes_iv, ivsize);
-        /*
-        print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
-                       key, key_len, 1);
-        print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
-                       src, src_len, 1);
-        */
-        ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
-        crypto_free_blkcipher(tfm);
-        if (ret < 0) {
-                pr_err("ceph_aes_decrypt failed %d\n", ret);
-                return ret;
-        }
-        if (src_len <= *dst_len)
-                last_byte = ((char *)dst)[src_len - 1];
-        else
-                last_byte = pad[src_len - *dst_len - 1];
-        if (last_byte <= 16 && src_len >= last_byte) {
-                *dst_len = src_len - last_byte;
-        } else {
-                pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
-                       last_byte, (int)src_len);
-                return -EPERM;  /* bad padding */
-        }
-        /*
-        print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
-                       dst, *dst_len, 1);
-        */
-        return 0;
-}
-static int ceph_aes_decrypt2(const void *key, int key_len,
-                             void *dst1, size_t *dst1_len,
-                             void *dst2, size_t *dst2_len,
-                             const void *src, size_t src_len)
-{
-        struct scatterlist sg_in[1], sg_out[3];
-        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-        struct blkcipher_desc desc = { .tfm = tfm };
-        char pad[16];
-        void *iv;
-        int ivsize;
-        int ret;
-        int last_byte;
-        if (IS_ERR(tfm))
-                return PTR_ERR(tfm);
-        sg_init_table(sg_in, 1);
-        sg_set_buf(sg_in, src, src_len);
-        sg_init_table(sg_out, 3);
-        sg_set_buf(&sg_out[0], dst1, *dst1_len);
-        sg_set_buf(&sg_out[1], dst2, *dst2_len);
-        sg_set_buf(&sg_out[2], pad, sizeof(pad));
-        crypto_blkcipher_setkey((void *)tfm, key, key_len);
-        iv = crypto_blkcipher_crt(tfm)->iv;
-        ivsize = crypto_blkcipher_ivsize(tfm);
-        memcpy(iv, aes_iv, ivsize);
-        /*
-        print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
-                       key, key_len, 1);
-        print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
-                       src, src_len, 1);
-        */
-        ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
-        crypto_free_blkcipher(tfm);
-        if (ret < 0) {
-                pr_err("ceph_aes_decrypt failed %d\n", ret);
-                return ret;
-        }
-        if (src_len <= *dst1_len)
-                last_byte = ((char *)dst1)[src_len - 1];
-        else if (src_len <= *dst1_len + *dst2_len)
-                last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
-        else
-                last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
-        if (last_byte <= 16 && src_len >= last_byte) {
-                src_len -= last_byte;
-        } else {
-                pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
-                       last_byte, (int)src_len);
-                return -EPERM;  /* bad padding */
-        }
-        if (src_len < *dst1_len) {
-                *dst1_len = src_len;
-                *dst2_len = 0;
-        } else {
-                *dst2_len = src_len - *dst1_len;
-        }
-        /*
-        print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
-                       dst1, *dst1_len, 1);
-        print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
-                       dst2, *dst2_len, 1);
-        */
-        return 0;
-}
-int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-                 const void *src, size_t src_len)
-{
-        switch (secret->type) {
-        case CEPH_CRYPTO_NONE:
-                if (*dst_len < src_len)
-                        return -ERANGE;
-                memcpy(dst, src, src_len);
-                *dst_len = src_len;
-                return 0;
-        case CEPH_CRYPTO_AES:
-                return ceph_aes_decrypt(secret->key, secret->len, dst,
-                                        dst_len, src, src_len);
-        default:
-                return -EINVAL;
-        }
-}
-int ceph_decrypt2(struct ceph_crypto_key *secret,
-                        void *dst1, size_t *dst1_len,
-                        void *dst2, size_t *dst2_len,
-                        const void *src, size_t src_len)
-{
-        size_t t;
-        switch (secret->type) {
-        case CEPH_CRYPTO_NONE:
-                if (*dst1_len + *dst2_len < src_len)
-                        return -ERANGE;
-                t = min(*dst1_len, src_len);
-                memcpy(dst1, src, t);
-                *dst1_len = t;
-                src += t;
-                src_len -= t;
-                if (src_len) {
-                        t = min(*dst2_len, src_len);
-                        memcpy(dst2, src, t);
-                        *dst2_len = t;
-                }
-                return 0;
-        case CEPH_CRYPTO_AES:
-                return ceph_aes_decrypt2(secret->key, secret->len,
-                                         dst1, dst1_len, dst2, dst2_len,
-                                         src, src_len);
-        default:
-                return -EINVAL;
-        }
-}
-int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-                 const void *src, size_t src_len)
-{
-        switch (secret->type) {
-        case CEPH_CRYPTO_NONE:
-                if (*dst_len < src_len)
-                        return -ERANGE;
-                memcpy(dst, src, src_len);
-                *dst_len = src_len;
-                return 0;
-        case CEPH_CRYPTO_AES:
-                return ceph_aes_encrypt(secret->key, secret->len, dst,
-                                        dst_len, src, src_len);
-        default:
-                return -EINVAL;
-        }
-}
-int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-                  const void *src1, size_t src1_len,
-                  const void *src2, size_t src2_len)
-{
-        switch (secret->type) {
-        case CEPH_CRYPTO_NONE:
-                if (*dst_len < src1_len + src2_len)
-                        return -ERANGE;
-                memcpy(dst, src1, src1_len);
-                memcpy(dst + src1_len, src2, src2_len);
-                *dst_len = src1_len + src2_len;
-                return 0;
-        case CEPH_CRYPTO_AES:
-                return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
-                                         src1, src1_len, src2, src2_len);
-        default:
-                return -EINVAL;
-        }
-}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
deleted file mode 100644
index bdf38607323c..000000000000
--- a/fs/ceph/crypto.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _FS_CEPH_CRYPTO_H
-#define _FS_CEPH_CRYPTO_H
-#include "types.h"
-#include "buffer.h"
-/*
- * cryptographic secret
- */
-struct ceph_crypto_key {
-        int type;
-        struct ceph_timespec created;
-        int len;
-        void *key;
-};
-static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
-{
-        kfree(key->key);
-}
-extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
-                                  void **p, void *end);
-extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
-                                  void **p, void *end);
-extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
-/* crypto.c */
-extern int ceph_decrypt(struct ceph_crypto_key *secret,
-                        void *dst, size_t *dst_len,
-                        const void *src, size_t src_len);
-extern int ceph_encrypt(struct ceph_crypto_key *secret,
-                        void *dst, size_t *dst_len,
-                        const void *src, size_t src_len);
-extern int ceph_decrypt2(struct ceph_crypto_key *secret,
-                        void *dst1, size_t *dst1_len,
-                        void *dst2, size_t *dst2_len,
-                        const void *src, size_t src_len);
-extern int ceph_encrypt2(struct ceph_crypto_key *secret,
-                         void *dst, size_t *dst_len,
-                         const void *src1, size_t src1_len,
-                         const void *src2, size_t src2_len);
-/* armor.c */
-extern int ceph_armor(char *dst, const char *src, const char *end);
-extern int ceph_unarmor(char *dst, const char *src, const char *end);
-#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6fd8b20a8611..08f65faac112 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/device.h>
 #include <linux/slab.h>
@@ -7,143 +7,49 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
 #include "super.h"
-#include "mds_client.h"
-#include "mon_client.h"
-#include "auth.h"
 #ifdef CONFIG_DEBUG_FS
-/*
+#include "mds_client.h"
- * Implement /sys/kernel/debug/ceph fun
- *
- * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
- *      .../osdmap      - current osdmap
- *      .../mdsmap      - current mdsmap
- *      .../monmap      - current monmap
- *      .../osdc        - active osd requests
- *      .../mdsc        - active mds requests
- *      .../monc        - mon client state
- *      .../dentry_lru  - dump contents of dentry lru
- *      .../caps        - expose cap (reservation) stats
- *      .../bdi         - symlink to ../../bdi/something
- */
-static struct dentry *ceph_debugfs_dir;
-static int monmap_show(struct seq_file *s, void *p)
-{
-        int i;
-        struct ceph_client *client = s->private;
-        if (client->monc.monmap == NULL)
-                return 0;
-        seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
-        for (i = 0; i < client->monc.monmap->num_mon; i++) {
-                struct ceph_entity_inst *inst =
-                        &client->monc.monmap->mon_inst[i];
-                seq_printf(s, "\t%s%lld\t%s\n",
-                           ENTITY_NAME(inst->name),
-                           pr_addr(&inst->addr.in_addr));
-        }
-        return 0;
-}
 static int mdsmap_show(struct seq_file *s, void *p)
 {
        int i;
-        struct ceph_client *client = s->private;
+        struct ceph_fs_client *fsc = s->private;
-        if (client->mdsc.mdsmap == NULL)
+        if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
                return 0;
-        seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
+        seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
-        seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+        seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
        seq_printf(s, "session_timeout %d\n",
-                       client->mdsc.mdsmap->m_session_timeout);
+                       fsc->mdsc->mdsmap->m_session_timeout);
        seq_printf(s, "session_autoclose %d\n",
-                       client->mdsc.mdsmap->m_session_autoclose);
+                       fsc->mdsc->mdsmap->m_session_autoclose);
-        for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+        for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
                struct ceph_entity_addr *addr =
-                        &client->mdsc.mdsmap->m_info[i].addr;
+                        &fsc->mdsc->mdsmap->m_info[i].addr;
-                int state = client->mdsc.mdsmap->m_info[i].state;
+                int state = fsc->mdsc->mdsmap->m_info[i].state;
-                seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+                seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
+                               ceph_pr_addr(&addr->in_addr),
                               ceph_mds_state_name(state));
        }
        return 0;
 }
-static int osdmap_show(struct seq_file *s, void *p)
+/*
-{
+ * mdsc debugfs
-        int i;
+ */
-        struct ceph_client *client = s->private;
-        struct rb_node *n;
-        if (client->osdc.osdmap == NULL)
-                return 0;
-        seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
-        seq_printf(s, "flags%s%s\n",
-                   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
-                   " NEARFULL" : "",
-                   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
-                   " FULL" : "");
-        for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
-                struct ceph_pg_pool_info *pool =
-                        rb_entry(n, struct ceph_pg_pool_info, node);
-                seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
-                           pool->id, pool->v.pg_num, pool->pg_num_mask,
-                           pool->v.lpg_num, pool->lpg_num_mask);
-        }
-        for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
-                struct ceph_entity_addr *addr =
-                        &client->osdc.osdmap->osd_addr[i];
-                int state = client->osdc.osdmap->osd_state[i];
-                char sb[64];
-                seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
-                           i, pr_addr(&addr->in_addr),
-                           ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
-                           ceph_osdmap_state_str(sb, sizeof(sb), state));
-        }
-        return 0;
-}
-static int monc_show(struct seq_file *s, void *p)
-{
-        struct ceph_client *client = s->private;
-        struct ceph_mon_generic_request *req;
-        struct ceph_mon_client *monc = &client->monc;
-        struct rb_node *rp;
-        mutex_lock(&monc->mutex);
-        if (monc->have_mdsmap)
-                seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
-        if (monc->have_osdmap)
-                seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
-        if (monc->want_next_osdmap)
-                seq_printf(s, "want next osdmap\n");
-        for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
-                __u16 op;
-                req = rb_entry(rp, struct ceph_mon_generic_request, node);
-                op = le16_to_cpu(req->request->hdr.type);
-                if (op == CEPH_MSG_STATFS)
-                        seq_printf(s, "%lld statfs\n", req->tid);
-                else
-                        seq_printf(s, "%lld unknown\n", req->tid);
-        }
-        mutex_unlock(&monc->mutex);
-        return 0;
-}
 static int mdsc_show(struct seq_file *s, void *p)
 {
-        struct ceph_client *client = s->private;
+        struct ceph_fs_client *fsc = s->private;
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        struct rb_node *rp;
        int pathlen;
@@ -154,10 +60,13 @@ static int mdsc_show(struct seq_file *s, void *p)
        for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
                req = rb_entry(rp, struct ceph_mds_request, r_node);
-                if (req->r_request)
+                if (req->r_request && req->r_session)
-                        seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
+                        seq_printf(s, "%lld\tmds%d\t", req->r_tid,
-                else
+                                   req->r_session->s_mds);
+                else if (!req->r_request)
                        seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+                else
+                        seq_printf(s, "%lld\t(no session)\t", req->r_tid);
                seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
@@ -214,61 +123,12 @@ static int mdsc_show(struct seq_file *s, void *p)
        return 0;
 }
-static int osdc_show(struct seq_file *s, void *pp)
-{
-        struct ceph_client *client = s->private;
-        struct ceph_osd_client *osdc = &client->osdc;
-        struct rb_node *p;
-        mutex_lock(&osdc->request_mutex);
-        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-                struct ceph_osd_request *req;
-                struct ceph_osd_request_head *head;
-                struct ceph_osd_op *op;
-                int num_ops;
-                int opcode, olen;
-                int i;
-                req = rb_entry(p, struct ceph_osd_request, r_node);
-                seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
-                           req->r_osd ? req->r_osd->o_osd : -1,
-                           le32_to_cpu(req->r_pgid.pool),
-                           le16_to_cpu(req->r_pgid.ps));
-                head = req->r_request->front.iov_base;
-                op = (void *)(head + 1);
-                num_ops = le16_to_cpu(head->num_ops);
-                olen = le32_to_cpu(head->object_len);
-                seq_printf(s, "%.*s", olen,
-                           (const char *)(head->ops + num_ops));
-                if (req->r_reassert_version.epoch)
-                        seq_printf(s, "\t%u'%llu",
-                           (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
-                           le64_to_cpu(req->r_reassert_version.version));
-                else
-                        seq_printf(s, "\t");
-                for (i = 0; i < num_ops; i++) {
-                        opcode = le16_to_cpu(op->op);
-                        seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
-                        op++;
-                }
-                seq_printf(s, "\n");
-        }
-        mutex_unlock(&osdc->request_mutex);
-        return 0;
-}
 static int caps_show(struct seq_file *s, void *p)
 {
-        struct ceph_client *client = s->private;
+        struct ceph_fs_client *fsc = s->private;
        int total, avail, used, reserved, min;
-        ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+        ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
        seq_printf(s, "total\t\t%d\n"
                   "avail\t\t%d\n"
                   "used\t\t%d\n"
@@ -280,8 +140,8 @@ static int caps_show(struct seq_file *s, void *p)
 static int dentry_lru_show(struct seq_file *s, void *ptr)
 {
-        struct ceph_client *client = s->private;
+        struct ceph_fs_client *fsc = s->private;
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_dentry_info *di;
        spin_lock(&mdsc->dentry_lru_lock);
@@ -295,199 +155,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
        return 0;
 }
-#define DEFINE_SHOW_FUNC(name)                                          \
+CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
-static int name##_open(struct inode *inode, struct file *file)          \
+CEPH_DEFINE_SHOW_FUNC(mdsc_show)
-{                                                                       \
+CEPH_DEFINE_SHOW_FUNC(caps_show)
-        struct seq_file *sf;                                            \
+CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
-        int ret;                                                        \
-                                                                        \
-        ret = single_open(file, name, NULL);                            \
-        sf = file->private_data;                                        \
-        sf->private = inode->i_private;                                 \
-        return ret;                                                     \
-}                                                                       \
-                                                                        \
-static const struct file_operations name##_fops = {                     \
-        .open           = name##_open,                                  \
-        .read           = seq_read,                                     \
-        .llseek         = seq_lseek,                                    \
-        .release        = single_release,                               \
-};
-DEFINE_SHOW_FUNC(monmap_show)
-DEFINE_SHOW_FUNC(mdsmap_show)
-DEFINE_SHOW_FUNC(osdmap_show)
-DEFINE_SHOW_FUNC(monc_show)
-DEFINE_SHOW_FUNC(mdsc_show)
-DEFINE_SHOW_FUNC(osdc_show)
-DEFINE_SHOW_FUNC(dentry_lru_show)
-DEFINE_SHOW_FUNC(caps_show)
+/*
+ * debugfs
+ */
 static int congestion_kb_set(void *data, u64 val)
 {
-        struct ceph_client *client = (struct ceph_client *)data;
+        struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
-        if (client)
-                client->mount_args->congestion_kb = (int)val;
+        fsc->mount_options->congestion_kb = (int)val;
        return 0;
 }
 static int congestion_kb_get(void *data, u64 *val)
 {
-        struct ceph_client *client = (struct ceph_client *)data;
+        struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
-        if (client)
-                *val = (u64)client->mount_args->congestion_kb;
+        *val = (u64)fsc->mount_options->congestion_kb;
        return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
                        congestion_kb_set, "%llu\n");
-int __init ceph_debugfs_init(void)
-{
-        ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
-        if (!ceph_debugfs_dir)
-                return -ENOMEM;
-        return 0;
-}
-void ceph_debugfs_cleanup(void)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 {
-        debugfs_remove(ceph_debugfs_dir);
+        dout("ceph_fs_debugfs_cleanup\n");
+        debugfs_remove(fsc->debugfs_bdi);
+        debugfs_remove(fsc->debugfs_congestion_kb);
+        debugfs_remove(fsc->debugfs_mdsmap);
+        debugfs_remove(fsc->debugfs_caps);
+        debugfs_remove(fsc->debugfs_mdsc);
+        debugfs_remove(fsc->debugfs_dentry_lru);
 }
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
-        int ret = 0;
+        char name[100];
-        char name[80];
+        int err = -ENOMEM;
-        snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
-                 client->monc.auth->global_id);
-        client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
-        if (!client->debugfs_dir)
-                goto out;
-        client->monc.debugfs_file = debugfs_create_file("monc",
+        dout("ceph_fs_debugfs_init\n");
-                                                      0600,
+        fsc->debugfs_congestion_kb =
-                                                      client->debugfs_dir,
+                debugfs_create_file("writeback_congestion_kb",
-                                                      client,
+                                    0600,
-                                                      &monc_show_fops);
+                                    fsc->client->debugfs_dir,
-        if (!client->monc.debugfs_file)
+                                    fsc,
-                goto out;
+                                    &congestion_kb_fops);
+        if (!fsc->debugfs_congestion_kb)
-        client->mdsc.debugfs_file = debugfs_create_file("mdsc",
-                                                      0600,
-                                                      client->debugfs_dir,
-                                                      client,
-                                                      &mdsc_show_fops);
-        if (!client->mdsc.debugfs_file)
                goto out;
-        client->osdc.debugfs_file = debugfs_create_file("osdc",
+        dout("a\n");
-                                                      0600,
-                                                      client->debugfs_dir,
-                                                      client,
-                                                      &osdc_show_fops);
-        if (!client->osdc.debugfs_file)
-                goto out;
-        client->debugfs_monmap = debugfs_create_file("monmap",
+        snprintf(name, sizeof(name), "../../bdi/%s",
-                                        0600,
+                 dev_name(fsc->backing_dev_info.dev));
-                                        client->debugfs_dir,
+        fsc->debugfs_bdi =
-                                        client,
+                debugfs_create_symlink("bdi",
-                                        &monmap_show_fops);
+                                       fsc->client->debugfs_dir,
-        if (!client->debugfs_monmap)
+                                       name);
+        if (!fsc->debugfs_bdi)
                goto out;
-        client->debugfs_mdsmap = debugfs_create_file("mdsmap",
+        dout("b\n");
+        fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
                                        0600,
-                                        client->debugfs_dir,
+                                        fsc->client->debugfs_dir,
-                                        client,
+                                        fsc,
                                        &mdsmap_show_fops);
-        if (!client->debugfs_mdsmap)
+        if (!fsc->debugfs_mdsmap)
                goto out;
-        client->debugfs_osdmap = debugfs_create_file("osdmap",
+        dout("ca\n");
-                                        0600,
+        fsc->debugfs_mdsc = debugfs_create_file("mdsc",
-                                        client->debugfs_dir,
+                                                0600,
-                                        client,
+                                                fsc->client->debugfs_dir,
-                                        &osdmap_show_fops);
+                                                fsc,
-        if (!client->debugfs_osdmap)
+                                                &mdsc_show_fops);
+        if (!fsc->debugfs_mdsc)
                goto out;
-        client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+        dout("da\n");
-                                        0600,
+        fsc->debugfs_caps = debugfs_create_file("caps",
-                                        client->debugfs_dir,
-                                        client,
-                                        &dentry_lru_show_fops);
-        if (!client->debugfs_dentry_lru)
-                goto out;
-        client->debugfs_caps = debugfs_create_file("caps",
                                                   0400,
-                                                   client->debugfs_dir,
+                                                   fsc->client->debugfs_dir,
-                                                   client,
+                                                   fsc,
                                                   &caps_show_fops);
-        if (!client->debugfs_caps)
+        if (!fsc->debugfs_caps)
                goto out;
-        client->debugfs_congestion_kb =
+        dout("ea\n");
-                debugfs_create_file("writeback_congestion_kb",
+        fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
-                                    0600,
+                                        0600,
-                                    client->debugfs_dir,
+                                        fsc->client->debugfs_dir,
-                                    client,
+                                        fsc,
-                                    &congestion_kb_fops);
+                                        &dentry_lru_show_fops);
-        if (!client->debugfs_congestion_kb)
+        if (!fsc->debugfs_dentry_lru)
                goto out;
-        sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
-        client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
-                                                     name);
        return 0;
 out:
-        ceph_debugfs_client_cleanup(client);
+        ceph_fs_debugfs_cleanup(fsc);
-        return ret;
+        return err;
 }
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
-{
-        debugfs_remove(client->debugfs_bdi);
-        debugfs_remove(client->debugfs_caps);
-        debugfs_remove(client->debugfs_dentry_lru);
-        debugfs_remove(client->debugfs_osdmap);
-        debugfs_remove(client->debugfs_mdsmap);
-        debugfs_remove(client->debugfs_monmap);
-        debugfs_remove(client->osdc.debugfs_file);
-        debugfs_remove(client->mdsc.debugfs_file);
-        debugfs_remove(client->monc.debugfs_file);
-        debugfs_remove(client->debugfs_congestion_kb);
-        debugfs_remove(client->debugfs_dir);
-}
 #else  /* CONFIG_DEBUG_FS */
-int __init ceph_debugfs_init(void)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
-{
-        return 0;
-}
-void ceph_debugfs_cleanup(void)
-{
-}
-int ceph_debugfs_client_init(struct ceph_client *client)
 {
        return 0;
 }
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 {
 }
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
deleted file mode 100644
index 3d25415afe63..000000000000
--- a/fs/ceph/decode.h
+++ /dev/null
@@ -1,196 +0,0 @@
-#ifndef __CEPH_DECODE_H
-#define __CEPH_DECODE_H
-#include <asm/unaligned.h>
-#include <linux/time.h>
-#include "types.h"
-/*
- * in all cases,
- *   void **p     pointer to position pointer
- *   void *end    pointer to end of buffer (last byte + 1)
- */
-static inline u64 ceph_decode_64(void **p)
-{
-        u64 v = get_unaligned_le64(*p);
-        *p += sizeof(u64);
-        return v;
-}
-static inline u32 ceph_decode_32(void **p)
-{
-        u32 v = get_unaligned_le32(*p);
-        *p += sizeof(u32);
-        return v;
-}
-static inline u16 ceph_decode_16(void **p)
-{
-        u16 v = get_unaligned_le16(*p);
-        *p += sizeof(u16);
-        return v;
-}
-static inline u8 ceph_decode_8(void **p)
-{
-        u8 v = *(u8 *)*p;
-        (*p)++;
-        return v;
-}
-static inline void ceph_decode_copy(void **p, void *pv, size_t n)
-{
-        memcpy(pv, *p, n);
-        *p += n;
-}
-/*
- * bounds check input.
- */
-#define ceph_decode_need(p, end, n, bad)                \
-        do {                                            \
-                if (unlikely(*(p) + (n) > (end)))       \
-                        goto bad;                       \
-        } while (0)
-#define ceph_decode_64_safe(p, end, v, bad)                     \
-        do {                                                    \
-                ceph_decode_need(p, end, sizeof(u64), bad);     \
-                v = ceph_decode_64(p);                          \
-        } while (0)
-#define ceph_decode_32_safe(p, end, v, bad)                     \
-        do {                                                    \
-                ceph_decode_need(p, end, sizeof(u32), bad);     \
-                v = ceph_decode_32(p);                          \
-        } while (0)
-#define ceph_decode_16_safe(p, end, v, bad)                     \
-        do {                                                    \
-                ceph_decode_need(p, end, sizeof(u16), bad);     \
-                v = ceph_decode_16(p);                          \
-        } while (0)
-#define ceph_decode_8_safe(p, end, v, bad)                      \
-        do {                                                    \
-                ceph_decode_need(p, end, sizeof(u8), bad);      \
-                v = ceph_decode_8(p);                           \
-        } while (0)
-#define ceph_decode_copy_safe(p, end, pv, n, bad)               \
-        do {                                                    \
-                ceph_decode_need(p, end, n, bad);               \
-                ceph_decode_copy(p, pv, n);                     \
-        } while (0)
-/*
- * struct ceph_timespec <-> struct timespec
- */
-static inline void ceph_decode_timespec(struct timespec *ts,
-                                        const struct ceph_timespec *tv)
-{
-        ts->tv_sec = le32_to_cpu(tv->tv_sec);
-        ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
-}
-static inline void ceph_encode_timespec(struct ceph_timespec *tv,
-                                        const struct timespec *ts)
-{
-        tv->tv_sec = cpu_to_le32(ts->tv_sec);
-        tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
-}
-/*
- * sockaddr_storage <-> ceph_sockaddr
- */
-static inline void ceph_encode_addr(struct ceph_entity_addr *a)
-{
-        __be16 ss_family = htons(a->in_addr.ss_family);
-        a->in_addr.ss_family = *(__u16 *)&ss_family;
-}
-static inline void ceph_decode_addr(struct ceph_entity_addr *a)
-{
-        __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
-        a->in_addr.ss_family = ntohs(ss_family);
-        WARN_ON(a->in_addr.ss_family == 512);
-}
-/*
- * encoders
- */
-static inline void ceph_encode_64(void **p, u64 v)
-{
-        put_unaligned_le64(v, (__le64 *)*p);
-        *p += sizeof(u64);
-}
-static inline void ceph_encode_32(void **p, u32 v)
-{
-        put_unaligned_le32(v, (__le32 *)*p);
-        *p += sizeof(u32);
-}
-static inline void ceph_encode_16(void **p, u16 v)
-{
-        put_unaligned_le16(v, (__le16 *)*p);
-        *p += sizeof(u16);
-}
-static inline void ceph_encode_8(void **p, u8 v)
-{
-        *(u8 *)*p = v;
-        (*p)++;
-}
-static inline void ceph_encode_copy(void **p, const void *s, int len)
-{
-        memcpy(*p, s, len);
-        *p += len;
-}
-/*
- * filepath, string encoders
- */
-static inline void ceph_encode_filepath(void **p, void *end,
-                                        u64 ino, const char *path)
-{
-        u32 len = path ? strlen(path) : 0;
-        BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
-        ceph_encode_8(p, 1);
-        ceph_encode_64(p, ino);
-        ceph_encode_32(p, len);
-        if (len)
-                memcpy(*p, path, len);
-        *p += len;
-}
-static inline void ceph_encode_string(void **p, void *end,
-                                      const char *s, u32 len)
-{
-        BUG_ON(*p + sizeof(len) + len > end);
-        ceph_encode_32(p, len);
-        if (len)
-                memcpy(*p, s, len);
-        *p += len;
-}
-#define ceph_encode_need(p, end, n, bad)                \
-        do {                                            \
-                if (unlikely(*(p) + (n) > (end)))       \
-                        goto bad;                       \
-        } while (0)
-#define ceph_encode_64_safe(p, end, v, bad)                     \
-        do {                                                    \
-                ceph_encode_need(p, end, sizeof(u64), bad);     \
-                ceph_encode_64(p, v);                           \
-        } while (0)
-#define ceph_encode_32_safe(p, end, v, bad)                     \
-        do {                                                    \
-                ceph_encode_need(p, end, sizeof(u32), bad);     \
-                ceph_encode_32(p, v);                   \
-        } while (0)
-#define ceph_encode_16_safe(p, end, v, bad)                     \
-        do {                                                    \
-                ceph_encode_need(p, end, sizeof(u16), bad);     \
-                ceph_encode_16(p, v);                   \
-        } while (0)
-#define ceph_encode_copy_safe(p, end, pv, n, bad)               \
-        do {                                                    \
-                ceph_encode_need(p, end, n, bad);               \
-                ceph_encode_copy(p, pv, n);                     \
-        } while (0)
-#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a1986eb52045..0bc68de8edd7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/spinlock.h>
 #include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 #include "super.h"
+#include "mds_client.h"
 /*
 * Directory operations: readdir, lookup, create, link, unlink,
@@ -39,12 +40,13 @@ int ceph_init_dentry(struct dentry *dentry)
        if (dentry->d_fsdata)
                return 0;
-        if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+        if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
-                dentry->d_op = &ceph_dentry_ops;
+            ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+                d_set_d_op(dentry, &ceph_dentry_ops);
        else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
-                dentry->d_op = &ceph_snapdir_dentry_ops;
+                d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
        else
-                dentry->d_op = &ceph_snap_dentry_ops;
+                d_set_d_op(dentry, &ceph_snap_dentry_ops);
        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
        if (!di)
@@ -94,10 +96,7 @@ static unsigned fpos_off(loff_t p)
 */
 static int __dcache_readdir(struct file *filp,
                            void *dirent, filldir_t filldir)
-                __releases(inode->i_lock)
-                __acquires(inode->i_lock)
 {
-        struct inode *inode = filp->f_dentry->d_inode;
        struct ceph_file_info *fi = filp->private_data;
        struct dentry *parent = filp->f_dentry;
        struct inode *dir = parent->d_inode;
@@ -113,11 +112,11 @@ static int __dcache_readdir(struct file *filp,
        dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
             last);
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        /* start at beginning? */
-        if (filp->f_pos == 2 || (last &&
+        if (filp->f_pos == 2 || last == NULL ||
-                                 filp->f_pos < ceph_dentry(last)->offset)) {
+            filp->f_pos < ceph_dentry(last)->offset) {
                if (list_empty(&parent->d_subdirs))
                        goto out_unlock;
                p = parent->d_subdirs.prev;
@@ -137,6 +136,7 @@ more:
                        fi->at_end = 1;
                        goto out_unlock;
                }
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                if (!d_unhashed(dentry) && dentry->d_inode &&
                    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
                    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
@@ -146,14 +146,15 @@ more:
                     dentry->d_name.len, dentry->d_name.name, di->offset,
                     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
                     !dentry->d_inode ? " null" : "");
+                spin_unlock(&dentry->d_lock);
                p = p->prev;
                dentry = list_entry(p, struct dentry, d_u.d_child);
                di = ceph_dentry(dentry);
        }
-        atomic_inc(&dentry->d_count);
+        dget_dlock(dentry);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&parent->d_lock);
        dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
             dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -171,35 +172,30 @@ more:
                } else {
                        dput(last);
                }
-                last = NULL;
        }
-        spin_lock(&inode->i_lock);
-        spin_lock(&dcache_lock);
        last = dentry;
        if (err < 0)
-                goto out_unlock;
+                goto out;
-        p = p->prev;
        filp->f_pos++;
-        /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+        /* make sure a dentry wasn't dropped while we didn't have parent lock */
-        if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
+        if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
-                goto more;
+                dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
-        dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+                err = -EAGAIN;
-        err = -EAGAIN;
+                goto out;
+        }
-out_unlock:
+        spin_lock(&parent->d_lock);
-        spin_unlock(&dcache_lock);
+        p = p->prev;    /* advance to next dentry */
+        goto more;
-        if (last) {
+out_unlock:
-                spin_unlock(&inode->i_lock);
+        spin_unlock(&parent->d_lock);
+out:
+        if (last)
                dput(last);
-                spin_lock(&inode->i_lock);
-        }
        return err;
 }
@@ -227,15 +223,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct ceph_file_info *fi = filp->private_data;
        struct inode *inode = filp->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        unsigned frag = fpos_frag(filp->f_pos);
        int off = fpos_off(filp->f_pos);
        int err;
        u32 ftype;
        struct ceph_mds_reply_info_parsed *rinfo;
-        const int max_entries = client->mount_args->max_readdir;
+        const int max_entries = fsc->mount_options->max_readdir;
-        const int max_bytes = client->mount_args->max_readdir_bytes;
+        const int max_bytes = fsc->mount_options->max_readdir_bytes;
        dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
        if (fi->at_end)
@@ -267,17 +263,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        /* can we use the dcache? */
        spin_lock(&inode->i_lock);
        if ((filp->f_pos == 2 || fi->dentry) &&
-            !ceph_test_opt(client, NOASYNCREADDIR) &&
+            !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
            ceph_snap(inode) != CEPH_SNAPDIR &&
            (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
            __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+                spin_unlock(&inode->i_lock);
                err = __dcache_readdir(filp, dirent, filldir);
-                if (err != -EAGAIN) {
+                if (err != -EAGAIN)
-                        spin_unlock(&inode->i_lock);
                        return err;
-                }
+        } else {
+                spin_unlock(&inode->i_lock);
        }
-        spin_unlock(&inode->i_lock);
        if (fi->dentry) {
                err = note_last_dentry(fi, fi->dentry->d_name.name,
                                       fi->dentry->d_name.len);
@@ -344,7 +340,10 @@ more:
                if (req->r_reply_info.dir_end) {
                        kfree(fi->last_name);
                        fi->last_name = NULL;
-                        fi->next_offset = 2;
+                        if (ceph_frag_is_rightmost(frag))
+                                fi->next_offset = 2;
+                        else
+                                fi->next_offset = 0;
                } else {
                        rinfo = &req->r_reply_info;
                        err = note_last_dentry(fi,
@@ -363,18 +362,22 @@ more:
                u64 pos = ceph_make_fpos(frag, off);
                struct ceph_mds_reply_inode *in =
                        rinfo->dir_in[off - fi->offset].in;
+                struct ceph_vino vino;
+                ino_t ino;
                dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
                     off, off - fi->offset, rinfo->dir_nr, pos,
                     rinfo->dir_dname_len[off - fi->offset],
                     rinfo->dir_dname[off - fi->offset], in);
                BUG_ON(!in);
                ftype = le32_to_cpu(in->mode) >> 12;
+                vino.ino = le64_to_cpu(in->ino);
+                vino.snap = le64_to_cpu(in->snapid);
+                ino = ceph_vino_to_ino(vino);
                if (filldir(dirent,
                            rinfo->dir_dname[off - fi->offset],
                            rinfo->dir_dname_len[off - fi->offset],
-                            pos,
+                            pos, ino, ftype) < 0) {
-                            le64_to_cpu(in->ino),
-                            ftype) < 0) {
                        dout("filldir stopping us...\n");
                        return 0;
                }
@@ -422,6 +425,7 @@ static void reset_readdir(struct ceph_file_info *fi)
                fi->last_readdir = NULL;
        }
        kfree(fi->last_name);
+        fi->last_name = NULL;
        fi->next_offset = 2;  /* compensate for . and .. */
        if (fi->dentry) {
                dput(fi->dentry);
@@ -487,14 +491,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                                  struct dentry *dentry, int err)
 {
-        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
        struct inode *parent = dentry->d_parent->d_inode;
        /* .snap dir? */
        if (err == -ENOENT &&
-            ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
            strcmp(dentry->d_name.name,
-                   client->mount_args->snapdir_name) == 0) {
+                   fsc->mount_options->snapdir_name) == 0) {
                struct inode *inode = ceph_get_snapdir(parent);
                dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
                     dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -539,8 +542,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                                  struct nameidata *nd)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int op;
        int err;
@@ -572,7 +575,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                spin_lock(&dir->i_lock);
                dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
                if (strncmp(dentry->d_name.name,
-                            client->mount_args->snapdir_name,
+                            fsc->mount_options->snapdir_name,
                            dentry->d_name.len) &&
                    !is_root_ceph_dentry(dir, dentry) &&
                    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -629,8 +632,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
 static int ceph_mknod(struct inode *dir, struct dentry *dentry,
                      int mode, dev_t rdev)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int err;
@@ -685,8 +688,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
 static int ceph_symlink(struct inode *dir, struct dentry *dentry,
                            const char *dest)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int err;
@@ -716,8 +719,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int err = -EROFS;
        int op;
@@ -758,8 +761,8 @@ out:
 static int ceph_link(struct dentry *old_dentry, struct inode *dir,
                     struct dentry *dentry)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int err;
@@ -813,8 +816,8 @@ static int drop_caps_for_unlink(struct inode *inode)
 */
 static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct inode *inode = dentry->d_inode;
        struct ceph_mds_request *req;
        int err = -EROFS;
@@ -854,8 +857,8 @@ out:
 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
                       struct inode *new_dir, struct dentry *new_dentry)
 {
-        struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int err;
@@ -987,7 +990,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
 */
 static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *dir = dentry->d_parent->d_inode;
+        struct inode *dir;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        dir = dentry->d_parent->d_inode;
        dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
             dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
@@ -1076,7 +1084,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
        struct ceph_inode_info *ci = ceph_inode(inode);
        int left;
-        if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+        if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
                return -EISDIR;
        if (!cf->dir_info) {
@@ -1177,7 +1185,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
        dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
        if (di) {
-                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+                mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_add_tail(&di->lru, &mdsc->dentry_lru);
                mdsc->num_dentry++;
@@ -1193,7 +1201,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
        dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
             dn->d_name.len, dn->d_name.name, di->offset);
        if (di) {
-                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+                mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_move_tail(&di->lru, &mdsc->dentry_lru);
                spin_unlock(&mdsc->dentry_lru_lock);
@@ -1208,7 +1216,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
        dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
        if (di) {
-                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+                mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_del_init(&di->lru);
                mdsc->num_dentry--;
@@ -1216,6 +1224,26 @@ void ceph_dentry_lru_del(struct dentry *dn)
        }
 }
+/*
+ * Return name hash for a given dentry.  This is dependent on
+ * the parent directory's hash function.
+ */
+unsigned ceph_dentry_hash(struct dentry *dn)
+{
+        struct inode *dir = dn->d_parent->d_inode;
+        struct ceph_inode_info *dci = ceph_inode(dir);
+        switch (dci->i_dir_layout.dl_dir_hash) {
+        case 0: /* for backward compat */
+        case CEPH_STR_HASH_LINUX:
+                return dn->d_name.hash;
+        default:
+                return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+                                     dn->d_name.name, dn->d_name.len);
+        }
+}
 const struct file_operations ceph_dir_fops = {
        .read = ceph_read_dir,
        .readdir = ceph_readdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e38423e82f2e..e41056174bf8 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/exportfs.h>
 #include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "super.h"
+#include "mds_client.h"
 /*
 * NFS export support
@@ -58,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
                dout("encode_fh %p connectable\n", dentry);
                cfh->ino = ceph_ino(dentry->d_inode);
                cfh->parent_ino = ceph_ino(parent->d_inode);
-                cfh->parent_name_hash = parent->d_name.hash;
+                cfh->parent_name_hash = ceph_dentry_hash(parent);
                *max_len = connected_handle_length;
                type = 2;
        } else if (*max_len >= handle_length) {
@@ -120,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 static struct dentry *__cfh_to_dentry(struct super_block *sb,
                                      struct ceph_nfs_confh *cfh)
 {
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
        struct inode *inode;
        struct dentry *dentry;
        struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 66e4da6dba22..7d0e4a82d898 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
+#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/file.h>
@@ -38,8 +39,8 @@
 static struct ceph_mds_request *
 prepare_open_request(struct super_block *sb, int flags, int create_mode)
 {
-        struct ceph_client *client = ceph_sb_to_client(sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int want_auth = USE_ANY_MDS;
        int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 int ceph_open(struct inode *inode, struct file *file)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        struct ceph_file_info *cf = file->private_data;
        struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -153,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file)
        }
        /*
-         * No need to block if we have any caps.  Update wanted set
+         * No need to block if we have caps on the auth MDS (for
+         * write) or any MDS (for read).  Update wanted set
         * asynchronously.
         */
        spin_lock(&inode->i_lock);
-        if (__ceph_is_any_real_caps(ci)) {
+        if (__ceph_is_any_real_caps(ci) &&
+            (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
                int mds_wanted = __ceph_caps_mds_wanted(ci);
                int issued = __ceph_caps_issued(ci, NULL);
@@ -216,8 +219,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
                                struct nameidata *nd, int mode,
                                int locked_dir)
 {
-        struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct file *file = nd->intent.open.file;
        struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
        struct ceph_mds_request *req;
@@ -270,163 +273,6 @@ int ceph_release(struct inode *inode, struct file *file)
 }
 /*
- * build a vector of user pages
- */
-static struct page **get_direct_page_vector(const char __user *data,
-                                            int num_pages,
-                                            loff_t off, size_t len)
-{
-        struct page **pages;
-        int rc;
-        pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
-        if (!pages)
-                return ERR_PTR(-ENOMEM);
-        down_read(&current->mm->mmap_sem);
-        rc = get_user_pages(current, current->mm, (unsigned long)data,
-                            num_pages, 0, 0, pages, NULL);
-        up_read(&current->mm->mmap_sem);
-        if (rc < 0)
-                goto fail;
-        return pages;
-fail:
-        kfree(pages);
-        return ERR_PTR(rc);
-}
-static void put_page_vector(struct page **pages, int num_pages)
-{
-        int i;
-        for (i = 0; i < num_pages; i++)
-                put_page(pages[i]);
-        kfree(pages);
-}
-void ceph_release_page_vector(struct page **pages, int num_pages)
-{
-        int i;
-        for (i = 0; i < num_pages; i++)
-                __free_pages(pages[i], 0);
-        kfree(pages);
-}
-/*
- * allocate a vector new pages
- */
-static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
-{
-        struct page **pages;
-        int i;
-        pages = kmalloc(sizeof(*pages) * num_pages, flags);
-        if (!pages)
-                return ERR_PTR(-ENOMEM);
-        for (i = 0; i < num_pages; i++) {
-                pages[i] = __page_cache_alloc(flags);
-                if (pages[i] == NULL) {
-                        ceph_release_page_vector(pages, i);
-                        return ERR_PTR(-ENOMEM);
-                }
-        }
-        return pages;
-}
-/*
- * copy user data into a page vector
- */
-static int copy_user_to_page_vector(struct page **pages,
-                                    const char __user *data,
-                                    loff_t off, size_t len)
-{
-        int i = 0;
-        int po = off & ~PAGE_CACHE_MASK;
-        int left = len;
-        int l, bad;
-        while (left > 0) {
-                l = min_t(int, PAGE_CACHE_SIZE-po, left);
-                bad = copy_from_user(page_address(pages[i]) + po, data, l);
-                if (bad == l)
-                        return -EFAULT;
-                data += l - bad;
-                left -= l - bad;
-                po += l - bad;
-                if (po == PAGE_CACHE_SIZE) {
-                        po = 0;
-                        i++;
-                }
-        }
-        return len;
-}
-/*
- * copy user data from a page vector into a user pointer
- */
-static int copy_page_vector_to_user(struct page **pages, char __user *data,
-                                    loff_t off, size_t len)
-{
-        int i = 0;
-        int po = off & ~PAGE_CACHE_MASK;
-        int left = len;
-        int l, bad;
-        while (left > 0) {
-                l = min_t(int, left, PAGE_CACHE_SIZE-po);
-                bad = copy_to_user(data, page_address(pages[i]) + po, l);
-                if (bad == l)
-                        return -EFAULT;
-                data += l - bad;
-                left -= l - bad;
-                if (po) {
-                        po += l - bad;
-                        if (po == PAGE_CACHE_SIZE)
-                                po = 0;
-                }
-                i++;
-        }
-        return len;
-}
-/*
- * Zero an extent within a page vector.  Offset is relative to the
- * start of the first page.
- */
-static void zero_page_vector_range(int off, int len, struct page **pages)
-{
-        int i = off >> PAGE_CACHE_SHIFT;
-        off &= ~PAGE_CACHE_MASK;
-        dout("zero_page_vector_page %u~%u\n", off, len);
-        /* leading partial page? */
-        if (off) {
-                int end = min((int)PAGE_CACHE_SIZE, off + len);
-                dout("zeroing %d %p head from %d\n", i, pages[i],
-                     (int)off);
-                zero_user_segment(pages[i], off, end);
-                len -= (end - off);
-                i++;
-        }
-        while (len >= PAGE_CACHE_SIZE) {
-                dout("zeroing %d %p len=%d\n", i, pages[i], len);
-                zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
-                len -= PAGE_CACHE_SIZE;
-                i++;
-        }
-        /* trailing partial page? */
-        if (len) {
-                dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
-                zero_user_segment(pages[i], 0, len);
-        }
-}
-/*
 * Read a range of bytes striped over one or more objects.  Iterate over
 * objects we stripe over.  (That's not atomic, but good enough for now.)
 *
@@ -436,11 +282,13 @@ static void zero_page_vector_range(int off, int len, struct page **pages)
 static int striped_read(struct inode *inode,
                        u64 off, u64 len,
                        struct page **pages, int num_pages,
-                        int *checkeof)
+                        int *checkeof, bool align_to_pages,
+                        unsigned long buf_align)
 {
-        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_inode_info *ci = ceph_inode(inode);
        u64 pos, this_len;
+        int io_align, page_align;
        int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
        int left, pages_left;
        int read;
@@ -456,14 +304,19 @@ static int striped_read(struct inode *inode,
        page_pos = pages;
        pages_left = num_pages;
        read = 0;
+        io_align = off & ~PAGE_MASK;
 more:
+        if (align_to_pages)
+                page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+        else
+                page_align = pos & ~PAGE_MASK;
        this_len = left;
-        ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+        ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
                                  &ci->i_layout, pos, &this_len,
                                  ci->i_truncate_seq,
                                  ci->i_truncate_size,
-                                  page_pos, pages_left);
+                                  page_pos, pages_left, page_align);
        hit_stripe = this_len < left;
        was_short = ret >= 0 && ret < this_len;
        if (ret == -ENOENT)
@@ -477,8 +330,8 @@ more:
                if (read < pos - off) {
                        dout(" zero gap %llu to %llu\n", off + read, pos);
-                        zero_page_vector_range(page_off + read,
+                        ceph_zero_page_vector_range(page_off + read,
-                                               pos - off - read, pages);
+                                                    pos - off - read, pages);
                }
                pos += ret;
                read = pos - off;
@@ -495,8 +348,8 @@ more:
                /* was original extent fully inside i_size? */
                if (pos + left <= inode->i_size) {
                        dout("zero tail\n");
-                        zero_page_vector_range(page_off + read, len - read,
+                        ceph_zero_page_vector_range(page_off + read, len - read,
-                                               pages);
+                                                    pages);
                        read = len;
                        goto out;
                }
@@ -524,41 +377,43 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
        struct inode *inode = file->f_dentry->d_inode;
        struct page **pages;
        u64 off = *poff;
-        int num_pages = calc_pages_for(off, len);
+        int num_pages, ret;
-        int ret;
        dout("sync_read on file %p %llu~%u %s\n", file, off, len,
             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
        if (file->f_flags & O_DIRECT) {
-                pages = get_direct_page_vector(data, num_pages, off, len);
+                num_pages = calc_pages_for((unsigned long)data, len);
+                pages = ceph_get_direct_page_vector(data, num_pages, true);
-                /*
-                 * flush any page cache pages in this range.  this
-                 * will make concurrent normal and O_DIRECT io slow,
-                 * but it will at least behave sensibly when they are
-                 * in sequence.
-                 */
        } else {
+                num_pages = calc_pages_for(off, len);
                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
        }
        if (IS_ERR(pages))
                return PTR_ERR(pages);
+        /*
+         * flush any page cache pages in this range.  this
+         * will make concurrent normal and sync io slow,
+         * but it will at least behave sensibly when they are
+         * in sequence.
+         */
        ret = filemap_write_and_wait(inode->i_mapping);
        if (ret < 0)
                goto done;
-        ret = striped_read(inode, off, len, pages, num_pages, checkeof);
+        ret = striped_read(inode, off, len, pages, num_pages, checkeof,
+                           file->f_flags & O_DIRECT,
+                           (unsigned long)data & ~PAGE_MASK);
        if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
-                ret = copy_page_vector_to_user(pages, data, off, ret);
+                ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
        if (ret >= 0)
                *poff = off + ret;
 done:
        if (file->f_flags & O_DIRECT)
-                put_page_vector(pages, num_pages);
+                ceph_put_page_vector(pages, num_pages, true);
        else
                ceph_release_page_vector(pages, num_pages);
        dout("sync_read result %d\n", ret);
@@ -594,7 +449,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_client *client = ceph_inode_to_client(inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_osd_request *req;
        struct page **pages;
        int num_pages;
@@ -604,6 +459,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
        int flags;
        int do_sync = 0;
        int check_caps = 0;
+        int page_align, io_align;
+        unsigned long buf_align;
        int ret;
        struct timespec mtime = CURRENT_TIME;
@@ -618,6 +475,9 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
        else
                pos = *offset;
+        io_align = pos & ~PAGE_MASK;
+        buf_align = (unsigned long)data & ~PAGE_MASK;
        ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
        if (ret < 0)
                return ret;
@@ -642,20 +502,27 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
         */
 more:
        len = left;
-        req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+        if (file->f_flags & O_DIRECT) {
+                /* write from beginning of first page, regardless of
+                   io alignment */
+                page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+                num_pages = calc_pages_for((unsigned long)data, len);
+        } else {
+                page_align = pos & ~PAGE_MASK;
+                num_pages = calc_pages_for(pos, len);
+        }
+        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                    ceph_vino(inode), pos, &len,
                                    CEPH_OSD_OP_WRITE, flags,
                                    ci->i_snap_realm->cached_context,
                                    do_sync,
                                    ci->i_truncate_seq, ci->i_truncate_size,
-                                    &mtime, false, 2);
+                                    &mtime, false, 2, page_align);
        if (!req)
                return -ENOMEM;
-        num_pages = calc_pages_for(pos, len);
        if (file->f_flags & O_DIRECT) {
-                pages = get_direct_page_vector(data, num_pages, pos, len);
+                pages = ceph_get_direct_page_vector(data, num_pages, false);
                if (IS_ERR(pages)) {
                        ret = PTR_ERR(pages);
                        goto out;
@@ -673,7 +540,7 @@ more:
                        ret = PTR_ERR(pages);
                        goto out;
                }
-                ret = copy_user_to_page_vector(pages, data, pos, len);
+                ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
                if (ret < 0) {
                        ceph_release_page_vector(pages, num_pages);
                        goto out;
@@ -689,7 +556,7 @@ more:
        req->r_num_pages = num_pages;
        req->r_inode = inode;
-        ret = ceph_osdc_start_request(&client->osdc, req, false);
+        ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!ret) {
                if (req->r_safe_callback) {
                        /*
@@ -701,11 +568,11 @@ more:
                        spin_unlock(&ci->i_unsafe_lock);
                        ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
                }
-                ret = ceph_osdc_wait_request(&client->osdc, req);
+                ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
        }
        if (file->f_flags & O_DIRECT)
-                put_page_vector(pages, num_pages);
+                ceph_put_page_vector(pages, num_pages, false);
        else if (file->f_flags & O_SYNC)
                ceph_release_page_vector(pages, num_pages);
@@ -814,7 +681,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
        struct ceph_file_info *fi = file->private_data;
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+        struct ceph_osd_client *osdc =
+                &ceph_sb_to_client(inode->i_sb)->client->osdc;
        loff_t endoff = pos + iov->iov_len;
        int want, got = 0;
        int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 62377ec37edf..5625463aa479 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,8 +1,7 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/module.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
@@ -13,7 +12,8 @@
 #include <linux/pagevec.h>
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
 /*
 * Ceph inode operations
@@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_release_count = 0;
        ci->i_symlink = NULL;
+        memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
        ci->i_fragtree = RB_ROOT;
        mutex_init(&ci->i_fragtree_mutex);
@@ -368,6 +370,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        return &ci->vfs_inode;
 }
+static void ceph_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ceph_inode_cachep, ci);
+}
 void ceph_destroy_inode(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -384,7 +395,7 @@ void ceph_destroy_inode(struct inode *inode)
         */
        if (ci->i_snap_realm) {
                struct ceph_mds_client *mdsc =
-                        &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+                        ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
                struct ceph_snap_realm *realm = ci->i_snap_realm;
                dout(" dropping residual ref to snap realm %p\n", realm);
@@ -407,7 +418,7 @@ void ceph_destroy_inode(struct inode *inode)
        if (ci->i_xattrs.prealloc_blob)
                ceph_buffer_put(ci->i_xattrs.prealloc_blob);
-        kmem_cache_free(ceph_inode_cachep, ci);
+        call_rcu(&inode->i_rcu, ceph_i_callback);
 }
@@ -470,7 +481,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
        if (issued & (CEPH_CAP_FILE_EXCL|
                      CEPH_CAP_FILE_WR|
-                      CEPH_CAP_FILE_BUFFER)) {
+                      CEPH_CAP_FILE_BUFFER|
+                      CEPH_CAP_AUTH_EXCL|
+                      CEPH_CAP_XATTR_EXCL)) {
                if (timespec_compare(ctime, &inode->i_ctime) > 0) {
                        dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
                             inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
@@ -510,7 +523,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
                        warn = 1;
                }
        } else {
-                /* we have no write caps; whatever the MDS says is true */
+                /* we have no write|excl caps; whatever the MDS says is true */
                if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
                        inode->i_ctime = *ctime;
                        inode->i_mtime = *mtime;
@@ -566,12 +579,17 @@ static int fill_inode(struct inode *inode,
        /*
         * provided version will be odd if inode value is projected,
-         * even if stable.  skip the update if we have a newer info
+         * even if stable.  skip the update if we have newer stable
-         * (e.g., due to inode info racing form multiple MDSs), or if
+         * info (ours>=theirs, e.g. due to racing mds replies), unless
-         * we are getting projected (unstable) inode info.
+         * we are getting projected (unstable) info (in which case the
+         * version is odd, and we want ours>theirs).
+         *   us   them
+         *   2    2     skip
+         *   3    2     skip
+         *   3    3     update
         */
        if (le64_to_cpu(info->version) > 0 &&
-            (ci->i_version & ~1) > le64_to_cpu(info->version))
+            (ci->i_version & ~1) >= le64_to_cpu(info->version))
                goto no_change;
        issued = __ceph_caps_issued(ci, &implemented);
@@ -605,7 +623,14 @@ static int fill_inode(struct inode *inode,
                            le32_to_cpu(info->time_warp_seq),
                            &ctime, &mtime, &atime);
-        ci->i_max_size = le64_to_cpu(info->max_size);
+        /* only update max_size on auth cap */
+        if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+            ci->i_max_size != le64_to_cpu(info->max_size)) {
+                dout("max_size %lld -> %llu\n", ci->i_max_size,
+                     le64_to_cpu(info->max_size));
+                ci->i_max_size = le64_to_cpu(info->max_size);
+        }
        ci->i_layout = info->layout;
        inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
@@ -666,6 +691,8 @@ static int fill_inode(struct inode *inode,
                inode->i_op = &ceph_dir_iops;
                inode->i_fop = &ceph_dir_fops;
+                ci->i_dir_layout = iinfo->dir_layout;
                ci->i_files = le64_to_cpu(info->files);
                ci->i_subdirs = le64_to_cpu(info->subdirs);
                ci->i_rbytes = le64_to_cpu(info->rbytes);
@@ -683,10 +710,6 @@ static int fill_inode(struct inode *inode,
                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
                        ci->i_max_offset = 2;
                }
-                /* it may be better to set st_size in getattr instead? */
-                if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
-                        inode->i_size = ci->i_rbytes;
                break;
        default:
                pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -827,13 +850,13 @@ static void ceph_set_dentry_offset(struct dentry *dn)
        di->offset = ceph_inode(inode)->i_max_offset++;
        spin_unlock(&inode->i_lock);
-        spin_lock(&dcache_lock);
+        spin_lock(&dir->d_lock);
-        spin_lock(&dn->d_lock);
+        spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
        list_move(&dn->d_u.d_child, &dir->d_subdirs);
        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
             dn->d_u.d_child.prev, dn->d_u.d_child.next);
        spin_unlock(&dn->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dir->d_lock);
 }
 /*
@@ -865,8 +888,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
        } else if (realdn) {
                dout("dn %p (%d) spliced with %p (%d) "
                     "inode %p ino %llx.%llx\n",
-                     dn, atomic_read(&dn->d_count),
+                     dn, dn->d_count,
-                     realdn, atomic_read(&realdn->d_count),
+                     realdn, realdn->d_count,
                     realdn->d_inode, ceph_vinop(realdn->d_inode));
                dput(dn);
                dn = realdn;
@@ -901,7 +924,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
        struct inode *in = NULL;
        struct ceph_mds_reply_inode *ininfo;
        struct ceph_vino vino;
-        struct ceph_client *client = ceph_sb_to_client(sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
        int i = 0;
        int err = 0;
@@ -965,7 +988,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
         */
        if (rinfo->head->is_dentry && !req->r_aborted &&
            (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
-                                               client->mount_args->snapdir_name,
+                                               fsc->mount_options->snapdir_name,
                                               req->r_dentry->d_name.len))) {
                /*
                 * lookup link rename   : null -> possibly existing inode
@@ -1054,7 +1077,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                ininfo = rinfo->targeti.in;
                vino.ino = le64_to_cpu(ininfo->ino);
                vino.snap = le64_to_cpu(ininfo->snapid);
-                if (!dn->d_inode) {
+                in = dn->d_inode;
+                if (!in) {
                        in = ceph_get_inode(sb, vino);
                        if (IS_ERR(in)) {
                                pr_err("fill_trace bad get_inode "
@@ -1216,11 +1240,11 @@ retry_lookup:
                        goto retry_lookup;
                } else {
                        /* reorder parent's d_subdirs */
-                        spin_lock(&dcache_lock);
+                        spin_lock(&parent->d_lock);
-                        spin_lock(&dn->d_lock);
+                        spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
                        list_move(&dn->d_u.d_child, &parent->d_subdirs);
                        spin_unlock(&dn->d_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&parent->d_lock);
                }
                di = dn->d_fsdata;
@@ -1385,11 +1409,8 @@ static void ceph_invalidate_work(struct work_struct *work)
        spin_lock(&inode->i_lock);
        dout("invalidate_pages %p gen %d revoking %d\n", inode,
             ci->i_rdcache_gen, ci->i_rdcache_revoking);
-        if (ci->i_rdcache_gen == 0 ||
+        if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-            ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-                BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
                /* nevermind! */
-                ci->i_rdcache_revoking = 0;
                spin_unlock(&inode->i_lock);
                goto out;
        }
@@ -1399,15 +1420,16 @@ static void ceph_invalidate_work(struct work_struct *work)
        ceph_invalidate_nondirty_pages(inode->i_mapping);
        spin_lock(&inode->i_lock);
-        if (orig_gen == ci->i_rdcache_gen) {
+        if (orig_gen == ci->i_rdcache_gen &&
+            orig_gen == ci->i_rdcache_revoking) {
                dout("invalidate_pages %p gen %d successful\n", inode,
                     ci->i_rdcache_gen);
-                ci->i_rdcache_gen = 0;
+                ci->i_rdcache_revoking--;
-                ci->i_rdcache_revoking = 0;
                check = 1;
        } else {
-                dout("invalidate_pages %p gen %d raced, gen now %d\n",
+                dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
-                     inode, orig_gen, ci->i_rdcache_gen);
+                     inode, orig_gen, ci->i_rdcache_gen,
+                     ci->i_rdcache_revoking);
        }
        spin_unlock(&inode->i_lock);
@@ -1533,7 +1555,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        struct inode *parent_inode = dentry->d_parent->d_inode;
        const unsigned int ia_valid = attr->ia_valid;
        struct ceph_mds_request *req;
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
        int issued;
        int release = 0, dirtied = 0;
        int mask = 0;
@@ -1728,8 +1750,8 @@ out:
 */
 int ceph_do_getattr(struct inode *inode, int mask)
 {
-        struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int err;
@@ -1738,7 +1760,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
                return 0;
        }
-        dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+        dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
        if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
                return 0;
@@ -1759,12 +1781,17 @@ int ceph_do_getattr(struct inode *inode, int mask)
 * Check inode permissions.  We verify we have a valid value for
 * the AUTH cap, then call the generic handler.
 */
-int ceph_permission(struct inode *inode, int mask)
+int ceph_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+        int err;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
        if (!err)
-                err = generic_permission(inode, mask, NULL);
+                err = generic_permission(inode, mask, flags, NULL);
        return err;
 }
@@ -1788,7 +1815,11 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
                else
                        stat->dev = 0;
                if (S_ISDIR(inode->i_mode)) {
-                        stat->size = ci->i_rbytes;
+                        if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
+                                                RBYTES))
+                                stat->size = ci->i_rbytes;
+                        else
+                                stat->size = ci->i_files + ci->i_subdirs;
                        stat->blocks = 0;
                        stat->blksize = 65536;
                }
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 76e307d2aba1..8888c9ba68db 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
 #include <linux/in.h>
-#include "ioctl.h"
 #include "super.h"
-#include "ceph_debug.h"
+#include "mds_client.h"
+#include <linux/ceph/ceph_debug.h>
+#include "ioctl.h"
 /*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_mds_request *req;
        struct ceph_ioctl_layout l;
        int err, i;
@@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 }
 /*
+ * Set a layout policy on a directory inode. All items in the tree
+ * rooted at this inode will inherit this layout on creation,
+ * (It doesn't apply retroactively )
+ * unless a subdirectory has its own layout policy.
+ */
+static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_mds_request *req;
+        struct ceph_ioctl_layout l;
+        int err, i;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+        /* copy and validate */
+        if (copy_from_user(&l, arg, sizeof(l)))
+                return -EFAULT;
+        if ((l.object_size & ~PAGE_MASK) ||
+            (l.stripe_unit & ~PAGE_MASK) ||
+            !l.stripe_unit ||
+            (l.object_size &&
+                (unsigned)l.object_size % (unsigned)l.stripe_unit))
+                return -EINVAL;
+        /* make sure it's a valid data pool */
+        if (l.data_pool > 0) {
+                mutex_lock(&mdsc->mutex);
+                err = -EINVAL;
+                for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+                        if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+                                err = 0;
+                                break;
+                        }
+                mutex_unlock(&mdsc->mutex);
+                if (err)
+                        return err;
+        }
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
+                                       USE_AUTH_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->r_inode = igrab(inode);
+        req->r_args.setlayout.layout.fl_stripe_unit =
+                        cpu_to_le32(l.stripe_unit);
+        req->r_args.setlayout.layout.fl_stripe_count =
+                        cpu_to_le32(l.stripe_count);
+        req->r_args.setlayout.layout.fl_object_size =
+                        cpu_to_le32(l.object_size);
+        req->r_args.setlayout.layout.fl_pg_pool =
+                        cpu_to_le32(l.data_pool);
+        req->r_args.setlayout.layout.fl_pg_preferred =
+                        cpu_to_le32(l.preferred_osd);
+        err = ceph_mdsc_do_request(mdsc, inode, req);
+        ceph_mdsc_put_request(req);
+        return err;
+}
+/*
 * Return object name, size/offset information, and location (OSD
 * number, network address) for a given file offset.
 */
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        struct ceph_ioctl_dataloc dl;
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+        struct ceph_osd_client *osdc =
+                &ceph_sb_to_client(inode->i_sb)->client->osdc;
        u64 len = 1, olen;
        u64 tmp;
        struct ceph_object_layout ol;
@@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case CEPH_IOC_SET_LAYOUT:
                return ceph_ioctl_set_layout(file, (void __user *)arg);
+        case CEPH_IOC_SET_LAYOUT_POLICY:
+                return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
        case CEPH_IOC_GET_DATALOC:
                return ceph_ioctl_get_dataloc(file, (void __user *)arg);
        case CEPH_IOC_LAZYIO:
                return ceph_ioctl_lazyio(file);
        }
        return -ENOTTY;
 }
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 88451a3b6857..52e8fd74d450 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
                                   struct ceph_ioctl_layout)
 #define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,           \
                                   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5,    \
+                                   struct ceph_ioctl_layout)
 /*
 * Extract identity, address of the OSD and object storing a given
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ff4e753aae92..476b329867d4 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,50 +1,78 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/file.h>
 #include <linux/namei.h>
 #include "super.h"
 #include "mds_client.h"
-#include "pagelist.h"
+#include <linux/ceph/pagelist.h>
 /**
 * Implement fcntl and flock locking functions.
 */
 static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
-                             u64 pid, u64 pid_ns,
+                             int cmd, u8 wait, struct file_lock *fl)
-                             int cmd, u64 start, u64 length, u8 wait)
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_mds_client *mdsc =
-                &ceph_sb_to_client(inode->i_sb)->mdsc;
+                ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_mds_request *req;
        int err;
+        u64 length = 0;
        req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
        if (IS_ERR(req))
                return PTR_ERR(req);
        req->r_inode = igrab(inode);
+        /* mds requires start and length rather than start and end */
+        if (LLONG_MAX == fl->fl_end)
+                length = 0;
+        else
+                length = fl->fl_end - fl->fl_start + 1;
        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
             "length: %llu, wait: %d, type`: %d", (int)lock_type,
-             (int)operation, pid, start, length, wait, cmd);
+             (int)operation, (u64)fl->fl_pid, fl->fl_start,
+             length, wait, fl->fl_type);
        req->r_args.filelock_change.rule = lock_type;
        req->r_args.filelock_change.type = cmd;
-        req->r_args.filelock_change.pid = cpu_to_le64(pid);
+        req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
        /* This should be adjusted, but I'm not sure if
           namespaces actually get id numbers*/
        req->r_args.filelock_change.pid_namespace =
-                cpu_to_le64((u64)pid_ns);
+                cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
-        req->r_args.filelock_change.start = cpu_to_le64(start);
+        req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
        req->r_args.filelock_change.length = cpu_to_le64(length);
        req->r_args.filelock_change.wait = wait;
        err = ceph_mdsc_do_request(mdsc, inode, req);
+        if ( operation == CEPH_MDS_OP_GETFILELOCK){
+                fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+                if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
+                        fl->fl_type = F_RDLCK;
+                else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
+                        fl->fl_type = F_WRLCK;
+                else
+                        fl->fl_type = F_UNLCK;
+                fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
+                length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
+                                                 le64_to_cpu(req->r_reply_info.filelock_reply->length);
+                if (length >= 1)
+                        fl->fl_end = length -1;
+                else
+                        fl->fl_end = 0;
+        }
        ceph_mdsc_put_request(req);
        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
-             "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
+             "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
-             (int)operation, pid, start, length, wait, cmd, err);
+             (int)operation, (u64)fl->fl_pid, fl->fl_start,
+             length, wait, fl->fl_type, err);
        return err;
 }
@@ -54,7 +82,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 */
 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 {
-        u64 length;
        u8 lock_cmd;
        int err;
        u8 wait = 0;
@@ -76,29 +103,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
        else
                lock_cmd = CEPH_LOCK_UNLOCK;
-        if (LLONG_MAX == fl->fl_end)
+        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
-                length = 0;
-        else
-                length = fl->fl_end - fl->fl_start + 1;
-        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                (u64)fl->fl_pid,
-                                (u64)(unsigned long)fl->fl_nspid,
-                                lock_cmd, fl->fl_start,
-                                length, wait);
        if (!err) {
-                dout("mds locked, locking locally");
+                if ( op != CEPH_MDS_OP_GETFILELOCK ){
-                err = posix_lock_file(file, fl, NULL);
+                        dout("mds locked, locking locally");
-                if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+                        err = posix_lock_file(file, fl, NULL);
-                        /* undo! This should only happen if the kernel detects
+                        if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
-                         * local deadlock. */
+                                /* undo! This should only happen if the kernel detects
-                        ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+                                 * local deadlock. */
-                                          (u64)fl->fl_pid,
+                                ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                          (u64)(unsigned long)fl->fl_nspid,
+                                                  CEPH_LOCK_UNLOCK, 0, fl);
-                                          CEPH_LOCK_UNLOCK, fl->fl_start,
+                                dout("got %d on posix_lock_file, undid lock", err);
-                                          length, 0);
+                        }
-                        dout("got %d on posix_lock_file, undid lock", err);
                }
        } else {
                dout("mds returned error code %d", err);
        }
@@ -107,7 +125,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-        u64 length;
        u8 lock_cmd;
        int err;
        u8 wait = 1;
@@ -127,26 +144,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
                lock_cmd = CEPH_LOCK_EXCL;
        else
                lock_cmd = CEPH_LOCK_UNLOCK;
-        /* mds requires start and length rather than start and end */
-        if (LLONG_MAX == fl->fl_end)
-                length = 0;
-        else
-                length = fl->fl_end - fl->fl_start + 1;
        err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
-                                file, (u64)fl->fl_pid,
+                                file, lock_cmd, wait, fl);
-                                (u64)(unsigned long)fl->fl_nspid,
-                                lock_cmd, fl->fl_start,
-                                length, wait);
        if (!err) {
                err = flock_lock_file_wait(file, fl);
                if (err) {
                        ceph_lock_message(CEPH_LOCK_FLOCK,
                                          CEPH_MDS_OP_SETFILELOCK,
-                                          file, (u64)fl->fl_pid,
+                                          file, CEPH_LOCK_UNLOCK, 0, fl);
-                                          (u64)(unsigned long)fl->fl_nspid,
-                                          CEPH_LOCK_UNLOCK, fl->fl_start,
-                                          length, 0);
                        dout("got %d on flock_lock_file_wait, undid lock", err);
                }
        } else {
@@ -181,8 +187,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 * Encode the flock and fcntl locks for the given inode into the pagelist.
 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
 * sequential flock locks.
- * Must be called with BLK already held, and the lock numbers should have
+ * Must be called with lock_flocks() already held.
- * been gathered under the same lock holding window.
+ * If we encounter more of a specific lock type than expected,
+ * we return the value 1.
 */
 int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
                      int num_fcntl_locks, int num_flock_locks)
@@ -190,6 +197,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
        struct file_lock *lock;
        struct ceph_filelock cephlock;
        int err = 0;
+        int seen_fcntl = 0;
+        int seen_flock = 0;
        dout("encoding %d flock and %d fcntl locks", num_flock_locks,
             num_fcntl_locks);
@@ -198,6 +207,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
                goto fail;
        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
                if (lock->fl_flags & FL_POSIX) {
+                        ++seen_fcntl;
+                        if (seen_fcntl > num_fcntl_locks) {
+                                err = -ENOSPC;
+                                goto fail;
+                        }
                        err = lock_to_ceph_filelock(lock, &cephlock);
                        if (err)
                                goto fail;
@@ -213,6 +227,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
                goto fail;
        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
                if (lock->fl_flags & FL_FLOCK) {
+                        ++seen_flock;
+                        if (seen_flock > num_flock_locks) {
+                                err = -ENOSPC;
+                                goto fail;
+                        }
                        err = lock_to_ceph_filelock(lock, &cephlock);
                        if (err)
                                goto fail;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fad95f8f2608..a1ee8fa3a8e7 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,17 +1,20 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
+#include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
-#include "mds_client.h"
-#include "mon_client.h"
 #include "super.h"
-#include "messenger.h"
+#include "mds_client.h"
-#include "decode.h"
-#include "auth.h"
+#include <linux/ceph/messenger.h>
-#include "pagelist.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
 /*
 * A cluster of MDS (metadata server) daemons is responsible for
@@ -57,7 +60,8 @@ static const struct ceph_connection_operations mds_con_ops;
 * parse individual inode info
 */
 static int parse_reply_info_in(void **p, void *end,
-                               struct ceph_mds_reply_info_in *info)
+                               struct ceph_mds_reply_info_in *info,
+                               int features)
 {
        int err = -EIO;
@@ -71,6 +75,12 @@ static int parse_reply_info_in(void **p, void *end,
        info->symlink = *p;
        *p += info->symlink_len;
+        if (features & CEPH_FEATURE_DIRLAYOUTHASH)
+                ceph_decode_copy_safe(p, end, &info->dir_layout,
+                                      sizeof(info->dir_layout), bad);
+        else
+                memset(&info->dir_layout, 0, sizeof(info->dir_layout));
        ceph_decode_32_safe(p, end, info->xattr_len, bad);
        ceph_decode_need(p, end, info->xattr_len, bad);
        info->xattr_data = *p;
@@ -85,12 +95,13 @@ bad:
 * target inode.
 */
 static int parse_reply_info_trace(void **p, void *end,
-                                  struct ceph_mds_reply_info_parsed *info)
+                                  struct ceph_mds_reply_info_parsed *info,
+                                  int features)
 {
        int err;
        if (info->head->is_dentry) {
-                err = parse_reply_info_in(p, end, &info->diri);
+                err = parse_reply_info_in(p, end, &info->diri, features);
                if (err < 0)
                        goto out_bad;
@@ -111,7 +122,7 @@ static int parse_reply_info_trace(void **p, void *end,
        }
        if (info->head->is_target) {
-                err = parse_reply_info_in(p, end, &info->targeti);
+                err = parse_reply_info_in(p, end, &info->targeti, features);
                if (err < 0)
                        goto out_bad;
        }
@@ -131,7 +142,8 @@ out_bad:
 * parse readdir results
 */
 static int parse_reply_info_dir(void **p, void *end,
-                                struct ceph_mds_reply_info_parsed *info)
+                                struct ceph_mds_reply_info_parsed *info,
+                                int features)
 {
        u32 num, i = 0;
        int err;
@@ -179,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end,
                *p += sizeof(struct ceph_mds_reply_lease);
                /* inode */
-                err = parse_reply_info_in(p, end, &info->dir_in[i]);
+                err = parse_reply_info_in(p, end, &info->dir_in[i], features);
                if (err < 0)
                        goto out_bad;
                i++;
@@ -199,10 +211,45 @@ out_bad:
 }
 /*
+ * parse fcntl F_GETLK results
+ */
+static int parse_reply_info_filelock(void **p, void *end,
+                                     struct ceph_mds_reply_info_parsed *info,
+                                     int features)
+{
+        if (*p + sizeof(*info->filelock_reply) > end)
+                goto bad;
+        info->filelock_reply = *p;
+        *p += sizeof(*info->filelock_reply);
+        if (unlikely(*p != end))
+                goto bad;
+        return 0;
+bad:
+        return -EIO;
+}
+/*
+ * parse extra results
+ */
+static int parse_reply_info_extra(void **p, void *end,
+                                  struct ceph_mds_reply_info_parsed *info,
+                                  int features)
+{
+        if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+                return parse_reply_info_filelock(p, end, info, features);
+        else
+                return parse_reply_info_dir(p, end, info, features);
+}
+/*
 * parse entire mds reply
 */
 static int parse_reply_info(struct ceph_msg *msg,
-                            struct ceph_mds_reply_info_parsed *info)
+                            struct ceph_mds_reply_info_parsed *info,
+                            int features)
 {
        void *p, *end;
        u32 len;
@@ -215,15 +262,15 @@ static int parse_reply_info(struct ceph_msg *msg,
        /* trace */
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
-                err = parse_reply_info_trace(&p, p+len, info);
+                err = parse_reply_info_trace(&p, p+len, info, features);
                if (err < 0)
                        goto out_bad;
        }
-        /* dir content */
+        /* extra */
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
-                err = parse_reply_info_dir(&p, p+len, info);
+                err = parse_reply_info_extra(&p, p+len, info, features);
                if (err < 0)
                        goto out_bad;
        }
@@ -286,8 +333,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
             atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
        if (atomic_dec_and_test(&s->s_ref)) {
                if (s->s_authorizer)
-                        s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
+                     s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
-                                s->s_mdsc->client->monc.auth, s->s_authorizer);
+                             s->s_mdsc->fsc->client->monc.auth,
+                             s->s_authorizer);
                kfree(s);
        }
 }
@@ -344,7 +392,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
        s->s_seq = 0;
        mutex_init(&s->s_mutex);
-        ceph_con_init(mdsc->client->msgr, &s->s_con);
+        ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
        s->s_con.private = s;
        s->s_con.ops = &mds_con_ops;
        s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -524,6 +572,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
        ceph_mdsc_get_request(req);
        __insert_request(mdsc, req);
+        req->r_uid = current_fsuid();
+        req->r_gid = current_fsgid();
        if (dir) {
                struct ceph_inode_info *ci = ceph_inode(dir);
@@ -599,7 +650,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
        } else if (req->r_dentry) {
                struct inode *dir = req->r_dentry->d_parent->d_inode;
-                if (dir->i_sb != mdsc->client->sb) {
+                if (dir->i_sb != mdsc->fsc->sb) {
                        /* not this fs! */
                        inode = req->r_dentry->d_inode;
                } else if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -615,7 +666,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                } else {
                        /* dir + name */
                        inode = dir;
-                        hash = req->r_dentry->d_name.hash;
+                        hash = ceph_dentry_hash(req->r_dentry);
                        is_hash = true;
                }
        }
@@ -642,9 +693,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                dout("choose_mds %p %llx.%llx "
                                     "frag %u mds%d (%d/%d)\n",
                                     inode, ceph_vinop(inode),
-                                     frag.frag, frag.mds,
+                                     frag.frag, mds,
                                     (int)r, frag.ndist);
-                                return mds;
+                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+                                    CEPH_MDS_STATE_ACTIVE)
+                                        return mds;
                        }
                        /* since this file/dir wasn't known to be
@@ -657,7 +710,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                dout("choose_mds %p %llx.%llx "
                                     "frag %u mds%d (auth)\n",
                                     inode, ceph_vinop(inode), frag.frag, mds);
-                                return mds;
+                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+                                    CEPH_MDS_STATE_ACTIVE)
+                                        return mds;
                        }
                }
        }
@@ -884,7 +939,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
        __ceph_remove_cap(cap);
        if (!__ceph_is_any_real_caps(ci)) {
                struct ceph_mds_client *mdsc =
-                        &ceph_sb_to_client(inode->i_sb)->mdsc;
+                        ceph_sb_to_client(inode->i_sb)->mdsc;
                spin_lock(&mdsc->cap_dirty_lock);
                if (!list_empty(&ci->i_dirty_item)) {
@@ -1146,7 +1201,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg, *partial = NULL;
        struct ceph_mds_cap_release *head;
        int err = -ENOMEM;
-        int extra = mdsc->client->mount_args->cap_release_safety;
+        int extra = mdsc->fsc->mount_options->cap_release_safety;
        int num;
        dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
@@ -1447,7 +1502,7 @@ retry:
        *base = ceph_ino(temp->d_inode);
        *plen = len;
        dout("build_path on %p %d built %llx '%.*s'\n",
-             dentry, atomic_read(&dentry->d_count), *base, len, path);
+             dentry, dentry->d_count, *base, len, path);
        return path;
 }
@@ -1583,8 +1638,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
        head->op = cpu_to_le32(req->r_op);
-        head->caller_uid = cpu_to_le32(current_fsuid());
+        head->caller_uid = cpu_to_le32(req->r_uid);
-        head->caller_gid = cpu_to_le32(current_fsgid());
+        head->caller_gid = cpu_to_le32(req->r_gid);
        head->args = req->r_args;
        ceph_encode_filepath(&p, end, ino1, path1);
@@ -1654,7 +1709,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg;
        int flags = 0;
-        req->r_mds = mds;
        req->r_attempts++;
        if (req->r_inode) {
                struct ceph_cap *cap =
@@ -1741,6 +1795,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
                goto finish;
        }
+        put_request_session(req);
        mds = __choose_mds(mdsc, req);
        if (mds < 0 ||
            ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
@@ -1758,6 +1814,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
                        goto finish;
                }
        }
+        req->r_session = get_session(session);
        dout("do_request mds%d session %p state %s\n", mds, session,
             session_state_name(session->s_state));
        if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1770,7 +1828,6 @@ static int __do_request(struct ceph_mds_client *mdsc,
        }
        /* send request */
-        req->r_session = get_session(session);
        req->r_resend_mds = -1;   /* forget any previous mds hint */
        if (req->r_request_started == 0)   /* note request start time */
@@ -1824,7 +1881,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
                if (req->r_session &&
                    req->r_session->s_mds == mds) {
                        dout(" kicking tid %llu\n", req->r_tid);
-                        put_request_session(req);
                        __do_request(mdsc, req);
                }
        }
@@ -2017,8 +2073,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        goto out;
                } else  {
                        struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-                        struct ceph_cap *cap =
+                        struct ceph_cap *cap = NULL;
-                                ceph_get_cap_for_mds(ci, req->r_mds);;
+                        if (req->r_session)
+                                cap = ceph_get_cap_for_mds(ci,
+                                                   req->r_session->s_mds);
                        dout("already using auth");
                        if ((!cap || cap != ci->i_auth_cap) ||
@@ -2062,12 +2121,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        dout("handle_reply tid %lld result %d\n", tid, result);
        rinfo = &req->r_reply_info;
-        err = parse_reply_info(msg, rinfo);
+        err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
        mutex_unlock(&mdsc->mutex);
        mutex_lock(&session->s_mutex);
        if (err < 0) {
-                pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+                pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
                ceph_msg_dump(msg);
                goto out_err;
        }
@@ -2085,9 +2144,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        /* insert trace into our cache */
        mutex_lock(&req->r_fill_mutex);
-        err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+        err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
        if (err == 0) {
-                if (result == 0 && rinfo->dir_nr)
+                if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+                    rinfo->dir_nr)
                        ceph_readdir_prepopulate(req, req->r_session);
                ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
        }
@@ -2361,19 +2421,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
        if (recon_state->flock) {
                int num_fcntl_locks, num_flock_locks;
+                struct ceph_pagelist_cursor trunc_point;
-                lock_kernel();
-                ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+                ceph_pagelist_set_cursor(pagelist, &trunc_point);
-                rec.v2.flock_len = (2*sizeof(u32) +
+                do {
-                                    (num_fcntl_locks+num_flock_locks) *
+                        lock_flocks();
-                                    sizeof(struct ceph_filelock));
+                        ceph_count_locks(inode, &num_fcntl_locks,
+                                         &num_flock_locks);
-                err = ceph_pagelist_append(pagelist, &rec, reclen);
+                        rec.v2.flock_len = (2*sizeof(u32) +
-                if (!err)
+                                            (num_fcntl_locks+num_flock_locks) *
-                        err = ceph_encode_locks(inode, pagelist,
+                                            sizeof(struct ceph_filelock));
-                                                num_fcntl_locks,
+                        unlock_flocks();
-                                                num_flock_locks);
-                unlock_kernel();
+                        /* pre-alloc pagelist */
+                        ceph_pagelist_truncate(pagelist, &trunc_point);
+                        err = ceph_pagelist_append(pagelist, &rec, reclen);
+                        if (!err)
+                                err = ceph_pagelist_reserve(pagelist,
+                                                            rec.v2.flock_len);
+                        /* encode locks */
+                        if (!err) {
+                                lock_flocks();
+                                err = ceph_encode_locks(inode,
+                                                        pagelist,
+                                                        num_fcntl_locks,
+                                                        num_flock_locks);
+                                unlock_flocks();
+                        }
+                } while (err == -ENOSPC);
        } else {
                err = ceph_pagelist_append(pagelist, &rec, reclen);
        }
@@ -2613,7 +2689,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
                         struct ceph_mds_session *session,
                         struct ceph_msg *msg)
 {
-        struct super_block *sb = mdsc->client->sb;
+        struct super_block *sb = mdsc->fsc->sb;
        struct inode *inode;
        struct ceph_inode_info *ci;
        struct dentry *parent, *dentry;
@@ -2891,10 +2967,16 @@ static void delayed_work(struct work_struct *work)
        schedule_delayed(mdsc);
 }
+int ceph_mdsc_init(struct ceph_fs_client *fsc)
-int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 {
-        mdsc->client = client;
+        struct ceph_mds_client *mdsc;
+        mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
+        if (!mdsc)
+                return -ENOMEM;
+        mdsc->fsc = fsc;
+        fsc->mdsc = mdsc;
        mutex_init(&mdsc->mutex);
        mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
        if (mdsc->mdsmap == NULL)
@@ -2927,7 +3009,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
        INIT_LIST_HEAD(&mdsc->dentry_lru);
        ceph_caps_init(mdsc);
-        ceph_adjust_min_caps(mdsc, client->min_caps);
+        ceph_adjust_min_caps(mdsc, fsc->min_caps);
        return 0;
 }
@@ -2939,7 +3021,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 static void wait_requests(struct ceph_mds_client *mdsc)
 {
        struct ceph_mds_request *req;
-        struct ceph_client *client = mdsc->client;
+        struct ceph_fs_client *fsc = mdsc->fsc;
        mutex_lock(&mdsc->mutex);
        if (__get_oldest_req(mdsc)) {
@@ -2947,7 +3029,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
                dout("wait_requests waiting for requests\n");
                wait_for_completion_timeout(&mdsc->safe_umount_waiters,
-                                    client->mount_args->mount_timeout * HZ);
+                                    fsc->client->options->mount_timeout * HZ);
                /* tear down remaining requests */
                mutex_lock(&mdsc->mutex);
@@ -3030,7 +3112,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
        u64 want_tid, want_flush;
-        if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+        if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
                return;
        dout("sync\n");
@@ -3053,7 +3135,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc)
 {
        int i, n = 0;
-        if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+        if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
                return true;
        mutex_lock(&mdsc->mutex);
@@ -3071,8 +3153,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
        struct ceph_mds_session *session;
        int i;
-        struct ceph_client *client = mdsc->client;
+        struct ceph_fs_client *fsc = mdsc->fsc;
-        unsigned long timeout = client->mount_args->mount_timeout * HZ;
+        unsigned long timeout = fsc->client->options->mount_timeout * HZ;
        dout("close_sessions\n");
@@ -3119,7 +3201,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
        dout("stopped\n");
 }
-void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
        dout("stop\n");
        cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -3129,6 +3211,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
        ceph_caps_finalize(mdsc);
 }
+void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
+{
+        struct ceph_mds_client *mdsc = fsc->mdsc;
+        ceph_mdsc_stop(mdsc);
+        fsc->mdsc = NULL;
+        kfree(mdsc);
+}
 /*
 * handle mds map update.
@@ -3145,14 +3236,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
        ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
        ceph_decode_copy(&p, &fsid, sizeof(fsid));
-        if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+        if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
                return;
        epoch = ceph_decode_32(&p);
        maplen = ceph_decode_32(&p);
        dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
        /* do we need it? */
-        ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+        ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
        mutex_lock(&mdsc->mutex);
        if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
                dout("handle_map epoch %u <= our %u\n",
@@ -3176,7 +3267,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
        } else {
                mdsc->mdsmap = newmap;  /* first mds map */
        }
-        mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+        mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
        __wake_requests(mdsc, &mdsc->waiting_for_map);
@@ -3277,7 +3368,7 @@ static int get_authorizer(struct ceph_connection *con,
 {
        struct ceph_mds_session *s = con->private;
        struct ceph_mds_client *mdsc = s->s_mdsc;
-        struct ceph_auth_client *ac = mdsc->client->monc.auth;
+        struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
        int ret = 0;
        if (force_new && s->s_authorizer) {
@@ -3311,7 +3402,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 {
        struct ceph_mds_session *s = con->private;
        struct ceph_mds_client *mdsc = s->s_mdsc;
-        struct ceph_auth_client *ac = mdsc->client->monc.auth;
+        struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
        return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
 }
@@ -3320,12 +3411,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
 {
        struct ceph_mds_session *s = con->private;
        struct ceph_mds_client *mdsc = s->s_mdsc;
-        struct ceph_auth_client *ac = mdsc->client->monc.auth;
+        struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
        if (ac->ops->invalidate_authorizer)
                ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
-        return ceph_monc_validate_auth(&mdsc->client->monc);
+        return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
 }
 static const struct ceph_connection_operations mds_con_ops = {
@@ -3338,7 +3429,4 @@ static const struct ceph_connection_operations mds_con_ops = {
        .peer_reset = peer_reset,
 };
 /* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c98267ce6d2a..4e3a9cc0bba6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
-#include "types.h"
+#include <linux/ceph/types.h>
-#include "messenger.h"
+#include <linux/ceph/messenger.h>
-#include "mdsmap.h"
+#include <linux/ceph/mdsmap.h>
 /*
 * Some lock dependencies:
@@ -26,7 +26,7 @@
 *
 */
-struct ceph_client;
+struct ceph_fs_client;
 struct ceph_cap;
 /*
@@ -35,6 +35,7 @@ struct ceph_cap;
 */
 struct ceph_mds_reply_info_in {
        struct ceph_mds_reply_inode *in;
+        struct ceph_dir_layout dir_layout;
        u32 symlink_len;
        char *symlink;
        u32 xattr_len;
@@ -42,26 +43,37 @@ struct ceph_mds_reply_info_in {
 };
 /*
- * parsed info about an mds reply, including information about the
+ * parsed info about an mds reply, including information about
- * target inode and/or its parent directory and dentry, and directory
+ * either: 1) the target inode and/or its parent directory and dentry,
- * contents (for readdir results).
+ * and directory contents (for readdir results), or
+ * 2) the file range lock info (for fcntl F_GETLK results).
 */
 struct ceph_mds_reply_info_parsed {
        struct ceph_mds_reply_head    *head;
+        /* trace */
        struct ceph_mds_reply_info_in diri, targeti;
        struct ceph_mds_reply_dirfrag *dirfrag;
        char                          *dname;
        u32                           dname_len;
        struct ceph_mds_reply_lease   *dlease;
-        struct ceph_mds_reply_dirfrag *dir_dir;
+        /* extra */
-        int                           dir_nr;
+        union {
-        char                          **dir_dname;
+                /* for fcntl F_GETLK results */
-        u32                           *dir_dname_len;
+                struct ceph_filelock *filelock_reply;
-        struct ceph_mds_reply_lease   **dir_dlease;
-        struct ceph_mds_reply_info_in *dir_in;
+                /* for readdir results */
-        u8                            dir_complete, dir_end;
+                struct {
+                        struct ceph_mds_reply_dirfrag *dir_dir;
+                        int                           dir_nr;
+                        char                          **dir_dname;
+                        u32                           *dir_dname_len;
+                        struct ceph_mds_reply_lease   **dir_dlease;
+                        struct ceph_mds_reply_info_in *dir_in;
+                        u8                            dir_complete, dir_end;
+                };
+        };
        /* encoded blob describing snapshot contexts for certain
           operations (e.g., open) */
@@ -154,7 +166,6 @@ struct ceph_mds_request {
        struct ceph_mds_client *r_mdsc;
        int r_op;                    /* mds op code */
-        int r_mds;
        /* operation on what? */
        struct inode *r_inode;              /* arg1 */
@@ -170,6 +181,8 @@ struct ceph_mds_request {
        union ceph_mds_request_args r_args;
        int r_fmode;        /* file mode, if expecting cap */
+        uid_t r_uid;
+        gid_t r_gid;
        /* for choosing which mds to send this request to */
        int r_direct_mode;
@@ -230,7 +243,7 @@ struct ceph_mds_request {
 * mds client state
 */
 struct ceph_mds_client {
-        struct ceph_client      *client;
+        struct ceph_fs_client  *fsc;
        struct mutex            mutex;         /* all nested structures */
        struct ceph_mdsmap      *mdsmap;
@@ -289,11 +302,6 @@ struct ceph_mds_client {
        int             caps_avail_count;    /* unused, unreserved */
        int             caps_min_count;      /* keep at least this many
                                                (unreserved) */
-#ifdef CONFIG_DEBUG_FS
-        struct dentry     *debugfs_file;
-#endif
        spinlock_t        dentry_lru_lock;
        struct list_head  dentry_lru;
        int               num_dentry;
@@ -316,10 +324,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
 extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
                             struct ceph_msg *msg, int mds);
-extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
+extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
-                           struct ceph_client *client);
 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 040be6d1150b..73b7d44e8a35 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/bug.h>
 #include <linux/err.h>
@@ -6,9 +6,9 @@
 #include <linux/slab.h>
 #include <linux/types.h>
-#include "mdsmap.h"
+#include <linux/ceph/mdsmap.h>
-#include "messenger.h"
+#include <linux/ceph/messenger.h>
-#include "decode.h"
+#include <linux/ceph/decode.h>
 #include "super.h"
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                }
                dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
-                     i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+                     i+1, n, global_id, mds, inc,
+                     ceph_pr_addr(&addr.in_addr),
                     ceph_mds_state_name(state));
                if (mds >= 0 && mds < m->m_max_mds && state > 0) {
                        m->m_info[mds].global_id = global_id;
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
deleted file mode 100644
index 4c5cb0880bba..000000000000
--- a/fs/ceph/mdsmap.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef _FS_CEPH_MDSMAP_H
-#define _FS_CEPH_MDSMAP_H
-#include "types.h"
-/*
- * mds map - describe servers in the mds cluster.
- *
- * we limit fields to those the client actually xcares about
- */
-struct ceph_mds_info {
-        u64 global_id;
-        struct ceph_entity_addr addr;
-        s32 state;
-        int num_export_targets;
-        bool laggy;
-        u32 *export_targets;
-};
-struct ceph_mdsmap {
-        u32 m_epoch, m_client_epoch, m_last_failure;
-        u32 m_root;
-        u32 m_session_timeout;          /* seconds */
-        u32 m_session_autoclose;        /* seconds */
-        u64 m_max_file_size;
-        u32 m_max_mds;                  /* size of m_addr, m_state arrays */
-        struct ceph_mds_info *m_info;
-        /* which object pools file data can be stored in */
-        int m_num_data_pg_pools;
-        u32 *m_data_pg_pools;
-        u32 m_cas_pg_pool;
-};
-static inline struct ceph_entity_addr *
-ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
-{
-        if (w >= m->m_max_mds)
-                return NULL;
-        return &m->m_info[w].addr;
-}
-static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
-{
-        BUG_ON(w < 0);
-        if (w >= m->m_max_mds)
-                return CEPH_MDS_STATE_DNE;
-        return m->m_info[w].state;
-}
-static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
-{
-        if (w >= 0 && w < m->m_max_mds)
-                return m->m_info[w].laggy;
-        return false;
-}
-extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
-extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
-#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
deleted file mode 100644
index 2502d76fcec1..000000000000
--- a/fs/ceph/messenger.c
+++ /dev/null
@@ -1,2277 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/crc32c.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/inet.h>
-#include <linux/kthread.h>
-#include <linux/net.h>
-#include <linux/slab.h>
-#include <linux/socket.h>
-#include <linux/string.h>
-#include <net/tcp.h>
-#include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "pagelist.h"
-/*
- * Ceph uses the messenger to exchange ceph_msg messages with other
- * hosts in the system.  The messenger provides ordered and reliable
- * delivery.  We tolerate TCP disconnects by reconnecting (with
- * exponential backoff) in the case of a fault (disconnection, bad
- * crc, protocol error).  Acks allow sent messages to be discarded by
- * the sender.
- */
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-#ifdef CONFIG_LOCKDEP
-static struct lock_class_key socket_class;
-#endif
-static void queue_con(struct ceph_connection *con);
-static void con_work(struct work_struct *);
-static void ceph_fault(struct ceph_connection *con);
-/*
- * nicely render a sockaddr as a string.
- */
-#define MAX_ADDR_STR 20
-#define MAX_ADDR_STR_LEN 60
-static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
-static DEFINE_SPINLOCK(addr_str_lock);
-static int last_addr_str;
-const char *pr_addr(const struct sockaddr_storage *ss)
-{
-        int i;
-        char *s;
-        struct sockaddr_in *in4 = (void *)ss;
-        struct sockaddr_in6 *in6 = (void *)ss;
-        spin_lock(&addr_str_lock);
-        i = last_addr_str++;
-        if (last_addr_str == MAX_ADDR_STR)
-                last_addr_str = 0;
-        spin_unlock(&addr_str_lock);
-        s = addr_str[i];
-        switch (ss->ss_family) {
-        case AF_INET:
-                snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
-                         (unsigned int)ntohs(in4->sin_port));
-                break;
-        case AF_INET6:
-                snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
-                         (unsigned int)ntohs(in6->sin6_port));
-                break;
-        default:
-                sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
-        }
-        return s;
-}
-static void encode_my_addr(struct ceph_messenger *msgr)
-{
-        memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
-        ceph_encode_addr(&msgr->my_enc_addr);
-}
-/*
- * work queue for all reading and writing to/from the socket.
- */
-struct workqueue_struct *ceph_msgr_wq;
-int __init ceph_msgr_init(void)
-{
-        ceph_msgr_wq = create_workqueue("ceph-msgr");
-        if (IS_ERR(ceph_msgr_wq)) {
-                int ret = PTR_ERR(ceph_msgr_wq);
-                pr_err("msgr_init failed to create workqueue: %d\n", ret);
-                ceph_msgr_wq = NULL;
-                return ret;
-        }
-        return 0;
-}
-void ceph_msgr_exit(void)
-{
-        destroy_workqueue(ceph_msgr_wq);
-}
-void ceph_msgr_flush(void)
-{
-        flush_workqueue(ceph_msgr_wq);
-}
-/*
- * socket callback functions
- */
-/* data available on socket, or listen socket received a connect */
-static void ceph_data_ready(struct sock *sk, int count_unused)
-{
-        struct ceph_connection *con =
-                (struct ceph_connection *)sk->sk_user_data;
-        if (sk->sk_state != TCP_CLOSE_WAIT) {
-                dout("ceph_data_ready on %p state = %lu, queueing work\n",
-                     con, con->state);
-                queue_con(con);
-        }
-}
-/* socket has buffer space for writing */
-static void ceph_write_space(struct sock *sk)
-{
-        struct ceph_connection *con =
-                (struct ceph_connection *)sk->sk_user_data;
-        /* only queue to workqueue if there is data we want to write. */
-        if (test_bit(WRITE_PENDING, &con->state)) {
-                dout("ceph_write_space %p queueing write work\n", con);
-                queue_con(con);
-        } else {
-                dout("ceph_write_space %p nothing to write\n", con);
-        }
-        /* since we have our own write_space, clear the SOCK_NOSPACE flag */
-        clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-}
-/* socket's state has changed */
-static void ceph_state_change(struct sock *sk)
-{
-        struct ceph_connection *con =
-                (struct ceph_connection *)sk->sk_user_data;
-        dout("ceph_state_change %p state = %lu sk_state = %u\n",
-             con, con->state, sk->sk_state);
-        if (test_bit(CLOSED, &con->state))
-                return;
-        switch (sk->sk_state) {
-        case TCP_CLOSE:
-                dout("ceph_state_change TCP_CLOSE\n");
-        case TCP_CLOSE_WAIT:
-                dout("ceph_state_change TCP_CLOSE_WAIT\n");
-                if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
-                        if (test_bit(CONNECTING, &con->state))
-                                con->error_msg = "connection failed";
-                        else
-                                con->error_msg = "socket closed";
-                        queue_con(con);
-                }
-                break;
-        case TCP_ESTABLISHED:
-                dout("ceph_state_change TCP_ESTABLISHED\n");
-                queue_con(con);
-                break;
-        }
-}
-/*
- * set up socket callbacks
- */
-static void set_sock_callbacks(struct socket *sock,
-                               struct ceph_connection *con)
-{
-        struct sock *sk = sock->sk;
-        sk->sk_user_data = (void *)con;
-        sk->sk_data_ready = ceph_data_ready;
-        sk->sk_write_space = ceph_write_space;
-        sk->sk_state_change = ceph_state_change;
-}
-/*
- * socket helpers
- */
-/*
- * initiate connection to a remote socket.
- */
-static struct socket *ceph_tcp_connect(struct ceph_connection *con)
-{
-        struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
-        struct socket *sock;
-        int ret;
-        BUG_ON(con->sock);
-        ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
-                               IPPROTO_TCP, &sock);
-        if (ret)
-                return ERR_PTR(ret);
-        con->sock = sock;
-        sock->sk->sk_allocation = GFP_NOFS;
-#ifdef CONFIG_LOCKDEP
-        lockdep_set_class(&sock->sk->sk_lock, &socket_class);
-#endif
-        set_sock_callbacks(sock, con);
-        dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
-        ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
-                                 O_NONBLOCK);
-        if (ret == -EINPROGRESS) {
-                dout("connect %s EINPROGRESS sk_state = %u\n",
-                     pr_addr(&con->peer_addr.in_addr),
-                     sock->sk->sk_state);
-                ret = 0;
-        }
-        if (ret < 0) {
-                pr_err("connect %s error %d\n",
-                       pr_addr(&con->peer_addr.in_addr), ret);
-                sock_release(sock);
-                con->sock = NULL;
-                con->error_msg = "connect error";
-        }
-        if (ret < 0)
-                return ERR_PTR(ret);
-        return sock;
-}
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
-        struct kvec iov = {buf, len};
-        struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-        return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
-}
-/*
- * write something.  @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
-                     size_t kvlen, size_t len, int more)
-{
-        struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-        if (more)
-                msg.msg_flags |= MSG_MORE;
-        else
-                msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
-        return kernel_sendmsg(sock, &msg, iov, kvlen, len);
-}
-/*
- * Shutdown/close the socket for the given connection.
- */
-static int con_close_socket(struct ceph_connection *con)
-{
-        int rc;
-        dout("con_close_socket on %p sock %p\n", con, con->sock);
-        if (!con->sock)
-                return 0;
-        set_bit(SOCK_CLOSED, &con->state);
-        rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
-        sock_release(con->sock);
-        con->sock = NULL;
-        clear_bit(SOCK_CLOSED, &con->state);
-        return rc;
-}
-/*
- * Reset a connection.  Discard all incoming and outgoing messages
- * and clear *_seq state.
- */
-static void ceph_msg_remove(struct ceph_msg *msg)
-{
-        list_del_init(&msg->list_head);
-        ceph_msg_put(msg);
-}
-static void ceph_msg_remove_list(struct list_head *head)
-{
-        while (!list_empty(head)) {
-                struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
-                                                        list_head);
-                ceph_msg_remove(msg);
-        }
-}
-static void reset_connection(struct ceph_connection *con)
-{
-        /* reset connection, out_queue, msg_ and connect_seq */
-        /* discard existing out_queue and msg_seq */
-        ceph_msg_remove_list(&con->out_queue);
-        ceph_msg_remove_list(&con->out_sent);
-        if (con->in_msg) {
-                ceph_msg_put(con->in_msg);
-                con->in_msg = NULL;
-        }
-        con->connect_seq = 0;
-        con->out_seq = 0;
-        if (con->out_msg) {
-                ceph_msg_put(con->out_msg);
-                con->out_msg = NULL;
-        }
-        con->out_keepalive_pending = false;
-        con->in_seq = 0;
-        con->in_seq_acked = 0;
-}
-/*
- * mark a peer down.  drop any open connections.
- */
-void ceph_con_close(struct ceph_connection *con)
-{
-        dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
-        set_bit(CLOSED, &con->state);  /* in case there's queued work */
-        clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
-        clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
-        clear_bit(KEEPALIVE_PENDING, &con->state);
-        clear_bit(WRITE_PENDING, &con->state);
-        mutex_lock(&con->mutex);
-        reset_connection(con);
-        con->peer_global_seq = 0;
-        cancel_delayed_work(&con->work);
-        mutex_unlock(&con->mutex);
-        queue_con(con);
-}
-/*
- * Reopen a closed connection, with a new peer address.
- */
-void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
-{
-        dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
-        set_bit(OPENING, &con->state);
-        clear_bit(CLOSED, &con->state);
-        memcpy(&con->peer_addr, addr, sizeof(*addr));
-        con->delay = 0;      /* reset backoff memory */
-        queue_con(con);
-}
-/*
- * return true if this connection ever successfully opened
- */
-bool ceph_con_opened(struct ceph_connection *con)
-{
-        return con->connect_seq > 0;
-}
-/*
- * generic get/put
- */
-struct ceph_connection *ceph_con_get(struct ceph_connection *con)
-{
-        dout("con_get %p nref = %d -> %d\n", con,
-             atomic_read(&con->nref), atomic_read(&con->nref) + 1);
-        if (atomic_inc_not_zero(&con->nref))
-                return con;
-        return NULL;
-}
-void ceph_con_put(struct ceph_connection *con)
-{
-        dout("con_put %p nref = %d -> %d\n", con,
-             atomic_read(&con->nref), atomic_read(&con->nref) - 1);
-        BUG_ON(atomic_read(&con->nref) == 0);
-        if (atomic_dec_and_test(&con->nref)) {
-                BUG_ON(con->sock);
-                kfree(con);
-        }
-}
-/*
- * initialize a new connection.
- */
-void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
-{
-        dout("con_init %p\n", con);
-        memset(con, 0, sizeof(*con));
-        atomic_set(&con->nref, 1);
-        con->msgr = msgr;
-        mutex_init(&con->mutex);
-        INIT_LIST_HEAD(&con->out_queue);
-        INIT_LIST_HEAD(&con->out_sent);
-        INIT_DELAYED_WORK(&con->work, con_work);
-}
-/*
- * We maintain a global counter to order connection attempts.  Get
- * a unique seq greater than @gt.
- */
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
-{
-        u32 ret;
-        spin_lock(&msgr->global_seq_lock);
-        if (msgr->global_seq < gt)
-                msgr->global_seq = gt;
-        ret = ++msgr->global_seq;
-        spin_unlock(&msgr->global_seq_lock);
-        return ret;
-}
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off.  Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con, int v)
-{
-        struct ceph_msg *m = con->out_msg;
-        dout("prepare_write_message_footer %p\n", con);
-        con->out_kvec_is_msg = true;
-        con->out_kvec[v].iov_base = &m->footer;
-        con->out_kvec[v].iov_len = sizeof(m->footer);
-        con->out_kvec_bytes += sizeof(m->footer);
-        con->out_kvec_left++;
-        con->out_more = m->more_to_follow;
-        con->out_msg_done = true;
-}
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
-        struct ceph_msg *m;
-        int v = 0;
-        con->out_kvec_bytes = 0;
-        con->out_kvec_is_msg = true;
-        con->out_msg_done = false;
-        /* Sneak an ack in there first?  If we can get it into the same
-         * TCP packet that's a good thing. */
-        if (con->in_seq > con->in_seq_acked) {
-                con->in_seq_acked = con->in_seq;
-                con->out_kvec[v].iov_base = &tag_ack;
-                con->out_kvec[v++].iov_len = 1;
-                con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-                con->out_kvec[v].iov_base = &con->out_temp_ack;
-                con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
-                con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
-        }
-        m = list_first_entry(&con->out_queue,
-                       struct ceph_msg, list_head);
-        con->out_msg = m;
-        if (test_bit(LOSSYTX, &con->state)) {
-                list_del_init(&m->list_head);
-        } else {
-                /* put message on sent list */
-                ceph_msg_get(m);
-                list_move_tail(&m->list_head, &con->out_sent);
-        }
-        /*
-         * only assign outgoing seq # if we haven't sent this message
-         * yet.  if it is requeued, resend with it's original seq.
-         */
-        if (m->needs_out_seq) {
-                m->hdr.seq = cpu_to_le64(++con->out_seq);
-                m->needs_out_seq = false;
-        }
-        dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
-             m, con->out_seq, le16_to_cpu(m->hdr.type),
-             le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
-             le32_to_cpu(m->hdr.data_len),
-             m->nr_pages);
-        BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
-        /* tag + hdr + front + middle */
-        con->out_kvec[v].iov_base = &tag_msg;
-        con->out_kvec[v++].iov_len = 1;
-        con->out_kvec[v].iov_base = &m->hdr;
-        con->out_kvec[v++].iov_len = sizeof(m->hdr);
-        con->out_kvec[v++] = m->front;
-        if (m->middle)
-                con->out_kvec[v++] = m->middle->vec;
-        con->out_kvec_left = v;
-        con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
-                (m->middle ? m->middle->vec.iov_len : 0);
-        con->out_kvec_cur = con->out_kvec;
-        /* fill in crc (except data pages), footer */
-        con->out_msg->hdr.crc =
-                cpu_to_le32(crc32c(0, (void *)&m->hdr,
-                                      sizeof(m->hdr) - sizeof(m->hdr.crc)));
-        con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
-        con->out_msg->footer.front_crc =
-                cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
-        if (m->middle)
-                con->out_msg->footer.middle_crc =
-                        cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
-                                           m->middle->vec.iov_len));
-        else
-                con->out_msg->footer.middle_crc = 0;
-        con->out_msg->footer.data_crc = 0;
-        dout("prepare_write_message front_crc %u data_crc %u\n",
-             le32_to_cpu(con->out_msg->footer.front_crc),
-             le32_to_cpu(con->out_msg->footer.middle_crc));
-        /* is there a data payload? */
-        if (le32_to_cpu(m->hdr.data_len) > 0) {
-                /* initialize page iterator */
-                con->out_msg_pos.page = 0;
-                con->out_msg_pos.page_pos =
-                        le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
-                con->out_msg_pos.data_pos = 0;
-                con->out_msg_pos.did_page_crc = 0;
-                con->out_more = 1;  /* data + footer will follow */
-        } else {
-                /* no, queue up footer too and be done */
-                prepare_write_message_footer(con, v);
-        }
-        set_bit(WRITE_PENDING, &con->state);
-}
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
-        dout("prepare_write_ack %p %llu -> %llu\n", con,
-             con->in_seq_acked, con->in_seq);
-        con->in_seq_acked = con->in_seq;
-        con->out_kvec[0].iov_base = &tag_ack;
-        con->out_kvec[0].iov_len = 1;
-        con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-        con->out_kvec[1].iov_base = &con->out_temp_ack;
-        con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
-        con->out_kvec_left = 2;
-        con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
-        con->out_kvec_cur = con->out_kvec;
-        con->out_more = 1;  /* more will follow.. eventually.. */
-        set_bit(WRITE_PENDING, &con->state);
-}
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
-        dout("prepare_write_keepalive %p\n", con);
-        con->out_kvec[0].iov_base = &tag_keepalive;
-        con->out_kvec[0].iov_len = 1;
-        con->out_kvec_left = 1;
-        con->out_kvec_bytes = 1;
-        con->out_kvec_cur = con->out_kvec;
-        set_bit(WRITE_PENDING, &con->state);
-}
-/*
- * Connection negotiation.
- */
-static void prepare_connect_authorizer(struct ceph_connection *con)
-{
-        void *auth_buf;
-        int auth_len = 0;
-        int auth_protocol = 0;
-        mutex_unlock(&con->mutex);
-        if (con->ops->get_authorizer)
-                con->ops->get_authorizer(con, &auth_buf, &auth_len,
-                                         &auth_protocol, &con->auth_reply_buf,
-                                         &con->auth_reply_buf_len,
-                                         con->auth_retry);
-        mutex_lock(&con->mutex);
-        con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
-        con->out_connect.authorizer_len = cpu_to_le32(auth_len);
-        con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
-        con->out_kvec[con->out_kvec_left].iov_len = auth_len;
-        con->out_kvec_left++;
-        con->out_kvec_bytes += auth_len;
-}
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_banner(struct ceph_messenger *msgr,
-                                 struct ceph_connection *con)
-{
-        int len = strlen(CEPH_BANNER);
-        con->out_kvec[0].iov_base = CEPH_BANNER;
-        con->out_kvec[0].iov_len = len;
-        con->out_kvec[1].iov_base = &msgr->my_enc_addr;
-        con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
-        con->out_kvec_left = 2;
-        con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
-        con->out_kvec_cur = con->out_kvec;
-        con->out_more = 0;
-        set_bit(WRITE_PENDING, &con->state);
-}
-static void prepare_write_connect(struct ceph_messenger *msgr,
-                                  struct ceph_connection *con,
-                                  int after_banner)
-{
-        unsigned global_seq = get_global_seq(con->msgr, 0);
-        int proto;
-        switch (con->peer_name.type) {
-        case CEPH_ENTITY_TYPE_MON:
-                proto = CEPH_MONC_PROTOCOL;
-                break;
-        case CEPH_ENTITY_TYPE_OSD:
-                proto = CEPH_OSDC_PROTOCOL;
-                break;
-        case CEPH_ENTITY_TYPE_MDS:
-                proto = CEPH_MDSC_PROTOCOL;
-                break;
-        default:
-                BUG();
-        }
-        dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
-             con->connect_seq, global_seq, proto);
-        con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
-        con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
-        con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
-        con->out_connect.global_seq = cpu_to_le32(global_seq);
-        con->out_connect.protocol_version = cpu_to_le32(proto);
-        con->out_connect.flags = 0;
-        if (!after_banner) {
-                con->out_kvec_left = 0;
-                con->out_kvec_bytes = 0;
-        }
-        con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
-        con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
-        con->out_kvec_left++;
-        con->out_kvec_bytes += sizeof(con->out_connect);
-        con->out_kvec_cur = con->out_kvec;
-        con->out_more = 0;
-        set_bit(WRITE_PENDING, &con->state);
-        prepare_connect_authorizer(con);
-}
-/*
- * write as much of pending kvecs to the socket as we can.
- *  1 -> done
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
-        int ret;
-        dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
-        while (con->out_kvec_bytes > 0) {
-                ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
-                                       con->out_kvec_left, con->out_kvec_bytes,
-                                       con->out_more);
-                if (ret <= 0)
-                        goto out;
-                con->out_kvec_bytes -= ret;
-                if (con->out_kvec_bytes == 0)
-                        break;            /* done */
-                while (ret > 0) {
-                        if (ret >= con->out_kvec_cur->iov_len) {
-                                ret -= con->out_kvec_cur->iov_len;
-                                con->out_kvec_cur++;
-                                con->out_kvec_left--;
-                        } else {
-                                con->out_kvec_cur->iov_len -= ret;
-                                con->out_kvec_cur->iov_base += ret;
-                                ret = 0;
-                                break;
-                        }
-                }
-        }
-        con->out_kvec_left = 0;
-        con->out_kvec_is_msg = false;
-        ret = 1;
-out:
-        dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
-             con->out_kvec_bytes, con->out_kvec_left, ret);
-        return ret;  /* done! */
-}
-/*
- * Write as much message data payload as we can.  If we finish, queue
- * up the footer.
- *  1 -> done, footer is now queued in out_kvec[].
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_msg_pages(struct ceph_connection *con)
-{
-        struct ceph_msg *msg = con->out_msg;
-        unsigned data_len = le32_to_cpu(msg->hdr.data_len);
-        size_t len;
-        int crc = con->msgr->nocrc;
-        int ret;
-        dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
-             con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
-             con->out_msg_pos.page_pos);
-        while (con->out_msg_pos.page < con->out_msg->nr_pages) {
-                struct page *page = NULL;
-                void *kaddr = NULL;
-                /*
-                 * if we are calculating the data crc (the default), we need
-                 * to map the page.  if our pages[] has been revoked, use the
-                 * zero page.
-                 */
-                if (msg->pages) {
-                        page = msg->pages[con->out_msg_pos.page];
-                        if (crc)
-                                kaddr = kmap(page);
-                } else if (msg->pagelist) {
-                        page = list_first_entry(&msg->pagelist->head,
-                                                struct page, lru);
-                        if (crc)
-                                kaddr = kmap(page);
-                } else {
-                        page = con->msgr->zero_page;
-                        if (crc)
-                                kaddr = page_address(con->msgr->zero_page);
-                }
-                len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
-                          (int)(data_len - con->out_msg_pos.data_pos));
-                if (crc && !con->out_msg_pos.did_page_crc) {
-                        void *base = kaddr + con->out_msg_pos.page_pos;
-                        u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
-                        BUG_ON(kaddr == NULL);
-                        con->out_msg->footer.data_crc =
-                                cpu_to_le32(crc32c(tmpcrc, base, len));
-                        con->out_msg_pos.did_page_crc = 1;
-                }
-                ret = kernel_sendpage(con->sock, page,
-                                      con->out_msg_pos.page_pos, len,
-                                      MSG_DONTWAIT | MSG_NOSIGNAL |
-                                      MSG_MORE);
-                if (crc && (msg->pages || msg->pagelist))
-                        kunmap(page);
-                if (ret <= 0)
-                        goto out;
-                con->out_msg_pos.data_pos += ret;
-                con->out_msg_pos.page_pos += ret;
-                if (ret == len) {
-                        con->out_msg_pos.page_pos = 0;
-                        con->out_msg_pos.page++;
-                        con->out_msg_pos.did_page_crc = 0;
-                        if (msg->pagelist)
-                                list_move_tail(&page->lru,
-                                               &msg->pagelist->head);
-                }
-        }
-        dout("write_partial_msg_pages %p msg %p done\n", con, msg);
-        /* prepare and queue up footer, too */
-        if (!crc)
-                con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
-        con->out_kvec_bytes = 0;
-        con->out_kvec_left = 0;
-        con->out_kvec_cur = con->out_kvec;
-        prepare_write_message_footer(con, 0);
-        ret = 1;
-out:
-        return ret;
-}
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
-        int ret;
-        while (con->out_skip > 0) {
-                struct kvec iov = {
-                        .iov_base = page_address(con->msgr->zero_page),
-                        .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
-                };
-                ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
-                if (ret <= 0)
-                        goto out;
-                con->out_skip -= ret;
-        }
-        ret = 1;
-out:
-        return ret;
-}
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_banner(struct ceph_connection *con)
-{
-        dout("prepare_read_banner %p\n", con);
-        con->in_base_pos = 0;
-}
-static void prepare_read_connect(struct ceph_connection *con)
-{
-        dout("prepare_read_connect %p\n", con);
-        con->in_base_pos = 0;
-}
-static void prepare_read_ack(struct ceph_connection *con)
-{
-        dout("prepare_read_ack %p\n", con);
-        con->in_base_pos = 0;
-}
-static void prepare_read_tag(struct ceph_connection *con)
-{
-        dout("prepare_read_tag %p\n", con);
-        con->in_base_pos = 0;
-        con->in_tag = CEPH_MSGR_TAG_READY;
-}
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
-        dout("prepare_read_message %p\n", con);
-        BUG_ON(con->in_msg != NULL);
-        con->in_base_pos = 0;
-        con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
-        return 0;
-}
-static int read_partial(struct ceph_connection *con,
-                        int *to, int size, void *object)
-{
-        *to += size;
-        while (con->in_base_pos < *to) {
-                int left = *to - con->in_base_pos;
-                int have = size - left;
-                int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
-                if (ret <= 0)
-                        return ret;
-                con->in_base_pos += ret;
-        }
-        return 1;
-}
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_banner(struct ceph_connection *con)
-{
-        int ret, to = 0;
-        dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
-        /* peer's banner */
-        ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
-        if (ret <= 0)
-                goto out;
-        ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
-                           &con->actual_peer_addr);
-        if (ret <= 0)
-                goto out;
-        ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
-                           &con->peer_addr_for_me);
-        if (ret <= 0)
-                goto out;
-out:
-        return ret;
-}
-static int read_partial_connect(struct ceph_connection *con)
-{
-        int ret, to = 0;
-        dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-        ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
-        if (ret <= 0)
-                goto out;
-        ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
-                           con->auth_reply_buf);
-        if (ret <= 0)
-                goto out;
-        dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
-             con, (int)con->in_reply.tag,
-             le32_to_cpu(con->in_reply.connect_seq),
-             le32_to_cpu(con->in_reply.global_seq));
-out:
-        return ret;
-}
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
-        if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
-                pr_err("connect to %s got bad banner\n",
-                       pr_addr(&con->peer_addr.in_addr));
-                con->error_msg = "protocol error, bad banner";
-                return -1;
-        }
-        return 0;
-}
-static bool addr_is_blank(struct sockaddr_storage *ss)
-{
-        switch (ss->ss_family) {
-        case AF_INET:
-                return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
-        case AF_INET6:
-                return
-                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
-                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
-                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
-                     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
-        }
-        return false;
-}
-static int addr_port(struct sockaddr_storage *ss)
-{
-        switch (ss->ss_family) {
-        case AF_INET:
-                return ntohs(((struct sockaddr_in *)ss)->sin_port);
-        case AF_INET6:
-                return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
-        }
-        return 0;
-}
-static void addr_set_port(struct sockaddr_storage *ss, int p)
-{
-        switch (ss->ss_family) {
-        case AF_INET:
-                ((struct sockaddr_in *)ss)->sin_port = htons(p);
-        case AF_INET6:
-                ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
-        }
-}
-/*
- * Parse an ip[:port] list into an addr array.  Use the default
- * monitor port if a port isn't specified.
- */
-int ceph_parse_ips(const char *c, const char *end,
-                   struct ceph_entity_addr *addr,
-                   int max_count, int *count)
-{
-        int i;
-        const char *p = c;
-        dout("parse_ips on '%.*s'\n", (int)(end-c), c);
-        for (i = 0; i < max_count; i++) {
-                const char *ipend;
-                struct sockaddr_storage *ss = &addr[i].in_addr;
-                struct sockaddr_in *in4 = (void *)ss;
-                struct sockaddr_in6 *in6 = (void *)ss;
-                int port;
-                char delim = ',';
-                if (*p == '[') {
-                        delim = ']';
-                        p++;
-                }
-                memset(ss, 0, sizeof(*ss));
-                if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
-                             delim, &ipend))
-                        ss->ss_family = AF_INET;
-                else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
-                                  delim, &ipend))
-                        ss->ss_family = AF_INET6;
-                else
-                        goto bad;
-                p = ipend;
-                if (delim == ']') {
-                        if (*p != ']') {
-                                dout("missing matching ']'\n");
-                                goto bad;
-                        }
-                        p++;
-                }
-                /* port? */
-                if (p < end && *p == ':') {
-                        port = 0;
-                        p++;
-                        while (p < end && *p >= '0' && *p <= '9') {
-                                port = (port * 10) + (*p - '0');
-                                p++;
-                        }
-                        if (port > 65535 || port == 0)
-                                goto bad;
-                } else {
-                        port = CEPH_MON_PORT;
-                }
-                addr_set_port(ss, port);
-                dout("parse_ips got %s\n", pr_addr(ss));
-                if (p == end)
-                        break;
-                if (*p != ',')
-                        goto bad;
-                p++;
-        }
-        if (p != end)
-                goto bad;
-        if (count)
-                *count = i + 1;
-        return 0;
-bad:
-        pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
-        return -EINVAL;
-}
-static int process_banner(struct ceph_connection *con)
-{
-        dout("process_banner on %p\n", con);
-        if (verify_hello(con) < 0)
-                return -1;
-        ceph_decode_addr(&con->actual_peer_addr);
-        ceph_decode_addr(&con->peer_addr_for_me);
-        /*
-         * Make sure the other end is who we wanted.  note that the other
-         * end may not yet know their ip address, so if it's 0.0.0.0, give
-         * them the benefit of the doubt.
-         */
-        if (memcmp(&con->peer_addr, &con->actual_peer_addr,
-                   sizeof(con->peer_addr)) != 0 &&
-            !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
-              con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-                pr_warning("wrong peer, want %s/%d, got %s/%d\n",
-                           pr_addr(&con->peer_addr.in_addr),
-                           (int)le32_to_cpu(con->peer_addr.nonce),
-                           pr_addr(&con->actual_peer_addr.in_addr),
-                           (int)le32_to_cpu(con->actual_peer_addr.nonce));
-                con->error_msg = "wrong peer at address";
-                return -1;
-        }
-        /*
-         * did we learn our address?
-         */
-        if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
-                int port = addr_port(&con->msgr->inst.addr.in_addr);
-                memcpy(&con->msgr->inst.addr.in_addr,
-                       &con->peer_addr_for_me.in_addr,
-                       sizeof(con->peer_addr_for_me.in_addr));
-                addr_set_port(&con->msgr->inst.addr.in_addr, port);
-                encode_my_addr(con->msgr);
-                dout("process_banner learned my addr is %s\n",
-                     pr_addr(&con->msgr->inst.addr.in_addr));
-        }
-        set_bit(NEGOTIATING, &con->state);
-        prepare_read_connect(con);
-        return 0;
-}
-static void fail_protocol(struct ceph_connection *con)
-{
-        reset_connection(con);
-        set_bit(CLOSED, &con->state);  /* in case there's queued work */
-        mutex_unlock(&con->mutex);
-        if (con->ops->bad_proto)
-                con->ops->bad_proto(con);
-        mutex_lock(&con->mutex);
-}
-static int process_connect(struct ceph_connection *con)
-{
-        u64 sup_feat = CEPH_FEATURE_SUPPORTED;
-        u64 req_feat = CEPH_FEATURE_REQUIRED;
-        u64 server_feat = le64_to_cpu(con->in_reply.features);
-        dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-        switch (con->in_reply.tag) {
-        case CEPH_MSGR_TAG_FEATURES:
-                pr_err("%s%lld %s feature set mismatch,"
-                       " my %llx < server's %llx, missing %llx\n",
-                       ENTITY_NAME(con->peer_name),
-                       pr_addr(&con->peer_addr.in_addr),
-                       sup_feat, server_feat, server_feat & ~sup_feat);
-                con->error_msg = "missing required protocol features";
-                fail_protocol(con);
-                return -1;
-        case CEPH_MSGR_TAG_BADPROTOVER:
-                pr_err("%s%lld %s protocol version mismatch,"
-                       " my %d != server's %d\n",
-                       ENTITY_NAME(con->peer_name),
-                       pr_addr(&con->peer_addr.in_addr),
-                       le32_to_cpu(con->out_connect.protocol_version),
-                       le32_to_cpu(con->in_reply.protocol_version));
-                con->error_msg = "protocol version mismatch";
-                fail_protocol(con);
-                return -1;
-        case CEPH_MSGR_TAG_BADAUTHORIZER:
-                con->auth_retry++;
-                dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
-                     con->auth_retry);
-                if (con->auth_retry == 2) {
-                        con->error_msg = "connect authorization failure";
-                        reset_connection(con);
-                        set_bit(CLOSED, &con->state);
-                        return -1;
-                }
-                con->auth_retry = 1;
-                prepare_write_connect(con->msgr, con, 0);
-                prepare_read_connect(con);
-                break;
-        case CEPH_MSGR_TAG_RESETSESSION:
-                /*
-                 * If we connected with a large connect_seq but the peer
-                 * has no record of a session with us (no connection, or
-                 * connect_seq == 0), they will send RESETSESION to indicate
-                 * that they must have reset their session, and may have
-                 * dropped messages.
-                 */
-                dout("process_connect got RESET peer seq %u\n",
-                     le32_to_cpu(con->in_connect.connect_seq));
-                pr_err("%s%lld %s connection reset\n",
-                       ENTITY_NAME(con->peer_name),
-                       pr_addr(&con->peer_addr.in_addr));
-                reset_connection(con);
-                prepare_write_connect(con->msgr, con, 0);
-                prepare_read_connect(con);
-                /* Tell ceph about it. */
-                mutex_unlock(&con->mutex);
-                pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
-                if (con->ops->peer_reset)
-                        con->ops->peer_reset(con);
-                mutex_lock(&con->mutex);
-                break;
-        case CEPH_MSGR_TAG_RETRY_SESSION:
-                /*
-                 * If we sent a smaller connect_seq than the peer has, try
-                 * again with a larger value.
-                 */
-                dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
-                     le32_to_cpu(con->out_connect.connect_seq),
-                     le32_to_cpu(con->in_connect.connect_seq));
-                con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
-                prepare_write_connect(con->msgr, con, 0);
-                prepare_read_connect(con);
-                break;
-        case CEPH_MSGR_TAG_RETRY_GLOBAL:
-                /*
-                 * If we sent a smaller global_seq than the peer has, try
-                 * again with a larger value.
-                 */
-                dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
-                     con->peer_global_seq,
-                     le32_to_cpu(con->in_connect.global_seq));
-                get_global_seq(con->msgr,
-                               le32_to_cpu(con->in_connect.global_seq));
-                prepare_write_connect(con->msgr, con, 0);
-                prepare_read_connect(con);
-                break;
-        case CEPH_MSGR_TAG_READY:
-                if (req_feat & ~server_feat) {
-                        pr_err("%s%lld %s protocol feature mismatch,"
-                               " my required %llx > server's %llx, need %llx\n",
-                               ENTITY_NAME(con->peer_name),
-                               pr_addr(&con->peer_addr.in_addr),
-                               req_feat, server_feat, req_feat & ~server_feat);
-                        con->error_msg = "missing required protocol features";
-                        fail_protocol(con);
-                        return -1;
-                }
-                clear_bit(CONNECTING, &con->state);
-                con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
-                con->connect_seq++;
-                con->peer_features = server_feat;
-                dout("process_connect got READY gseq %d cseq %d (%d)\n",
-                     con->peer_global_seq,
-                     le32_to_cpu(con->in_reply.connect_seq),
-                     con->connect_seq);
-                WARN_ON(con->connect_seq !=
-                        le32_to_cpu(con->in_reply.connect_seq));
-                if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-                        set_bit(LOSSYTX, &con->state);
-                prepare_read_tag(con);
-                break;
-        case CEPH_MSGR_TAG_WAIT:
-                /*
-                 * If there is a connection race (we are opening
-                 * connections to each other), one of us may just have
-                 * to WAIT.  This shouldn't happen if we are the
-                 * client.
-                 */
-                pr_err("process_connect peer connecting WAIT\n");
-        default:
-                pr_err("connect protocol error, will retry\n");
-                con->error_msg = "protocol error, garbage tag during connect";
-                return -1;
-        }
-        return 0;
-}
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
-        int to = 0;
-        return read_partial(con, &to, sizeof(con->in_temp_ack),
-                            &con->in_temp_ack);
-}
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
-        struct ceph_msg *m;
-        u64 ack = le64_to_cpu(con->in_temp_ack);
-        u64 seq;
-        while (!list_empty(&con->out_sent)) {
-                m = list_first_entry(&con->out_sent, struct ceph_msg,
-                                     list_head);
-                seq = le64_to_cpu(m->hdr.seq);
-                if (seq > ack)
-                        break;
-                dout("got ack for seq %llu type %d at %p\n", seq,
-                     le16_to_cpu(m->hdr.type), m);
-                ceph_msg_remove(m);
-        }
-        prepare_read_tag(con);
-}
-static int read_partial_message_section(struct ceph_connection *con,
-                                        struct kvec *section,
-                                        unsigned int sec_len, u32 *crc)
-{
-        int left;
-        int ret;
-        BUG_ON(!section);
-        while (section->iov_len < sec_len) {
-                BUG_ON(section->iov_base == NULL);
-                left = sec_len - section->iov_len;
-                ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
-                                       section->iov_len, left);
-                if (ret <= 0)
-                        return ret;
-                section->iov_len += ret;
-                if (section->iov_len == sec_len)
-                        *crc = crc32c(0, section->iov_base,
-                                      section->iov_len);
-        }
-        return 1;
-}
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-                                struct ceph_msg_header *hdr,
-                                int *skip);
-/*
- * read (part of) a message.
- */
-static int read_partial_message(struct ceph_connection *con)
-{
-        struct ceph_msg *m = con->in_msg;
-        void *p;
-        int ret;
-        int to, left;
-        unsigned front_len, middle_len, data_len, data_off;
-        int datacrc = con->msgr->nocrc;
-        int skip;
-        u64 seq;
-        dout("read_partial_message con %p msg %p\n", con, m);
-        /* header */
-        while (con->in_base_pos < sizeof(con->in_hdr)) {
-                left = sizeof(con->in_hdr) - con->in_base_pos;
-                ret = ceph_tcp_recvmsg(con->sock,
-                                       (char *)&con->in_hdr + con->in_base_pos,
-                                       left);
-                if (ret <= 0)
-                        return ret;
-                con->in_base_pos += ret;
-                if (con->in_base_pos == sizeof(con->in_hdr)) {
-                        u32 crc = crc32c(0, (void *)&con->in_hdr,
-                                 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
-                        if (crc != le32_to_cpu(con->in_hdr.crc)) {
-                                pr_err("read_partial_message bad hdr "
-                                       " crc %u != expected %u\n",
-                                       crc, con->in_hdr.crc);
-                                return -EBADMSG;
-                        }
-                }
-        }
-        front_len = le32_to_cpu(con->in_hdr.front_len);
-        if (front_len > CEPH_MSG_MAX_FRONT_LEN)
-                return -EIO;
-        middle_len = le32_to_cpu(con->in_hdr.middle_len);
-        if (middle_len > CEPH_MSG_MAX_DATA_LEN)
-                return -EIO;
-        data_len = le32_to_cpu(con->in_hdr.data_len);
-        if (data_len > CEPH_MSG_MAX_DATA_LEN)
-                return -EIO;
-        data_off = le16_to_cpu(con->in_hdr.data_off);
-        /* verify seq# */
-        seq = le64_to_cpu(con->in_hdr.seq);
-        if ((s64)seq - (s64)con->in_seq < 1) {
-                pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
-                        ENTITY_NAME(con->peer_name),
-                        pr_addr(&con->peer_addr.in_addr),
-                        seq, con->in_seq + 1);
-                con->in_base_pos = -front_len - middle_len - data_len -
-                        sizeof(m->footer);
-                con->in_tag = CEPH_MSGR_TAG_READY;
-                con->in_seq++;
-                return 0;
-        } else if ((s64)seq - (s64)con->in_seq > 1) {
-                pr_err("read_partial_message bad seq %lld expected %lld\n",
-                       seq, con->in_seq + 1);
-                con->error_msg = "bad message sequence # for incoming message";
-                return -EBADMSG;
-        }
-        /* allocate message? */
-        if (!con->in_msg) {
-                dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
-                     con->in_hdr.front_len, con->in_hdr.data_len);
-                skip = 0;
-                con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
-                if (skip) {
-                        /* skip this message */
-                        dout("alloc_msg said skip message\n");
-                        BUG_ON(con->in_msg);
-                        con->in_base_pos = -front_len - middle_len - data_len -
-                                sizeof(m->footer);
-                        con->in_tag = CEPH_MSGR_TAG_READY;
-                        con->in_seq++;
-                        return 0;
-                }
-                if (!con->in_msg) {
-                        con->error_msg =
-                                "error allocating memory for incoming message";
-                        return -ENOMEM;
-                }
-                m = con->in_msg;
-                m->front.iov_len = 0;    /* haven't read it yet */
-                if (m->middle)
-                        m->middle->vec.iov_len = 0;
-                con->in_msg_pos.page = 0;
-                con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
-                con->in_msg_pos.data_pos = 0;
-        }
-        /* front */
-        ret = read_partial_message_section(con, &m->front, front_len,
-                                           &con->in_front_crc);
-        if (ret <= 0)
-                return ret;
-        /* middle */
-        if (m->middle) {
-                ret = read_partial_message_section(con, &m->middle->vec,
-                                                   middle_len,
-                                                   &con->in_middle_crc);
-                if (ret <= 0)
-                        return ret;
-        }
-        /* (page) data */
-        while (con->in_msg_pos.data_pos < data_len) {
-                left = min((int)(data_len - con->in_msg_pos.data_pos),
-                           (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
-                BUG_ON(m->pages == NULL);
-                p = kmap(m->pages[con->in_msg_pos.page]);
-                ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
-                                       left);
-                if (ret > 0 && datacrc)
-                        con->in_data_crc =
-                                crc32c(con->in_data_crc,
-                                          p + con->in_msg_pos.page_pos, ret);
-                kunmap(m->pages[con->in_msg_pos.page]);
-                if (ret <= 0)
-                        return ret;
-                con->in_msg_pos.data_pos += ret;
-                con->in_msg_pos.page_pos += ret;
-                if (con->in_msg_pos.page_pos == PAGE_SIZE) {
-                        con->in_msg_pos.page_pos = 0;
-                        con->in_msg_pos.page++;
-                }
-        }
-        /* footer */
-        to = sizeof(m->hdr) + sizeof(m->footer);
-        while (con->in_base_pos < to) {
-                left = to - con->in_base_pos;
-                ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
-                                       (con->in_base_pos - sizeof(m->hdr)),
-                                       left);
-                if (ret <= 0)
-                        return ret;
-                con->in_base_pos += ret;
-        }
-        dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
-             m, front_len, m->footer.front_crc, middle_len,
-             m->footer.middle_crc, data_len, m->footer.data_crc);
-        /* crc ok? */
-        if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
-                pr_err("read_partial_message %p front crc %u != exp. %u\n",
-                       m, con->in_front_crc, m->footer.front_crc);
-                return -EBADMSG;
-        }
-        if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
-                pr_err("read_partial_message %p middle crc %u != exp %u\n",
-                       m, con->in_middle_crc, m->footer.middle_crc);
-                return -EBADMSG;
-        }
-        if (datacrc &&
-            (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
-            con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
-                pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
-                       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
-                return -EBADMSG;
-        }
-        return 1; /* done! */
-}
-/*
- * Process message.  This happens in the worker thread.  The callback should
- * be careful not to do anything that waits on other incoming messages or it
- * may deadlock.
- */
-static void process_message(struct ceph_connection *con)
-{
-        struct ceph_msg *msg;
-        msg = con->in_msg;
-        con->in_msg = NULL;
-        /* if first message, set peer_name */
-        if (con->peer_name.type == 0)
-                con->peer_name = msg->hdr.src;
-        con->in_seq++;
-        mutex_unlock(&con->mutex);
-        dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
-             msg, le64_to_cpu(msg->hdr.seq),
-             ENTITY_NAME(msg->hdr.src),
-             le16_to_cpu(msg->hdr.type),
-             ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
-             le32_to_cpu(msg->hdr.front_len),
-             le32_to_cpu(msg->hdr.data_len),
-             con->in_front_crc, con->in_middle_crc, con->in_data_crc);
-        con->ops->dispatch(con, msg);
-        mutex_lock(&con->mutex);
-        prepare_read_tag(con);
-}
-/*
- * Write something to the socket.  Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-static int try_write(struct ceph_connection *con)
-{
-        struct ceph_messenger *msgr = con->msgr;
-        int ret = 1;
-        dout("try_write start %p state %lu nref %d\n", con, con->state,
-             atomic_read(&con->nref));
-more:
-        dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
-        /* open the socket first? */
-        if (con->sock == NULL) {
-                /*
-                 * if we were STANDBY and are reconnecting _this_
-                 * connection, bump connect_seq now.  Always bump
-                 * global_seq.
-                 */
-                if (test_and_clear_bit(STANDBY, &con->state))
-                        con->connect_seq++;
-                prepare_write_banner(msgr, con);
-                prepare_write_connect(msgr, con, 1);
-                prepare_read_banner(con);
-                set_bit(CONNECTING, &con->state);
-                clear_bit(NEGOTIATING, &con->state);
-                BUG_ON(con->in_msg);
-                con->in_tag = CEPH_MSGR_TAG_READY;
-                dout("try_write initiating connect on %p new state %lu\n",
-                     con, con->state);
-                con->sock = ceph_tcp_connect(con);
-                if (IS_ERR(con->sock)) {
-                        con->sock = NULL;
-                        con->error_msg = "connect error";
-                        ret = -1;
-                        goto out;
-                }
-        }
-more_kvec:
-        /* kvec data queued? */
-        if (con->out_skip) {
-                ret = write_partial_skip(con);
-                if (ret <= 0)
-                        goto done;
-                if (ret < 0) {
-                        dout("try_write write_partial_skip err %d\n", ret);
-                        goto done;
-                }
-        }
-        if (con->out_kvec_left) {
-                ret = write_partial_kvec(con);
-                if (ret <= 0)
-                        goto done;
-        }
-        /* msg pages? */
-        if (con->out_msg) {
-                if (con->out_msg_done) {
-                        ceph_msg_put(con->out_msg);
-                        con->out_msg = NULL;   /* we're done with this one */
-                        goto do_next;
-                }
-                ret = write_partial_msg_pages(con);
-                if (ret == 1)
-                        goto more_kvec;  /* we need to send the footer, too! */
-                if (ret == 0)
-                        goto done;
-                if (ret < 0) {
-                        dout("try_write write_partial_msg_pages err %d\n",
-                             ret);
-                        goto done;
-                }
-        }
-do_next:
-        if (!test_bit(CONNECTING, &con->state)) {
-                /* is anything else pending? */
-                if (!list_empty(&con->out_queue)) {
-                        prepare_write_message(con);
-                        goto more;
-                }
-                if (con->in_seq > con->in_seq_acked) {
-                        prepare_write_ack(con);
-                        goto more;
-                }
-                if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
-                        prepare_write_keepalive(con);
-                        goto more;
-                }
-        }
-        /* Nothing to do! */
-        clear_bit(WRITE_PENDING, &con->state);
-        dout("try_write nothing else to write.\n");
-done:
-        ret = 0;
-out:
-        dout("try_write done on %p\n", con);
-        return ret;
-}
-/*
- * Read what we can from the socket.
- */
-static int try_read(struct ceph_connection *con)
-{
-        int ret = -1;
-        if (!con->sock)
-                return 0;
-        if (test_bit(STANDBY, &con->state))
-                return 0;
-        dout("try_read start on %p\n", con);
-more:
-        dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
-             con->in_base_pos);
-        if (test_bit(CONNECTING, &con->state)) {
-                if (!test_bit(NEGOTIATING, &con->state)) {
-                        dout("try_read connecting\n");
-                        ret = read_partial_banner(con);
-                        if (ret <= 0)
-                                goto done;
-                        if (process_banner(con) < 0) {
-                                ret = -1;
-                                goto out;
-                        }
-                }
-                ret = read_partial_connect(con);
-                if (ret <= 0)
-                        goto done;
-                if (process_connect(con) < 0) {
-                        ret = -1;
-                        goto out;
-                }
-                goto more;
-        }
-        if (con->in_base_pos < 0) {
-                /*
-                 * skipping + discarding content.
-                 *
-                 * FIXME: there must be a better way to do this!
-                 */
-                static char buf[1024];
-                int skip = min(1024, -con->in_base_pos);
-                dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
-                ret = ceph_tcp_recvmsg(con->sock, buf, skip);
-                if (ret <= 0)
-                        goto done;
-                con->in_base_pos += ret;
-                if (con->in_base_pos)
-                        goto more;
-        }
-        if (con->in_tag == CEPH_MSGR_TAG_READY) {
-                /*
-                 * what's next?
-                 */
-                ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
-                if (ret <= 0)
-                        goto done;
-                dout("try_read got tag %d\n", (int)con->in_tag);
-                switch (con->in_tag) {
-                case CEPH_MSGR_TAG_MSG:
-                        prepare_read_message(con);
-                        break;
-                case CEPH_MSGR_TAG_ACK:
-                        prepare_read_ack(con);
-                        break;
-                case CEPH_MSGR_TAG_CLOSE:
-                        set_bit(CLOSED, &con->state);   /* fixme */
-                        goto done;
-                default:
-                        goto bad_tag;
-                }
-        }
-        if (con->in_tag == CEPH_MSGR_TAG_MSG) {
-                ret = read_partial_message(con);
-                if (ret <= 0) {
-                        switch (ret) {
-                        case -EBADMSG:
-                                con->error_msg = "bad crc";
-                                ret = -EIO;
-                                goto out;
-                        case -EIO:
-                                con->error_msg = "io error";
-                                goto out;
-                        default:
-                                goto done;
-                        }
-                }
-                if (con->in_tag == CEPH_MSGR_TAG_READY)
-                        goto more;
-                process_message(con);
-                goto more;
-        }
-        if (con->in_tag == CEPH_MSGR_TAG_ACK) {
-                ret = read_partial_ack(con);
-                if (ret <= 0)
-                        goto done;
-                process_ack(con);
-                goto more;
-        }
-done:
-        ret = 0;
-out:
-        dout("try_read done on %p\n", con);
-        return ret;
-bad_tag:
-        pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
-        con->error_msg = "protocol error, garbage tag";
-        ret = -1;
-        goto out;
-}
-/*
- * Atomically queue work on a connection.  Bump @con reference to
- * avoid races with connection teardown.
- *
- * There is some trickery going on with QUEUED and BUSY because we
- * only want a _single_ thread operating on each connection at any
- * point in time, but we want to use all available CPUs.
- *
- * The worker thread only proceeds if it can atomically set BUSY.  It
- * clears QUEUED and does it's thing.  When it thinks it's done, it
- * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
- * (tries again to set BUSY).
- *
- * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
- * try to queue work.  If that fails (work is already queued, or BUSY)
- * we give up (work also already being done or is queued) but leave QUEUED
- * set so that the worker thread will loop if necessary.
- */
-static void queue_con(struct ceph_connection *con)
-{
-        if (test_bit(DEAD, &con->state)) {
-                dout("queue_con %p ignoring: DEAD\n",
-                     con);
-                return;
-        }
-        if (!con->ops->get(con)) {
-                dout("queue_con %p ref count 0\n", con);
-                return;
-        }
-        set_bit(QUEUED, &con->state);
-        if (test_bit(BUSY, &con->state)) {
-                dout("queue_con %p - already BUSY\n", con);
-                con->ops->put(con);
-        } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
-                dout("queue_con %p - already queued\n", con);
-                con->ops->put(con);
-        } else {
-                dout("queue_con %p\n", con);
-        }
-}
-/*
- * Do some work on a connection.  Drop a connection ref when we're done.
- */
-static void con_work(struct work_struct *work)
-{
-        struct ceph_connection *con = container_of(work, struct ceph_connection,
-                                                   work.work);
-        int backoff = 0;
-more:
-        if (test_and_set_bit(BUSY, &con->state) != 0) {
-                dout("con_work %p BUSY already set\n", con);
-                goto out;
-        }
-        dout("con_work %p start, clearing QUEUED\n", con);
-        clear_bit(QUEUED, &con->state);
-        mutex_lock(&con->mutex);
-        if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
-                dout("con_work CLOSED\n");
-                con_close_socket(con);
-                goto done;
-        }
-        if (test_and_clear_bit(OPENING, &con->state)) {
-                /* reopen w/ new peer */
-                dout("con_work OPENING\n");
-                con_close_socket(con);
-        }
-        if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
-            try_read(con) < 0 ||
-            try_write(con) < 0) {
-                mutex_unlock(&con->mutex);
-                backoff = 1;
-                ceph_fault(con);     /* error/fault path */
-                goto done_unlocked;
-        }
-done:
-        mutex_unlock(&con->mutex);
-done_unlocked:
-        clear_bit(BUSY, &con->state);
-        dout("con->state=%lu\n", con->state);
-        if (test_bit(QUEUED, &con->state)) {
-                if (!backoff || test_bit(OPENING, &con->state)) {
-                        dout("con_work %p QUEUED reset, looping\n", con);
-                        goto more;
-                }
-                dout("con_work %p QUEUED reset, but just faulted\n", con);
-                clear_bit(QUEUED, &con->state);
-        }
-        dout("con_work %p done\n", con);
-out:
-        con->ops->put(con);
-}
-/*
- * Generic error/fault handler.  A retry mechanism is used with
- * exponential backoff
- */
-static void ceph_fault(struct ceph_connection *con)
-{
-        pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
-               pr_addr(&con->peer_addr.in_addr), con->error_msg);
-        dout("fault %p state %lu to peer %s\n",
-             con, con->state, pr_addr(&con->peer_addr.in_addr));
-        if (test_bit(LOSSYTX, &con->state)) {
-                dout("fault on LOSSYTX channel\n");
-                goto out;
-        }
-        mutex_lock(&con->mutex);
-        if (test_bit(CLOSED, &con->state))
-                goto out_unlock;
-        con_close_socket(con);
-        if (con->in_msg) {
-                ceph_msg_put(con->in_msg);
-                con->in_msg = NULL;
-        }
-        /* Requeue anything that hasn't been acked */
-        list_splice_init(&con->out_sent, &con->out_queue);
-        /* If there are no messages in the queue, place the connection
-         * in a STANDBY state (i.e., don't try to reconnect just yet). */
-        if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
-                dout("fault setting STANDBY\n");
-                set_bit(STANDBY, &con->state);
-        } else {
-                /* retry after a delay. */
-                if (con->delay == 0)
-                        con->delay = BASE_DELAY_INTERVAL;
-                else if (con->delay < MAX_DELAY_INTERVAL)
-                        con->delay *= 2;
-                dout("fault queueing %p delay %lu\n", con, con->delay);
-                con->ops->get(con);
-                if (queue_delayed_work(ceph_msgr_wq, &con->work,
-                                       round_jiffies_relative(con->delay)) == 0)
-                        con->ops->put(con);
-        }
-out_unlock:
-        mutex_unlock(&con->mutex);
-out:
-        /*
-         * in case we faulted due to authentication, invalidate our
-         * current tickets so that we can get new ones.
-         */
-        if (con->auth_retry && con->ops->invalidate_authorizer) {
-                dout("calling invalidate_authorizer()\n");
-                con->ops->invalidate_authorizer(con);
-        }
-        if (con->ops->fault)
-                con->ops->fault(con);
-}
-/*
- * create a new messenger instance
- */
-struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
-{
-        struct ceph_messenger *msgr;
-        msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
-        if (msgr == NULL)
-                return ERR_PTR(-ENOMEM);
-        spin_lock_init(&msgr->global_seq_lock);
-        /* the zero page is needed if a request is "canceled" while the message
-         * is being written over the socket */
-        msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
-        if (!msgr->zero_page) {
-                kfree(msgr);
-                return ERR_PTR(-ENOMEM);
-        }
-        kmap(msgr->zero_page);
-        if (myaddr)
-                msgr->inst.addr = *myaddr;
-        /* select a random nonce */
-        msgr->inst.addr.type = 0;
-        get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
-        encode_my_addr(msgr);
-        dout("messenger_create %p\n", msgr);
-        return msgr;
-}
-void ceph_messenger_destroy(struct ceph_messenger *msgr)
-{
-        dout("destroy %p\n", msgr);
-        kunmap(msgr->zero_page);
-        __free_page(msgr->zero_page);
-        kfree(msgr);
-        dout("destroyed messenger %p\n", msgr);
-}
-/*
- * Queue up an outgoing message on the given connection.
- */
-void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
-{
-        if (test_bit(CLOSED, &con->state)) {
-                dout("con_send %p closed, dropping %p\n", con, msg);
-                ceph_msg_put(msg);
-                return;
-        }
-        /* set src+dst */
-        msg->hdr.src = con->msgr->inst.name;
-        BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
-        msg->needs_out_seq = true;
-        /* queue */
-        mutex_lock(&con->mutex);
-        BUG_ON(!list_empty(&msg->list_head));
-        list_add_tail(&msg->list_head, &con->out_queue);
-        dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
-             ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
-             ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
-             le32_to_cpu(msg->hdr.front_len),
-             le32_to_cpu(msg->hdr.middle_len),
-             le32_to_cpu(msg->hdr.data_len));
-        mutex_unlock(&con->mutex);
-        /* if there wasn't anything waiting to send before, queue
-         * new work */
-        if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
-                queue_con(con);
-}
-/*
- * Revoke a message that was previously queued for send
- */
-void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
-{
-        mutex_lock(&con->mutex);
-        if (!list_empty(&msg->list_head)) {
-                dout("con_revoke %p msg %p - was on queue\n", con, msg);
-                list_del_init(&msg->list_head);
-                ceph_msg_put(msg);
-                msg->hdr.seq = 0;
-        }
-        if (con->out_msg == msg) {
-                dout("con_revoke %p msg %p - was sending\n", con, msg);
-                con->out_msg = NULL;
-                if (con->out_kvec_is_msg) {
-                        con->out_skip = con->out_kvec_bytes;
-                        con->out_kvec_is_msg = false;
-                }
-                ceph_msg_put(msg);
-                msg->hdr.seq = 0;
-        }
-        mutex_unlock(&con->mutex);
-}
-/*
- * Revoke a message that we may be reading data into
- */
-void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
-{
-        mutex_lock(&con->mutex);
-        if (con->in_msg && con->in_msg == msg) {
-                unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
-                unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
-                unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
-                /* skip rest of message */
-                dout("con_revoke_pages %p msg %p revoked\n", con, msg);
-                        con->in_base_pos = con->in_base_pos -
-                                sizeof(struct ceph_msg_header) -
-                                front_len -
-                                middle_len -
-                                data_len -
-                                sizeof(struct ceph_msg_footer);
-                ceph_msg_put(con->in_msg);
-                con->in_msg = NULL;
-                con->in_tag = CEPH_MSGR_TAG_READY;
-                con->in_seq++;
-        } else {
-                dout("con_revoke_pages %p msg %p pages %p no-op\n",
-                     con, con->in_msg, msg);
-        }
-        mutex_unlock(&con->mutex);
-}
-/*
- * Queue a keepalive byte to ensure the tcp connection is alive.
- */
-void ceph_con_keepalive(struct ceph_connection *con)
-{
-        if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
-            test_and_set_bit(WRITE_PENDING, &con->state) == 0)
-                queue_con(con);
-}
-/*
- * construct a new message with given type, size
- * the new msg has a ref count of 1.
- */
-struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
-{
-        struct ceph_msg *m;
-        m = kmalloc(sizeof(*m), flags);
-        if (m == NULL)
-                goto out;
-        kref_init(&m->kref);
-        INIT_LIST_HEAD(&m->list_head);
-        m->hdr.tid = 0;
-        m->hdr.type = cpu_to_le16(type);
-        m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
-        m->hdr.version = 0;
-        m->hdr.front_len = cpu_to_le32(front_len);
-        m->hdr.middle_len = 0;
-        m->hdr.data_len = 0;
-        m->hdr.data_off = 0;
-        m->hdr.reserved = 0;
-        m->footer.front_crc = 0;
-        m->footer.middle_crc = 0;
-        m->footer.data_crc = 0;
-        m->footer.flags = 0;
-        m->front_max = front_len;
-        m->front_is_vmalloc = false;
-        m->more_to_follow = false;
-        m->pool = NULL;
-        /* front */
-        if (front_len) {
-                if (front_len > PAGE_CACHE_SIZE) {
-                        m->front.iov_base = __vmalloc(front_len, flags,
-                                                      PAGE_KERNEL);
-                        m->front_is_vmalloc = true;
-                } else {
-                        m->front.iov_base = kmalloc(front_len, flags);
-                }
-                if (m->front.iov_base == NULL) {
-                        pr_err("msg_new can't allocate %d bytes\n",
-                             front_len);
-                        goto out2;
-                }
-        } else {
-                m->front.iov_base = NULL;
-        }
-        m->front.iov_len = front_len;
-        /* middle */
-        m->middle = NULL;
-        /* data */
-        m->nr_pages = 0;
-        m->pages = NULL;
-        m->pagelist = NULL;
-        dout("ceph_msg_new %p front %d\n", m, front_len);
-        return m;
-out2:
-        ceph_msg_put(m);
-out:
-        pr_err("msg_new can't create type %d front %d\n", type, front_len);
-        return NULL;
-}
-/*
- * Allocate "middle" portion of a message, if it is needed and wasn't
- * allocated by alloc_msg.  This allows us to read a small fixed-size
- * per-type header in the front and then gracefully fail (i.e.,
- * propagate the error to the caller based on info in the front) when
- * the middle is too large.
- */
-static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
-{
-        int type = le16_to_cpu(msg->hdr.type);
-        int middle_len = le32_to_cpu(msg->hdr.middle_len);
-        dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
-             ceph_msg_type_name(type), middle_len);
-        BUG_ON(!middle_len);
-        BUG_ON(msg->middle);
-        msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
-        if (!msg->middle)
-                return -ENOMEM;
-        return 0;
-}
-/*
- * Generic message allocator, for incoming messages.
- */
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-                                struct ceph_msg_header *hdr,
-                                int *skip)
-{
-        int type = le16_to_cpu(hdr->type);
-        int front_len = le32_to_cpu(hdr->front_len);
-        int middle_len = le32_to_cpu(hdr->middle_len);
-        struct ceph_msg *msg = NULL;
-        int ret;
-        if (con->ops->alloc_msg) {
-                mutex_unlock(&con->mutex);
-                msg = con->ops->alloc_msg(con, hdr, skip);
-                mutex_lock(&con->mutex);
-                if (!msg || *skip)
-                        return NULL;
-        }
-        if (!msg) {
-                *skip = 0;
-                msg = ceph_msg_new(type, front_len, GFP_NOFS);
-                if (!msg) {
-                        pr_err("unable to allocate msg type %d len %d\n",
-                               type, front_len);
-                        return NULL;
-                }
-        }
-        memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
-        if (middle_len && !msg->middle) {
-                ret = ceph_alloc_middle(con, msg);
-                if (ret < 0) {
-                        ceph_msg_put(msg);
-                        return NULL;
-                }
-        }
-        return msg;
-}
-/*
- * Free a generically kmalloc'd message.
- */
-void ceph_msg_kfree(struct ceph_msg *m)
-{
-        dout("msg_kfree %p\n", m);
-        if (m->front_is_vmalloc)
-                vfree(m->front.iov_base);
-        else
-                kfree(m->front.iov_base);
-        kfree(m);
-}
-/*
- * Drop a msg ref.  Destroy as needed.
- */
-void ceph_msg_last_put(struct kref *kref)
-{
-        struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
-        dout("ceph_msg_put last one on %p\n", m);
-        WARN_ON(!list_empty(&m->list_head));
-        /* drop middle, data, if any */
-        if (m->middle) {
-                ceph_buffer_put(m->middle);
-                m->middle = NULL;
-        }
-        m->nr_pages = 0;
-        m->pages = NULL;
-        if (m->pagelist) {
-                ceph_pagelist_release(m->pagelist);
-                kfree(m->pagelist);
-                m->pagelist = NULL;
-        }
-        if (m->pool)
-                ceph_msgpool_put(m->pool, m);
-        else
-                ceph_msg_kfree(m);
-}
-void ceph_msg_dump(struct ceph_msg *msg)
-{
-        pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
-                 msg->front_max, msg->nr_pages);
-        print_hex_dump(KERN_DEBUG, "header: ",
-                       DUMP_PREFIX_OFFSET, 16, 1,
-                       &msg->hdr, sizeof(msg->hdr), true);
-        print_hex_dump(KERN_DEBUG, " front: ",
-                       DUMP_PREFIX_OFFSET, 16, 1,
-                       msg->front.iov_base, msg->front.iov_len, true);
-        if (msg->middle)
-                print_hex_dump(KERN_DEBUG, "middle: ",
-                               DUMP_PREFIX_OFFSET, 16, 1,
-                               msg->middle->vec.iov_base,
-                               msg->middle->vec.iov_len, true);
-        print_hex_dump(KERN_DEBUG, "footer: ",
-                       DUMP_PREFIX_OFFSET, 16, 1,
-                       &msg->footer, sizeof(msg->footer), true);
-}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
deleted file mode 100644
index 76fbc957bc13..000000000000
--- a/fs/ceph/messenger.h
+++ /dev/null
@@ -1,253 +0,0 @@
-#ifndef __FS_CEPH_MESSENGER_H
-#define __FS_CEPH_MESSENGER_H
-#include <linux/kref.h>
-#include <linux/mutex.h>
-#include <linux/net.h>
-#include <linux/radix-tree.h>
-#include <linux/uio.h>
-#include <linux/version.h>
-#include <linux/workqueue.h>
-#include "types.h"
-#include "buffer.h"
-struct ceph_msg;
-struct ceph_connection;
-extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
-/*
- * Ceph defines these callbacks for handling connection events.
- */
-struct ceph_connection_operations {
-        struct ceph_connection *(*get)(struct ceph_connection *);
-        void (*put)(struct ceph_connection *);
-        /* handle an incoming message. */
-        void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
-        /* authorize an outgoing connection */
-        int (*get_authorizer) (struct ceph_connection *con,
-                               void **buf, int *len, int *proto,
-                               void **reply_buf, int *reply_len, int force_new);
-        int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
-        int (*invalidate_authorizer)(struct ceph_connection *con);
-        /* protocol version mismatch */
-        void (*bad_proto) (struct ceph_connection *con);
-        /* there was some error on the socket (disconnect, whatever) */
-        void (*fault) (struct ceph_connection *con);
-        /* a remote host as terminated a message exchange session, and messages
-         * we sent (or they tried to send us) may be lost. */
-        void (*peer_reset) (struct ceph_connection *con);
-        struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
-                                        struct ceph_msg_header *hdr,
-                                        int *skip);
-};
-/* use format string %s%d */
-#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
-struct ceph_messenger {
-        struct ceph_entity_inst inst;    /* my name+address */
-        struct ceph_entity_addr my_enc_addr;
-        struct page *zero_page;          /* used in certain error cases */
-        bool nocrc;
-        /*
-         * the global_seq counts connections i (attempt to) initiate
-         * in order to disambiguate certain connect race conditions.
-         */
-        u32 global_seq;
-        spinlock_t global_seq_lock;
-};
-/*
- * a single message.  it contains a header (src, dest, message type, etc.),
- * footer (crc values, mainly), a "front" message body, and possibly a
- * data payload (stored in some number of pages).
- */
-struct ceph_msg {
-        struct ceph_msg_header hdr;     /* header */
-        struct ceph_msg_footer footer;  /* footer */
-        struct kvec front;              /* unaligned blobs of message */
-        struct ceph_buffer *middle;
-        struct page **pages;            /* data payload.  NOT OWNER. */
-        unsigned nr_pages;              /* size of page array */
-        struct ceph_pagelist *pagelist; /* instead of pages */
-        struct list_head list_head;
-        struct kref kref;
-        bool front_is_vmalloc;
-        bool more_to_follow;
-        bool needs_out_seq;
-        int front_max;
-        struct ceph_msgpool *pool;
-};
-struct ceph_msg_pos {
-        int page, page_pos;  /* which page; offset in page */
-        int data_pos;        /* offset in data payload */
-        int did_page_crc;    /* true if we've calculated crc for current page */
-};
-/* ceph connection fault delay defaults, for exponential backoff */
-#define BASE_DELAY_INTERVAL     (HZ/2)
-#define MAX_DELAY_INTERVAL      (5 * 60 * HZ)
-/*
- * ceph_connection state bit flags
- *
- * QUEUED and BUSY are used together to ensure that only a single
- * thread is currently opening, reading or writing data to the socket.
- */
-#define LOSSYTX         0  /* we can close channel or drop messages on errors */
-#define CONNECTING      1
-#define NEGOTIATING     2
-#define KEEPALIVE_PENDING      3
-#define WRITE_PENDING   4  /* we have data ready to send */
-#define QUEUED          5  /* there is work queued on this connection */
-#define BUSY            6  /* work is being done */
-#define STANDBY         8  /* no outgoing messages, socket closed.  we keep
-                            * the ceph_connection around to maintain shared
-                            * state with the peer. */
-#define CLOSED          10 /* we've closed the connection */
-#define SOCK_CLOSED     11 /* socket state changed to closed */
-#define OPENING         13 /* open connection w/ (possibly new) peer */
-#define DEAD            14 /* dead, about to kfree */
-/*
- * A single connection with another host.
- *
- * We maintain a queue of outgoing messages, and some session state to
- * ensure that we can preserve the lossless, ordered delivery of
- * messages in the case of a TCP disconnect.
- */
-struct ceph_connection {
-        void *private;
-        atomic_t nref;
-        const struct ceph_connection_operations *ops;
-        struct ceph_messenger *msgr;
-        struct socket *sock;
-        unsigned long state;    /* connection state (see flags above) */
-        const char *error_msg;  /* error message, if any */
-        struct ceph_entity_addr peer_addr; /* peer address */
-        struct ceph_entity_name peer_name; /* peer name */
-        struct ceph_entity_addr peer_addr_for_me;
-        unsigned peer_features;
-        u32 connect_seq;      /* identify the most recent connection
-                                 attempt for this connection, client */
-        u32 peer_global_seq;  /* peer's global seq for this connection */
-        int auth_retry;       /* true if we need a newer authorizer */
-        void *auth_reply_buf;   /* where to put the authorizer reply */
-        int auth_reply_buf_len;
-        struct mutex mutex;
-        /* out queue */
-        struct list_head out_queue;
-        struct list_head out_sent;   /* sending or sent but unacked */
-        u64 out_seq;                 /* last message queued for send */
-        bool out_keepalive_pending;
-        u64 in_seq, in_seq_acked;  /* last message received, acked */
-        /* connection negotiation temps */
-        char in_banner[CEPH_BANNER_MAX_LEN];
-        union {
-                struct {  /* outgoing connection */
-                        struct ceph_msg_connect out_connect;
-                        struct ceph_msg_connect_reply in_reply;
-                };
-                struct {  /* incoming */
-                        struct ceph_msg_connect in_connect;
-                        struct ceph_msg_connect_reply out_reply;
-                };
-        };
-        struct ceph_entity_addr actual_peer_addr;
-        /* message out temps */
-        struct ceph_msg *out_msg;        /* sending message (== tail of
-                                            out_sent) */
-        bool out_msg_done;
-        struct ceph_msg_pos out_msg_pos;
-        struct kvec out_kvec[8],         /* sending header/footer data */
-                *out_kvec_cur;
-        int out_kvec_left;   /* kvec's left in out_kvec */
-        int out_skip;        /* skip this many bytes */
-        int out_kvec_bytes;  /* total bytes left */
-        bool out_kvec_is_msg; /* kvec refers to out_msg */
-        int out_more;        /* there is more data after the kvecs */
-        __le64 out_temp_ack; /* for writing an ack */
-        /* message in temps */
-        struct ceph_msg_header in_hdr;
-        struct ceph_msg *in_msg;
-        struct ceph_msg_pos in_msg_pos;
-        u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
-        char in_tag;         /* protocol control byte */
-        int in_base_pos;     /* bytes read */
-        __le64 in_temp_ack;  /* for reading an ack */
-        struct delayed_work work;           /* send|recv work */
-        unsigned long       delay;          /* current delay interval */
-};
-extern const char *pr_addr(const struct sockaddr_storage *ss);
-extern int ceph_parse_ips(const char *c, const char *end,
-                          struct ceph_entity_addr *addr,
-                          int max_count, int *count);
-extern int ceph_msgr_init(void);
-extern void ceph_msgr_exit(void);
-extern void ceph_msgr_flush(void);
-extern struct ceph_messenger *ceph_messenger_create(
-        struct ceph_entity_addr *myaddr);
-extern void ceph_messenger_destroy(struct ceph_messenger *);
-extern void ceph_con_init(struct ceph_messenger *msgr,
-                          struct ceph_connection *con);
-extern void ceph_con_open(struct ceph_connection *con,
-                          struct ceph_entity_addr *addr);
-extern bool ceph_con_opened(struct ceph_connection *con);
-extern void ceph_con_close(struct ceph_connection *con);
-extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke_message(struct ceph_connection *con,
-                                  struct ceph_msg *msg);
-extern void ceph_con_keepalive(struct ceph_connection *con);
-extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
-extern void ceph_con_put(struct ceph_connection *con);
-extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
-extern void ceph_msg_kfree(struct ceph_msg *m);
-static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
-{
-        kref_get(&msg->kref);
-        return msg;
-}
-extern void ceph_msg_last_put(struct kref *kref);
-static inline void ceph_msg_put(struct ceph_msg *msg)
-{
-        kref_put(&msg->kref, ceph_msg_last_put);
-}
-extern void ceph_msg_dump(struct ceph_msg *msg);
-#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
deleted file mode 100644
index b2a5a3e4a671..000000000000
--- a/fs/ceph/mon_client.c
+++ /dev/null
@@ -1,1018 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/sched.h>
-#include "mon_client.h"
-#include "super.h"
-#include "auth.h"
-#include "decode.h"
-/*
- * Interact with Ceph monitor cluster.  Handle requests for new map
- * versions, and periodically resend as needed.  Also implement
- * statfs() and umount().
- *
- * A small cluster of Ceph "monitors" are responsible for managing critical
- * cluster configuration and state information.  An odd number (e.g., 3, 5)
- * of cmon daemons use a modified version of the Paxos part-time parliament
- * algorithm to manage the MDS map (mds cluster membership), OSD map, and
- * list of clients who have mounted the file system.
- *
- * We maintain an open, active session with a monitor at all times in order to
- * receive timely MDSMap updates.  We periodically send a keepalive byte on the
- * TCP socket to ensure we detect a failure.  If the connection does break, we
- * randomly hunt for a new monitor.  Once the connection is reestablished, we
- * resend any outstanding requests.
- */
-static const struct ceph_connection_operations mon_con_ops;
-static int __validate_auth(struct ceph_mon_client *monc);
-/*
- * Decode a monmap blob (e.g., during mount).
- */
-struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
-{
-        struct ceph_monmap *m = NULL;
-        int i, err = -EINVAL;
-        struct ceph_fsid fsid;
-        u32 epoch, num_mon;
-        u16 version;
-        u32 len;
-        ceph_decode_32_safe(&p, end, len, bad);
-        ceph_decode_need(&p, end, len, bad);
-        dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
-        ceph_decode_16_safe(&p, end, version, bad);
-        ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
-        ceph_decode_copy(&p, &fsid, sizeof(fsid));
-        epoch = ceph_decode_32(&p);
-        num_mon = ceph_decode_32(&p);
-        ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
-        if (num_mon >= CEPH_MAX_MON)
-                goto bad;
-        m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
-        if (m == NULL)
-                return ERR_PTR(-ENOMEM);
-        m->fsid = fsid;
-        m->epoch = epoch;
-        m->num_mon = num_mon;
-        ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
-        for (i = 0; i < num_mon; i++)
-                ceph_decode_addr(&m->mon_inst[i].addr);
-        dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
-             m->num_mon);
-        for (i = 0; i < m->num_mon; i++)
-                dout("monmap_decode  mon%d is %s\n", i,
-                     pr_addr(&m->mon_inst[i].addr.in_addr));
-        return m;
-bad:
-        dout("monmap_decode failed with %d\n", err);
-        kfree(m);
-        return ERR_PTR(err);
-}
-/*
- * return true if *addr is included in the monmap.
- */
-int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
-{
-        int i;
-        for (i = 0; i < m->num_mon; i++)
-                if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
-                        return 1;
-        return 0;
-}
-/*
- * Send an auth request.
- */
-static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
-{
-        monc->pending_auth = 1;
-        monc->m_auth->front.iov_len = len;
-        monc->m_auth->hdr.front_len = cpu_to_le32(len);
-        ceph_con_revoke(monc->con, monc->m_auth);
-        ceph_msg_get(monc->m_auth);  /* keep our ref */
-        ceph_con_send(monc->con, monc->m_auth);
-}
-/*
- * Close monitor session, if any.
- */
-static void __close_session(struct ceph_mon_client *monc)
-{
-        if (monc->con) {
-                dout("__close_session closing mon%d\n", monc->cur_mon);
-                ceph_con_revoke(monc->con, monc->m_auth);
-                ceph_con_close(monc->con);
-                monc->cur_mon = -1;
-                monc->pending_auth = 0;
-                ceph_auth_reset(monc->auth);
-        }
-}
-/*
- * Open a session with a (new) monitor.
- */
-static int __open_session(struct ceph_mon_client *monc)
-{
-        char r;
-        int ret;
-        if (monc->cur_mon < 0) {
-                get_random_bytes(&r, 1);
-                monc->cur_mon = r % monc->monmap->num_mon;
-                dout("open_session num=%d r=%d -> mon%d\n",
-                     monc->monmap->num_mon, r, monc->cur_mon);
-                monc->sub_sent = 0;
-                monc->sub_renew_after = jiffies;  /* i.e., expired */
-                monc->want_next_osdmap = !!monc->want_next_osdmap;
-                dout("open_session mon%d opening\n", monc->cur_mon);
-                monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
-                monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
-                ceph_con_open(monc->con,
-                              &monc->monmap->mon_inst[monc->cur_mon].addr);
-                /* initiatiate authentication handshake */
-                ret = ceph_auth_build_hello(monc->auth,
-                                            monc->m_auth->front.iov_base,
-                                            monc->m_auth->front_max);
-                __send_prepared_auth_request(monc, ret);
-        } else {
-                dout("open_session mon%d already open\n", monc->cur_mon);
-        }
-        return 0;
-}
-static bool __sub_expired(struct ceph_mon_client *monc)
-{
-        return time_after_eq(jiffies, monc->sub_renew_after);
-}
-/*
- * Reschedule delayed work timer.
- */
-static void __schedule_delayed(struct ceph_mon_client *monc)
-{
-        unsigned delay;
-        if (monc->cur_mon < 0 || __sub_expired(monc))
-                delay = 10 * HZ;
-        else
-                delay = 20 * HZ;
-        dout("__schedule_delayed after %u\n", delay);
-        schedule_delayed_work(&monc->delayed_work, delay);
-}
-/*
- * Send subscribe request for mdsmap and/or osdmap.
- */
-static void __send_subscribe(struct ceph_mon_client *monc)
-{
-        dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
-             (unsigned)monc->sub_sent, __sub_expired(monc),
-             monc->want_next_osdmap);
-        if ((__sub_expired(monc) && !monc->sub_sent) ||
-            monc->want_next_osdmap == 1) {
-                struct ceph_msg *msg = monc->m_subscribe;
-                struct ceph_mon_subscribe_item *i;
-                void *p, *end;
-                p = msg->front.iov_base;
-                end = p + msg->front_max;
-                dout("__send_subscribe to 'mdsmap' %u+\n",
-                     (unsigned)monc->have_mdsmap);
-                if (monc->want_next_osdmap) {
-                        dout("__send_subscribe to 'osdmap' %u\n",
-                             (unsigned)monc->have_osdmap);
-                        ceph_encode_32(&p, 3);
-                        ceph_encode_string(&p, end, "osdmap", 6);
-                        i = p;
-                        i->have = cpu_to_le64(monc->have_osdmap);
-                        i->onetime = 1;
-                        p += sizeof(*i);
-                        monc->want_next_osdmap = 2;  /* requested */
-                } else {
-                        ceph_encode_32(&p, 2);
-                }
-                ceph_encode_string(&p, end, "mdsmap", 6);
-                i = p;
-                i->have = cpu_to_le64(monc->have_mdsmap);
-                i->onetime = 0;
-                p += sizeof(*i);
-                ceph_encode_string(&p, end, "monmap", 6);
-                i = p;
-                i->have = 0;
-                i->onetime = 0;
-                p += sizeof(*i);
-                msg->front.iov_len = p - msg->front.iov_base;
-                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-                ceph_con_revoke(monc->con, msg);
-                ceph_con_send(monc->con, ceph_msg_get(msg));
-                monc->sub_sent = jiffies | 1;  /* never 0 */
-        }
-}
-static void handle_subscribe_ack(struct ceph_mon_client *monc,
-                                 struct ceph_msg *msg)
-{
-        unsigned seconds;
-        struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
-        if (msg->front.iov_len < sizeof(*h))
-                goto bad;
-        seconds = le32_to_cpu(h->duration);
-        mutex_lock(&monc->mutex);
-        if (monc->hunting) {
-                pr_info("mon%d %s session established\n",
-                        monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
-                monc->hunting = false;
-        }
-        dout("handle_subscribe_ack after %d seconds\n", seconds);
-        monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
-        monc->sub_sent = 0;
-        mutex_unlock(&monc->mutex);
-        return;
-bad:
-        pr_err("got corrupt subscribe-ack msg\n");
-        ceph_msg_dump(msg);
-}
-/*
- * Keep track of which maps we have
- */
-int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
-{
-        mutex_lock(&monc->mutex);
-        monc->have_mdsmap = got;
-        mutex_unlock(&monc->mutex);
-        return 0;
-}
-int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
-{
-        mutex_lock(&monc->mutex);
-        monc->have_osdmap = got;
-        monc->want_next_osdmap = 0;
-        mutex_unlock(&monc->mutex);
-        return 0;
-}
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
-{
-        dout("request_next_osdmap have %u\n", monc->have_osdmap);
-        mutex_lock(&monc->mutex);
-        if (!monc->want_next_osdmap)
-                monc->want_next_osdmap = 1;
-        if (monc->want_next_osdmap < 2)
-                __send_subscribe(monc);
-        mutex_unlock(&monc->mutex);
-}
-/*
- *
- */
-int ceph_monc_open_session(struct ceph_mon_client *monc)
-{
-        if (!monc->con) {
-                monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
-                if (!monc->con)
-                        return -ENOMEM;
-                ceph_con_init(monc->client->msgr, monc->con);
-                monc->con->private = monc;
-                monc->con->ops = &mon_con_ops;
-        }
-        mutex_lock(&monc->mutex);
-        __open_session(monc);
-        __schedule_delayed(monc);
-        mutex_unlock(&monc->mutex);
-        return 0;
-}
-/*
- * The monitor responds with mount ack indicate mount success.  The
- * included client ticket allows the client to talk to MDSs and OSDs.
- */
-static void ceph_monc_handle_map(struct ceph_mon_client *monc,
-                                 struct ceph_msg *msg)
-{
-        struct ceph_client *client = monc->client;
-        struct ceph_monmap *monmap = NULL, *old = monc->monmap;
-        void *p, *end;
-        mutex_lock(&monc->mutex);
-        dout("handle_monmap\n");
-        p = msg->front.iov_base;
-        end = p + msg->front.iov_len;
-        monmap = ceph_monmap_decode(p, end);
-        if (IS_ERR(monmap)) {
-                pr_err("problem decoding monmap, %d\n",
-                       (int)PTR_ERR(monmap));
-                goto out;
-        }
-        if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
-                kfree(monmap);
-                goto out;
-        }
-        client->monc.monmap = monmap;
-        kfree(old);
-out:
-        mutex_unlock(&monc->mutex);
-        wake_up_all(&client->auth_wq);
-}
-/*
- * generic requests (e.g., statfs, poolop)
- */
-static struct ceph_mon_generic_request *__lookup_generic_req(
-        struct ceph_mon_client *monc, u64 tid)
-{
-        struct ceph_mon_generic_request *req;
-        struct rb_node *n = monc->generic_request_tree.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_mon_generic_request, node);
-                if (tid < req->tid)
-                        n = n->rb_left;
-                else if (tid > req->tid)
-                        n = n->rb_right;
-                else
-                        return req;
-        }
-        return NULL;
-}
-static void __insert_generic_request(struct ceph_mon_client *monc,
-                            struct ceph_mon_generic_request *new)
-{
-        struct rb_node **p = &monc->generic_request_tree.rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_mon_generic_request *req = NULL;
-        while (*p) {
-                parent = *p;
-                req = rb_entry(parent, struct ceph_mon_generic_request, node);
-                if (new->tid < req->tid)
-                        p = &(*p)->rb_left;
-                else if (new->tid > req->tid)
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
-        }
-        rb_link_node(&new->node, parent, p);
-        rb_insert_color(&new->node, &monc->generic_request_tree);
-}
-static void release_generic_request(struct kref *kref)
-{
-        struct ceph_mon_generic_request *req =
-                container_of(kref, struct ceph_mon_generic_request, kref);
-        if (req->reply)
-                ceph_msg_put(req->reply);
-        if (req->request)
-                ceph_msg_put(req->request);
-        kfree(req);
-}
-static void put_generic_request(struct ceph_mon_generic_request *req)
-{
-        kref_put(&req->kref, release_generic_request);
-}
-static void get_generic_request(struct ceph_mon_generic_request *req)
-{
-        kref_get(&req->kref);
-}
-static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
-                                         struct ceph_msg_header *hdr,
-                                         int *skip)
-{
-        struct ceph_mon_client *monc = con->private;
-        struct ceph_mon_generic_request *req;
-        u64 tid = le64_to_cpu(hdr->tid);
-        struct ceph_msg *m;
-        mutex_lock(&monc->mutex);
-        req = __lookup_generic_req(monc, tid);
-        if (!req) {
-                dout("get_generic_reply %lld dne\n", tid);
-                *skip = 1;
-                m = NULL;
-        } else {
-                dout("get_generic_reply %lld got %p\n", tid, req->reply);
-                m = ceph_msg_get(req->reply);
-                /*
-                 * we don't need to track the connection reading into
-                 * this reply because we only have one open connection
-                 * at a time, ever.
-                 */
-        }
-        mutex_unlock(&monc->mutex);
-        return m;
-}
-static int do_generic_request(struct ceph_mon_client *monc,
-                              struct ceph_mon_generic_request *req)
-{
-        int err;
-        /* register request */
-        mutex_lock(&monc->mutex);
-        req->tid = ++monc->last_tid;
-        req->request->hdr.tid = cpu_to_le64(req->tid);
-        __insert_generic_request(monc, req);
-        monc->num_generic_requests++;
-        ceph_con_send(monc->con, ceph_msg_get(req->request));
-        mutex_unlock(&monc->mutex);
-        err = wait_for_completion_interruptible(&req->completion);
-        mutex_lock(&monc->mutex);
-        rb_erase(&req->node, &monc->generic_request_tree);
-        monc->num_generic_requests--;
-        mutex_unlock(&monc->mutex);
-        if (!err)
-                err = req->result;
-        return err;
-}
-/*
- * statfs
- */
-static void handle_statfs_reply(struct ceph_mon_client *monc,
-                                struct ceph_msg *msg)
-{
-        struct ceph_mon_generic_request *req;
-        struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
-        u64 tid = le64_to_cpu(msg->hdr.tid);
-        if (msg->front.iov_len != sizeof(*reply))
-                goto bad;
-        dout("handle_statfs_reply %p tid %llu\n", msg, tid);
-        mutex_lock(&monc->mutex);
-        req = __lookup_generic_req(monc, tid);
-        if (req) {
-                *(struct ceph_statfs *)req->buf = reply->st;
-                req->result = 0;
-                get_generic_request(req);
-        }
-        mutex_unlock(&monc->mutex);
-        if (req) {
-                complete_all(&req->completion);
-                put_generic_request(req);
-        }
-        return;
-bad:
-        pr_err("corrupt generic reply, tid %llu\n", tid);
-        ceph_msg_dump(msg);
-}
-/*
- * Do a synchronous statfs().
- */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-{
-        struct ceph_mon_generic_request *req;
-        struct ceph_mon_statfs *h;
-        int err;
-        req = kzalloc(sizeof(*req), GFP_NOFS);
-        if (!req)
-                return -ENOMEM;
-        kref_init(&req->kref);
-        req->buf = buf;
-        req->buf_len = sizeof(*buf);
-        init_completion(&req->completion);
-        err = -ENOMEM;
-        req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
-        if (!req->request)
-                goto out;
-        req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
-        if (!req->reply)
-                goto out;
-        /* fill out request */
-        h = req->request->front.iov_base;
-        h->monhdr.have_version = 0;
-        h->monhdr.session_mon = cpu_to_le16(-1);
-        h->monhdr.session_mon_tid = 0;
-        h->fsid = monc->monmap->fsid;
-        err = do_generic_request(monc, req);
-out:
-        kref_put(&req->kref, release_generic_request);
-        return err;
-}
-/*
- * pool ops
- */
-static int get_poolop_reply_buf(const char *src, size_t src_len,
-                                char *dst, size_t dst_len)
-{
-        u32 buf_len;
-        if (src_len != sizeof(u32) + dst_len)
-                return -EINVAL;
-        buf_len = le32_to_cpu(*(u32 *)src);
-        if (buf_len != dst_len)
-                return -EINVAL;
-        memcpy(dst, src + sizeof(u32), dst_len);
-        return 0;
-}
-static void handle_poolop_reply(struct ceph_mon_client *monc,
-                                struct ceph_msg *msg)
-{
-        struct ceph_mon_generic_request *req;
-        struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
-        u64 tid = le64_to_cpu(msg->hdr.tid);
-        if (msg->front.iov_len < sizeof(*reply))
-                goto bad;
-        dout("handle_poolop_reply %p tid %llu\n", msg, tid);
-        mutex_lock(&monc->mutex);
-        req = __lookup_generic_req(monc, tid);
-        if (req) {
-                if (req->buf_len &&
-                    get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
-                                     msg->front.iov_len - sizeof(*reply),
-                                     req->buf, req->buf_len) < 0) {
-                        mutex_unlock(&monc->mutex);
-                        goto bad;
-                }
-                req->result = le32_to_cpu(reply->reply_code);
-                get_generic_request(req);
-        }
-        mutex_unlock(&monc->mutex);
-        if (req) {
-                complete(&req->completion);
-                put_generic_request(req);
-        }
-        return;
-bad:
-        pr_err("corrupt generic reply, tid %llu\n", tid);
-        ceph_msg_dump(msg);
-}
-/*
- * Do a synchronous pool op.
- */
-int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
-                        u32 pool, u64 snapid,
-                        char *buf, int len)
-{
-        struct ceph_mon_generic_request *req;
-        struct ceph_mon_poolop *h;
-        int err;
-        req = kzalloc(sizeof(*req), GFP_NOFS);
-        if (!req)
-                return -ENOMEM;
-        kref_init(&req->kref);
-        req->buf = buf;
-        req->buf_len = len;
-        init_completion(&req->completion);
-        err = -ENOMEM;
-        req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
-        if (!req->request)
-                goto out;
-        req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
-        if (!req->reply)
-                goto out;
-        /* fill out request */
-        req->request->hdr.version = cpu_to_le16(2);
-        h = req->request->front.iov_base;
-        h->monhdr.have_version = 0;
-        h->monhdr.session_mon = cpu_to_le16(-1);
-        h->monhdr.session_mon_tid = 0;
-        h->fsid = monc->monmap->fsid;
-        h->pool = cpu_to_le32(pool);
-        h->op = cpu_to_le32(op);
-        h->auid = 0;
-        h->snapid = cpu_to_le64(snapid);
-        h->name_len = 0;
-        err = do_generic_request(monc, req);
-out:
-        kref_put(&req->kref, release_generic_request);
-        return err;
-}
-int ceph_monc_create_snapid(struct ceph_mon_client *monc,
-                            u32 pool, u64 *snapid)
-{
-        return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
-                                   pool, 0, (char *)snapid, sizeof(*snapid));
-}
-int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
-                            u32 pool, u64 snapid)
-{
-        return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
-                                   pool, snapid, 0, 0);
-}
-/*
- * Resend pending generic requests.
- */
-static void __resend_generic_request(struct ceph_mon_client *monc)
-{
-        struct ceph_mon_generic_request *req;
-        struct rb_node *p;
-        for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
-                req = rb_entry(p, struct ceph_mon_generic_request, node);
-                ceph_con_revoke(monc->con, req->request);
-                ceph_con_send(monc->con, ceph_msg_get(req->request));
-        }
-}
-/*
- * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
- * renew/retry subscription as needed (in case it is timing out, or we
- * got an ENOMEM).  And keep the monitor connection alive.
- */
-static void delayed_work(struct work_struct *work)
-{
-        struct ceph_mon_client *monc =
-                container_of(work, struct ceph_mon_client, delayed_work.work);
-        dout("monc delayed_work\n");
-        mutex_lock(&monc->mutex);
-        if (monc->hunting) {
-                __close_session(monc);
-                __open_session(monc);  /* continue hunting */
-        } else {
-                ceph_con_keepalive(monc->con);
-                __validate_auth(monc);
-                if (monc->auth->ops->is_authenticated(monc->auth))
-                        __send_subscribe(monc);
-        }
-        __schedule_delayed(monc);
-        mutex_unlock(&monc->mutex);
-}
-/*
- * On startup, we build a temporary monmap populated with the IPs
- * provided by mount(2).
- */
-static int build_initial_monmap(struct ceph_mon_client *monc)
-{
-        struct ceph_mount_args *args = monc->client->mount_args;
-        struct ceph_entity_addr *mon_addr = args->mon_addr;
-        int num_mon = args->num_mon;
-        int i;
-        /* build initial monmap */
-        monc->monmap = kzalloc(sizeof(*monc->monmap) +
-                               num_mon*sizeof(monc->monmap->mon_inst[0]),
-                               GFP_KERNEL);
-        if (!monc->monmap)
-                return -ENOMEM;
-        for (i = 0; i < num_mon; i++) {
-                monc->monmap->mon_inst[i].addr = mon_addr[i];
-                monc->monmap->mon_inst[i].addr.nonce = 0;
-                monc->monmap->mon_inst[i].name.type =
-                        CEPH_ENTITY_TYPE_MON;
-                monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
-        }
-        monc->monmap->num_mon = num_mon;
-        monc->have_fsid = false;
-        /* release addr memory */
-        kfree(args->mon_addr);
-        args->mon_addr = NULL;
-        args->num_mon = 0;
-        return 0;
-}
-int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
-{
-        int err = 0;
-        dout("init\n");
-        memset(monc, 0, sizeof(*monc));
-        monc->client = cl;
-        monc->monmap = NULL;
-        mutex_init(&monc->mutex);
-        err = build_initial_monmap(monc);
-        if (err)
-                goto out;
-        monc->con = NULL;
-        /* authentication */
-        monc->auth = ceph_auth_init(cl->mount_args->name,
-                                    cl->mount_args->secret);
-        if (IS_ERR(monc->auth))
-                return PTR_ERR(monc->auth);
-        monc->auth->want_keys =
-                CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
-                CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
-        /* msgs */
-        err = -ENOMEM;
-        monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
-                                     sizeof(struct ceph_mon_subscribe_ack),
-                                     GFP_NOFS);
-        if (!monc->m_subscribe_ack)
-                goto out_monmap;
-        monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
-        if (!monc->m_subscribe)
-                goto out_subscribe_ack;
-        monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
-        if (!monc->m_auth_reply)
-                goto out_subscribe;
-        monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
-        monc->pending_auth = 0;
-        if (!monc->m_auth)
-                goto out_auth_reply;
-        monc->cur_mon = -1;
-        monc->hunting = true;
-        monc->sub_renew_after = jiffies;
-        monc->sub_sent = 0;
-        INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
-        monc->generic_request_tree = RB_ROOT;
-        monc->num_generic_requests = 0;
-        monc->last_tid = 0;
-        monc->have_mdsmap = 0;
-        monc->have_osdmap = 0;
-        monc->want_next_osdmap = 1;
-        return 0;
-out_auth_reply:
-        ceph_msg_put(monc->m_auth_reply);
-out_subscribe:
-        ceph_msg_put(monc->m_subscribe);
-out_subscribe_ack:
-        ceph_msg_put(monc->m_subscribe_ack);
-out_monmap:
-        kfree(monc->monmap);
-out:
-        return err;
-}
-void ceph_monc_stop(struct ceph_mon_client *monc)
-{
-        dout("stop\n");
-        cancel_delayed_work_sync(&monc->delayed_work);
-        mutex_lock(&monc->mutex);
-        __close_session(monc);
-        if (monc->con) {
-                monc->con->private = NULL;
-                monc->con->ops->put(monc->con);
-                monc->con = NULL;
-        }
-        mutex_unlock(&monc->mutex);
-        ceph_auth_destroy(monc->auth);
-        ceph_msg_put(monc->m_auth);
-        ceph_msg_put(monc->m_auth_reply);
-        ceph_msg_put(monc->m_subscribe);
-        ceph_msg_put(monc->m_subscribe_ack);
-        kfree(monc->monmap);
-}
-static void handle_auth_reply(struct ceph_mon_client *monc,
-                              struct ceph_msg *msg)
-{
-        int ret;
-        int was_auth = 0;
-        mutex_lock(&monc->mutex);
-        if (monc->auth->ops)
-                was_auth = monc->auth->ops->is_authenticated(monc->auth);
-        monc->pending_auth = 0;
-        ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
-                                     msg->front.iov_len,
-                                     monc->m_auth->front.iov_base,
-                                     monc->m_auth->front_max);
-        if (ret < 0) {
-                monc->client->auth_err = ret;
-                wake_up_all(&monc->client->auth_wq);
-        } else if (ret > 0) {
-                __send_prepared_auth_request(monc, ret);
-        } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
-                dout("authenticated, starting session\n");
-                monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
-                monc->client->msgr->inst.name.num =
-                                        cpu_to_le64(monc->auth->global_id);
-                __send_subscribe(monc);
-                __resend_generic_request(monc);
-        }
-        mutex_unlock(&monc->mutex);
-}
-static int __validate_auth(struct ceph_mon_client *monc)
-{
-        int ret;
-        if (monc->pending_auth)
-                return 0;
-        ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
-                              monc->m_auth->front_max);
-        if (ret <= 0)
-                return ret; /* either an error, or no need to authenticate */
-        __send_prepared_auth_request(monc, ret);
-        return 0;
-}
-int ceph_monc_validate_auth(struct ceph_mon_client *monc)
-{
-        int ret;
-        mutex_lock(&monc->mutex);
-        ret = __validate_auth(monc);
-        mutex_unlock(&monc->mutex);
-        return ret;
-}
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
-        struct ceph_mon_client *monc = con->private;
-        int type = le16_to_cpu(msg->hdr.type);
-        if (!monc)
-                return;
-        switch (type) {
-        case CEPH_MSG_AUTH_REPLY:
-                handle_auth_reply(monc, msg);
-                break;
-        case CEPH_MSG_MON_SUBSCRIBE_ACK:
-                handle_subscribe_ack(monc, msg);
-                break;
-        case CEPH_MSG_STATFS_REPLY:
-                handle_statfs_reply(monc, msg);
-                break;
-        case CEPH_MSG_POOLOP_REPLY:
-                handle_poolop_reply(monc, msg);
-                break;
-        case CEPH_MSG_MON_MAP:
-                ceph_monc_handle_map(monc, msg);
-                break;
-        case CEPH_MSG_MDS_MAP:
-                ceph_mdsc_handle_map(&monc->client->mdsc, msg);
-                break;
-        case CEPH_MSG_OSD_MAP:
-                ceph_osdc_handle_map(&monc->client->osdc, msg);
-                break;
-        default:
-                pr_err("received unknown message type %d %s\n", type,
-                       ceph_msg_type_name(type));
-        }
-        ceph_msg_put(msg);
-}
-/*
- * Allocate memory for incoming message
- */
-static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
-                                      struct ceph_msg_header *hdr,
-                                      int *skip)
-{
-        struct ceph_mon_client *monc = con->private;
-        int type = le16_to_cpu(hdr->type);
-        int front_len = le32_to_cpu(hdr->front_len);
-        struct ceph_msg *m = NULL;
-        *skip = 0;
-        switch (type) {
-        case CEPH_MSG_MON_SUBSCRIBE_ACK:
-                m = ceph_msg_get(monc->m_subscribe_ack);
-                break;
-        case CEPH_MSG_POOLOP_REPLY:
-        case CEPH_MSG_STATFS_REPLY:
-                return get_generic_reply(con, hdr, skip);
-        case CEPH_MSG_AUTH_REPLY:
-                m = ceph_msg_get(monc->m_auth_reply);
-                break;
-        case CEPH_MSG_MON_MAP:
-        case CEPH_MSG_MDS_MAP:
-        case CEPH_MSG_OSD_MAP:
-                m = ceph_msg_new(type, front_len, GFP_NOFS);
-                break;
-        }
-        if (!m) {
-                pr_info("alloc_msg unknown type %d\n", type);
-                *skip = 1;
-        }
-        return m;
-}
-/*
- * If the monitor connection resets, pick a new monitor and resubmit
- * any pending requests.
- */
-static void mon_fault(struct ceph_connection *con)
-{
-        struct ceph_mon_client *monc = con->private;
-        if (!monc)
-                return;
-        dout("mon_fault\n");
-        mutex_lock(&monc->mutex);
-        if (!con->private)
-                goto out;
-        if (monc->con && !monc->hunting)
-                pr_info("mon%d %s session lost, "
-                        "hunting for new mon\n", monc->cur_mon,
-                        pr_addr(&monc->con->peer_addr.in_addr));
-        __close_session(monc);
-        if (!monc->hunting) {
-                /* start hunting */
-                monc->hunting = true;
-                __open_session(monc);
-        } else {
-                /* already hunting, let's wait a bit */
-                __schedule_delayed(monc);
-        }
-out:
-        mutex_unlock(&monc->mutex);
-}
-static const struct ceph_connection_operations mon_con_ops = {
-        .get = ceph_con_get,
-        .put = ceph_con_put,
-        .dispatch = dispatch,
-        .fault = mon_fault,
-        .alloc_msg = mon_alloc_msg,
-};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
deleted file mode 100644
index 8e396f2c0963..000000000000
--- a/fs/ceph/mon_client.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef _FS_CEPH_MON_CLIENT_H
-#define _FS_CEPH_MON_CLIENT_H
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/rbtree.h>
-#include "messenger.h"
-struct ceph_client;
-struct ceph_mount_args;
-struct ceph_auth_client;
-/*
- * The monitor map enumerates the set of all monitors.
- */
-struct ceph_monmap {
-        struct ceph_fsid fsid;
-        u32 epoch;
-        u32 num_mon;
-        struct ceph_entity_inst mon_inst[0];
-};
-struct ceph_mon_client;
-struct ceph_mon_generic_request;
-/*
- * Generic mechanism for resending monitor requests.
- */
-typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
-                                         int newmon);
-/* a pending monitor request */
-struct ceph_mon_request {
-        struct ceph_mon_client *monc;
-        struct delayed_work delayed_work;
-        unsigned long delay;
-        ceph_monc_request_func_t do_request;
-};
-/*
- * ceph_mon_generic_request is being used for the statfs and poolop requests
- * which are bening done a bit differently because we need to get data back
- * to the caller
- */
-struct ceph_mon_generic_request {
-        struct kref kref;
-        u64 tid;
-        struct rb_node node;
-        int result;
-        void *buf;
-        int buf_len;
-        struct completion completion;
-        struct ceph_msg *request;  /* original request */
-        struct ceph_msg *reply;    /* and reply */
-};
-struct ceph_mon_client {
-        struct ceph_client *client;
-        struct ceph_monmap *monmap;
-        struct mutex mutex;
-        struct delayed_work delayed_work;
-        struct ceph_auth_client *auth;
-        struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
-        int pending_auth;
-        bool hunting;
-        int cur_mon;                       /* last monitor i contacted */
-        unsigned long sub_sent, sub_renew_after;
-        struct ceph_connection *con;
-        bool have_fsid;
-        /* pending generic requests */
-        struct rb_root generic_request_tree;
-        int num_generic_requests;
-        u64 last_tid;
-        /* mds/osd map */
-        int want_next_osdmap; /* 1 = want, 2 = want+asked */
-        u32 have_osdmap, have_mdsmap;
-#ifdef CONFIG_DEBUG_FS
-        struct dentry *debugfs_file;
-#endif
-};
-extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
-extern int ceph_monmap_contains(struct ceph_monmap *m,
-                                struct ceph_entity_addr *addr);
-extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
-extern void ceph_monc_stop(struct ceph_mon_client *monc);
-/*
- * The model here is to indicate that we need a new map of at least
- * epoch @want, and also call in when we receive a map.  We will
- * periodically rerequest the map from the monitor cluster until we
- * get what we want.
- */
-extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
-extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
-extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
-                               struct ceph_statfs *buf);
-extern int ceph_monc_open_session(struct ceph_mon_client *monc);
-extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
-extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
-                                   u32 pool, u64 *snapid);
-extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
-                                   u32 pool, u64 snapid);
-#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
deleted file mode 100644
index dd65a6438131..000000000000
--- a/fs/ceph/msgpool.c
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/err.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/vmalloc.h>
-#include "msgpool.h"
-static void *alloc_fn(gfp_t gfp_mask, void *arg)
-{
-        struct ceph_msgpool *pool = arg;
-        void *p;
-        p = ceph_msg_new(0, pool->front_len, gfp_mask);
-        if (!p)
-                pr_err("msgpool %s alloc failed\n", pool->name);
-        return p;
-}
-static void free_fn(void *element, void *arg)
-{
-        ceph_msg_put(element);
-}
-int ceph_msgpool_init(struct ceph_msgpool *pool,
-                      int front_len, int size, bool blocking, const char *name)
-{
-        pool->front_len = front_len;
-        pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
-        if (!pool->pool)
-                return -ENOMEM;
-        pool->name = name;
-        return 0;
-}
-void ceph_msgpool_destroy(struct ceph_msgpool *pool)
-{
-        mempool_destroy(pool->pool);
-}
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
-                                  int front_len)
-{
-        if (front_len > pool->front_len) {
-                pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
-                       pool->name, front_len, pool->front_len);
-                WARN_ON(1);
-                /* try to alloc a fresh message */
-                return ceph_msg_new(0, front_len, GFP_NOFS);
-        }
-        return mempool_alloc(pool->pool, GFP_NOFS);
-}
-void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
-{
-        /* reset msg front_len; user may have changed it */
-        msg->front.iov_len = pool->front_len;
-        msg->hdr.front_len = cpu_to_le32(pool->front_len);
-        kref_init(&msg->kref);  /* retake single ref */
-}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
deleted file mode 100644
index a362605f9368..000000000000
--- a/fs/ceph/msgpool.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _FS_CEPH_MSGPOOL
-#define _FS_CEPH_MSGPOOL
-#include <linux/mempool.h>
-#include "messenger.h"
-/*
- * we use memory pools for preallocating messages we may receive, to
- * avoid unexpected OOM conditions.
- */
-struct ceph_msgpool {
-        const char *name;
-        mempool_t *pool;
-        int front_len;          /* preallocated payload size */
-};
-extern int ceph_msgpool_init(struct ceph_msgpool *pool,
-                             int front_len, int size, bool blocking,
-                             const char *name);
-extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
-                                         int front_len);
-extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
-#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
deleted file mode 100644
index 680d3d648cac..000000000000
--- a/fs/ceph/msgr.h
+++ /dev/null
@@ -1,175 +0,0 @@
-#ifndef CEPH_MSGR_H
-#define CEPH_MSGR_H
-/*
- * Data types for message passing layer used by Ceph.
- */
-#define CEPH_MON_PORT    6789  /* default monitor port */
-/*
- * client-side processes will try to bind to ports in this
- * range, simply for the benefit of tools like nmap or wireshark
- * that would like to identify the protocol.
- */
-#define CEPH_PORT_FIRST  6789
-#define CEPH_PORT_START  6800  /* non-monitors start here */
-#define CEPH_PORT_LAST   6900
-/*
- * tcp connection banner.  include a protocol version. and adjust
- * whenever the wire protocol changes.  try to keep this string length
- * constant.
- */
-#define CEPH_BANNER "ceph v027"
-#define CEPH_BANNER_MAX_LEN 30
-/*
- * Rollover-safe type and comparator for 32-bit sequence numbers.
- * Comparator returns -1, 0, or 1.
- */
-typedef __u32 ceph_seq_t;
-static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
-{
-       return (__s32)a - (__s32)b;
-}
-/*
- * entity_name -- logical name for a process participating in the
- * network, e.g. 'mds0' or 'osd3'.
- */
-struct ceph_entity_name {
-        __u8 type;      /* CEPH_ENTITY_TYPE_* */
-        __le64 num;
-} __attribute__ ((packed));
-#define CEPH_ENTITY_TYPE_MON    0x01
-#define CEPH_ENTITY_TYPE_MDS    0x02
-#define CEPH_ENTITY_TYPE_OSD    0x04
-#define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_AUTH   0x20
-#define CEPH_ENTITY_TYPE_ANY    0xFF
-extern const char *ceph_entity_type_name(int type);
-/*
- * entity_addr -- network address
- */
-struct ceph_entity_addr {
-        __le32 type;
-        __le32 nonce;  /* unique id for process (e.g. pid) */
-        struct sockaddr_storage in_addr;
-} __attribute__ ((packed));
-struct ceph_entity_inst {
-        struct ceph_entity_name name;
-        struct ceph_entity_addr addr;
-} __attribute__ ((packed));
-/* used by message exchange protocol */
-#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
-#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
-#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
-                                          incoming connection */
-#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
-                                          with higher cseq */
-#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
-                                          with higher gseq */
-#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
-#define CEPH_MSGR_TAG_MSG           7  /* message */
-#define CEPH_MSGR_TAG_ACK           8  /* message ack */
-#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
-#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
-#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
-#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
-/*
- * connection negotiation
- */
-struct ceph_msg_connect {
-        __le64 features;     /* supported feature bits */
-        __le32 host_type;    /* CEPH_ENTITY_TYPE_* */
-        __le32 global_seq;   /* count connections initiated by this host */
-        __le32 connect_seq;  /* count connections initiated in this session */
-        __le32 protocol_version;
-        __le32 authorizer_protocol;
-        __le32 authorizer_len;
-        __u8  flags;         /* CEPH_MSG_CONNECT_* */
-} __attribute__ ((packed));
-struct ceph_msg_connect_reply {
-        __u8 tag;
-        __le64 features;     /* feature bits for this session */
-        __le32 global_seq;
-        __le32 connect_seq;
-        __le32 protocol_version;
-        __le32 authorizer_len;
-        __u8 flags;
-} __attribute__ ((packed));
-#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
-/*
- * message header
- */
-struct ceph_msg_header_old {
-        __le64 seq;       /* message seq# for this session */
-        __le64 tid;       /* transaction id */
-        __le16 type;      /* message type */
-        __le16 priority;  /* priority.  higher value == higher priority */
-        __le16 version;   /* version of message encoding */
-        __le32 front_len; /* bytes in main payload */
-        __le32 middle_len;/* bytes in middle payload */
-        __le32 data_len;  /* bytes of data payload */
-        __le16 data_off;  /* sender: include full offset;
-                             receiver: mask against ~PAGE_MASK */
-        struct ceph_entity_inst src, orig_src;
-        __le32 reserved;
-        __le32 crc;       /* header crc32c */
-} __attribute__ ((packed));
-struct ceph_msg_header {
-        __le64 seq;       /* message seq# for this session */
-        __le64 tid;       /* transaction id */
-        __le16 type;      /* message type */
-        __le16 priority;  /* priority.  higher value == higher priority */
-        __le16 version;   /* version of message encoding */
-        __le32 front_len; /* bytes in main payload */
-        __le32 middle_len;/* bytes in middle payload */
-        __le32 data_len;  /* bytes of data payload */
-        __le16 data_off;  /* sender: include full offset;
-                             receiver: mask against ~PAGE_MASK */
-        struct ceph_entity_name src;
-        __le32 reserved;
-        __le32 crc;       /* header crc32c */
-} __attribute__ ((packed));
-#define CEPH_MSG_PRIO_LOW     64
-#define CEPH_MSG_PRIO_DEFAULT 127
-#define CEPH_MSG_PRIO_HIGH    196
-#define CEPH_MSG_PRIO_HIGHEST 255
-/*
- * follows data payload
- */
-struct ceph_msg_footer {
-        __le32 front_crc, middle_crc, data_crc;
-        __u8 flags;
-} __attribute__ ((packed));
-#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
-#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
-#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
deleted file mode 100644
index 3b5571b8ce22..000000000000
--- a/fs/ceph/osd_client.c
+++ /dev/null
@@ -1,1539 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/err.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include "super.h"
-#include "osd_client.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
-#define OSD_OP_FRONT_LEN        4096
-#define OSD_OPREPLY_FRONT_LEN   512
-static const struct ceph_connection_operations osd_con_ops;
-static int __kick_requests(struct ceph_osd_client *osdc,
-                          struct ceph_osd *kickosd);
-static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-/*
- * Implement client access to distributed object storage cluster.
- *
- * All data objects are stored within a cluster/cloud of OSDs, or
- * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
- * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
- * remote daemons serving up and coordinating consistent and safe
- * access to storage.
- *
- * Cluster membership and the mapping of data objects onto storage devices
- * are described by the osd map.
- *
- * We keep track of pending OSD requests (read, write), resubmit
- * requests to different OSDs when the cluster topology/data layout
- * change, or retry the affected requests when the communications
- * channel with an OSD is reset.
- */
-/*
- * calculate the mapping of a file extent onto an object, and fill out the
- * request accordingly.  shorten extent as necessary if it crosses an
- * object boundary.
- *
- * fill osd op in request message.
- */
-static void calc_layout(struct ceph_osd_client *osdc,
-                        struct ceph_vino vino, struct ceph_file_layout *layout,
-                        u64 off, u64 *plen,
-                        struct ceph_osd_request *req)
-{
-        struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-        struct ceph_osd_op *op = (void *)(reqhead + 1);
-        u64 orig_len = *plen;
-        u64 objoff, objlen;    /* extent in object */
-        u64 bno;
-        reqhead->snapid = cpu_to_le64(vino.snap);
-        /* object extent? */
-        ceph_calc_file_object_mapping(layout, off, plen, &bno,
-                                      &objoff, &objlen);
-        if (*plen < orig_len)
-                dout(" skipping last %llu, final file extent %llu~%llu\n",
-                     orig_len - *plen, off, *plen);
-        sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
-        req->r_oid_len = strlen(req->r_oid);
-        op->extent.offset = cpu_to_le64(objoff);
-        op->extent.length = cpu_to_le64(objlen);
-        req->r_num_pages = calc_pages_for(off, *plen);
-        dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
-             req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
-}
-/*
- * requests
- */
-void ceph_osdc_release_request(struct kref *kref)
-{
-        struct ceph_osd_request *req = container_of(kref,
-                                                    struct ceph_osd_request,
-                                                    r_kref);
-        if (req->r_request)
-                ceph_msg_put(req->r_request);
-        if (req->r_reply)
-                ceph_msg_put(req->r_reply);
-        if (req->r_con_filling_msg) {
-                dout("release_request revoking pages %p from con %p\n",
-                     req->r_pages, req->r_con_filling_msg);
-                ceph_con_revoke_message(req->r_con_filling_msg,
-                                      req->r_reply);
-                ceph_con_put(req->r_con_filling_msg);
-        }
-        if (req->r_own_pages)
-                ceph_release_page_vector(req->r_pages,
-                                         req->r_num_pages);
-        ceph_put_snap_context(req->r_snapc);
-        if (req->r_mempool)
-                mempool_free(req, req->r_osdc->req_mempool);
-        else
-                kfree(req);
-}
-/*
- * build new request AND message, calculate layout, and adjust file
- * extent as needed.
- *
- * if the file was recently truncated, we include information about its
- * old and new size so that the object can be updated appropriately.  (we
- * avoid synchronously deleting truncated objects because it's slow.)
- *
- * if @do_sync, include a 'startsync' command so that the osd will flush
- * data quickly.
- */
-struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
-                                               struct ceph_file_layout *layout,
-                                               struct ceph_vino vino,
-                                               u64 off, u64 *plen,
-                                               int opcode, int flags,
-                                               struct ceph_snap_context *snapc,
-                                               int do_sync,
-                                               u32 truncate_seq,
-                                               u64 truncate_size,
-                                               struct timespec *mtime,
-                                               bool use_mempool, int num_reply)
-{
-        struct ceph_osd_request *req;
-        struct ceph_msg *msg;
-        struct ceph_osd_request_head *head;
-        struct ceph_osd_op *op;
-        void *p;
-        int num_op = 1 + do_sync;
-        size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
-        int i;
-        if (use_mempool) {
-                req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
-                memset(req, 0, sizeof(*req));
-        } else {
-                req = kzalloc(sizeof(*req), GFP_NOFS);
-        }
-        if (req == NULL)
-                return NULL;
-        req->r_osdc = osdc;
-        req->r_mempool = use_mempool;
-        kref_init(&req->r_kref);
-        init_completion(&req->r_completion);
-        init_completion(&req->r_safe_completion);
-        INIT_LIST_HEAD(&req->r_unsafe_item);
-        req->r_flags = flags;
-        WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
-        /* create reply message */
-        if (use_mempool)
-                msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
-        else
-                msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-                                   OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
-        if (!msg) {
-                ceph_osdc_put_request(req);
-                return NULL;
-        }
-        req->r_reply = msg;
-        /* create request message; allow space for oid */
-        msg_size += 40;
-        if (snapc)
-                msg_size += sizeof(u64) * snapc->num_snaps;
-        if (use_mempool)
-                msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
-        else
-                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
-        if (!msg) {
-                ceph_osdc_put_request(req);
-                return NULL;
-        }
-        msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
-        memset(msg->front.iov_base, 0, msg->front.iov_len);
-        head = msg->front.iov_base;
-        op = (void *)(head + 1);
-        p = (void *)(op + num_op);
-        req->r_request = msg;
-        req->r_snapc = ceph_get_snap_context(snapc);
-        head->client_inc = cpu_to_le32(1); /* always, for now. */
-        head->flags = cpu_to_le32(flags);
-        if (flags & CEPH_OSD_FLAG_WRITE)
-                ceph_encode_timespec(&head->mtime, mtime);
-        head->num_ops = cpu_to_le16(num_op);
-        op->op = cpu_to_le16(opcode);
-        /* calculate max write size */
-        calc_layout(osdc, vino, layout, off, plen, req);
-        req->r_file_layout = *layout;  /* keep a copy */
-        if (flags & CEPH_OSD_FLAG_WRITE) {
-                req->r_request->hdr.data_off = cpu_to_le16(off);
-                req->r_request->hdr.data_len = cpu_to_le32(*plen);
-                op->payload_len = cpu_to_le32(*plen);
-        }
-        op->extent.truncate_size = cpu_to_le64(truncate_size);
-        op->extent.truncate_seq = cpu_to_le32(truncate_seq);
-        /* fill in oid */
-        head->object_len = cpu_to_le32(req->r_oid_len);
-        memcpy(p, req->r_oid, req->r_oid_len);
-        p += req->r_oid_len;
-        if (do_sync) {
-                op++;
-                op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
-        }
-        if (snapc) {
-                head->snap_seq = cpu_to_le64(snapc->seq);
-                head->num_snaps = cpu_to_le32(snapc->num_snaps);
-                for (i = 0; i < snapc->num_snaps; i++) {
-                        put_unaligned_le64(snapc->snaps[i], p);
-                        p += sizeof(u64);
-                }
-        }
-        BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
-        msg_size = p - msg->front.iov_base;
-        msg->front.iov_len = msg_size;
-        msg->hdr.front_len = cpu_to_le32(msg_size);
-        return req;
-}
-/*
- * We keep osd requests in an rbtree, sorted by ->r_tid.
- */
-static void __insert_request(struct ceph_osd_client *osdc,
-                             struct ceph_osd_request *new)
-{
-        struct rb_node **p = &osdc->requests.rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_osd_request *req = NULL;
-        while (*p) {
-                parent = *p;
-                req = rb_entry(parent, struct ceph_osd_request, r_node);
-                if (new->r_tid < req->r_tid)
-                        p = &(*p)->rb_left;
-                else if (new->r_tid > req->r_tid)
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
-        }
-        rb_link_node(&new->r_node, parent, p);
-        rb_insert_color(&new->r_node, &osdc->requests);
-}
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
-                                                 u64 tid)
-{
-        struct ceph_osd_request *req;
-        struct rb_node *n = osdc->requests.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_osd_request, r_node);
-                if (tid < req->r_tid)
-                        n = n->rb_left;
-                else if (tid > req->r_tid)
-                        n = n->rb_right;
-                else
-                        return req;
-        }
-        return NULL;
-}
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
-                    u64 tid)
-{
-        struct ceph_osd_request *req;
-        struct rb_node *n = osdc->requests.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_osd_request, r_node);
-                if (tid < req->r_tid) {
-                        if (!n->rb_left)
-                                return req;
-                        n = n->rb_left;
-                } else if (tid > req->r_tid) {
-                        n = n->rb_right;
-                } else {
-                        return req;
-                }
-        }
-        return NULL;
-}
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
-{
-        struct ceph_osd *osd = con->private;
-        struct ceph_osd_client *osdc;
-        if (!osd)
-                return;
-        dout("osd_reset osd%d\n", osd->o_osd);
-        osdc = osd->o_osdc;
-        down_read(&osdc->map_sem);
-        kick_requests(osdc, osd);
-        up_read(&osdc->map_sem);
-}
-/*
- * Track open sessions with osds.
- */
-static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
-{
-        struct ceph_osd *osd;
-        osd = kzalloc(sizeof(*osd), GFP_NOFS);
-        if (!osd)
-                return NULL;
-        atomic_set(&osd->o_ref, 1);
-        osd->o_osdc = osdc;
-        INIT_LIST_HEAD(&osd->o_requests);
-        INIT_LIST_HEAD(&osd->o_osd_lru);
-        osd->o_incarnation = 1;
-        ceph_con_init(osdc->client->msgr, &osd->o_con);
-        osd->o_con.private = osd;
-        osd->o_con.ops = &osd_con_ops;
-        osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
-        INIT_LIST_HEAD(&osd->o_keepalive_item);
-        return osd;
-}
-static struct ceph_osd *get_osd(struct ceph_osd *osd)
-{
-        if (atomic_inc_not_zero(&osd->o_ref)) {
-                dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
-                     atomic_read(&osd->o_ref));
-                return osd;
-        } else {
-                dout("get_osd %p FAIL\n", osd);
-                return NULL;
-        }
-}
-static void put_osd(struct ceph_osd *osd)
-{
-        dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
-             atomic_read(&osd->o_ref) - 1);
-        if (atomic_dec_and_test(&osd->o_ref)) {
-                struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
-                if (osd->o_authorizer)
-                        ac->ops->destroy_authorizer(ac, osd->o_authorizer);
-                kfree(osd);
-        }
-}
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-        dout("__remove_osd %p\n", osd);
-        BUG_ON(!list_empty(&osd->o_requests));
-        rb_erase(&osd->o_node, &osdc->osds);
-        list_del_init(&osd->o_osd_lru);
-        ceph_con_close(&osd->o_con);
-        put_osd(osd);
-}
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
-                              struct ceph_osd *osd)
-{
-        dout("__move_osd_to_lru %p\n", osd);
-        BUG_ON(!list_empty(&osd->o_osd_lru));
-        list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
-        osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
-}
-static void __remove_osd_from_lru(struct ceph_osd *osd)
-{
-        dout("__remove_osd_from_lru %p\n", osd);
-        if (!list_empty(&osd->o_osd_lru))
-                list_del_init(&osd->o_osd_lru);
-}
-static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
-{
-        struct ceph_osd *osd, *nosd;
-        dout("__remove_old_osds %p\n", osdc);
-        mutex_lock(&osdc->request_mutex);
-        list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
-                if (!remove_all && time_before(jiffies, osd->lru_ttl))
-                        break;
-                __remove_osd(osdc, osd);
-        }
-        mutex_unlock(&osdc->request_mutex);
-}
-/*
- * reset osd connect
- */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-        struct ceph_osd_request *req;
-        int ret = 0;
-        dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
-        if (list_empty(&osd->o_requests)) {
-                __remove_osd(osdc, osd);
-        } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
-                          &osd->o_con.peer_addr,
-                          sizeof(osd->o_con.peer_addr)) == 0 &&
-                   !ceph_con_opened(&osd->o_con)) {
-                dout(" osd addr hasn't changed and connection never opened,"
-                     " letting msgr retry");
-                /* touch each r_stamp for handle_timeout()'s benfit */
-                list_for_each_entry(req, &osd->o_requests, r_osd_item)
-                        req->r_stamp = jiffies;
-                ret = -EAGAIN;
-        } else {
-                ceph_con_close(&osd->o_con);
-                ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
-                osd->o_incarnation++;
-        }
-        return ret;
-}
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
-{
-        struct rb_node **p = &osdc->osds.rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_osd *osd = NULL;
-        while (*p) {
-                parent = *p;
-                osd = rb_entry(parent, struct ceph_osd, o_node);
-                if (new->o_osd < osd->o_osd)
-                        p = &(*p)->rb_left;
-                else if (new->o_osd > osd->o_osd)
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
-        }
-        rb_link_node(&new->o_node, parent, p);
-        rb_insert_color(&new->o_node, &osdc->osds);
-}
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
-{
-        struct ceph_osd *osd;
-        struct rb_node *n = osdc->osds.rb_node;
-        while (n) {
-                osd = rb_entry(n, struct ceph_osd, o_node);
-                if (o < osd->o_osd)
-                        n = n->rb_left;
-                else if (o > osd->o_osd)
-                        n = n->rb_right;
-                else
-                        return osd;
-        }
-        return NULL;
-}
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
-{
-        schedule_delayed_work(&osdc->timeout_work,
-                        osdc->client->mount_args->osd_keepalive_timeout * HZ);
-}
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
-{
-        cancel_delayed_work(&osdc->timeout_work);
-}
-/*
- * Register request, assign tid.  If this is the first request, set up
- * the timeout event.
- */
-static void register_request(struct ceph_osd_client *osdc,
-                             struct ceph_osd_request *req)
-{
-        mutex_lock(&osdc->request_mutex);
-        req->r_tid = ++osdc->last_tid;
-        req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
-        INIT_LIST_HEAD(&req->r_req_lru_item);
-        dout("register_request %p tid %lld\n", req, req->r_tid);
-        __insert_request(osdc, req);
-        ceph_osdc_get_request(req);
-        osdc->num_requests++;
-        if (osdc->num_requests == 1) {
-                dout(" first request, scheduling timeout\n");
-                __schedule_osd_timeout(osdc);
-        }
-        mutex_unlock(&osdc->request_mutex);
-}
-/*
- * called under osdc->request_mutex
- */
-static void __unregister_request(struct ceph_osd_client *osdc,
-                                 struct ceph_osd_request *req)
-{
-        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-        rb_erase(&req->r_node, &osdc->requests);
-        osdc->num_requests--;
-        if (req->r_osd) {
-                /* make sure the original request isn't in flight. */
-                ceph_con_revoke(&req->r_osd->o_con, req->r_request);
-                list_del_init(&req->r_osd_item);
-                if (list_empty(&req->r_osd->o_requests))
-                        __move_osd_to_lru(osdc, req->r_osd);
-                req->r_osd = NULL;
-        }
-        ceph_osdc_put_request(req);
-        list_del_init(&req->r_req_lru_item);
-        if (osdc->num_requests == 0) {
-                dout(" no requests, canceling timeout\n");
-                __cancel_osd_timeout(osdc);
-        }
-}
-/*
- * Cancel a previously queued request message
- */
-static void __cancel_request(struct ceph_osd_request *req)
-{
-        if (req->r_sent && req->r_osd) {
-                ceph_con_revoke(&req->r_osd->o_con, req->r_request);
-                req->r_sent = 0;
-        }
-        list_del_init(&req->r_req_lru_item);
-}
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately.  If there is
- * no up osd, set r_osd to NULL.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_osds(struct ceph_osd_client *osdc,
-                      struct ceph_osd_request *req)
-{
-        struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-        struct ceph_pg pgid;
-        int acting[CEPH_PG_MAX_SIZE];
-        int o = -1, num = 0;
-        int err;
-        dout("map_osds %p tid %lld\n", req, req->r_tid);
-        err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
-                                      &req->r_file_layout, osdc->osdmap);
-        if (err)
-                return err;
-        pgid = reqhead->layout.ol_pgid;
-        req->r_pgid = pgid;
-        err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
-        if (err > 0) {
-                o = acting[0];
-                num = err;
-        }
-        if ((req->r_osd && req->r_osd->o_osd == o &&
-             req->r_sent >= req->r_osd->o_incarnation &&
-             req->r_num_pg_osds == num &&
-             memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
-            (req->r_osd == NULL && o == -1))
-                return 0;  /* no change */
-        dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
-             req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
-             req->r_osd ? req->r_osd->o_osd : -1);
-        /* record full pg acting set */
-        memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
-        req->r_num_pg_osds = num;
-        if (req->r_osd) {
-                __cancel_request(req);
-                list_del_init(&req->r_osd_item);
-                req->r_osd = NULL;
-        }
-        req->r_osd = __lookup_osd(osdc, o);
-        if (!req->r_osd && o >= 0) {
-                err = -ENOMEM;
-                req->r_osd = create_osd(osdc);
-                if (!req->r_osd)
-                        goto out;
-                dout("map_osds osd %p is osd%d\n", req->r_osd, o);
-                req->r_osd->o_osd = o;
-                req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
-                __insert_osd(osdc, req->r_osd);
-                ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
-        }
-        if (req->r_osd) {
-                __remove_osd_from_lru(req->r_osd);
-                list_add(&req->r_osd_item, &req->r_osd->o_requests);
-        }
-        err = 1;   /* osd or pg changed */
-out:
-        return err;
-}
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static int __send_request(struct ceph_osd_client *osdc,
-                          struct ceph_osd_request *req)
-{
-        struct ceph_osd_request_head *reqhead;
-        int err;
-        err = __map_osds(osdc, req);
-        if (err < 0)
-                return err;
-        if (req->r_osd == NULL) {
-                dout("send_request %p no up osds in pg\n", req);
-                ceph_monc_request_next_osdmap(&osdc->client->monc);
-                return 0;
-        }
-        dout("send_request %p tid %llu to osd%d flags %d\n",
-             req, req->r_tid, req->r_osd->o_osd, req->r_flags);
-        reqhead = req->r_request->front.iov_base;
-        reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
-        reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
-        reqhead->reassert_version = req->r_reassert_version;
-        req->r_stamp = jiffies;
-        list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
-        ceph_msg_get(req->r_request); /* send consumes a ref */
-        ceph_con_send(&req->r_osd->o_con, req->r_request);
-        req->r_sent = req->r_osd->o_incarnation;
-        return 0;
-}
-/*
- * Timeout callback, called every N seconds when 1 or more osd
- * requests has been active for more than N seconds.  When this
- * happens, we ping all OSDs with requests who have timed out to
- * ensure any communications channel reset is detected.  Reset the
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
- */
-static void handle_timeout(struct work_struct *work)
-{
-        struct ceph_osd_client *osdc =
-                container_of(work, struct ceph_osd_client, timeout_work.work);
-        struct ceph_osd_request *req, *last_req = NULL;
-        struct ceph_osd *osd;
-        unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
-        unsigned long keepalive =
-                osdc->client->mount_args->osd_keepalive_timeout * HZ;
-        unsigned long last_stamp = 0;
-        struct rb_node *p;
-        struct list_head slow_osds;
-        dout("timeout\n");
-        down_read(&osdc->map_sem);
-        ceph_monc_request_next_osdmap(&osdc->client->monc);
-        mutex_lock(&osdc->request_mutex);
-        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-                req = rb_entry(p, struct ceph_osd_request, r_node);
-                if (req->r_resend) {
-                        int err;
-                        dout("osdc resending prev failed %lld\n", req->r_tid);
-                        err = __send_request(osdc, req);
-                        if (err)
-                                dout("osdc failed again on %lld\n", req->r_tid);
-                        else
-                                req->r_resend = false;
-                        continue;
-                }
-        }
-        /*
-         * reset osds that appear to be _really_ unresponsive.  this
-         * is a failsafe measure.. we really shouldn't be getting to
-         * this point if the system is working properly.  the monitors
-         * should mark the osd as failed and we should find out about
-         * it from an updated osd map.
-         */
-        while (timeout && !list_empty(&osdc->req_lru)) {
-                req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
-                                 r_req_lru_item);
-                if (time_before(jiffies, req->r_stamp + timeout))
-                        break;
-                BUG_ON(req == last_req && req->r_stamp == last_stamp);
-                last_req = req;
-                last_stamp = req->r_stamp;
-                osd = req->r_osd;
-                BUG_ON(!osd);
-                pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
-                           req->r_tid, osd->o_osd);
-                __kick_requests(osdc, osd);
-        }
-        /*
-         * ping osds that are a bit slow.  this ensures that if there
-         * is a break in the TCP connection we will notice, and reopen
-         * a connection with that osd (from the fault callback).
-         */
-        INIT_LIST_HEAD(&slow_osds);
-        list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-                if (time_before(jiffies, req->r_stamp + keepalive))
-                        break;
-                osd = req->r_osd;
-                BUG_ON(!osd);
-                dout(" tid %llu is slow, will send keepalive on osd%d\n",
-                     req->r_tid, osd->o_osd);
-                list_move_tail(&osd->o_keepalive_item, &slow_osds);
-        }
-        while (!list_empty(&slow_osds)) {
-                osd = list_entry(slow_osds.next, struct ceph_osd,
-                                 o_keepalive_item);
-                list_del_init(&osd->o_keepalive_item);
-                ceph_con_keepalive(&osd->o_con);
-        }
-        __schedule_osd_timeout(osdc);
-        mutex_unlock(&osdc->request_mutex);
-        up_read(&osdc->map_sem);
-}
-static void handle_osds_timeout(struct work_struct *work)
-{
-        struct ceph_osd_client *osdc =
-                container_of(work, struct ceph_osd_client,
-                             osds_timeout_work.work);
-        unsigned long delay =
-                osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
-        dout("osds timeout\n");
-        down_read(&osdc->map_sem);
-        remove_old_osds(osdc, 0);
-        up_read(&osdc->map_sem);
-        schedule_delayed_work(&osdc->osds_timeout_work,
-                              round_jiffies_relative(delay));
-}
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
-                         struct ceph_connection *con)
-{
-        struct ceph_osd_reply_head *rhead = msg->front.iov_base;
-        struct ceph_osd_request *req;
-        u64 tid;
-        int numops, object_len, flags;
-        s32 result;
-        tid = le64_to_cpu(msg->hdr.tid);
-        if (msg->front.iov_len < sizeof(*rhead))
-                goto bad;
-        numops = le32_to_cpu(rhead->num_ops);
-        object_len = le32_to_cpu(rhead->object_len);
-        result = le32_to_cpu(rhead->result);
-        if (msg->front.iov_len != sizeof(*rhead) + object_len +
-            numops * sizeof(struct ceph_osd_op))
-                goto bad;
-        dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
-        /* lookup */
-        mutex_lock(&osdc->request_mutex);
-        req = __lookup_request(osdc, tid);
-        if (req == NULL) {
-                dout("handle_reply tid %llu dne\n", tid);
-                mutex_unlock(&osdc->request_mutex);
-                return;
-        }
-        ceph_osdc_get_request(req);
-        flags = le32_to_cpu(rhead->flags);
-        /*
-         * if this connection filled our message, drop our reference now, to
-         * avoid a (safe but slower) revoke later.
-         */
-        if (req->r_con_filling_msg == con && req->r_reply == msg) {
-                dout(" dropping con_filling_msg ref %p\n", con);
-                req->r_con_filling_msg = NULL;
-                ceph_con_put(con);
-        }
-        if (!req->r_got_reply) {
-                unsigned bytes;
-                req->r_result = le32_to_cpu(rhead->result);
-                bytes = le32_to_cpu(msg->hdr.data_len);
-                dout("handle_reply result %d bytes %d\n", req->r_result,
-                     bytes);
-                if (req->r_result == 0)
-                        req->r_result = bytes;
-                /* in case this is a write and we need to replay, */
-                req->r_reassert_version = rhead->reassert_version;
-                req->r_got_reply = 1;
-        } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
-                dout("handle_reply tid %llu dup ack\n", tid);
-                mutex_unlock(&osdc->request_mutex);
-                goto done;
-        }
-        dout("handle_reply tid %llu flags %d\n", tid, flags);
-        /* either this is a read, or we got the safe response */
-        if (result < 0 ||
-            (flags & CEPH_OSD_FLAG_ONDISK) ||
-            ((flags & CEPH_OSD_FLAG_WRITE) == 0))
-                __unregister_request(osdc, req);
-        mutex_unlock(&osdc->request_mutex);
-        if (req->r_callback)
-                req->r_callback(req, msg);
-        else
-                complete_all(&req->r_completion);
-        if (flags & CEPH_OSD_FLAG_ONDISK) {
-                if (req->r_safe_callback)
-                        req->r_safe_callback(req, msg);
-                complete_all(&req->r_safe_completion);  /* fsync waiter */
-        }
-done:
-        ceph_osdc_put_request(req);
-        return;
-bad:
-        pr_err("corrupt osd_op_reply got %d %d expected %d\n",
-               (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
-               (int)sizeof(*rhead));
-        ceph_msg_dump(msg);
-}
-static int __kick_requests(struct ceph_osd_client *osdc,
-                          struct ceph_osd *kickosd)
-{
-        struct ceph_osd_request *req;
-        struct rb_node *p, *n;
-        int needmap = 0;
-        int err;
-        dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
-        if (kickosd) {
-                err = __reset_osd(osdc, kickosd);
-                if (err == -EAGAIN)
-                        return 1;
-        } else {
-                for (p = rb_first(&osdc->osds); p; p = n) {
-                        struct ceph_osd *osd =
-                                rb_entry(p, struct ceph_osd, o_node);
-                        n = rb_next(p);
-                        if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-                            memcmp(&osd->o_con.peer_addr,
-                                   ceph_osd_addr(osdc->osdmap,
-                                                 osd->o_osd),
-                                   sizeof(struct ceph_entity_addr)) != 0)
-                                __reset_osd(osdc, osd);
-                }
-        }
-        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-                req = rb_entry(p, struct ceph_osd_request, r_node);
-                if (req->r_resend) {
-                        dout(" r_resend set on tid %llu\n", req->r_tid);
-                        __cancel_request(req);
-                        goto kick;
-                }
-                if (req->r_osd && kickosd == req->r_osd) {
-                        __cancel_request(req);
-                        goto kick;
-                }
-                err = __map_osds(osdc, req);
-                if (err == 0)
-                        continue;  /* no change */
-                if (err < 0) {
-                        /*
-                         * FIXME: really, we should set the request
-                         * error and fail if this isn't a 'nofail'
-                         * request, but that's a fair bit more
-                         * complicated to do.  So retry!
-                         */
-                        dout(" setting r_resend on %llu\n", req->r_tid);
-                        req->r_resend = true;
-                        continue;
-                }
-                if (req->r_osd == NULL) {
-                        dout("tid %llu maps to no valid osd\n", req->r_tid);
-                        needmap++;  /* request a newer map */
-                        continue;
-                }
-kick:
-                dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
-                     req->r_osd ? req->r_osd->o_osd : -1);
-                req->r_flags |= CEPH_OSD_FLAG_RETRY;
-                err = __send_request(osdc, req);
-                if (err) {
-                        dout(" setting r_resend on %llu\n", req->r_tid);
-                        req->r_resend = true;
-                }
-        }
-        return needmap;
-}
-/*
- * Resubmit osd requests whose osd or osd address has changed.  Request
- * a new osd map if osds are down, or we are otherwise unable to determine
- * how to direct a request.
- *
- * Close connections to down osds.
- *
- * If @who is specified, resubmit requests for that specific osd.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static void kick_requests(struct ceph_osd_client *osdc,
-                          struct ceph_osd *kickosd)
-{
-        int needmap;
-        mutex_lock(&osdc->request_mutex);
-        needmap = __kick_requests(osdc, kickosd);
-        mutex_unlock(&osdc->request_mutex);
-        if (needmap) {
-                dout("%d requests for down osds, need new map\n", needmap);
-                ceph_monc_request_next_osdmap(&osdc->client->monc);
-        }
-}
-/*
- * Process updated osd map.
- *
- * The message contains any number of incremental and full maps, normally
- * indicating some sort of topology change in the cluster.  Kick requests
- * off to different OSDs as needed.
- */
-void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
-{
-        void *p, *end, *next;
-        u32 nr_maps, maplen;
-        u32 epoch;
-        struct ceph_osdmap *newmap = NULL, *oldmap;
-        int err;
-        struct ceph_fsid fsid;
-        dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
-        p = msg->front.iov_base;
-        end = p + msg->front.iov_len;
-        /* verify fsid */
-        ceph_decode_need(&p, end, sizeof(fsid), bad);
-        ceph_decode_copy(&p, &fsid, sizeof(fsid));
-        if (ceph_check_fsid(osdc->client, &fsid) < 0)
-                return;
-        down_write(&osdc->map_sem);
-        /* incremental maps */
-        ceph_decode_32_safe(&p, end, nr_maps, bad);
-        dout(" %d inc maps\n", nr_maps);
-        while (nr_maps > 0) {
-                ceph_decode_need(&p, end, 2*sizeof(u32), bad);
-                epoch = ceph_decode_32(&p);
-                maplen = ceph_decode_32(&p);
-                ceph_decode_need(&p, end, maplen, bad);
-                next = p + maplen;
-                if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
-                        dout("applying incremental map %u len %d\n",
-                             epoch, maplen);
-                        newmap = osdmap_apply_incremental(&p, next,
-                                                          osdc->osdmap,
-                                                          osdc->client->msgr);
-                        if (IS_ERR(newmap)) {
-                                err = PTR_ERR(newmap);
-                                goto bad;
-                        }
-                        BUG_ON(!newmap);
-                        if (newmap != osdc->osdmap) {
-                                ceph_osdmap_destroy(osdc->osdmap);
-                                osdc->osdmap = newmap;
-                        }
-                } else {
-                        dout("ignoring incremental map %u len %d\n",
-                             epoch, maplen);
-                }
-                p = next;
-                nr_maps--;
-        }
-        if (newmap)
-                goto done;
-        /* full maps */
-        ceph_decode_32_safe(&p, end, nr_maps, bad);
-        dout(" %d full maps\n", nr_maps);
-        while (nr_maps) {
-                ceph_decode_need(&p, end, 2*sizeof(u32), bad);
-                epoch = ceph_decode_32(&p);
-                maplen = ceph_decode_32(&p);
-                ceph_decode_need(&p, end, maplen, bad);
-                if (nr_maps > 1) {
-                        dout("skipping non-latest full map %u len %d\n",
-                             epoch, maplen);
-                } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
-                        dout("skipping full map %u len %d, "
-                             "older than our %u\n", epoch, maplen,
-                             osdc->osdmap->epoch);
-                } else {
-                        dout("taking full map %u len %d\n", epoch, maplen);
-                        newmap = osdmap_decode(&p, p+maplen);
-                        if (IS_ERR(newmap)) {
-                                err = PTR_ERR(newmap);
-                                goto bad;
-                        }
-                        BUG_ON(!newmap);
-                        oldmap = osdc->osdmap;
-                        osdc->osdmap = newmap;
-                        if (oldmap)
-                                ceph_osdmap_destroy(oldmap);
-                }
-                p += maplen;
-                nr_maps--;
-        }
-done:
-        downgrade_write(&osdc->map_sem);
-        ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
-        if (newmap)
-                kick_requests(osdc, NULL);
-        up_read(&osdc->map_sem);
-        wake_up_all(&osdc->client->auth_wq);
-        return;
-bad:
-        pr_err("osdc handle_map corrupt msg\n");
-        ceph_msg_dump(msg);
-        up_write(&osdc->map_sem);
-        return;
-}
-/*
- * Register request, send initial attempt.
- */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-                            struct ceph_osd_request *req,
-                            bool nofail)
-{
-        int rc = 0;
-        req->r_request->pages = req->r_pages;
-        req->r_request->nr_pages = req->r_num_pages;
-        register_request(osdc, req);
-        down_read(&osdc->map_sem);
-        mutex_lock(&osdc->request_mutex);
-        /*
-         * a racing kick_requests() may have sent the message for us
-         * while we dropped request_mutex above, so only send now if
-         * the request still han't been touched yet.
-         */
-        if (req->r_sent == 0) {
-                rc = __send_request(osdc, req);
-                if (rc) {
-                        if (nofail) {
-                                dout("osdc_start_request failed send, "
-                                     " marking %lld\n", req->r_tid);
-                                req->r_resend = true;
-                                rc = 0;
-                        } else {
-                                __unregister_request(osdc, req);
-                        }
-                }
-        }
-        mutex_unlock(&osdc->request_mutex);
-        up_read(&osdc->map_sem);
-        return rc;
-}
-/*
- * wait for a request to complete
- */
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
-                           struct ceph_osd_request *req)
-{
-        int rc;
-        rc = wait_for_completion_interruptible(&req->r_completion);
-        if (rc < 0) {
-                mutex_lock(&osdc->request_mutex);
-                __cancel_request(req);
-                __unregister_request(osdc, req);
-                mutex_unlock(&osdc->request_mutex);
-                dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
-                return rc;
-        }
-        dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
-        return req->r_result;
-}
-/*
- * sync - wait for all in-flight requests to flush.  avoid starvation.
- */
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
-{
-        struct ceph_osd_request *req;
-        u64 last_tid, next_tid = 0;
-        mutex_lock(&osdc->request_mutex);
-        last_tid = osdc->last_tid;
-        while (1) {
-                req = __lookup_request_ge(osdc, next_tid);
-                if (!req)
-                        break;
-                if (req->r_tid > last_tid)
-                        break;
-                next_tid = req->r_tid + 1;
-                if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
-                        continue;
-                ceph_osdc_get_request(req);
-                mutex_unlock(&osdc->request_mutex);
-                dout("sync waiting on tid %llu (last is %llu)\n",
-                     req->r_tid, last_tid);
-                wait_for_completion(&req->r_safe_completion);
-                mutex_lock(&osdc->request_mutex);
-                ceph_osdc_put_request(req);
-        }
-        mutex_unlock(&osdc->request_mutex);
-        dout("sync done (thru tid %llu)\n", last_tid);
-}
-/*
- * init, shutdown
- */
-int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
-{
-        int err;
-        dout("init\n");
-        osdc->client = client;
-        osdc->osdmap = NULL;
-        init_rwsem(&osdc->map_sem);
-        init_completion(&osdc->map_waiters);
-        osdc->last_requested_map = 0;
-        mutex_init(&osdc->request_mutex);
-        osdc->last_tid = 0;
-        osdc->osds = RB_ROOT;
-        INIT_LIST_HEAD(&osdc->osd_lru);
-        osdc->requests = RB_ROOT;
-        INIT_LIST_HEAD(&osdc->req_lru);
-        osdc->num_requests = 0;
-        INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
-        INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
-        schedule_delayed_work(&osdc->osds_timeout_work,
-           round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
-        err = -ENOMEM;
-        osdc->req_mempool = mempool_create_kmalloc_pool(10,
-                                        sizeof(struct ceph_osd_request));
-        if (!osdc->req_mempool)
-                goto out;
-        err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
-                                "osd_op");
-        if (err < 0)
-                goto out_mempool;
-        err = ceph_msgpool_init(&osdc->msgpool_op_reply,
-                                OSD_OPREPLY_FRONT_LEN, 10, true,
-                                "osd_op_reply");
-        if (err < 0)
-                goto out_msgpool;
-        return 0;
-out_msgpool:
-        ceph_msgpool_destroy(&osdc->msgpool_op);
-out_mempool:
-        mempool_destroy(osdc->req_mempool);
-out:
-        return err;
-}
-void ceph_osdc_stop(struct ceph_osd_client *osdc)
-{
-        cancel_delayed_work_sync(&osdc->timeout_work);
-        cancel_delayed_work_sync(&osdc->osds_timeout_work);
-        if (osdc->osdmap) {
-                ceph_osdmap_destroy(osdc->osdmap);
-                osdc->osdmap = NULL;
-        }
-        remove_old_osds(osdc, 1);
-        mempool_destroy(osdc->req_mempool);
-        ceph_msgpool_destroy(&osdc->msgpool_op);
-        ceph_msgpool_destroy(&osdc->msgpool_op_reply);
-}
-/*
- * Read some contiguous pages.  If we cross a stripe boundary, shorten
- * *plen.  Return number of bytes read, or error.
- */
-int ceph_osdc_readpages(struct ceph_osd_client *osdc,
-                        struct ceph_vino vino, struct ceph_file_layout *layout,
-                        u64 off, u64 *plen,
-                        u32 truncate_seq, u64 truncate_size,
-                        struct page **pages, int num_pages)
-{
-        struct ceph_osd_request *req;
-        int rc = 0;
-        dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
-             vino.snap, off, *plen);
-        req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
-                                    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
-                                    NULL, 0, truncate_seq, truncate_size, NULL,
-                                    false, 1);
-        if (!req)
-                return -ENOMEM;
-        /* it may be a short read due to an object boundary */
-        req->r_pages = pages;
-        dout("readpages  final extent is %llu~%llu (%d pages)\n",
-             off, *plen, req->r_num_pages);
-        rc = ceph_osdc_start_request(osdc, req, false);
-        if (!rc)
-                rc = ceph_osdc_wait_request(osdc, req);
-        ceph_osdc_put_request(req);
-        dout("readpages result %d\n", rc);
-        return rc;
-}
-/*
- * do a synchronous write on N pages
- */
-int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
-                         struct ceph_file_layout *layout,
-                         struct ceph_snap_context *snapc,
-                         u64 off, u64 len,
-                         u32 truncate_seq, u64 truncate_size,
-                         struct timespec *mtime,
-                         struct page **pages, int num_pages,
-                         int flags, int do_sync, bool nofail)
-{
-        struct ceph_osd_request *req;
-        int rc = 0;
-        BUG_ON(vino.snap != CEPH_NOSNAP);
-        req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
-                                    CEPH_OSD_OP_WRITE,
-                                    flags | CEPH_OSD_FLAG_ONDISK |
-                                            CEPH_OSD_FLAG_WRITE,
-                                    snapc, do_sync,
-                                    truncate_seq, truncate_size, mtime,
-                                    nofail, 1);
-        if (!req)
-                return -ENOMEM;
-        /* it may be a short write due to an object boundary */
-        req->r_pages = pages;
-        dout("writepages %llu~%llu (%d pages)\n", off, len,
-             req->r_num_pages);
-        rc = ceph_osdc_start_request(osdc, req, nofail);
-        if (!rc)
-                rc = ceph_osdc_wait_request(osdc, req);
-        ceph_osdc_put_request(req);
-        if (rc == 0)
-                rc = len;
-        dout("writepages result %d\n", rc);
-        return rc;
-}
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
-        struct ceph_osd *osd = con->private;
-        struct ceph_osd_client *osdc;
-        int type = le16_to_cpu(msg->hdr.type);
-        if (!osd)
-                goto out;
-        osdc = osd->o_osdc;
-        switch (type) {
-        case CEPH_MSG_OSD_MAP:
-                ceph_osdc_handle_map(osdc, msg);
-                break;
-        case CEPH_MSG_OSD_OPREPLY:
-                handle_reply(osdc, msg, con);
-                break;
-        default:
-                pr_err("received unknown message type %d %s\n", type,
-                       ceph_msg_type_name(type));
-        }
-out:
-        ceph_msg_put(msg);
-}
-/*
- * lookup and return message for incoming reply.  set up reply message
- * pages.
- */
-static struct ceph_msg *get_reply(struct ceph_connection *con,
-                                  struct ceph_msg_header *hdr,
-                                  int *skip)
-{
-        struct ceph_osd *osd = con->private;
-        struct ceph_osd_client *osdc = osd->o_osdc;
-        struct ceph_msg *m;
-        struct ceph_osd_request *req;
-        int front = le32_to_cpu(hdr->front_len);
-        int data_len = le32_to_cpu(hdr->data_len);
-        u64 tid;
-        tid = le64_to_cpu(hdr->tid);
-        mutex_lock(&osdc->request_mutex);
-        req = __lookup_request(osdc, tid);
-        if (!req) {
-                *skip = 1;
-                m = NULL;
-                pr_info("get_reply unknown tid %llu from osd%d\n", tid,
-                        osd->o_osd);
-                goto out;
-        }
-        if (req->r_con_filling_msg) {
-                dout("get_reply revoking msg %p from old con %p\n",
-                     req->r_reply, req->r_con_filling_msg);
-                ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
-                ceph_con_put(req->r_con_filling_msg);
-                req->r_con_filling_msg = NULL;
-        }
-        if (front > req->r_reply->front.iov_len) {
-                pr_warning("get_reply front %d > preallocated %d\n",
-                           front, (int)req->r_reply->front.iov_len);
-                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
-                if (!m)
-                        goto out;
-                ceph_msg_put(req->r_reply);
-                req->r_reply = m;
-        }
-        m = ceph_msg_get(req->r_reply);
-        if (data_len > 0) {
-                unsigned data_off = le16_to_cpu(hdr->data_off);
-                int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-                if (unlikely(req->r_num_pages < want)) {
-                        pr_warning("tid %lld reply %d > expected %d pages\n",
-                                   tid, want, m->nr_pages);
-                        *skip = 1;
-                        ceph_msg_put(m);
-                        m = NULL;
-                        goto out;
-                }
-                m->pages = req->r_pages;
-                m->nr_pages = req->r_num_pages;
-        }
-        *skip = 0;
-        req->r_con_filling_msg = ceph_con_get(con);
-        dout("get_reply tid %lld %p\n", tid, m);
-out:
-        mutex_unlock(&osdc->request_mutex);
-        return m;
-}
-static struct ceph_msg *alloc_msg(struct ceph_connection *con,
-                                  struct ceph_msg_header *hdr,
-                                  int *skip)
-{
-        struct ceph_osd *osd = con->private;
-        int type = le16_to_cpu(hdr->type);
-        int front = le32_to_cpu(hdr->front_len);
-        switch (type) {
-        case CEPH_MSG_OSD_MAP:
-                return ceph_msg_new(type, front, GFP_NOFS);
-        case CEPH_MSG_OSD_OPREPLY:
-                return get_reply(con, hdr, skip);
-        default:
-                pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
-                        osd->o_osd);
-                *skip = 1;
-                return NULL;
-        }
-}
-/*
- * Wrappers to refcount containing ceph_osd struct
- */
-static struct ceph_connection *get_osd_con(struct ceph_connection *con)
-{
-        struct ceph_osd *osd = con->private;
-        if (get_osd(osd))
-                return con;
-        return NULL;
-}
-static void put_osd_con(struct ceph_connection *con)
-{
-        struct ceph_osd *osd = con->private;
-        put_osd(osd);
-}
-/*
- * authentication
- */
-static int get_authorizer(struct ceph_connection *con,
-                          void **buf, int *len, int *proto,
-                          void **reply_buf, int *reply_len, int force_new)
-{
-        struct ceph_osd *o = con->private;
-        struct ceph_osd_client *osdc = o->o_osdc;
-        struct ceph_auth_client *ac = osdc->client->monc.auth;
-        int ret = 0;
-        if (force_new && o->o_authorizer) {
-                ac->ops->destroy_authorizer(ac, o->o_authorizer);
-                o->o_authorizer = NULL;
-        }
-        if (o->o_authorizer == NULL) {
-                ret = ac->ops->create_authorizer(
-                        ac, CEPH_ENTITY_TYPE_OSD,
-                        &o->o_authorizer,
-                        &o->o_authorizer_buf,
-                        &o->o_authorizer_buf_len,
-                        &o->o_authorizer_reply_buf,
-                        &o->o_authorizer_reply_buf_len);
-                if (ret)
-                        return ret;
-        }
-        *proto = ac->protocol;
-        *buf = o->o_authorizer_buf;
-        *len = o->o_authorizer_buf_len;
-        *reply_buf = o->o_authorizer_reply_buf;
-        *reply_len = o->o_authorizer_reply_buf_len;
-        return 0;
-}
-static int verify_authorizer_reply(struct ceph_connection *con, int len)
-{
-        struct ceph_osd *o = con->private;
-        struct ceph_osd_client *osdc = o->o_osdc;
-        struct ceph_auth_client *ac = osdc->client->monc.auth;
-        return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
-}
-static int invalidate_authorizer(struct ceph_connection *con)
-{
-        struct ceph_osd *o = con->private;
-        struct ceph_osd_client *osdc = o->o_osdc;
-        struct ceph_auth_client *ac = osdc->client->monc.auth;
-        if (ac->ops->invalidate_authorizer)
-                ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
-        return ceph_monc_validate_auth(&osdc->client->monc);
-}
-static const struct ceph_connection_operations osd_con_ops = {
-        .get = get_osd_con,
-        .put = put_osd_con,
-        .dispatch = dispatch,
-        .get_authorizer = get_authorizer,
-        .verify_authorizer_reply = verify_authorizer_reply,
-        .invalidate_authorizer = invalidate_authorizer,
-        .alloc_msg = alloc_msg,
-        .fault = osd_reset,
-};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
deleted file mode 100644
index ce776989ef6a..000000000000
--- a/fs/ceph/osd_client.h
+++ /dev/null
@@ -1,167 +0,0 @@
-#ifndef _FS_CEPH_OSD_CLIENT_H
-#define _FS_CEPH_OSD_CLIENT_H
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/mempool.h>
-#include <linux/rbtree.h>
-#include "types.h"
-#include "osdmap.h"
-#include "messenger.h"
-struct ceph_msg;
-struct ceph_snap_context;
-struct ceph_osd_request;
-struct ceph_osd_client;
-struct ceph_authorizer;
-/*
- * completion callback for async writepages
- */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
-                                     struct ceph_msg *);
-/* a given osd we're communicating with */
-struct ceph_osd {
-        atomic_t o_ref;
-        struct ceph_osd_client *o_osdc;
-        int o_osd;
-        int o_incarnation;
-        struct rb_node o_node;
-        struct ceph_connection o_con;
-        struct list_head o_requests;
-        struct list_head o_osd_lru;
-        struct ceph_authorizer *o_authorizer;
-        void *o_authorizer_buf, *o_authorizer_reply_buf;
-        size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
-        unsigned long lru_ttl;
-        int o_marked_for_keepalive;
-        struct list_head o_keepalive_item;
-};
-/* an in-flight request */
-struct ceph_osd_request {
-        u64             r_tid;              /* unique for this client */
-        struct rb_node  r_node;
-        struct list_head r_req_lru_item;
-        struct list_head r_osd_item;
-        struct ceph_osd *r_osd;
-        struct ceph_pg   r_pgid;
-        int              r_pg_osds[CEPH_PG_MAX_SIZE];
-        int              r_num_pg_osds;
-        struct ceph_connection *r_con_filling_msg;
-        struct ceph_msg  *r_request, *r_reply;
-        int               r_result;
-        int               r_flags;     /* any additional flags for the osd */
-        u32               r_sent;      /* >0 if r_request is sending/sent */
-        int               r_got_reply;
-        struct ceph_osd_client *r_osdc;
-        struct kref       r_kref;
-        bool              r_mempool;
-        struct completion r_completion, r_safe_completion;
-        ceph_osdc_callback_t r_callback, r_safe_callback;
-        struct ceph_eversion r_reassert_version;
-        struct list_head  r_unsafe_item;
-        struct inode *r_inode;                /* for use by callbacks */
-        char              r_oid[40];          /* object name */
-        int               r_oid_len;
-        unsigned long     r_stamp;            /* send OR check time */
-        bool              r_resend;           /* msg send failed, needs retry */
-        struct ceph_file_layout r_file_layout;
-        struct ceph_snap_context *r_snapc;    /* snap context for writes */
-        unsigned          r_num_pages;        /* size of page array (follows) */
-        struct page     **r_pages;            /* pages for data payload */
-        int               r_pages_from_pool;
-        int               r_own_pages;        /* if true, i own page list */
-};
-struct ceph_osd_client {
-        struct ceph_client     *client;
-        struct ceph_osdmap     *osdmap;       /* current map */
-        struct rw_semaphore    map_sem;
-        struct completion      map_waiters;
-        u64                    last_requested_map;
-        struct mutex           request_mutex;
-        struct rb_root         osds;          /* osds */
-        struct list_head       osd_lru;       /* idle osds */
-        u64                    timeout_tid;   /* tid of timeout triggering rq */
-        u64                    last_tid;      /* tid of last request */
-        struct rb_root         requests;      /* pending requests */
-        struct list_head       req_lru;       /* pending requests lru */
-        int                    num_requests;
-        struct delayed_work    timeout_work;
-        struct delayed_work    osds_timeout_work;
-#ifdef CONFIG_DEBUG_FS
-        struct dentry          *debugfs_file;
-#endif
-        mempool_t              *req_mempool;
-        struct ceph_msgpool     msgpool_op;
-        struct ceph_msgpool     msgpool_op_reply;
-};
-extern int ceph_osdc_init(struct ceph_osd_client *osdc,
-                          struct ceph_client *client);
-extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
-extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
-                                   struct ceph_msg *msg);
-extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
-                                 struct ceph_msg *msg);
-extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
-                                      struct ceph_file_layout *layout,
-                                      struct ceph_vino vino,
-                                      u64 offset, u64 *len, int op, int flags,
-                                      struct ceph_snap_context *snapc,
-                                      int do_sync, u32 truncate_seq,
-                                      u64 truncate_size,
-                                      struct timespec *mtime,
-                                      bool use_mempool, int num_reply);
-static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
-{
-        kref_get(&req->r_kref);
-}
-extern void ceph_osdc_release_request(struct kref *kref);
-static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
-{
-        kref_put(&req->r_kref, ceph_osdc_release_request);
-}
-extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-                                   struct ceph_osd_request *req,
-                                   bool nofail);
-extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
-                                  struct ceph_osd_request *req);
-extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
-extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
-                               struct ceph_vino vino,
-                               struct ceph_file_layout *layout,
-                               u64 off, u64 *plen,
-                               u32 truncate_seq, u64 truncate_size,
-                               struct page **pages, int nr_pages);
-extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
-                                struct ceph_vino vino,
-                                struct ceph_file_layout *layout,
-                                struct ceph_snap_context *sc,
-                                u64 off, u64 len,
-                                u32 truncate_seq, u64 truncate_size,
-                                struct timespec *mtime,
-                                struct page **pages, int nr_pages,
-                                int flags, int do_sync, bool nofail);
-#endif
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
deleted file mode 100644
index e31f118f1392..000000000000
--- a/fs/ceph/osdmap.c
+++ /dev/null
@@ -1,1110 +0,0 @@
-#include "ceph_debug.h"
-#include <linux/slab.h>
-#include <asm/div64.h>
-#include "super.h"
-#include "osdmap.h"
-#include "crush/hash.h"
-#include "crush/mapper.h"
-#include "decode.h"
-char *ceph_osdmap_state_str(char *str, int len, int state)
-{
-        int flag = 0;
-        if (!len)
-                goto done;
-        *str = '\0';
-        if (state) {
-                if (state & CEPH_OSD_EXISTS) {
-                        snprintf(str, len, "exists");
-                        flag = 1;
-                }
-                if (state & CEPH_OSD_UP) {
-                        snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
-                                 "up");
-                        flag = 1;
-                }
-        } else {
-                snprintf(str, len, "doesn't exist");
-        }
-done:
-        return str;
-}
-/* maps */
-static int calc_bits_of(unsigned t)
-{
-        int b = 0;
-        while (t) {
-                t = t >> 1;
-                b++;
-        }
-        return b;
-}
-/*
- * the foo_mask is the smallest value 2^n-1 that is >= foo.
- */
-static void calc_pg_masks(struct ceph_pg_pool_info *pi)
-{
-        pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
-        pi->pgp_num_mask =
-                (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
-        pi->lpg_num_mask =
-                (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
-        pi->lpgp_num_mask =
-                (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
-}
-/*
- * decode crush map
- */
-static int crush_decode_uniform_bucket(void **p, void *end,
-                                       struct crush_bucket_uniform *b)
-{
-        dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
-        ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
-        b->item_weight = ceph_decode_32(p);
-        return 0;
-bad:
-        return -EINVAL;
-}
-static int crush_decode_list_bucket(void **p, void *end,
-                                    struct crush_bucket_list *b)
-{
-        int j;
-        dout("crush_decode_list_bucket %p to %p\n", *p, end);
-        b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-        if (b->item_weights == NULL)
-                return -ENOMEM;
-        b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-        if (b->sum_weights == NULL)
-                return -ENOMEM;
-        ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
-        for (j = 0; j < b->h.size; j++) {
-                b->item_weights[j] = ceph_decode_32(p);
-                b->sum_weights[j] = ceph_decode_32(p);
-        }
-        return 0;
-bad:
-        return -EINVAL;
-}
-static int crush_decode_tree_bucket(void **p, void *end,
-                                    struct crush_bucket_tree *b)
-{
-        int j;
-        dout("crush_decode_tree_bucket %p to %p\n", *p, end);
-        ceph_decode_32_safe(p, end, b->num_nodes, bad);
-        b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
-        if (b->node_weights == NULL)
-                return -ENOMEM;
-        ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
-        for (j = 0; j < b->num_nodes; j++)
-                b->node_weights[j] = ceph_decode_32(p);
-        return 0;
-bad:
-        return -EINVAL;
-}
-static int crush_decode_straw_bucket(void **p, void *end,
-                                     struct crush_bucket_straw *b)
-{
-        int j;
-        dout("crush_decode_straw_bucket %p to %p\n", *p, end);
-        b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-        if (b->item_weights == NULL)
-                return -ENOMEM;
-        b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-        if (b->straws == NULL)
-                return -ENOMEM;
-        ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
-        for (j = 0; j < b->h.size; j++) {
-                b->item_weights[j] = ceph_decode_32(p);
-                b->straws[j] = ceph_decode_32(p);
-        }
-        return 0;
-bad:
-        return -EINVAL;
-}
-static struct crush_map *crush_decode(void *pbyval, void *end)
-{
-        struct crush_map *c;
-        int err = -EINVAL;
-        int i, j;
-        void **p = &pbyval;
-        void *start = pbyval;
-        u32 magic;
-        dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-        c = kzalloc(sizeof(*c), GFP_NOFS);
-        if (c == NULL)
-                return ERR_PTR(-ENOMEM);
-        ceph_decode_need(p, end, 4*sizeof(u32), bad);
-        magic = ceph_decode_32(p);
-        if (magic != CRUSH_MAGIC) {
-                pr_err("crush_decode magic %x != current %x\n",
-                       (unsigned)magic, (unsigned)CRUSH_MAGIC);
-                goto bad;
-        }
-        c->max_buckets = ceph_decode_32(p);
-        c->max_rules = ceph_decode_32(p);
-        c->max_devices = ceph_decode_32(p);
-        c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
-        if (c->device_parents == NULL)
-                goto badmem;
-        c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
-        if (c->bucket_parents == NULL)
-                goto badmem;
-        c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
-        if (c->buckets == NULL)
-                goto badmem;
-        c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
-        if (c->rules == NULL)
-                goto badmem;
-        /* buckets */
-        for (i = 0; i < c->max_buckets; i++) {
-                int size = 0;
-                u32 alg;
-                struct crush_bucket *b;
-                ceph_decode_32_safe(p, end, alg, bad);
-                if (alg == 0) {
-                        c->buckets[i] = NULL;
-                        continue;
-                }
-                dout("crush_decode bucket %d off %x %p to %p\n",
-                     i, (int)(*p-start), *p, end);
-                switch (alg) {
-                case CRUSH_BUCKET_UNIFORM:
-                        size = sizeof(struct crush_bucket_uniform);
-                        break;
-                case CRUSH_BUCKET_LIST:
-                        size = sizeof(struct crush_bucket_list);
-                        break;
-                case CRUSH_BUCKET_TREE:
-                        size = sizeof(struct crush_bucket_tree);
-                        break;
-                case CRUSH_BUCKET_STRAW:
-                        size = sizeof(struct crush_bucket_straw);
-                        break;
-                default:
-                        err = -EINVAL;
-                        goto bad;
-                }
-                BUG_ON(size == 0);
-                b = c->buckets[i] = kzalloc(size, GFP_NOFS);
-                if (b == NULL)
-                        goto badmem;
-                ceph_decode_need(p, end, 4*sizeof(u32), bad);
-                b->id = ceph_decode_32(p);
-                b->type = ceph_decode_16(p);
-                b->alg = ceph_decode_8(p);
-                b->hash = ceph_decode_8(p);
-                b->weight = ceph_decode_32(p);
-                b->size = ceph_decode_32(p);
-                dout("crush_decode bucket size %d off %x %p to %p\n",
-                     b->size, (int)(*p-start), *p, end);
-                b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
-                if (b->items == NULL)
-                        goto badmem;
-                b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
-                if (b->perm == NULL)
-                        goto badmem;
-                b->perm_n = 0;
-                ceph_decode_need(p, end, b->size*sizeof(u32), bad);
-                for (j = 0; j < b->size; j++)
-                        b->items[j] = ceph_decode_32(p);
-                switch (b->alg) {
-                case CRUSH_BUCKET_UNIFORM:
-                        err = crush_decode_uniform_bucket(p, end,
-                                  (struct crush_bucket_uniform *)b);
-                        if (err < 0)
-                                goto bad;
-                        break;
-                case CRUSH_BUCKET_LIST:
-                        err = crush_decode_list_bucket(p, end,
-                               (struct crush_bucket_list *)b);
-                        if (err < 0)
-                                goto bad;
-                        break;
-                case CRUSH_BUCKET_TREE:
-                        err = crush_decode_tree_bucket(p, end,
-                                (struct crush_bucket_tree *)b);
-                        if (err < 0)
-                                goto bad;
-                        break;
-                case CRUSH_BUCKET_STRAW:
-                        err = crush_decode_straw_bucket(p, end,
-                                (struct crush_bucket_straw *)b);
-                        if (err < 0)
-                                goto bad;
-                        break;
-                }
-        }
-        /* rules */
-        dout("rule vec is %p\n", c->rules);
-        for (i = 0; i < c->max_rules; i++) {
-                u32 yes;
-                struct crush_rule *r;
-                ceph_decode_32_safe(p, end, yes, bad);
-                if (!yes) {
-                        dout("crush_decode NO rule %d off %x %p to %p\n",
-                             i, (int)(*p-start), *p, end);
-                        c->rules[i] = NULL;
-                        continue;
-                }
-                dout("crush_decode rule %d off %x %p to %p\n",
-                     i, (int)(*p-start), *p, end);
-                /* len */
-                ceph_decode_32_safe(p, end, yes, bad);
-#if BITS_PER_LONG == 32
-                err = -EINVAL;
-                if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
-                        goto bad;
-#endif
-                r = c->rules[i] = kmalloc(sizeof(*r) +
-                                          yes*sizeof(struct crush_rule_step),
-                                          GFP_NOFS);
-                if (r == NULL)
-                        goto badmem;
-                dout(" rule %d is at %p\n", i, r);
-                r->len = yes;
-                ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
-                ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
-                for (j = 0; j < r->len; j++) {
-                        r->steps[j].op = ceph_decode_32(p);
-                        r->steps[j].arg1 = ceph_decode_32(p);
-                        r->steps[j].arg2 = ceph_decode_32(p);
-                }
-        }
-        /* ignore trailing name maps. */
-        dout("crush_decode success\n");
-        return c;
-badmem:
-        err = -ENOMEM;
-bad:
-        dout("crush_decode fail %d\n", err);
-        crush_destroy(c);
-        return ERR_PTR(err);
-}
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
-{
-        u64 a = *(u64 *)&l;
-        u64 b = *(u64 *)&r;
-        if (a < b)
-                return -1;
-        if (a > b)
-                return 1;
-        return 0;
-}
-static int __insert_pg_mapping(struct ceph_pg_mapping *new,
-                               struct rb_root *root)
-{
-        struct rb_node **p = &root->rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_pg_mapping *pg = NULL;
-        int c;
-        while (*p) {
-                parent = *p;
-                pg = rb_entry(parent, struct ceph_pg_mapping, node);
-                c = pgid_cmp(new->pgid, pg->pgid);
-                if (c < 0)
-                        p = &(*p)->rb_left;
-                else if (c > 0)
-                        p = &(*p)->rb_right;
-                else
-                        return -EEXIST;
-        }
-        rb_link_node(&new->node, parent, p);
-        rb_insert_color(&new->node, root);
-        return 0;
-}
-static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
-                                                   struct ceph_pg pgid)
-{
-        struct rb_node *n = root->rb_node;
-        struct ceph_pg_mapping *pg;
-        int c;
-        while (n) {
-                pg = rb_entry(n, struct ceph_pg_mapping, node);
-                c = pgid_cmp(pgid, pg->pgid);
-                if (c < 0)
-                        n = n->rb_left;
-                else if (c > 0)
-                        n = n->rb_right;
-                else
-                        return pg;
-        }
-        return NULL;
-}
-/*
- * rbtree of pg pool info
- */
-static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
-{
-        struct rb_node **p = &root->rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_pg_pool_info *pi = NULL;
-        while (*p) {
-                parent = *p;
-                pi = rb_entry(parent, struct ceph_pg_pool_info, node);
-                if (new->id < pi->id)
-                        p = &(*p)->rb_left;
-                else if (new->id > pi->id)
-                        p = &(*p)->rb_right;
-                else
-                        return -EEXIST;
-        }
-        rb_link_node(&new->node, parent, p);
-        rb_insert_color(&new->node, root);
-        return 0;
-}
-static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
-{
-        struct ceph_pg_pool_info *pi;
-        struct rb_node *n = root->rb_node;
-        while (n) {
-                pi = rb_entry(n, struct ceph_pg_pool_info, node);
-                if (id < pi->id)
-                        n = n->rb_left;
-                else if (id > pi->id)
-                        n = n->rb_right;
-                else
-                        return pi;
-        }
-        return NULL;
-}
-static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
-{
-        rb_erase(&pi->node, root);
-        kfree(pi->name);
-        kfree(pi);
-}
-static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
-{
-        unsigned n, m;
-        ceph_decode_copy(p, &pi->v, sizeof(pi->v));
-        calc_pg_masks(pi);
-        /* num_snaps * snap_info_t */
-        n = le32_to_cpu(pi->v.num_snaps);
-        while (n--) {
-                ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
-                                 sizeof(struct ceph_timespec), bad);
-                *p += sizeof(u64) +       /* key */
-                        1 + sizeof(u64) + /* u8, snapid */
-                        sizeof(struct ceph_timespec);
-                m = ceph_decode_32(p);    /* snap name */
-                *p += m;
-        }
-        *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
-        return 0;
-bad:
-        return -EINVAL;
-}
-static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
-{
-        struct ceph_pg_pool_info *pi;
-        u32 num, len, pool;
-        ceph_decode_32_safe(p, end, num, bad);
-        dout(" %d pool names\n", num);
-        while (num--) {
-                ceph_decode_32_safe(p, end, pool, bad);
-                ceph_decode_32_safe(p, end, len, bad);
-                dout("  pool %d len %d\n", pool, len);
-                pi = __lookup_pg_pool(&map->pg_pools, pool);
-                if (pi) {
-                        kfree(pi->name);
-                        pi->name = kmalloc(len + 1, GFP_NOFS);
-                        if (pi->name) {
-                                memcpy(pi->name, *p, len);
-                                pi->name[len] = '\0';
-                                dout("  name is %s\n", pi->name);
-                        }
-                }
-                *p += len;
-        }
-        return 0;
-bad:
-        return -EINVAL;
-}
-/*
- * osd map
- */
-void ceph_osdmap_destroy(struct ceph_osdmap *map)
-{
-        dout("osdmap_destroy %p\n", map);
-        if (map->crush)
-                crush_destroy(map->crush);
-        while (!RB_EMPTY_ROOT(&map->pg_temp)) {
-                struct ceph_pg_mapping *pg =
-                        rb_entry(rb_first(&map->pg_temp),
-                                 struct ceph_pg_mapping, node);
-                rb_erase(&pg->node, &map->pg_temp);
-                kfree(pg);
-        }
-        while (!RB_EMPTY_ROOT(&map->pg_pools)) {
-                struct ceph_pg_pool_info *pi =
-                        rb_entry(rb_first(&map->pg_pools),
-                                 struct ceph_pg_pool_info, node);
-                __remove_pg_pool(&map->pg_pools, pi);
-        }
-        kfree(map->osd_state);
-        kfree(map->osd_weight);
-        kfree(map->osd_addr);
-        kfree(map);
-}
-/*
- * adjust max osd value.  reallocate arrays.
- */
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
-{
-        u8 *state;
-        struct ceph_entity_addr *addr;
-        u32 *weight;
-        state = kcalloc(max, sizeof(*state), GFP_NOFS);
-        addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
-        weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
-        if (state == NULL || addr == NULL || weight == NULL) {
-                kfree(state);
-                kfree(addr);
-                kfree(weight);
-                return -ENOMEM;
-        }
-        /* copy old? */
-        if (map->osd_state) {
-                memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
-                memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
-                memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
-                kfree(map->osd_state);
-                kfree(map->osd_addr);
-                kfree(map->osd_weight);
-        }
-        map->osd_state = state;
-        map->osd_weight = weight;
-        map->osd_addr = addr;
-        map->max_osd = max;
-        return 0;
-}
-/*
- * decode a full map.
- */
-struct ceph_osdmap *osdmap_decode(void **p, void *end)
-{
-        struct ceph_osdmap *map;
-        u16 version;
-        u32 len, max, i;
-        u8 ev;
-        int err = -EINVAL;
-        void *start = *p;
-        struct ceph_pg_pool_info *pi;
-        dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-        map = kzalloc(sizeof(*map), GFP_NOFS);
-        if (map == NULL)
-                return ERR_PTR(-ENOMEM);
-        map->pg_temp = RB_ROOT;
-        ceph_decode_16_safe(p, end, version, bad);
-        if (version > CEPH_OSDMAP_VERSION) {
-                pr_warning("got unknown v %d > %d of osdmap\n", version,
-                           CEPH_OSDMAP_VERSION);
-                goto bad;
-        }
-        ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
-        ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
-        map->epoch = ceph_decode_32(p);
-        ceph_decode_copy(p, &map->created, sizeof(map->created));
-        ceph_decode_copy(p, &map->modified, sizeof(map->modified));
-        ceph_decode_32_safe(p, end, max, bad);
-        while (max--) {
-                ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
-                pi = kzalloc(sizeof(*pi), GFP_NOFS);
-                if (!pi)
-                        goto bad;
-                pi->id = ceph_decode_32(p);
-                ev = ceph_decode_8(p); /* encoding version */
-                if (ev > CEPH_PG_POOL_VERSION) {
-                        pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-                                   ev, CEPH_PG_POOL_VERSION);
-                        kfree(pi);
-                        goto bad;
-                }
-                err = __decode_pool(p, end, pi);
-                if (err < 0)
-                        goto bad;
-                __insert_pg_pool(&map->pg_pools, pi);
-        }
-        if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-                goto bad;
-        ceph_decode_32_safe(p, end, map->pool_max, bad);
-        ceph_decode_32_safe(p, end, map->flags, bad);
-        max = ceph_decode_32(p);
-        /* (re)alloc osd arrays */
-        err = osdmap_set_max_osd(map, max);
-        if (err < 0)
-                goto bad;
-        dout("osdmap_decode max_osd = %d\n", map->max_osd);
-        /* osds */
-        err = -EINVAL;
-        ceph_decode_need(p, end, 3*sizeof(u32) +
-                         map->max_osd*(1 + sizeof(*map->osd_weight) +
-                                       sizeof(*map->osd_addr)), bad);
-        *p += 4; /* skip length field (should match max) */
-        ceph_decode_copy(p, map->osd_state, map->max_osd);
-        *p += 4; /* skip length field (should match max) */
-        for (i = 0; i < map->max_osd; i++)
-                map->osd_weight[i] = ceph_decode_32(p);
-        *p += 4; /* skip length field (should match max) */
-        ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
-        for (i = 0; i < map->max_osd; i++)
-                ceph_decode_addr(&map->osd_addr[i]);
-        /* pg_temp */
-        ceph_decode_32_safe(p, end, len, bad);
-        for (i = 0; i < len; i++) {
-                int n, j;
-                struct ceph_pg pgid;
-                struct ceph_pg_mapping *pg;
-                ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
-                ceph_decode_copy(p, &pgid, sizeof(pgid));
-                n = ceph_decode_32(p);
-                ceph_decode_need(p, end, n * sizeof(u32), bad);
-                err = -ENOMEM;
-                pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
-                if (!pg)
-                        goto bad;
-                pg->pgid = pgid;
-                pg->len = n;
-                for (j = 0; j < n; j++)
-                        pg->osds[j] = ceph_decode_32(p);
-                err = __insert_pg_mapping(pg, &map->pg_temp);
-                if (err)
-                        goto bad;
-                dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
-        }
-        /* crush */
-        ceph_decode_32_safe(p, end, len, bad);
-        dout("osdmap_decode crush len %d from off 0x%x\n", len,
-             (int)(*p - start));
-        ceph_decode_need(p, end, len, bad);
-        map->crush = crush_decode(*p, end);
-        *p += len;
-        if (IS_ERR(map->crush)) {
-                err = PTR_ERR(map->crush);
-                map->crush = NULL;
-                goto bad;
-        }
-        /* ignore the rest of the map */
-        *p = end;
-        dout("osdmap_decode done %p %p\n", *p, end);
-        return map;
-bad:
-        dout("osdmap_decode fail\n");
-        ceph_osdmap_destroy(map);
-        return ERR_PTR(err);
-}
-/*
- * decode and apply an incremental map update.
- */
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                             struct ceph_osdmap *map,
-                                             struct ceph_messenger *msgr)
-{
-        struct crush_map *newcrush = NULL;
-        struct ceph_fsid fsid;
-        u32 epoch = 0;
-        struct ceph_timespec modified;
-        u32 len, pool;
-        __s32 new_pool_max, new_flags, max;
-        void *start = *p;
-        int err = -EINVAL;
-        u16 version;
-        struct rb_node *rbp;
-        ceph_decode_16_safe(p, end, version, bad);
-        if (version > CEPH_OSDMAP_INC_VERSION) {
-                pr_warning("got unknown v %d > %d of inc osdmap\n", version,
-                           CEPH_OSDMAP_INC_VERSION);
-                goto bad;
-        }
-        ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
-                         bad);
-        ceph_decode_copy(p, &fsid, sizeof(fsid));
-        epoch = ceph_decode_32(p);
-        BUG_ON(epoch != map->epoch+1);
-        ceph_decode_copy(p, &modified, sizeof(modified));
-        new_pool_max = ceph_decode_32(p);
-        new_flags = ceph_decode_32(p);
-        /* full map? */
-        ceph_decode_32_safe(p, end, len, bad);
-        if (len > 0) {
-                dout("apply_incremental full map len %d, %p to %p\n",
-                     len, *p, end);
-                return osdmap_decode(p, min(*p+len, end));
-        }
-        /* new crush? */
-        ceph_decode_32_safe(p, end, len, bad);
-        if (len > 0) {
-                dout("apply_incremental new crush map len %d, %p to %p\n",
-                     len, *p, end);
-                newcrush = crush_decode(*p, min(*p+len, end));
-                if (IS_ERR(newcrush))
-                        return ERR_CAST(newcrush);
-                *p += len;
-        }
-        /* new flags? */
-        if (new_flags >= 0)
-                map->flags = new_flags;
-        if (new_pool_max >= 0)
-                map->pool_max = new_pool_max;
-        ceph_decode_need(p, end, 5*sizeof(u32), bad);
-        /* new max? */
-        max = ceph_decode_32(p);
-        if (max >= 0) {
-                err = osdmap_set_max_osd(map, max);
-                if (err < 0)
-                        goto bad;
-        }
-        map->epoch++;
-        map->modified = map->modified;
-        if (newcrush) {
-                if (map->crush)
-                        crush_destroy(map->crush);
-                map->crush = newcrush;
-                newcrush = NULL;
-        }
-        /* new_pool */
-        ceph_decode_32_safe(p, end, len, bad);
-        while (len--) {
-                __u8 ev;
-                struct ceph_pg_pool_info *pi;
-                ceph_decode_32_safe(p, end, pool, bad);
-                ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
-                ev = ceph_decode_8(p);  /* encoding version */
-                if (ev > CEPH_PG_POOL_VERSION) {
-                        pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-                                   ev, CEPH_PG_POOL_VERSION);
-                        goto bad;
-                }
-                pi = __lookup_pg_pool(&map->pg_pools, pool);
-                if (!pi) {
-                        pi = kzalloc(sizeof(*pi), GFP_NOFS);
-                        if (!pi) {
-                                err = -ENOMEM;
-                                goto bad;
-                        }
-                        pi->id = pool;
-                        __insert_pg_pool(&map->pg_pools, pi);
-                }
-                err = __decode_pool(p, end, pi);
-                if (err < 0)
-                        goto bad;
-        }
-        if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-                goto bad;
-        /* old_pool */
-        ceph_decode_32_safe(p, end, len, bad);
-        while (len--) {
-                struct ceph_pg_pool_info *pi;
-                ceph_decode_32_safe(p, end, pool, bad);
-                pi = __lookup_pg_pool(&map->pg_pools, pool);
-                if (pi)
-                        __remove_pg_pool(&map->pg_pools, pi);
-        }
-        /* new_up */
-        err = -EINVAL;
-        ceph_decode_32_safe(p, end, len, bad);
-        while (len--) {
-                u32 osd;
-                struct ceph_entity_addr addr;
-                ceph_decode_32_safe(p, end, osd, bad);
-                ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
-                ceph_decode_addr(&addr);
-                pr_info("osd%d up\n", osd);
-                BUG_ON(osd >= map->max_osd);
-                map->osd_state[osd] |= CEPH_OSD_UP;
-                map->osd_addr[osd] = addr;
-        }
-        /* new_down */
-        ceph_decode_32_safe(p, end, len, bad);
-        while (len--) {
-                u32 osd;
-                ceph_decode_32_safe(p, end, osd, bad);
-                (*p)++;  /* clean flag */
-                pr_info("osd%d down\n", osd);
-                if (osd < map->max_osd)
-                        map->osd_state[osd] &= ~CEPH_OSD_UP;
-        }
-        /* new_weight */
-        ceph_decode_32_safe(p, end, len, bad);
-        while (len--) {
-                u32 osd, off;
-                ceph_decode_need(p, end, sizeof(u32)*2, bad);
-                osd = ceph_decode_32(p);
-                off = ceph_decode_32(p);
-                pr_info("osd%d weight 0x%x %s\n", osd, off,
-                     off == CEPH_OSD_IN ? "(in)" :
-                     (off == CEPH_OSD_OUT ? "(out)" : ""));
-                if (osd < map->max_osd)
-                        map->osd_weight[osd] = off;
-        }
-        /* new_pg_temp */
-        rbp = rb_first(&map->pg_temp);
-        ceph_decode_32_safe(p, end, len, bad);
-        while (len--) {
-                struct ceph_pg_mapping *pg;
-                int j;
-                struct ceph_pg pgid;
-                u32 pglen;
-                ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
-                ceph_decode_copy(p, &pgid, sizeof(pgid));
-                pglen = ceph_decode_32(p);
-                /* remove any? */
-                while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
-                                                node)->pgid, pgid) <= 0) {
-                        struct ceph_pg_mapping *cur =
-                                rb_entry(rbp, struct ceph_pg_mapping, node);
-                        rbp = rb_next(rbp);
-                        dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-                        rb_erase(&cur->node, &map->pg_temp);
-                        kfree(cur);
-                }
-                if (pglen) {
-                        /* insert */
-                        ceph_decode_need(p, end, pglen*sizeof(u32), bad);
-                        pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
-                        if (!pg) {
-                                err = -ENOMEM;
-                                goto bad;
-                        }
-                        pg->pgid = pgid;
-                        pg->len = pglen;
-                        for (j = 0; j < pglen; j++)
-                                pg->osds[j] = ceph_decode_32(p);
-                        err = __insert_pg_mapping(pg, &map->pg_temp);
-                        if (err) {
-                                kfree(pg);
-                                goto bad;
-                        }
-                        dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
-                             pglen);
-                }
-        }
-        while (rbp) {
-                struct ceph_pg_mapping *cur =
-                        rb_entry(rbp, struct ceph_pg_mapping, node);
-                rbp = rb_next(rbp);
-                dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-                rb_erase(&cur->node, &map->pg_temp);
-                kfree(cur);
-        }
-        /* ignore the rest */
-        *p = end;
-        return map;
-bad:
-        pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
-               epoch, (int)(*p - start), *p, start, end);
-        print_hex_dump(KERN_DEBUG, "osdmap: ",
-                       DUMP_PREFIX_OFFSET, 16, 1,
-                       start, end - start, true);
-        if (newcrush)
-                crush_destroy(newcrush);
-        return ERR_PTR(err);
-}
-/*
- * calculate file layout from given offset, length.
- * fill in correct oid, logical length, and object extent
- * offset, length.
- *
- * for now, we write only a single su, until we can
- * pass a stride back to the caller.
- */
-void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-                                   u64 off, u64 *plen,
-                                   u64 *ono,
-                                   u64 *oxoff, u64 *oxlen)
-{
-        u32 osize = le32_to_cpu(layout->fl_object_size);
-        u32 su = le32_to_cpu(layout->fl_stripe_unit);
-        u32 sc = le32_to_cpu(layout->fl_stripe_count);
-        u32 bl, stripeno, stripepos, objsetno;
-        u32 su_per_object;
-        u64 t, su_offset;
-        dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
-             osize, su);
-        su_per_object = osize / su;
-        dout("osize %u / su %u = su_per_object %u\n", osize, su,
-             su_per_object);
-        BUG_ON((su & ~PAGE_MASK) != 0);
-        /* bl = *off / su; */
-        t = off;
-        do_div(t, su);
-        bl = t;
-        dout("off %llu / su %u = bl %u\n", off, su, bl);
-        stripeno = bl / sc;
-        stripepos = bl % sc;
-        objsetno = stripeno / su_per_object;
-        *ono = objsetno * sc + stripepos;
-        dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
-        /* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
-        t = off;
-        su_offset = do_div(t, su);
-        *oxoff = su_offset + (stripeno % su_per_object) * su;
-        /*
-         * Calculate the length of the extent being written to the selected
-         * object. This is the minimum of the full length requested (plen) or
-         * the remainder of the current stripe being written to.
-         */
-        *oxlen = min_t(u64, *plen, su - su_offset);
-        *plen = *oxlen;
-        dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
-}
-/*
- * calculate an object layout (i.e. pgid) from an oid,
- * file_layout, and osdmap
- */
-int ceph_calc_object_layout(struct ceph_object_layout *ol,
-                            const char *oid,
-                            struct ceph_file_layout *fl,
-                            struct ceph_osdmap *osdmap)
-{
-        unsigned num, num_mask;
-        struct ceph_pg pgid;
-        s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
-        int poolid = le32_to_cpu(fl->fl_pg_pool);
-        struct ceph_pg_pool_info *pool;
-        unsigned ps;
-        BUG_ON(!osdmap);
-        pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
-        if (!pool)
-                return -EIO;
-        ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
-        if (preferred >= 0) {
-                ps += preferred;
-                num = le32_to_cpu(pool->v.lpg_num);
-                num_mask = pool->lpg_num_mask;
-        } else {
-                num = le32_to_cpu(pool->v.pg_num);
-                num_mask = pool->pg_num_mask;
-        }
-        pgid.ps = cpu_to_le16(ps);
-        pgid.preferred = cpu_to_le16(preferred);
-        pgid.pool = fl->fl_pg_pool;
-        if (preferred >= 0)
-                dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
-                     (int)preferred);
-        else
-                dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
-        ol->ol_pgid = pgid;
-        ol->ol_stripe_unit = fl->fl_object_stripe_unit;
-        return 0;
-}
-/*
- * Calculate raw osd vector for the given pgid.  Return pointer to osd
- * array, or NULL on failure.
- */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                        int *osds, int *num)
-{
-        struct ceph_pg_mapping *pg;
-        struct ceph_pg_pool_info *pool;
-        int ruleno;
-        unsigned poolid, ps, pps;
-        int preferred;
-        /* pg_temp? */
-        pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
-        if (pg) {
-                *num = pg->len;
-                return pg->osds;
-        }
-        /* crush */
-        poolid = le32_to_cpu(pgid.pool);
-        ps = le16_to_cpu(pgid.ps);
-        preferred = (s16)le16_to_cpu(pgid.preferred);
-        /* don't forcefeed bad device ids to crush */
-        if (preferred >= osdmap->max_osd ||
-            preferred >= osdmap->crush->max_devices)
-                preferred = -1;
-        pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
-        if (!pool)
-                return NULL;
-        ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
-                                 pool->v.type, pool->v.size);
-        if (ruleno < 0) {
-                pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
-                       poolid, pool->v.crush_ruleset, pool->v.type,
-                       pool->v.size);
-                return NULL;
-        }
-        if (preferred >= 0)
-                pps = ceph_stable_mod(ps,
-                                      le32_to_cpu(pool->v.lpgp_num),
-                                      pool->lpgp_num_mask);
-        else
-                pps = ceph_stable_mod(ps,
-                                      le32_to_cpu(pool->v.pgp_num),
-                                      pool->pgp_num_mask);
-        pps += poolid;
-        *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
-                             min_t(int, pool->v.size, *num),
-                             preferred, osdmap->osd_weight);
-        return osds;
-}
-/*
- * Return acting set for given pgid.
- */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                        int *acting)
-{
-        int rawosds[CEPH_PG_MAX_SIZE], *osds;
-        int i, o, num = CEPH_PG_MAX_SIZE;
-        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-        if (!osds)
-                return -1;
-        /* primary is first up osd */
-        o = 0;
-        for (i = 0; i < num; i++)
-                if (ceph_osd_is_up(osdmap, osds[i]))
-                        acting[o++] = osds[i];
-        return o;
-}
-/*
- * Return primary osd for given pgid, or -1 if none.
- */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
-{
-        int rawosds[CEPH_PG_MAX_SIZE], *osds;
-        int i, num = CEPH_PG_MAX_SIZE;
-        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-        if (!osds)
-                return -1;
-        /* primary is first up osd */
-        for (i = 0; i < num; i++)
-                if (ceph_osd_is_up(osdmap, osds[i]))
-                        return osds[i];
-        return -1;
-}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
deleted file mode 100644
index 970b547e510d..000000000000
--- a/fs/ceph/osdmap.h
+++ /dev/null
@@ -1,128 +0,0 @@
-#ifndef _FS_CEPH_OSDMAP_H
-#define _FS_CEPH_OSDMAP_H
-#include <linux/rbtree.h>
-#include "types.h"
-#include "ceph_fs.h"
-#include "crush/crush.h"
-/*
- * The osd map describes the current membership of the osd cluster and
- * specifies the mapping of objects to placement groups and placement
- * groups to (sets of) osds.  That is, it completely specifies the
- * (desired) distribution of all data objects in the system at some
- * point in time.
- *
- * Each map version is identified by an epoch, which increases monotonically.
- *
- * The map can be updated either via an incremental map (diff) describing
- * the change between two successive epochs, or as a fully encoded map.
- */
-struct ceph_pg_pool_info {
-        struct rb_node node;
-        int id;
-        struct ceph_pg_pool v;
-        int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
-        char *name;
-};
-struct ceph_pg_mapping {
-        struct rb_node node;
-        struct ceph_pg pgid;
-        int len;
-        int osds[];
-};
-struct ceph_osdmap {
-        struct ceph_fsid fsid;
-        u32 epoch;
-        u32 mkfs_epoch;
-        struct ceph_timespec created, modified;
-        u32 flags;         /* CEPH_OSDMAP_* */
-        u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
-        u8 *osd_state;     /* CEPH_OSD_* */
-        u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
-        struct ceph_entity_addr *osd_addr;
-        struct rb_root pg_temp;
-        struct rb_root pg_pools;
-        u32 pool_max;
-        /* the CRUSH map specifies the mapping of placement groups to
-         * the list of osds that store+replicate them. */
-        struct crush_map *crush;
-};
-/*
- * file layout helpers
- */
-#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
-#define ceph_file_layout_stripe_count(l) \
-        ((__s32)le32_to_cpu((l).fl_stripe_count))
-#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
-#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
-#define ceph_file_layout_object_su(l) \
-        ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
-#define ceph_file_layout_pg_preferred(l) \
-        ((__s32)le32_to_cpu((l).fl_pg_preferred))
-#define ceph_file_layout_pg_pool(l) \
-        ((__s32)le32_to_cpu((l).fl_pg_pool))
-static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
-{
-        return le32_to_cpu(l->fl_stripe_unit) *
-                le32_to_cpu(l->fl_stripe_count);
-}
-/* "period" == bytes before i start on a new set of objects */
-static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
-{
-        return le32_to_cpu(l->fl_object_size) *
-                le32_to_cpu(l->fl_stripe_count);
-}
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
-{
-        return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
-}
-static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
-{
-        return map && (map->flags & flag);
-}
-extern char *ceph_osdmap_state_str(char *str, int len, int state);
-static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
-                                                     int osd)
-{
-        if (osd >= map->max_osd)
-                return NULL;
-        return &map->osd_addr[osd];
-}
-extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                            struct ceph_osdmap *map,
-                                            struct ceph_messenger *msgr);
-extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
-/* calculate mapping of a file extent to an object */
-extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-                                          u64 off, u64 *plen,
-                                          u64 *bno, u64 *oxoff, u64 *oxlen);
-/* calculate mapping of object to a placement group */
-extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
-                                   const char *oid,
-                                   struct ceph_file_layout *fl,
-                                   struct ceph_osdmap *osdmap);
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                               int *acting);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-                                struct ceph_pg pgid);
-#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
deleted file mode 100644
index 46a368b6dce5..000000000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,63 +0,0 @@
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-#include "pagelist.h"
-static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
-{
-        struct page *page = list_entry(pl->head.prev, struct page,
-                                       lru);
-        kunmap(page);
-}
-int ceph_pagelist_release(struct ceph_pagelist *pl)
-{
-        if (pl->mapped_tail)
-                ceph_pagelist_unmap_tail(pl);
-        while (!list_empty(&pl->head)) {
-                struct page *page = list_first_entry(&pl->head, struct page,
-                                                     lru);
-                list_del(&page->lru);
-                __free_page(page);
-        }
-        return 0;
-}
-static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
-{
-        struct page *page = __page_cache_alloc(GFP_NOFS);
-        if (!page)
-                return -ENOMEM;
-        pl->room += PAGE_SIZE;
-        list_add_tail(&page->lru, &pl->head);
-        if (pl->mapped_tail)
-                ceph_pagelist_unmap_tail(pl);
-        pl->mapped_tail = kmap(page);
-        return 0;
-}
-int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
-{
-        while (pl->room < len) {
-                size_t bit = pl->room;
-                int ret;
-                memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
-                       buf, bit);
-                pl->length += bit;
-                pl->room -= bit;
-                buf += bit;
-                len -= bit;
-                ret = ceph_pagelist_addpage(pl);
-                if (ret)
-                        return ret;
-        }
-        memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
-        pl->length += len;
-        pl->room -= len;
-        return 0;
-}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
deleted file mode 100644
index e8a4187e1087..000000000000
--- a/fs/ceph/pagelist.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef __FS_CEPH_PAGELIST_H
-#define __FS_CEPH_PAGELIST_H
-#include <linux/list.h>
-struct ceph_pagelist {
-        struct list_head head;
-        void *mapped_tail;
-        size_t length;
-        size_t room;
-};
-static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
-{
-        INIT_LIST_HEAD(&pl->head);
-        pl->mapped_tail = NULL;
-        pl->length = 0;
-        pl->room = 0;
-}
-extern int ceph_pagelist_release(struct ceph_pagelist *pl);
-extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
-static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
-{
-        __le64 ev = cpu_to_le64(v);
-        return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
-{
-        __le32 ev = cpu_to_le32(v);
-        return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
-{
-        __le16 ev = cpu_to_le16(v);
-        return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
-{
-        return ceph_pagelist_append(pl, &v, 1);
-}
-static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
-                                              char *s, size_t len)
-{
-        int ret = ceph_pagelist_encode_32(pl, len);
-        if (ret)
-                return ret;
-        if (len)
-                return ceph_pagelist_append(pl, s, len);
-        return 0;
-}
-#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
deleted file mode 100644
index 6d5247f2e81b..000000000000
--- a/fs/ceph/rados.h
+++ /dev/null
@@ -1,405 +0,0 @@
-#ifndef CEPH_RADOS_H
-#define CEPH_RADOS_H
-/*
- * Data types for the Ceph distributed object storage layer RADOS
- * (Reliable Autonomic Distributed Object Store).
- */
-#include "msgr.h"
-/*
- * osdmap encoding versions
- */
-#define CEPH_OSDMAP_INC_VERSION     5
-#define CEPH_OSDMAP_INC_VERSION_EXT 5
-#define CEPH_OSDMAP_VERSION         5
-#define CEPH_OSDMAP_VERSION_EXT     5
-/*
- * fs id
- */
-struct ceph_fsid {
-        unsigned char fsid[16];
-};
-static inline int ceph_fsid_compare(const struct ceph_fsid *a,
-                                    const struct ceph_fsid *b)
-{
-        return memcmp(a, b, sizeof(*a));
-}
-/*
- * ino, object, etc.
- */
-typedef __le64 ceph_snapid_t;
-#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
-#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
-#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
-struct ceph_timespec {
-        __le32 tv_sec;
-        __le32 tv_nsec;
-} __attribute__ ((packed));
-/*
- * object layout - how objects are mapped into PGs
- */
-#define CEPH_OBJECT_LAYOUT_HASH     1
-#define CEPH_OBJECT_LAYOUT_LINEAR   2
-#define CEPH_OBJECT_LAYOUT_HASHINO  3
-/*
- * pg layout -- how PGs are mapped onto (sets of) OSDs
- */
-#define CEPH_PG_LAYOUT_CRUSH  0
-#define CEPH_PG_LAYOUT_HASH   1
-#define CEPH_PG_LAYOUT_LINEAR 2
-#define CEPH_PG_LAYOUT_HYBRID 3
-#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
-/*
- * placement group.
- * we encode this into one __le64.
- */
-struct ceph_pg {
-        __le16 preferred; /* preferred primary osd */
-        __le16 ps;        /* placement seed */
-        __le32 pool;      /* object pool */
-} __attribute__ ((packed));
-/*
- * pg_pool is a set of pgs storing a pool of objects
- *
- *  pg_num -- base number of pseudorandomly placed pgs
- *
- *  pgp_num -- effective number when calculating pg placement.  this
- * is used for pg_num increases.  new pgs result in data being "split"
- * into new pgs.  for this to proceed smoothly, new pgs are intiially
- * colocated with their parents; that is, pgp_num doesn't increase
- * until the new pgs have successfully split.  only _then_ are the new
- * pgs placed independently.
- *
- *  lpg_num -- localized pg count (per device).  replicas are randomly
- * selected.
- *
- *  lpgp_num -- as above.
- */
-#define CEPH_PG_TYPE_REP     1
-#define CEPH_PG_TYPE_RAID4   2
-#define CEPH_PG_POOL_VERSION 2
-struct ceph_pg_pool {
-        __u8 type;                /* CEPH_PG_TYPE_* */
-        __u8 size;                /* number of osds in each pg */
-        __u8 crush_ruleset;       /* crush placement rule */
-        __u8 object_hash;         /* hash mapping object name to ps */
-        __le32 pg_num, pgp_num;   /* number of pg's */
-        __le32 lpg_num, lpgp_num; /* number of localized pg's */
-        __le32 last_change;       /* most recent epoch changed */
-        __le64 snap_seq;          /* seq for per-pool snapshot */
-        __le32 snap_epoch;        /* epoch of last snap */
-        __le32 num_snaps;
-        __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
-        __le64 auid;               /* who owns the pg */
-} __attribute__ ((packed));
-/*
- * stable_mod func is used to control number of placement groups.
- * similar to straight-up modulo, but produces a stable mapping as b
- * increases over time.  b is the number of bins, and bmask is the
- * containing power of 2 minus 1.
- *
- * b <= bmask and bmask=(2**n)-1
- * e.g., b=12 -> bmask=15, b=123 -> bmask=127
- */
-static inline int ceph_stable_mod(int x, int b, int bmask)
-{
-        if ((x & bmask) < b)
-                return x & bmask;
-        else
-                return x & (bmask >> 1);
-}
-/*
- * object layout - how a given object should be stored.
- */
-struct ceph_object_layout {
-        struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
-        __le32 ol_stripe_unit;    /* for per-object parity, if any */
-} __attribute__ ((packed));
-/*
- * compound epoch+version, used by storage layer to serialize mutations
- */
-struct ceph_eversion {
-        __le32 epoch;
-        __le64 version;
-} __attribute__ ((packed));
-/*
- * osd map bits
- */
-/* status bits */
-#define CEPH_OSD_EXISTS 1
-#define CEPH_OSD_UP     2
-/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
-#define CEPH_OSD_IN  0x10000
-#define CEPH_OSD_OUT 0
-/*
- * osd map flag bits
- */
-#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
-#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
-#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
-#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
-#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
-/*
- * osd ops
- */
-#define CEPH_OSD_OP_MODE       0xf000
-#define CEPH_OSD_OP_MODE_RD    0x1000
-#define CEPH_OSD_OP_MODE_WR    0x2000
-#define CEPH_OSD_OP_MODE_RMW   0x3000
-#define CEPH_OSD_OP_MODE_SUB   0x4000
-#define CEPH_OSD_OP_TYPE       0x0f00
-#define CEPH_OSD_OP_TYPE_LOCK  0x0100
-#define CEPH_OSD_OP_TYPE_DATA  0x0200
-#define CEPH_OSD_OP_TYPE_ATTR  0x0300
-#define CEPH_OSD_OP_TYPE_EXEC  0x0400
-#define CEPH_OSD_OP_TYPE_PG    0x0500
-enum {
-        /** data **/
-        /* read */
-        CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
-        CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
-        /* fancy read */
-        CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
-        /* write */
-        CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
-        CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
-        CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
-        CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
-        CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
-        /* fancy write */
-        CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
-        CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
-        CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
-        CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
-        CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
-        CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
-        CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
-        CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
-        CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
-        /** attrs **/
-        /* read */
-        CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
-        CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
-        CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
-        /* write */
-        CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
-        CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
-        CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
-        CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
-        /** subop **/
-        CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
-        CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
-        CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
-        CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
-        CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
-        /** lock **/
-        CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
-        CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
-        CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
-        CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
-        CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
-        CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
-        /** exec **/
-        CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
-        /** pg **/
-        CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
-};
-static inline int ceph_osd_op_type_lock(int op)
-{
-        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
-}
-static inline int ceph_osd_op_type_data(int op)
-{
-        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
-}
-static inline int ceph_osd_op_type_attr(int op)
-{
-        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
-}
-static inline int ceph_osd_op_type_exec(int op)
-{
-        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
-}
-static inline int ceph_osd_op_type_pg(int op)
-{
-        return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
-}
-static inline int ceph_osd_op_mode_subop(int op)
-{
-        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
-}
-static inline int ceph_osd_op_mode_read(int op)
-{
-        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
-}
-static inline int ceph_osd_op_mode_modify(int op)
-{
-        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
-}
-/*
- * note that the following tmap stuff is also defined in the ceph librados.h
- * any modification here needs to be updated there
- */
-#define CEPH_OSD_TMAP_HDR 'h'
-#define CEPH_OSD_TMAP_SET 's'
-#define CEPH_OSD_TMAP_RM  'r'
-extern const char *ceph_osd_op_name(int op);
-/*
- * osd op flags
- *
- * An op may be READ, WRITE, or READ|WRITE.
- */
-enum {
-        CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
-        CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
-        CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
-        CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
-        CEPH_OSD_FLAG_READ = 16,        /* op may read */
-        CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
-        CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
-        CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
-        CEPH_OSD_FLAG_BALANCE_READS = 256,
-        CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
-        CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
-        CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
-        CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
-};
-enum {
-        CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
-};
-#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
-#define EBLACKLISTED ESHUTDOWN /* blacklisted */
-/* xattr comparison */
-enum {
-        CEPH_OSD_CMPXATTR_OP_NOP = 0,
-        CEPH_OSD_CMPXATTR_OP_EQ  = 1,
-        CEPH_OSD_CMPXATTR_OP_NE  = 2,
-        CEPH_OSD_CMPXATTR_OP_GT  = 3,
-        CEPH_OSD_CMPXATTR_OP_GTE = 4,
-        CEPH_OSD_CMPXATTR_OP_LT  = 5,
-        CEPH_OSD_CMPXATTR_OP_LTE = 6
-};
-enum {
-        CEPH_OSD_CMPXATTR_MODE_STRING = 1,
-        CEPH_OSD_CMPXATTR_MODE_U64    = 2
-};
-/*
- * an individual object operation.  each may be accompanied by some data
- * payload
- */
-struct ceph_osd_op {
-        __le16 op;           /* CEPH_OSD_OP_* */
-        __le32 flags;        /* CEPH_OSD_FLAG_* */
-        union {
-                struct {
-                        __le64 offset, length;
-                        __le64 truncate_size;
-                        __le32 truncate_seq;
-                } __attribute__ ((packed)) extent;
-                struct {
-                        __le32 name_len;
-                        __le32 value_len;
-                        __u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
-                        __u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
-                } __attribute__ ((packed)) xattr;
-                struct {
-                        __u8 class_len;
-                        __u8 method_len;
-                        __u8 argc;
-                        __le32 indata_len;
-                } __attribute__ ((packed)) cls;
-                struct {
-                        __le64 cookie, count;
-                } __attribute__ ((packed)) pgls;
-                struct {
-                        __le64 snapid;
-                } __attribute__ ((packed)) snap;
-        };
-        __le32 payload_len;
-} __attribute__ ((packed));
-/*
- * osd request message header.  each request may include multiple
- * ceph_osd_op object operations.
- */
-struct ceph_osd_request_head {
-        __le32 client_inc;                 /* client incarnation */
-        struct ceph_object_layout layout;  /* pgid */
-        __le32 osdmap_epoch;               /* client's osdmap epoch */
-        __le32 flags;
-        struct ceph_timespec mtime;        /* for mutations only */
-        struct ceph_eversion reassert_version; /* if we are replaying op */
-        __le32 object_len;     /* length of object name */
-        __le64 snapid;         /* snapid to read */
-        __le64 snap_seq;       /* writer's snap context */
-        __le32 num_snaps;
-        __le16 num_ops;
-        struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
-} __attribute__ ((packed));
-struct ceph_osd_reply_head {
-        __le32 client_inc;                /* client incarnation */
-        __le32 flags;
-        struct ceph_object_layout layout;
-        __le32 osdmap_epoch;
-        struct ceph_eversion reassert_version; /* for replaying uncommitted */
-        __le32 result;                    /* result code */
-        __le32 object_len;                /* length of object name */
-        __le32 num_ops;
-        struct ceph_osd_op ops[0];  /* ops[], object */
-} __attribute__ ((packed));
-#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 190b6c4a6f2b..39c243acd062 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/sort.h>
 #include <linux/slab.h>
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
 /*
 * Snapshots in ceph are driven in large part by cooperation from the
@@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                            struct ceph_cap_snap *capsnap)
 {
        struct inode *inode = &ci->vfs_inode;
-        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        BUG_ON(capsnap->writing);
        capsnap->size = inode->i_size;
@@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                      struct ceph_mds_session *session,
                      struct ceph_msg *msg)
 {
-        struct super_block *sb = mdsc->client->sb;
+        struct super_block *sb = mdsc->fsc->sb;
        int mds = session->s_mds;
        u64 split;
        int op;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c
index c6179d3a26a2..cd5097d7c804 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/strings.c
@@ -1,71 +1,9 @@
 /*
- * Ceph string constants
+ * Ceph fs string constants
 */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
-const char *ceph_entity_type_name(int type)
-{
-        switch (type) {
-        case CEPH_ENTITY_TYPE_MDS: return "mds";
-        case CEPH_ENTITY_TYPE_OSD: return "osd";
-        case CEPH_ENTITY_TYPE_MON: return "mon";
-        case CEPH_ENTITY_TYPE_CLIENT: return "client";
-        case CEPH_ENTITY_TYPE_AUTH: return "auth";
-        default: return "unknown";
-        }
-}
-const char *ceph_osd_op_name(int op)
-{
-        switch (op) {
-        case CEPH_OSD_OP_READ: return "read";
-        case CEPH_OSD_OP_STAT: return "stat";
-        case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
-        case CEPH_OSD_OP_WRITE: return "write";
-        case CEPH_OSD_OP_DELETE: return "delete";
-        case CEPH_OSD_OP_TRUNCATE: return "truncate";
-        case CEPH_OSD_OP_ZERO: return "zero";
-        case CEPH_OSD_OP_WRITEFULL: return "writefull";
-        case CEPH_OSD_OP_ROLLBACK: return "rollback";
-        case CEPH_OSD_OP_APPEND: return "append";
-        case CEPH_OSD_OP_STARTSYNC: return "startsync";
-        case CEPH_OSD_OP_SETTRUNC: return "settrunc";
-        case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
-        case CEPH_OSD_OP_TMAPUP: return "tmapup";
-        case CEPH_OSD_OP_TMAPGET: return "tmapget";
-        case CEPH_OSD_OP_TMAPPUT: return "tmapput";
-        case CEPH_OSD_OP_GETXATTR: return "getxattr";
-        case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
-        case CEPH_OSD_OP_SETXATTR: return "setxattr";
-        case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
-        case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
-        case CEPH_OSD_OP_RMXATTR: return "rmxattr";
-        case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
-        case CEPH_OSD_OP_PULL: return "pull";
-        case CEPH_OSD_OP_PUSH: return "push";
-        case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
-        case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
-        case CEPH_OSD_OP_SCRUB: return "scrub";
-        case CEPH_OSD_OP_WRLOCK: return "wrlock";
-        case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
-        case CEPH_OSD_OP_RDLOCK: return "rdlock";
-        case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
-        case CEPH_OSD_OP_UPLOCK: return "uplock";
-        case CEPH_OSD_OP_DNLOCK: return "dnlock";
-        case CEPH_OSD_OP_CALL: return "call";
-        case CEPH_OSD_OP_PGLS: return "pgls";
-        }
-        return "???";
-}
 const char *ceph_mds_state_name(int s)
 {
@@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o)
        }
        return "???";
 }
-const char *ceph_pool_op_name(int op)
-{
-        switch (op) {
-        case POOL_OP_CREATE: return "create";
-        case POOL_OP_DELETE: return "delete";
-        case POOL_OP_AUID_CHANGE: return "auid change";
-        case POOL_OP_CREATE_SNAP: return "create snap";
-        case POOL_OP_DELETE_SNAP: return "delete snap";
-        case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
-        case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
-        }
-        return "???";
-}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9922628532b2..9c5085465a63 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,5 +1,5 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <linux/backing-dev.h>
 #include <linux/ctype.h>
@@ -15,10 +15,13 @@
 #include <linux/statfs.h>
 #include <linux/string.h>
-#include "decode.h"
 #include "super.h"
-#include "mon_client.h"
+#include "mds_client.h"
-#include "auth.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
 /*
 * Ceph superblock operations
@@ -26,36 +29,22 @@
 * Handle the basics of mounting, unmounting.
 */
-/*
- * find filename portion of a path (/foo/bar/baz -> baz)
- */
-const char *ceph_file_part(const char *s, int len)
-{
-        const char *e = s + len;
-        while (e != s && *(e-1) != '/')
-                e--;
-        return e;
-}
 /*
 * super ops
 */
 static void ceph_put_super(struct super_block *s)
 {
-        struct ceph_client *client = ceph_sb_to_client(s);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(s);
        dout("put_super\n");
-        ceph_mdsc_close_sessions(&client->mdsc);
+        ceph_mdsc_close_sessions(fsc->mdsc);
        /*
         * ensure we release the bdi before put_anon_super releases
         * the device name.
         */
-        if (s->s_bdi == &client->backing_dev_info) {
+        if (s->s_bdi == &fsc->backing_dev_info) {
-                bdi_unregister(&client->backing_dev_info);
+                bdi_unregister(&fsc->backing_dev_info);
                s->s_bdi = NULL;
        }
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
-        struct ceph_monmap *monmap = client->monc.monmap;
+        struct ceph_monmap *monmap = fsc->client->monc.monmap;
        struct ceph_statfs st;
        u64 fsid;
        int err;
        dout("statfs\n");
-        err = ceph_monc_do_statfs(&client->monc, &st);
+        err = ceph_monc_do_statfs(&fsc->client->monc, &st);
        if (err < 0)
                return err;
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int ceph_sync_fs(struct super_block *sb, int wait)
 {
-        struct ceph_client *client = ceph_sb_to_client(sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
        if (!wait) {
                dout("sync_fs (non-blocking)\n");
-                ceph_flush_dirty_caps(&client->mdsc);
+                ceph_flush_dirty_caps(fsc->mdsc);
                dout("sync_fs (non-blocking) done\n");
                return 0;
        }
        dout("sync_fs (blocking)\n");
-        ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
+        ceph_osdc_sync(&fsc->client->osdc);
-        ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
+        ceph_mdsc_sync(fsc->mdsc);
        dout("sync_fs (blocking) done\n");
        return 0;
 }
-static int default_congestion_kb(void)
-{
-        int congestion_kb;
-        /*
-         * Copied from NFS
-         *
-         * congestion size, scale with available memory.
-         *
-         *  64MB:    8192k
-         * 128MB:   11585k
-         * 256MB:   16384k
-         * 512MB:   23170k
-         *   1GB:   32768k
-         *   2GB:   46340k
-         *   4GB:   65536k
-         *   8GB:   92681k
-         *  16GB:  131072k
-         *
-         * This allows larger machines to have larger/more transfers.
-         * Limit the default to 256M
-         */
-        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-        if (congestion_kb > 256*1024)
-                congestion_kb = 256*1024;
-        return congestion_kb;
-}
-/**
- * ceph_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- */
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-        struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
-        struct ceph_mount_args *args = client->mount_args;
-        if (args->flags & CEPH_OPT_FSID)
-                seq_printf(m, ",fsid=%pU", &args->fsid);
-        if (args->flags & CEPH_OPT_NOSHARE)
-                seq_puts(m, ",noshare");
-        if (args->flags & CEPH_OPT_DIRSTAT)
-                seq_puts(m, ",dirstat");
-        if ((args->flags & CEPH_OPT_RBYTES) == 0)
-                seq_puts(m, ",norbytes");
-        if (args->flags & CEPH_OPT_NOCRC)
-                seq_puts(m, ",nocrc");
-        if (args->flags & CEPH_OPT_NOASYNCREADDIR)
-                seq_puts(m, ",noasyncreaddir");
-        if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
-                seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
-        if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
-                seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
-        if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
-                seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
-        if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
-                seq_printf(m, ",osdkeepalivetimeout=%d",
-                         args->osd_keepalive_timeout);
-        if (args->wsize)
-                seq_printf(m, ",wsize=%d", args->wsize);
-        if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
-                seq_printf(m, ",rsize=%d", args->rsize);
-        if (args->congestion_kb != default_congestion_kb())
-                seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
-        if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
-                seq_printf(m, ",caps_wanted_delay_min=%d",
-                         args->caps_wanted_delay_min);
-        if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
-                seq_printf(m, ",caps_wanted_delay_max=%d",
-                           args->caps_wanted_delay_max);
-        if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
-                seq_printf(m, ",cap_release_safety=%d",
-                           args->cap_release_safety);
-        if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
-                seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
-        if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
-                seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
-        if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
-                seq_printf(m, ",snapdirname=%s", args->snapdir_name);
-        if (args->name)
-                seq_printf(m, ",name=%s", args->name);
-        if (args->secret)
-                seq_puts(m, ",secret=<hidden>");
-        return 0;
-}
-/*
- * caches
- */
-struct kmem_cache *ceph_inode_cachep;
-struct kmem_cache *ceph_cap_cachep;
-struct kmem_cache *ceph_dentry_cachep;
-struct kmem_cache *ceph_file_cachep;
-static void ceph_inode_init_once(void *foo)
-{
-        struct ceph_inode_info *ci = foo;
-        inode_init_once(&ci->vfs_inode);
-}
-static int __init init_caches(void)
-{
-        ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
-                                      sizeof(struct ceph_inode_info),
-                                      __alignof__(struct ceph_inode_info),
-                                      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-                                      ceph_inode_init_once);
-        if (ceph_inode_cachep == NULL)
-                return -ENOMEM;
-        ceph_cap_cachep = KMEM_CACHE(ceph_cap,
-                                     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-        if (ceph_cap_cachep == NULL)
-                goto bad_cap;
-        ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
-                                        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-        if (ceph_dentry_cachep == NULL)
-                goto bad_dentry;
-        ceph_file_cachep = KMEM_CACHE(ceph_file_info,
-                                      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-        if (ceph_file_cachep == NULL)
-                goto bad_file;
-        return 0;
-bad_file:
-        kmem_cache_destroy(ceph_dentry_cachep);
-bad_dentry:
-        kmem_cache_destroy(ceph_cap_cachep);
-bad_cap:
-        kmem_cache_destroy(ceph_inode_cachep);
-        return -ENOMEM;
-}
-static void destroy_caches(void)
-{
-        kmem_cache_destroy(ceph_inode_cachep);
-        kmem_cache_destroy(ceph_cap_cachep);
-        kmem_cache_destroy(ceph_dentry_cachep);
-        kmem_cache_destroy(ceph_file_cachep);
-}
-/*
- * ceph_umount_begin - initiate forced umount.  Tear down down the
- * mount, skipping steps that may hang while waiting for server(s).
- */
-static void ceph_umount_begin(struct super_block *sb)
-{
-        struct ceph_client *client = ceph_sb_to_client(sb);
-        dout("ceph_umount_begin - starting forced umount\n");
-        if (!client)
-                return;
-        client->mount_state = CEPH_MOUNT_SHUTDOWN;
-        return;
-}
-static const struct super_operations ceph_super_ops = {
-        .alloc_inode    = ceph_alloc_inode,
-        .destroy_inode  = ceph_destroy_inode,
-        .write_inode    = ceph_write_inode,
-        .sync_fs        = ceph_sync_fs,
-        .put_super      = ceph_put_super,
-        .show_options   = ceph_show_options,
-        .statfs         = ceph_statfs,
-        .umount_begin   = ceph_umount_begin,
-};
-const char *ceph_msg_type_name(int type)
-{
-        switch (type) {
-        case CEPH_MSG_SHUTDOWN: return "shutdown";
-        case CEPH_MSG_PING: return "ping";
-        case CEPH_MSG_AUTH: return "auth";
-        case CEPH_MSG_AUTH_REPLY: return "auth_reply";
-        case CEPH_MSG_MON_MAP: return "mon_map";
-        case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
-        case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
-        case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
-        case CEPH_MSG_STATFS: return "statfs";
-        case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
-        case CEPH_MSG_MDS_MAP: return "mds_map";
-        case CEPH_MSG_CLIENT_SESSION: return "client_session";
-        case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
-        case CEPH_MSG_CLIENT_REQUEST: return "client_request";
-        case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
-        case CEPH_MSG_CLIENT_REPLY: return "client_reply";
-        case CEPH_MSG_CLIENT_CAPS: return "client_caps";
-        case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
-        case CEPH_MSG_CLIENT_SNAP: return "client_snap";
-        case CEPH_MSG_CLIENT_LEASE: return "client_lease";
-        case CEPH_MSG_OSD_MAP: return "osd_map";
-        case CEPH_MSG_OSD_OP: return "osd_op";
-        case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
-        default: return "unknown";
-        }
-}
 /*
 * mount options
 */
 enum {
        Opt_wsize,
        Opt_rsize,
-        Opt_osdtimeout,
-        Opt_osdkeepalivetimeout,
-        Opt_mount_timeout,
-        Opt_osd_idle_ttl,
        Opt_caps_wanted_delay_min,
        Opt_caps_wanted_delay_max,
        Opt_cap_release_safety,
@@ -344,29 +123,19 @@ enum {
        Opt_congestion_kb,
        Opt_last_int,
        /* int args above */
-        Opt_fsid,
        Opt_snapdirname,
-        Opt_name,
-        Opt_secret,
        Opt_last_string,
        /* string args above */
-        Opt_ip,
-        Opt_noshare,
        Opt_dirstat,
        Opt_nodirstat,
        Opt_rbytes,
        Opt_norbytes,
-        Opt_nocrc,
        Opt_noasyncreaddir,
 };
-static match_table_t arg_tokens = {
+static match_table_t fsopt_tokens = {
        {Opt_wsize, "wsize=%d"},
        {Opt_rsize, "rsize=%d"},
-        {Opt_osdtimeout, "osdtimeout=%d"},
-        {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
-        {Opt_mount_timeout, "mount_timeout=%d"},
-        {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
        {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
        {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
        {Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -374,403 +143,466 @@ static match_table_t arg_tokens = {
        {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
        {Opt_congestion_kb, "write_congestion_kb=%d"},
        /* int args above */
-        {Opt_fsid, "fsid=%s"},
        {Opt_snapdirname, "snapdirname=%s"},
-        {Opt_name, "name=%s"},
-        {Opt_secret, "secret=%s"},
        /* string args above */
-        {Opt_ip, "ip=%s"},
-        {Opt_noshare, "noshare"},
        {Opt_dirstat, "dirstat"},
        {Opt_nodirstat, "nodirstat"},
        {Opt_rbytes, "rbytes"},
        {Opt_norbytes, "norbytes"},
-        {Opt_nocrc, "nocrc"},
        {Opt_noasyncreaddir, "noasyncreaddir"},
        {-1, NULL}
 };
-static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+static int parse_fsopt_token(char *c, void *private)
 {
-        int i = 0;
+        struct ceph_mount_options *fsopt = private;
-        char tmp[3];
+        substring_t argstr[MAX_OPT_ARGS];
-        int err = -EINVAL;
+        int token, intval, ret;
-        int d;
+        token = match_token((char *)c, fsopt_tokens, argstr);
-        dout("parse_fsid '%s'\n", str);
+        if (token < 0)
-        tmp[2] = 0;
+                return -EINVAL;
-        while (*str && i < 16) {
-                if (ispunct(*str)) {
+        if (token < Opt_last_int) {
-                        str++;
+                ret = match_int(&argstr[0], &intval);
-                        continue;
+                if (ret < 0) {
+                        pr_err("bad mount option arg (not int) "
+                               "at '%s'\n", c);
+                        return ret;
                }
-                if (!isxdigit(str[0]) || !isxdigit(str[1]))
+                dout("got int token %d val %d\n", token, intval);
-                        break;
+        } else if (token > Opt_last_int && token < Opt_last_string) {
-                tmp[0] = str[0];
+                dout("got string token %d val %s\n", token,
-                tmp[1] = str[1];
+                     argstr[0].from);
-                if (sscanf(tmp, "%x", &d) < 1)
+        } else {
-                        break;
+                dout("got token %d\n", token);
-                fsid->fsid[i] = d & 0xff;
-                i++;
-                str += 2;
        }
-        if (i == 16)
+        switch (token) {
-                err = 0;
+        case Opt_snapdirname:
-        dout("parse_fsid ret %d got fsid %pU", err, fsid);
+                kfree(fsopt->snapdir_name);
-        return err;
+                fsopt->snapdir_name = kstrndup(argstr[0].from,
+                                               argstr[0].to-argstr[0].from,
+                                               GFP_KERNEL);
+                if (!fsopt->snapdir_name)
+                        return -ENOMEM;
+                break;
+                /* misc */
+        case Opt_wsize:
+                fsopt->wsize = intval;
+                break;
+        case Opt_rsize:
+                fsopt->rsize = intval;
+                break;
+        case Opt_caps_wanted_delay_min:
+                fsopt->caps_wanted_delay_min = intval;
+                break;
+        case Opt_caps_wanted_delay_max:
+                fsopt->caps_wanted_delay_max = intval;
+                break;
+        case Opt_readdir_max_entries:
+                fsopt->max_readdir = intval;
+                break;
+        case Opt_readdir_max_bytes:
+                fsopt->max_readdir_bytes = intval;
+                break;
+        case Opt_congestion_kb:
+                fsopt->congestion_kb = intval;
+                break;
+        case Opt_dirstat:
+                fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
+                break;
+        case Opt_nodirstat:
+                fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
+                break;
+        case Opt_rbytes:
+                fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
+                break;
+        case Opt_norbytes:
+                fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
+                break;
+        case Opt_noasyncreaddir:
+                fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
+                break;
+        default:
+                BUG_ON(token);
+        }
+        return 0;
 }
-static struct ceph_mount_args *parse_mount_args(int flags, char *options,
+static void destroy_mount_options(struct ceph_mount_options *args)
-                                                const char *dev_name,
-                                                const char **path)
 {
-        struct ceph_mount_args *args;
+        dout("destroy_mount_options %p\n", args);
-        const char *c;
+        kfree(args->snapdir_name);
-        int err = -ENOMEM;
+        kfree(args);
-        substring_t argstr[MAX_OPT_ARGS];
+}
-        args = kzalloc(sizeof(*args), GFP_KERNEL);
+static int strcmp_null(const char *s1, const char *s2)
-        if (!args)
+{
-                return ERR_PTR(-ENOMEM);
+        if (!s1 && !s2)
-        args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
+                return 0;
-                                 GFP_KERNEL);
+        if (s1 && !s2)
-        if (!args->mon_addr)
+                return -1;
-                goto out;
+        if (!s1 && s2)
+                return 1;
+        return strcmp(s1, s2);
+}
-        dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
+static int compare_mount_options(struct ceph_mount_options *new_fsopt,
+                                 struct ceph_options *new_opt,
-        /* start with defaults */
+                                 struct ceph_fs_client *fsc)
-        args->sb_flags = flags;
+{
-        args->flags = CEPH_OPT_DEFAULT;
+        struct ceph_mount_options *fsopt1 = new_fsopt;
-        args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+        struct ceph_mount_options *fsopt2 = fsc->mount_options;
-        args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+        int ofs = offsetof(struct ceph_mount_options, snapdir_name);
-        args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+        int ret;
-        args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
-        args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
-        args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
-        args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
-        args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-        args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
-        args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
-        args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
-        args->congestion_kb = default_congestion_kb();
-        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
-        err = -EINVAL;
-        if (!dev_name)
-                goto out;
-        *path = strstr(dev_name, ":/");
-        if (*path == NULL) {
-                pr_err("device name is missing path (no :/ in %s)\n",
-                       dev_name);
-                goto out;
-        }
-        /* get mon ip(s) */
+        ret = memcmp(fsopt1, fsopt2, ofs);
-        err = ceph_parse_ips(dev_name, *path, args->mon_addr,
+        if (ret)
-                             CEPH_MAX_MON, &args->num_mon);
+                return ret;
-        if (err < 0)
-                goto out;
+        ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
+        if (ret)
+                return ret;
+        return ceph_compare_options(new_opt, fsc->client);
+}
+static int parse_mount_options(struct ceph_mount_options **pfsopt,
+                               struct ceph_options **popt,
+                               int flags, char *options,
+                               const char *dev_name,
+                               const char **path)
+{
+        struct ceph_mount_options *fsopt;
+        const char *dev_name_end;
+        int err = -ENOMEM;
+        fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
+        if (!fsopt)
+                return -ENOMEM;
+        dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
+        fsopt->sb_flags = flags;
+        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+        fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+        fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+        fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+        fsopt->congestion_kb = default_congestion_kb();
+        
+        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+        err = -EINVAL;
+        if (!dev_name)
+                goto out;
+        *path = strstr(dev_name, ":/");
+        if (*path == NULL) {
+                pr_err("device name is missing path (no :/ in %s)\n",
+                       dev_name);
+                goto out;
+        }
+        dev_name_end = *path;
+        dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
        /* path on server */
        *path += 2;
        dout("server path '%s'\n", *path);
-        /* parse mount options */
+        err = ceph_parse_options(popt, options, dev_name, dev_name_end,
-        while ((c = strsep(&options, ",")) != NULL) {
+                                 parse_fsopt_token, (void *)fsopt);
-                int token, intval, ret;
+        if (err)
-                if (!*c)
+                goto out;
-                        continue;
-                err = -EINVAL;
+        /* success */
-                token = match_token((char *)c, arg_tokens, argstr);
+        *pfsopt = fsopt;
-                if (token < 0) {
+        return 0;
-                        pr_err("bad mount option at '%s'\n", c);
-                        goto out;
-                }
-                if (token < Opt_last_int) {
-                        ret = match_int(&argstr[0], &intval);
-                        if (ret < 0) {
-                                pr_err("bad mount option arg (not int) "
-                                       "at '%s'\n", c);
-                                continue;
-                        }
-                        dout("got int token %d val %d\n", token, intval);
-                } else if (token > Opt_last_int && token < Opt_last_string) {
-                        dout("got string token %d val %s\n", token,
-                             argstr[0].from);
-                } else {
-                        dout("got token %d\n", token);
-                }
-                switch (token) {
-                case Opt_ip:
-                        err = ceph_parse_ips(argstr[0].from,
-                                             argstr[0].to,
-                                             &args->my_addr,
-                                             1, NULL);
-                        if (err < 0)
-                                goto out;
-                        args->flags |= CEPH_OPT_MYIP;
-                        break;
-                case Opt_fsid:
-                        err = parse_fsid(argstr[0].from, &args->fsid);
-                        if (err == 0)
-                                args->flags |= CEPH_OPT_FSID;
-                        break;
-                case Opt_snapdirname:
-                        kfree(args->snapdir_name);
-                        args->snapdir_name = kstrndup(argstr[0].from,
-                                              argstr[0].to-argstr[0].from,
-                                              GFP_KERNEL);
-                        break;
-                case Opt_name:
-                        args->name = kstrndup(argstr[0].from,
-                                              argstr[0].to-argstr[0].from,
-                                              GFP_KERNEL);
-                        break;
-                case Opt_secret:
-                        args->secret = kstrndup(argstr[0].from,
-                                                argstr[0].to-argstr[0].from,
-                                                GFP_KERNEL);
-                        break;
-                        /* misc */
-                case Opt_wsize:
-                        args->wsize = intval;
-                        break;
-                case Opt_rsize:
-                        args->rsize = intval;
-                        break;
-                case Opt_osdtimeout:
-                        args->osd_timeout = intval;
-                        break;
-                case Opt_osdkeepalivetimeout:
-                        args->osd_keepalive_timeout = intval;
-                        break;
-                case Opt_osd_idle_ttl:
-                        args->osd_idle_ttl = intval;
-                        break;
-                case Opt_mount_timeout:
-                        args->mount_timeout = intval;
-                        break;
-                case Opt_caps_wanted_delay_min:
-                        args->caps_wanted_delay_min = intval;
-                        break;
-                case Opt_caps_wanted_delay_max:
-                        args->caps_wanted_delay_max = intval;
-                        break;
-                case Opt_readdir_max_entries:
-                        args->max_readdir = intval;
-                        break;
-                case Opt_readdir_max_bytes:
-                        args->max_readdir_bytes = intval;
-                        break;
-                case Opt_congestion_kb:
-                        args->congestion_kb = intval;
-                        break;
-                case Opt_noshare:
-                        args->flags |= CEPH_OPT_NOSHARE;
-                        break;
-                case Opt_dirstat:
-                        args->flags |= CEPH_OPT_DIRSTAT;
-                        break;
-                case Opt_nodirstat:
-                        args->flags &= ~CEPH_OPT_DIRSTAT;
-                        break;
-                case Opt_rbytes:
-                        args->flags |= CEPH_OPT_RBYTES;
-                        break;
-                case Opt_norbytes:
-                        args->flags &= ~CEPH_OPT_RBYTES;
-                        break;
-                case Opt_nocrc:
-                        args->flags |= CEPH_OPT_NOCRC;
-                        break;
-                case Opt_noasyncreaddir:
-                        args->flags |= CEPH_OPT_NOASYNCREADDIR;
-                        break;
-                default:
-                        BUG_ON(token);
-                }
-        }
-        return args;
 out:
-        kfree(args->mon_addr);
+        destroy_mount_options(fsopt);
-        kfree(args);
+        return err;
-        return ERR_PTR(err);
 }
-static void destroy_mount_args(struct ceph_mount_args *args)
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 {
-        dout("destroy_mount_args %p\n", args);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
-        kfree(args->snapdir_name);
+        struct ceph_mount_options *fsopt = fsc->mount_options;
-        args->snapdir_name = NULL;
+        struct ceph_options *opt = fsc->client->options;
-        kfree(args->name);
-        args->name = NULL;
+        if (opt->flags & CEPH_OPT_FSID)
-        kfree(args->secret);
+                seq_printf(m, ",fsid=%pU", &opt->fsid);
-        args->secret = NULL;
+        if (opt->flags & CEPH_OPT_NOSHARE)
-        kfree(args);
+                seq_puts(m, ",noshare");
+        if (opt->flags & CEPH_OPT_NOCRC)
+                seq_puts(m, ",nocrc");
+        if (opt->name)
+                seq_printf(m, ",name=%s", opt->name);
+        if (opt->secret)
+                seq_puts(m, ",secret=<hidden>");
+        if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+                seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
+        if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+                seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
+        if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+                seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
+        if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+                seq_printf(m, ",osdkeepalivetimeout=%d",
+                           opt->osd_keepalive_timeout);
+        if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
+                seq_puts(m, ",dirstat");
+        if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
+                seq_puts(m, ",norbytes");
+        if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
+                seq_puts(m, ",noasyncreaddir");
+        if (fsopt->wsize)
+                seq_printf(m, ",wsize=%d", fsopt->wsize);
+        if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+                seq_printf(m, ",rsize=%d", fsopt->rsize);
+        if (fsopt->congestion_kb != default_congestion_kb())
+                seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+        if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+                seq_printf(m, ",caps_wanted_delay_min=%d",
+                         fsopt->caps_wanted_delay_min);
+        if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+                seq_printf(m, ",caps_wanted_delay_max=%d",
+                           fsopt->caps_wanted_delay_max);
+        if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+                seq_printf(m, ",cap_release_safety=%d",
+                           fsopt->cap_release_safety);
+        if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+                seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+        if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+                seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+        if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+                seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+        return 0;
 }
 /*
- * create a fresh client instance
+ * handle any mon messages the standard library doesn't understand.
+ * return error if we don't either.
 */
-static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
 {
-        struct ceph_client *client;
+        struct ceph_fs_client *fsc = client->private;
+        int type = le16_to_cpu(msg->hdr.type);
+        switch (type) {
+        case CEPH_MSG_MDS_MAP:
+                ceph_mdsc_handle_map(fsc->mdsc, msg);
+                return 0;
+        default:
+                return -1;
+        }
+}
+/*
+ * create a new fs client
+ */
+struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+                                        struct ceph_options *opt)
+{
+        struct ceph_fs_client *fsc;
        int err = -ENOMEM;
-        client = kzalloc(sizeof(*client), GFP_KERNEL);
+        fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
-        if (client == NULL)
+        if (!fsc)
                return ERR_PTR(-ENOMEM);
-        mutex_init(&client->mount_mutex);
+        fsc->client = ceph_create_client(opt, fsc);
+        if (IS_ERR(fsc->client)) {
-        init_waitqueue_head(&client->auth_wq);
+                err = PTR_ERR(fsc->client);
+                goto fail;
+        }
+        fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+        fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
+                CEPH_FEATURE_DIRLAYOUTHASH;
+        fsc->client->monc.want_mdsmap = 1;
-        client->sb = NULL;
+        fsc->mount_options = fsopt;
-        client->mount_state = CEPH_MOUNT_MOUNTING;
-        client->mount_args = args;
-        client->msgr = NULL;
+        fsc->sb = NULL;
+        fsc->mount_state = CEPH_MOUNT_MOUNTING;
-        client->auth_err = 0;
+        atomic_long_set(&fsc->writeback_count, 0);
-        atomic_long_set(&client->writeback_count, 0);
-        err = bdi_init(&client->backing_dev_info);
+        err = bdi_init(&fsc->backing_dev_info);
        if (err < 0)
-                goto fail;
+                goto fail_client;
        err = -ENOMEM;
-        client->wb_wq = create_workqueue("ceph-writeback");
+        /*
-        if (client->wb_wq == NULL)
+         * The number of concurrent works can be high but they don't need
+         * to be processed in parallel, limit concurrency.
+         */
+        fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
+        if (fsc->wb_wq == NULL)
                goto fail_bdi;
-        client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+        fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
-        if (client->pg_inv_wq == NULL)
+        if (fsc->pg_inv_wq == NULL)
                goto fail_wb_wq;
-        client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+        fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
-        if (client->trunc_wq == NULL)
+        if (fsc->trunc_wq == NULL)
                goto fail_pg_inv_wq;
        /* set up mempools */
        err = -ENOMEM;
-        client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+        fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
-                              client->mount_args->wsize >> PAGE_CACHE_SHIFT);
+                              fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
-        if (!client->wb_pagevec_pool)
+        if (!fsc->wb_pagevec_pool)
                goto fail_trunc_wq;
        /* caps */
-        client->min_caps = args->max_readdir;
+        fsc->min_caps = fsopt->max_readdir;
+        return fsc;
-        /* subsystems */
-        err = ceph_monc_init(&client->monc, client);
-        if (err < 0)
-                goto fail_mempool;
-        err = ceph_osdc_init(&client->osdc, client);
-        if (err < 0)
-                goto fail_monc;
-        err = ceph_mdsc_init(&client->mdsc, client);
-        if (err < 0)
-                goto fail_osdc;
-        return client;
-fail_osdc:
-        ceph_osdc_stop(&client->osdc);
-fail_monc:
-        ceph_monc_stop(&client->monc);
-fail_mempool:
-        mempool_destroy(client->wb_pagevec_pool);
 fail_trunc_wq:
-        destroy_workqueue(client->trunc_wq);
+        destroy_workqueue(fsc->trunc_wq);
 fail_pg_inv_wq:
-        destroy_workqueue(client->pg_inv_wq);
+        destroy_workqueue(fsc->pg_inv_wq);
 fail_wb_wq:
-        destroy_workqueue(client->wb_wq);
+        destroy_workqueue(fsc->wb_wq);
 fail_bdi:
-        bdi_destroy(&client->backing_dev_info);
+        bdi_destroy(&fsc->backing_dev_info);
+fail_client:
+        ceph_destroy_client(fsc->client);
 fail:
-        kfree(client);
+        kfree(fsc);
        return ERR_PTR(err);
 }
-static void ceph_destroy_client(struct ceph_client *client)
+void destroy_fs_client(struct ceph_fs_client *fsc)
 {
-        dout("destroy_client %p\n", client);
+        dout("destroy_fs_client %p\n", fsc);
-        /* unmount */
+        destroy_workqueue(fsc->wb_wq);
-        ceph_mdsc_stop(&client->mdsc);
+        destroy_workqueue(fsc->pg_inv_wq);
-        ceph_osdc_stop(&client->osdc);
+        destroy_workqueue(fsc->trunc_wq);
-        /*
+        bdi_destroy(&fsc->backing_dev_info);
-         * make sure mds and osd connections close out before destroying
-         * the auth module, which is needed to free those connections'
-         * ceph_authorizers.
-         */
-        ceph_msgr_flush();
-        ceph_monc_stop(&client->monc);
-        ceph_debugfs_client_cleanup(client);
+        mempool_destroy(fsc->wb_pagevec_pool);
-        destroy_workqueue(client->wb_wq);
-        destroy_workqueue(client->pg_inv_wq);
-        destroy_workqueue(client->trunc_wq);
-        bdi_destroy(&client->backing_dev_info);
+        destroy_mount_options(fsc->mount_options);
-        if (client->msgr)
+        ceph_fs_debugfs_cleanup(fsc);
-                ceph_messenger_destroy(client->msgr);
-        mempool_destroy(client->wb_pagevec_pool);
-        destroy_mount_args(client->mount_args);
+        ceph_destroy_client(fsc->client);
-        kfree(client);
+        kfree(fsc);
-        dout("destroy_client %p done\n", client);
+        dout("destroy_fs_client %p done\n", fsc);
 }
 /*
- * Initially learn our fsid, or verify an fsid matches.
+ * caches
 */
-int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+static void ceph_inode_init_once(void *foo)
 {
-        if (client->have_fsid) {
+        struct ceph_inode_info *ci = foo;
-                if (ceph_fsid_compare(&client->fsid, fsid)) {
+        inode_init_once(&ci->vfs_inode);
-                        pr_err("bad fsid, had %pU got %pU",
+}
-                               &client->fsid, fsid);
-                        return -1;
+static int __init init_caches(void)
-                }
+{
-        } else {
+        ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
-                pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
+                                      sizeof(struct ceph_inode_info),
-                        fsid);
+                                      __alignof__(struct ceph_inode_info),
-                memcpy(&client->fsid, fsid, sizeof(*fsid));
+                                      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-                ceph_debugfs_client_init(client);
+                                      ceph_inode_init_once);
-                client->have_fsid = true;
+        if (ceph_inode_cachep == NULL)
-        }
+                return -ENOMEM;
+        ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+                                     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+        if (ceph_cap_cachep == NULL)
+                goto bad_cap;
+        ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+                                        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+        if (ceph_dentry_cachep == NULL)
+                goto bad_dentry;
+        ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+                                      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+        if (ceph_file_cachep == NULL)
+                goto bad_file;
        return 0;
+bad_file:
+        kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+        kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+        kmem_cache_destroy(ceph_inode_cachep);
+        return -ENOMEM;
 }
+static void destroy_caches(void)
+{
+        kmem_cache_destroy(ceph_inode_cachep);
+        kmem_cache_destroy(ceph_cap_cachep);
+        kmem_cache_destroy(ceph_dentry_cachep);
+        kmem_cache_destroy(ceph_file_cachep);
+}
 /*
- * true if we have the mon map (and have thus joined the cluster)
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
 */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static void ceph_umount_begin(struct super_block *sb)
 {
-        return client->monc.monmap && client->monc.monmap->epoch &&
+        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
-               client->osdc.osdmap && client->osdc.osdmap->epoch;
+        dout("ceph_umount_begin - starting forced umount\n");
+        if (!fsc)
+                return;
+        fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+        return;
 }
+static const struct super_operations ceph_super_ops = {
+        .alloc_inode    = ceph_alloc_inode,
+        .destroy_inode  = ceph_destroy_inode,
+        .write_inode    = ceph_write_inode,
+        .sync_fs        = ceph_sync_fs,
+        .put_super      = ceph_put_super,
+        .show_options   = ceph_show_options,
+        .statfs         = ceph_statfs,
+        .umount_begin   = ceph_umount_begin,
+};
 /*
 * Bootstrap mount by opening the root directory.  Note the mount
 * @started time from caller, and time out if this takes too long.
 */
-static struct dentry *open_root_dentry(struct ceph_client *client,
+static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
                                       const char *path,
                                       unsigned long started)
 {
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req = NULL;
        int err;
        struct dentry *root;
@@ -784,14 +616,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
        req->r_ino1.ino = CEPH_INO_ROOT;
        req->r_ino1.snap = CEPH_NOSNAP;
        req->r_started = started;
-        req->r_timeout = client->mount_args->mount_timeout * HZ;
+        req->r_timeout = fsc->client->options->mount_timeout * HZ;
        req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
        req->r_num_caps = 2;
        err = ceph_mdsc_do_request(mdsc, NULL, req);
        if (err == 0) {
                dout("open_root_inode success\n");
                if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
-                    client->sb->s_root == NULL)
+                    fsc->sb->s_root == NULL)
                        root = d_alloc_root(req->r_target_inode);
                else
                        root = d_obtain_alias(req->r_target_inode);
@@ -804,105 +636,84 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
        return root;
 }
 /*
 * mount: join the ceph cluster, and open root directory.
 */
-static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
                      const char *path)
 {
-        struct ceph_entity_addr *myaddr = NULL;
        int err;
-        unsigned long timeout = client->mount_args->mount_timeout * HZ;
        unsigned long started = jiffies;  /* note the start time */
        struct dentry *root;
+        int first = 0;   /* first vfsmount for this super_block */
        dout("mount start\n");
-        mutex_lock(&client->mount_mutex);
+        mutex_lock(&fsc->client->mount_mutex);
-        /* initialize the messenger */
-        if (client->msgr == NULL) {
-                if (ceph_test_opt(client, MYIP))
-                        myaddr = &client->mount_args->my_addr;
-                client->msgr = ceph_messenger_create(myaddr);
-                if (IS_ERR(client->msgr)) {
-                        err = PTR_ERR(client->msgr);
-                        client->msgr = NULL;
-                        goto out;
-                }
-                client->msgr->nocrc = ceph_test_opt(client, NOCRC);
-        }
-        /* open session, and wait for mon, mds, and osd maps */
+        err = __ceph_open_session(fsc->client, started);
-        err = ceph_monc_open_session(&client->monc);
        if (err < 0)
                goto out;
-        while (!have_mon_and_osd_map(client)) {
-                err = -EIO;
-                if (timeout && time_after_eq(jiffies, started + timeout))
-                        goto out;
-                /* wait */
-                dout("mount waiting for mon_map\n");
-                err = wait_event_interruptible_timeout(client->auth_wq,
-                       have_mon_and_osd_map(client) || (client->auth_err < 0),
-                       timeout);
-                if (err == -EINTR || err == -ERESTARTSYS)
-                        goto out;
-                if (client->auth_err < 0) {
-                        err = client->auth_err;
-                        goto out;
-                }
-        }
        dout("mount opening root\n");
-        root = open_root_dentry(client, "", started);
+        root = open_root_dentry(fsc, "", started);
        if (IS_ERR(root)) {
                err = PTR_ERR(root);
                goto out;
        }
-        if (client->sb->s_root)
+        if (fsc->sb->s_root) {
                dput(root);
-        else
+        } else {
-                client->sb->s_root = root;
+                fsc->sb->s_root = root;
+                first = 1;
+                err = ceph_fs_debugfs_init(fsc);
+                if (err < 0)
+                        goto fail;
+        }
        if (path[0] == 0) {
                dget(root);
        } else {
                dout("mount opening base mountpoint\n");
-                root = open_root_dentry(client, path, started);
+                root = open_root_dentry(fsc, path, started);
                if (IS_ERR(root)) {
                        err = PTR_ERR(root);
-                        dput(client->sb->s_root);
+                        goto fail;
-                        client->sb->s_root = NULL;
-                        goto out;
                }
        }
-        mnt->mnt_root = root;
+        fsc->mount_state = CEPH_MOUNT_MOUNTED;
-        mnt->mnt_sb = client->sb;
-        client->mount_state = CEPH_MOUNT_MOUNTED;
        dout("mount success\n");
-        err = 0;
+        mutex_unlock(&fsc->client->mount_mutex);
+        return root;
 out:
-        mutex_unlock(&client->mount_mutex);
+        mutex_unlock(&fsc->client->mount_mutex);
-        return err;
+        return ERR_PTR(err);
+fail:
+        if (first) {
+                dput(fsc->sb->s_root);
+                fsc->sb->s_root = NULL;
+        }
+        goto out;
 }
 static int ceph_set_super(struct super_block *s, void *data)
 {
-        struct ceph_client *client = data;
+        struct ceph_fs_client *fsc = data;
        int ret;
        dout("set_super %p data %p\n", s, data);
-        s->s_flags = client->mount_args->sb_flags;
+        s->s_flags = fsc->mount_options->sb_flags;
        s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
-        s->s_fs_info = client;
+        s->s_fs_info = fsc;
-        client->sb = s;
+        fsc->sb = s;
        s->s_op = &ceph_super_ops;
        s->s_export_op = &ceph_export_ops;
@@ -917,7 +728,7 @@ static int ceph_set_super(struct super_block *s, void *data)
 fail:
        s->s_fs_info = NULL;
-        client->sb = NULL;
+        fsc->sb = NULL;
        return ret;
 }
@@ -926,30 +737,23 @@ fail:
 */
 static int ceph_compare_super(struct super_block *sb, void *data)
 {
-        struct ceph_client *new = data;
+        struct ceph_fs_client *new = data;
-        struct ceph_mount_args *args = new->mount_args;
+        struct ceph_mount_options *fsopt = new->mount_options;
-        struct ceph_client *other = ceph_sb_to_client(sb);
+        struct ceph_options *opt = new->client->options;
-        int i;
+        struct ceph_fs_client *other = ceph_sb_to_client(sb);
        dout("ceph_compare_super %p\n", sb);
-        if (args->flags & CEPH_OPT_FSID) {
-                if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
+        if (compare_mount_options(fsopt, opt, other)) {
-                        dout("fsid doesn't match\n");
+                dout("monitor(s)/mount options don't match\n");
-                        return 0;
+                return 0;
-                }
+        }
-        } else {
+        if ((opt->flags & CEPH_OPT_FSID) &&
-                /* do we share (a) monitor? */
+            ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
-                for (i = 0; i < new->monc.monmap->num_mon; i++)
+                dout("fsid doesn't match\n");
-                        if (ceph_monmap_contains(other->monc.monmap,
+                return 0;
-                                         &new->monc.monmap->mon_inst[i].addr))
-                                break;
-                if (i == new->monc.monmap->num_mon) {
-                        dout("mon ip not part of monmap\n");
-                        return 0;
-                }
-                dout("mon ip matches existing sb %p\n", sb);
        }
-        if (args->sb_flags != other->mount_args->sb_flags) {
+        if (fsopt->sb_flags != other->mount_options->sb_flags) {
                dout("flags differ\n");
                return 0;
        }
@@ -961,98 +765,113 @@ static int ceph_compare_super(struct super_block *sb, void *data)
 */
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+static int ceph_register_bdi(struct super_block *sb,
+                             struct ceph_fs_client *fsc)
 {
        int err;
        /* set ra_pages based on rsize mount option? */
-        if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
+        if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
-                client->backing_dev_info.ra_pages =
+                fsc->backing_dev_info.ra_pages =
-                        (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+                        (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
                        >> PAGE_SHIFT;
-        err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+        err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
                           atomic_long_inc_return(&bdi_seq));
        if (!err)
-                sb->s_bdi = &client->backing_dev_info;
+                sb->s_bdi = &fsc->backing_dev_info;
        return err;
 }
-static int ceph_get_sb(struct file_system_type *fs_type,
+static struct dentry *ceph_mount(struct file_system_type *fs_type,
-                       int flags, const char *dev_name, void *data,
+                       int flags, const char *dev_name, void *data)
-                       struct vfsmount *mnt)
 {
        struct super_block *sb;
-        struct ceph_client *client;
+        struct ceph_fs_client *fsc;
+        struct dentry *res;
        int err;
        int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
        const char *path = NULL;
-        struct ceph_mount_args *args;
+        struct ceph_mount_options *fsopt = NULL;
+        struct ceph_options *opt = NULL;
-        dout("ceph_get_sb\n");
+        dout("ceph_mount\n");
-        args = parse_mount_args(flags, data, dev_name, &path);
+        err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
-        if (IS_ERR(args)) {
+        if (err < 0) {
-                err = PTR_ERR(args);
+                res = ERR_PTR(err);
                goto out_final;
        }
        /* create client (which we may/may not use) */
-        client = ceph_create_client(args);
+        fsc = create_fs_client(fsopt, opt);
-        if (IS_ERR(client)) {
+        if (IS_ERR(fsc)) {
-                err = PTR_ERR(client);
+                res = ERR_CAST(fsc);
+                kfree(fsopt);
+                kfree(opt);
                goto out_final;
        }
-        if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+        err = ceph_mdsc_init(fsc);
+        if (err < 0) {
+                res = ERR_PTR(err);
+                goto out;
+        }
+        if (ceph_test_opt(fsc->client, NOSHARE))
                compare_super = NULL;
-        sb = sget(fs_type, compare_super, ceph_set_super, client);
+        sb = sget(fs_type, compare_super, ceph_set_super, fsc);
        if (IS_ERR(sb)) {
-                err = PTR_ERR(sb);
+                res = ERR_CAST(sb);
                goto out;
        }
-        if (ceph_sb_to_client(sb) != client) {
+        if (ceph_sb_to_client(sb) != fsc) {
-                ceph_destroy_client(client);
+                ceph_mdsc_destroy(fsc);
-                client = ceph_sb_to_client(sb);
+                destroy_fs_client(fsc);
-                dout("get_sb got existing client %p\n", client);
+                fsc = ceph_sb_to_client(sb);
+                dout("get_sb got existing client %p\n", fsc);
        } else {
-                dout("get_sb using new client %p\n", client);
+                dout("get_sb using new client %p\n", fsc);
-                err = ceph_register_bdi(sb, client);
+                err = ceph_register_bdi(sb, fsc);
-                if (err < 0)
+                if (err < 0) {
+                        res = ERR_PTR(err);
                        goto out_splat;
+                }
        }
-        err = ceph_mount(client, mnt, path);
+        res = ceph_real_mount(fsc, path);
-        if (err < 0)
+        if (IS_ERR(res))
                goto out_splat;
-        dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
+        dout("root %p inode %p ino %llx.%llx\n", res,
-             mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
+             res->d_inode, ceph_vinop(res->d_inode));
-        return 0;
+        return res;
 out_splat:
-        ceph_mdsc_close_sessions(&client->mdsc);
+        ceph_mdsc_close_sessions(fsc->mdsc);
        deactivate_locked_super(sb);
        goto out_final;
 out:
-        ceph_destroy_client(client);
+        ceph_mdsc_destroy(fsc);
+        destroy_fs_client(fsc);
 out_final:
-        dout("ceph_get_sb fail %d\n", err);
+        dout("ceph_mount fail %ld\n", PTR_ERR(res));
-        return err;
+        return res;
 }
 static void ceph_kill_sb(struct super_block *s)
 {
-        struct ceph_client *client = ceph_sb_to_client(s);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(s);
        dout("kill_sb %p\n", s);
-        ceph_mdsc_pre_umount(&client->mdsc);
+        ceph_mdsc_pre_umount(fsc->mdsc);
        kill_anon_super(s);    /* will call put_super after sb is r/o */
-        ceph_destroy_client(client);
+        ceph_mdsc_destroy(fsc);
+        destroy_fs_client(fsc);
 }
 static struct file_system_type ceph_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ceph",
-        .get_sb         = ceph_get_sb,
+        .mount          = ceph_mount,
        .kill_sb        = ceph_kill_sb,
        .fs_flags       = FS_RENAME_DOES_D_MOVE,
 };
@@ -1062,36 +881,20 @@ static struct file_system_type ceph_fs_type = {
 static int __init init_ceph(void)
 {
-        int ret = 0;
+        int ret = init_caches();
-        ret = ceph_debugfs_init();
-        if (ret < 0)
-                goto out;
-        ret = ceph_msgr_init();
-        if (ret < 0)
-                goto out_debugfs;
-        ret = init_caches();
        if (ret)
-                goto out_msgr;
+                goto out;
        ret = register_filesystem(&ceph_fs_type);
        if (ret)
                goto out_icache;
-        pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
+        pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
-                CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
-                CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
-                CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
        return 0;
 out_icache:
        destroy_caches();
-out_msgr:
-        ceph_msgr_exit();
-out_debugfs:
-        ceph_debugfs_cleanup();
 out:
        return ret;
 }
@@ -1101,8 +904,6 @@ static void __exit exit_ceph(void)
        dout("exit_ceph\n");
        unregister_filesystem(&ceph_fs_type);
        destroy_caches();
-        ceph_msgr_exit();
-        ceph_debugfs_cleanup();
 }
 module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b87638e84c4b..20b907d76ae2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
 #ifndef _FS_CEPH_SUPER_H
 #define _FS_CEPH_SUPER_H
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include <asm/unaligned.h>
 #include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
 #include <linux/writeback.h>
 #include <linux/slab.h>
-#include "types.h"
+#include <linux/ceph/libceph.h>
-#include "messenger.h"
-#include "msgpool.h"
-#include "mon_client.h"
-#include "mds_client.h"
-#include "osd_client.h"
-#include "ceph_fs.h"
 /* f_type in struct statfs */
 #define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,42 +24,25 @@
 #define CEPH_BLOCK_SHIFT   20  /* 1 MB */
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
-/*
+#define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
- * Supported features
+#define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
- */
+#define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
-#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
-#define CEPH_FEATURE_REQUIRED  CEPH_FEATURE_NOSRCADDR
-/*
+#define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
- * mount options
- */
-#define CEPH_OPT_FSID             (1<<0)
-#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
-#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
-#define CEPH_OPT_DIRSTAT          (1<<4) /* funky `cat dirname` for stats */
-#define CEPH_OPT_RBYTES           (1<<5) /* dir st_bytes = rbytes */
-#define CEPH_OPT_NOCRC            (1<<6) /* no data crc on writes */
-#define CEPH_OPT_NOASYNCREADDIR   (1<<7) /* no dcache readdir */
-#define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
+#define ceph_set_mount_opt(fsc, opt) \
+        (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+#define ceph_test_mount_opt(fsc, opt) \
+        (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
-#define ceph_set_opt(client, opt) \
+#define CEPH_MAX_READDIR_DEFAULT        1024
-        (client)->mount_args->flags |= CEPH_OPT_##opt;
+#define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
-#define ceph_test_opt(client, opt) \
+#define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
-        (!!((client)->mount_args->flags & CEPH_OPT_##opt))
+struct ceph_mount_options {
-struct ceph_mount_args {
-        int sb_flags;
        int flags;
-        struct ceph_fsid fsid;
+        int sb_flags;
-        struct ceph_entity_addr my_addr;
-        int num_mon;
-        struct ceph_entity_addr *mon_addr;
-        int mount_timeout;
-        int osd_idle_ttl;
-        int osd_timeout;
-        int osd_keepalive_timeout;
        int wsize;
        int rsize;            /* max readahead */
        int congestion_kb;    /* max writeback in flight */
@@ -73,82 +50,25 @@ struct ceph_mount_args {
        int cap_release_safety;
        int max_readdir;       /* max readdir result (entires) */
        int max_readdir_bytes; /* max readdir result (bytes) */
-        char *snapdir_name;   /* default ".snap" */
-        char *name;
-        char *secret;
-};
-/*
+        /*
- * defaults
+         * everything above this point can be memcmp'd; everything below
- */
+         * is handled in compare_mount_options()
-#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
+         */
-#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
-#define CEPH_OSD_KEEPALIVE_DEFAULT  5
-#define CEPH_OSD_IDLE_TTL_DEFAULT    60
-#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
-#define CEPH_MAX_READDIR_DEFAULT    1024
-#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
-#define CEPH_MSG_MAX_FRONT_LEN  (16*1024*1024)
-#define CEPH_MSG_MAX_DATA_LEN   (16*1024*1024)
-#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
-#define CEPH_AUTH_NAME_DEFAULT   "guest"
-/*
- * Delay telling the MDS we no longer want caps, in case we reopen
- * the file.  Delay a minimum amount of time, even if we send a cap
- * message for some other reason.  Otherwise, take the oppotunity to
- * update the mds to avoid sending another message later.
- */
-#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
-#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
-#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
-/* mount state */
-enum {
-        CEPH_MOUNT_MOUNTING,
-        CEPH_MOUNT_MOUNTED,
-        CEPH_MOUNT_UNMOUNTING,
-        CEPH_MOUNT_UNMOUNTED,
-        CEPH_MOUNT_SHUTDOWN,
-};
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
-{
-        BUG_ON(time_after(b, a));
-        return (long)a - (long)b;
-}
-/*
- * per-filesystem client state
- *
- * possibly shared by multiple mount points, if they are
- * mounting the same ceph filesystem/cluster.
- */
-struct ceph_client {
-        struct ceph_fsid fsid;
-        bool have_fsid;
-        struct mutex mount_mutex;       /* serialize mount attempts */
+        char *snapdir_name;   /* default ".snap" */
-        struct ceph_mount_args *mount_args;
+};
+struct ceph_fs_client {
        struct super_block *sb;
-        unsigned long mount_state;
+        struct ceph_mount_options *mount_options;
-        wait_queue_head_t auth_wq;
+        struct ceph_client *client;
-        int auth_err;
+        unsigned long mount_state;
        int min_caps;                  /* min caps i added */
-        struct ceph_messenger *msgr;   /* messenger instance */
+        struct ceph_mds_client *mdsc;
-        struct ceph_mon_client monc;
-        struct ceph_mds_client mdsc;
-        struct ceph_osd_client osdc;
        /* writeback */
        mempool_t *wb_pagevec_pool;
@@ -160,14 +80,14 @@ struct ceph_client {
        struct backing_dev_info backing_dev_info;
 #ifdef CONFIG_DEBUG_FS
-        struct dentry *debugfs_monmap;
+        struct dentry *debugfs_dentry_lru, *debugfs_caps;
-        struct dentry *debugfs_mdsmap, *debugfs_osdmap;
-        struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
        struct dentry *debugfs_congestion_kb;
        struct dentry *debugfs_bdi;
+        struct dentry *debugfs_mdsc, *debugfs_mdsmap;
 #endif
 };
 /*
 * File i/o capability.  This tracks shared state with the metadata
 * server that allows us to cache or writeback attributes or to read
@@ -275,6 +195,20 @@ struct ceph_inode_xattr {
        int should_free_val;
 };
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+        struct ceph_mds_session *lease_session;
+        u32 lease_gen, lease_shared_gen;
+        u32 lease_seq;
+        unsigned long lease_renew_after, lease_renew_from;
+        struct list_head lru;
+        struct dentry *dentry;
+        u64 time;
+        u64 offset;
+};
 struct ceph_inode_xattrs_info {
        /*
         * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info {
 /*
 * Ceph inode.
 */
-#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
-#define CEPH_I_NODELAY   4  /* do not delay cap release */
-#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
 struct ceph_inode_info {
        struct ceph_vino i_vino;   /* ceph ino + snap */
@@ -310,6 +239,7 @@ struct ceph_inode_info {
        unsigned i_ceph_flags;
        unsigned long i_release_count;
+        struct ceph_dir_layout i_dir_layout;
        struct ceph_file_layout i_layout;
        char *i_symlink;
@@ -364,9 +294,7 @@ struct ceph_inode_info {
        int i_rd_ref, i_rdcache_ref, i_wr_ref;
        int i_wrbuffer_ref, i_wrbuffer_ref_head;
        u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
-        u32 i_rdcache_gen;      /* we increment this each time we get
+        u32 i_rdcache_gen;      /* incremented each time we get FILE_CACHE. */
-                                   FILE_CACHE.  If it's non-zero, we
-                                   _may_ have cached pages. */
        u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
        struct list_head i_unsafe_writes; /* uncommitted sync writes */
@@ -391,6 +319,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
        return container_of(inode, struct ceph_inode_info, vfs_inode);
 }
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+        return ceph_inode(inode)->i_vino;
+}
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+        ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+        ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+        if (!ino)
+                ino = 1;
+#endif
+        return ino;
+}
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+static inline u64 ceph_ino(struct inode *inode)
+{
+        return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+        return ceph_inode(inode)->i_vino.snap;
+}
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+        struct ceph_vino *pvino = (struct ceph_vino *)data;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        return ci->i_vino.ino == pvino->ino &&
+                ci->i_vino.snap == pvino->snap;
+}
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+                                            struct ceph_vino vino)
+{
+        ino_t t = ceph_vino_to_ino(vino);
+        return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
+#define CEPH_I_NODELAY   4  /* do not delay cap release */
+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
 static inline void ceph_i_clear(struct inode *inode, unsigned mask)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -414,8 +399,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
        struct ceph_inode_info *ci = ceph_inode(inode);
        bool r;
-        smp_mb();
+        spin_lock(&inode->i_lock);
        r = (ci->i_ceph_flags & mask) == mask;
+        spin_unlock(&inode->i_lock);
        return r;
 }
@@ -432,20 +418,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
                            struct ceph_inode_frag *pfrag,
                            int *found);
-/*
- * Ceph dentry state
- */
-struct ceph_dentry_info {
-        struct ceph_mds_session *lease_session;
-        u32 lease_gen, lease_shared_gen;
-        u32 lease_seq;
-        unsigned long lease_renew_after, lease_renew_from;
-        struct list_head lru;
-        struct dentry *dentry;
-        u64 time;
-        u64 offset;
-};
 static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
 {
        return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -456,22 +428,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
        return ((loff_t)frag << 32) | (loff_t)off;
 }
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * don't include snap in ino hash, at least for now.
- */
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
-{
-        ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
-#if BITS_PER_LONG == 32
-        ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
-        if (!ino)
-                ino = 1;
-#endif
-        return ino;
-}
 static inline int ceph_set_ino_cb(struct inode *inode, void *data)
 {
        ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -479,39 +435,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
        return 0;
 }
-static inline struct ceph_vino ceph_vino(struct inode *inode)
-{
-        return ceph_inode(inode)->i_vino;
-}
-/* for printf-style formatting */
-#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
-static inline u64 ceph_ino(struct inode *inode)
-{
-        return ceph_inode(inode)->i_vino.ino;
-}
-static inline u64 ceph_snap(struct inode *inode)
-{
-        return ceph_inode(inode)->i_vino.snap;
-}
-static inline int ceph_ino_compare(struct inode *inode, void *data)
-{
-        struct ceph_vino *pvino = (struct ceph_vino *)data;
-        struct ceph_inode_info *ci = ceph_inode(inode);
-        return ci->i_vino.ino == pvino->ino &&
-                ci->i_vino.snap == pvino->snap;
-}
-static inline struct inode *ceph_find_inode(struct super_block *sb,
-                                            struct ceph_vino vino)
-{
-        ino_t t = ceph_vino_to_ino(vino);
-        return ilookup5(sb, t, ceph_ino_compare, &vino);
-}
 /*
 * caps helpers
 */
@@ -576,18 +499,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
                             struct ceph_cap_reservation *ctx, int need);
 extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
                               struct ceph_cap_reservation *ctx);
-extern void ceph_reservation_status(struct ceph_client *client,
+extern void ceph_reservation_status(struct ceph_fs_client *client,
                                    int *total, int *avail, int *used,
                                    int *reserved, int *min);
-static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
 {
-        return (struct ceph_client *)inode->i_sb->s_fs_info;
+        return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
 }
-static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
 {
-        return (struct ceph_client *)sb->s_fs_info;
+        return (struct ceph_fs_client *)sb->s_fs_info;
 }
@@ -617,51 +540,6 @@ struct ceph_file_info {
 /*
- * snapshots
- */
-/*
- * A "snap context" is the set of existing snapshots when we
- * write data.  It is used by the OSD to guide its COW behavior.
- *
- * The ceph_snap_context is refcounted, and attached to each dirty
- * page, indicating which context the dirty data belonged when it was
- * dirtied.
- */
-struct ceph_snap_context {
-        atomic_t nref;
-        u64 seq;
-        int num_snaps;
-        u64 snaps[];
-};
-static inline struct ceph_snap_context *
-ceph_get_snap_context(struct ceph_snap_context *sc)
-{
-        /*
-        printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-               atomic_read(&sc->nref)+1);
-        */
-        if (sc)
-                atomic_inc(&sc->nref);
-        return sc;
-}
-static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
-{
-        if (!sc)
-                return;
-        /*
-        printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-               atomic_read(&sc->nref)-1);
-        */
-        if (atomic_dec_and_test(&sc->nref)) {
-                /*printk(" deleting snap_context %p\n", sc);*/
-                kfree(sc);
-        }
-}
-/*
 * A "snap realm" describes a subset of the file hierarchy sharing
 * the same set of snapshots that apply to it.  The realms themselves
 * are organized into a hierarchy, such that children inherit (some of)
@@ -699,16 +577,33 @@ struct ceph_snap_realm {
        spinlock_t inodes_with_caps_lock;
 };
+static inline int default_congestion_kb(void)
-/*
- * calculate the number of pages a given length and offset map onto,
- * if we align the data.
- */
-static inline int calc_pages_for(u64 off, u64 len)
 {
-        return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+        int congestion_kb;
-                (off >> PAGE_CACHE_SHIFT);
+        /*
+         * Copied from NFS
+         *
+         * congestion size, scale with available memory.
+         *
+         *  64MB:    8192k
+         * 128MB:   11585k
+         * 256MB:   16384k
+         * 512MB:   23170k
+         *   1GB:   32768k
+         *   2GB:   46340k
+         *   4GB:   65536k
+         *   8GB:   92681k
+         *  16GB:  131072k
+         *
+         * This allows larger machines to have larger/more transfers.
+         * Limit the default to 256M
+         */
+        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+        if (congestion_kb > 256*1024)
+                congestion_kb = 256*1024;
+        return congestion_kb;
 }
@@ -741,16 +636,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
                           ci_item)->writing;
 }
-/* super.c */
-extern struct kmem_cache *ceph_inode_cachep;
-extern struct kmem_cache *ceph_cap_cachep;
-extern struct kmem_cache *ceph_dentry_cachep;
-extern struct kmem_cache *ceph_file_cachep;
-extern const char *ceph_msg_type_name(int type);
-extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
 /* inode.c */
 extern const struct inode_operations ceph_file_iops;
@@ -781,7 +666,7 @@ extern void ceph_queue_invalidate(struct inode *inode);
 extern void ceph_queue_writeback(struct inode *inode);
 extern int ceph_do_getattr(struct inode *inode, int mask);
-extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask, unsigned int flags);
 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
 extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
                        struct kstat *stat);
@@ -857,12 +742,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 extern const struct address_space_operations ceph_aops;
+extern int ceph_copy_to_page_vector(struct page **pages,
+                                    const char *data,
+                                    loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+                                    char *data,
+                                    loff_t off, size_t len);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
                                       struct nameidata *nd, int mode,
                                       int locked_dir);
 extern int ceph_release(struct inode *inode, struct file *filp);
-extern void ceph_release_page_vector(struct page **pages, int num_pages);
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
@@ -878,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern unsigned ceph_dentry_hash(struct dentry *dn);
 /*
 * our d_ops vary depending on whether the inode is live,
@@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 /* export.c */
 extern const struct export_operations ceph_export_ops;
-/* debugfs.c */
-extern int ceph_debugfs_init(void);
-extern void ceph_debugfs_cleanup(void);
-extern int ceph_debugfs_client_init(struct ceph_client *client);
-extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
 /* locks.c */
 extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
 extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
@@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
        return NULL;
 }
+/* debugfs.c */
+extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
 #endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
deleted file mode 100644
index 28b35a005ec2..000000000000
--- a/fs/ceph/types.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _FS_CEPH_TYPES_H
-#define _FS_CEPH_TYPES_H
-/* needed before including ceph_fs.h */
-#include <linux/in.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/string.h>
-#include "ceph_fs.h"
-#include "ceph_frag.h"
-#include "ceph_hash.h"
-/*
- * Identify inodes by both their ino AND snapshot id (a u64).
- */
-struct ceph_vino {
-        u64 ino;
-        u64 snap;
-};
-/* context for the caps reservation mechanism */
-struct ceph_cap_reservation {
-        int count;
-};
-#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9578af610b73..8c9eba6ef9df 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
 #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
 #include <linux/xattr.h>
 #include <linux/slab.h>
@@ -216,6 +219,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
        struct rb_node **p;
        struct rb_node *parent = NULL;
        struct ceph_inode_xattr *xattr = NULL;
+        int name_len = strlen(name);
        int c;
        p = &ci->i_xattrs.index.rb_node;
@@ -223,6 +227,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
                parent = *p;
                xattr = rb_entry(parent, struct ceph_inode_xattr, node);
                c = strncmp(name, xattr->name, xattr->name_len);
+                if (c == 0 && name_len > xattr->name_len)
+                        c = 1;
                if (c < 0)
                        p = &(*p)->rb_left;
                else if (c > 0)
@@ -620,12 +626,12 @@ out:
 static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                              const char *value, size_t size, int flags)
 {
-        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
        struct inode *inode = dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct inode *parent_inode = dentry->d_parent->d_inode;
        struct ceph_mds_request *req;
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        int err;
        int i, nr_pages;
        struct page **pages = NULL;
@@ -713,10 +719,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
        /* preallocate memory for xattr name, value, index node */
        err = -ENOMEM;
-        newname = kmalloc(name_len + 1, GFP_NOFS);
+        newname = kmemdup(name, name_len + 1, GFP_NOFS);
        if (!newname)
                goto out;
-        memcpy(newname, name, name_len + 1);
        if (val_len) {
                newval = kmalloc(val_len + 1, GFP_NOFS);
@@ -777,8 +782,8 @@ out:
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
-        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
-        struct ceph_mds_client *mdsc = &client->mdsc;
+        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct inode *inode = dentry->d_inode;
        struct inode *parent_inode = dentry->d_parent->d_inode;
        struct ceph_mds_request *req;