Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil: "This changeset has a few main parts: - Ilya has finished a huge refactoring effort to sync up the client-side logic in libceph with the user-space client code, which has evolved significantly over the last couple years, with lots of additional behaviors (e.g., how requests are handled when cluster is full and transitions from full to non-full). This structure of the code is more closely aligned with userspace now such that it will be much easier to maintain going forward when behavior changes take place. There are some locking improvements bundled in as well. - Zheng adds multi-filesystem support (multiple namespaces within the same Ceph cluster) - Zheng has changed the readdir offsets and directory enumeration so that dentry offsets are hash-based and therefore stable across directory fragmentation events on the MDS. - Zheng has a smorgasbord of bug fixes across fs/ceph" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (71 commits) ceph: fix wake_up_session_cb() ceph: don't use truncate_pagecache() to invalidate read cache ceph: SetPageError() for writeback pages if writepages fails ceph: handle interrupted ceph_writepage() ceph: make ceph_update_writeable_page() uninterruptible libceph: make ceph_osdc_wait_request() uninterruptible ceph: handle -EAGAIN returned by ceph_update_writeable_page() ceph: make fault/page_mkwrite return VM_FAULT_OOM for -ENOMEM ceph: block non-fatal signals for fault/page_mkwrite ceph: make logical calculation functions return bool ceph: tolerate bad i_size for symlink inode ceph: improve fragtree change detection ceph: keep leaf frag when updating fragtree ceph: fix dir_auth check in ceph_fill_dirfrag() ceph: don't assume frag tree splits in mds reply are sorted ceph: fix inode reference leak ceph: using hash value to compose dentry offset ceph: don't forbid marking directory complete after forward seek ceph: record 'offset' for each entry of readdir result ceph: define 'end/complete' in readdir reply as bit flags ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-05-26 17:10:32 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-05-26 17:10:32 -0400
commit: a10c38a4f385f5d7c173a263ff6bb2d36021b3bb (patch)
tree: 3cbaa916940b36a9fdb27c8a231e1488fbc352d6
parent: ea8ea737c46cffa5d0ee74309f81e55a7e5e9c2a (diff)
parent: e536030934aebf049fe6aaebc58dd37aeee21840 (diff)
29 files changed, 4758 insertions, 2508 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0ede6d7e2568..81666a56415e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -350,12 +350,12 @@ struct rbd_device {
        struct rbd_spec         *spec;
        struct rbd_options      *opts;
-        char                    *header_name;
+        struct ceph_object_id   header_oid;
+        struct ceph_object_locator header_oloc;
        struct ceph_file_layout layout;
-        struct ceph_osd_event   *watch_event;
+        struct ceph_osd_linger_request *watch_handle;
-        struct rbd_obj_request  *watch_request;
        struct rbd_spec         *parent_spec;
        u64                     parent_overlap;
@@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
        return __rbd_obj_request_wait(obj_request, 0);
 }
-static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
-                                        unsigned long timeout)
-{
-        return __rbd_obj_request_wait(obj_request, timeout);
-}
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
 {
@@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
                complete_all(&obj_request->completion);
 }
-static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
-{
-        dout("%s: obj %p\n", __func__, obj_request);
-        obj_request_done_set(obj_request);
-}
 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
 {
        struct rbd_img_request *img_request = NULL;
@@ -1828,13 +1816,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
                obj_request_done_set(obj_request);
 }
-static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
-                                struct ceph_msg *msg)
 {
        struct rbd_obj_request *obj_request = osd_req->r_priv;
        u16 opcode;
-        dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
+        dout("%s: osd_req %p\n", __func__, osd_req);
        rbd_assert(osd_req == obj_request->osd_req);
        if (obj_request_img_data_test(obj_request)) {
                rbd_assert(obj_request->img_request);
@@ -1878,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
        case CEPH_OSD_OP_CALL:
                rbd_osd_call_callback(obj_request);
                break;
-        case CEPH_OSD_OP_NOTIFY_ACK:
-        case CEPH_OSD_OP_WATCH:
-                rbd_osd_trivial_callback(obj_request);
-                break;
        default:
                rbd_warn(NULL, "%s: unsupported op %hu",
                        obj_request->object_name, (unsigned short) opcode);
@@ -1896,27 +1879,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
 {
        struct rbd_img_request *img_request = obj_request->img_request;
        struct ceph_osd_request *osd_req = obj_request->osd_req;
-        u64 snap_id;
-        rbd_assert(osd_req != NULL);
+        if (img_request)
+                osd_req->r_snapid = img_request->snap_id;
-        snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
-        ceph_osdc_build_request(osd_req, obj_request->offset,
-                        NULL, snap_id, NULL);
 }
 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
 {
-        struct rbd_img_request *img_request = obj_request->img_request;
        struct ceph_osd_request *osd_req = obj_request->osd_req;
-        struct ceph_snap_context *snapc;
-        struct timespec mtime = CURRENT_TIME;
-        rbd_assert(osd_req != NULL);
+        osd_req->r_mtime = CURRENT_TIME;
+        osd_req->r_data_offset = obj_request->offset;
-        snapc = img_request ? img_request->snapc : NULL;
-        ceph_osdc_build_request(osd_req, obj_request->offset,
-                        snapc, CEPH_NOSNAP, &mtime);
 }
 /*
@@ -1954,7 +1927,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
        osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
                                          GFP_NOIO);
        if (!osd_req)
-                return NULL;    /* ENOMEM */
+                goto fail;
        if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
                osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
@@ -1965,9 +1938,18 @@ static struct ceph_osd_request *rbd_osd_req_create(
        osd_req->r_priv = obj_request;
        osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-        ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+        if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+                             obj_request->object_name))
+                goto fail;
+        if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+                goto fail;
        return osd_req;
+fail:
+        ceph_osdc_put_request(osd_req);
+        return NULL;
 }
 /*
@@ -2003,16 +1985,25 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
        osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
                                                false, GFP_NOIO);
        if (!osd_req)
-                return NULL;    /* ENOMEM */
+                goto fail;
        osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
        osd_req->r_callback = rbd_osd_req_callback;
        osd_req->r_priv = obj_request;
        osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-        ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+        if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+                             obj_request->object_name))
+                goto fail;
+        if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+                goto fail;
        return osd_req;
+fail:
+        ceph_osdc_put_request(osd_req);
+        return NULL;
 }
@@ -2973,17 +2964,20 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request)
 {
        struct rbd_obj_request *obj_request;
        struct rbd_obj_request *next_obj_request;
+        int ret = 0;
        dout("%s: img %p\n", __func__, img_request);
-        for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
-                int ret;
+        rbd_img_request_get(img_request);
+        for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
                ret = rbd_img_obj_request_submit(obj_request);
                if (ret)
-                        return ret;
+                        goto out_put_ireq;
        }
-        return 0;
+out_put_ireq:
+        rbd_img_request_put(img_request);
+        return ret;
 }
 static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
@@ -3090,45 +3084,18 @@ out_err:
        obj_request_done_set(obj_request);
 }
-static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev);
-{
+static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev);
-        struct rbd_obj_request *obj_request;
-        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-        int ret;
-        obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
-                                                        OBJ_REQUEST_NODATA);
-        if (!obj_request)
-                return -ENOMEM;
-        ret = -ENOMEM;
-        obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
-                                                  obj_request);
-        if (!obj_request->osd_req)
-                goto out;
-        osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
-                                        notify_id, 0, 0);
-        rbd_osd_req_format_read(obj_request);
-        ret = rbd_obj_request_submit(osdc, obj_request);
+static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
-        if (ret)
+                         u64 notifier_id, void *data, size_t data_len)
-                goto out;
-        ret = rbd_obj_request_wait(obj_request);
-out:
-        rbd_obj_request_put(obj_request);
-        return ret;
-}
-static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 {
-        struct rbd_device *rbd_dev = (struct rbd_device *)data;
+        struct rbd_device *rbd_dev = arg;
+        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
        int ret;
-        dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
+        dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev,
-                rbd_dev->header_name, (unsigned long long)notify_id,
+             cookie, notify_id);
-                (unsigned int)opcode);
        /*
         * Until adequate refresh error handling is in place, there is
@@ -3140,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
        if (ret)
                rbd_warn(rbd_dev, "refresh failed: %d", ret);
-        ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
+        ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
+                                   &rbd_dev->header_oloc, notify_id, cookie,
+                                   NULL, 0);
        if (ret)
                rbd_warn(rbd_dev, "notify_ack ret %d", ret);
 }
-/*
+static void rbd_watch_errcb(void *arg, u64 cookie, int err)
- * Send a (un)watch request and wait for the ack.  Return a request
- * with a ref held on success or error.
- */
-static struct rbd_obj_request *rbd_obj_watch_request_helper(
-                                                struct rbd_device *rbd_dev,
-                                                bool watch)
 {
-        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+        struct rbd_device *rbd_dev = arg;
-        struct ceph_options *opts = osdc->client->options;
-        struct rbd_obj_request *obj_request;
        int ret;
-        obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
+        rbd_warn(rbd_dev, "encountered watch error: %d", err);
-                                             OBJ_REQUEST_NODATA);
-        if (!obj_request)
-                return ERR_PTR(-ENOMEM);
-        obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
-                                                  obj_request);
-        if (!obj_request->osd_req) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
-                              rbd_dev->watch_event->cookie, 0, watch);
-        rbd_osd_req_format_write(obj_request);
-        if (watch)
+        __rbd_dev_header_unwatch_sync(rbd_dev);
-                ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
-        ret = rbd_obj_request_submit(osdc, obj_request);
-        if (ret)
-                goto out;
-        ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
+        ret = rbd_dev_header_watch_sync(rbd_dev);
-        if (ret)
-                goto out;
-        ret = obj_request->result;
        if (ret) {
-                if (watch)
+                rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
-                        rbd_obj_request_end(obj_request);
+                return;
-                goto out;
        }
-        return obj_request;
+        ret = rbd_dev_refresh(rbd_dev);
+        if (ret)
-out:
+                rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
-        rbd_obj_request_put(obj_request);
-        return ERR_PTR(ret);
 }
 /*
@@ -3205,35 +3140,33 @@ out:
 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
 {
        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-        struct rbd_obj_request *obj_request;
+        struct ceph_osd_linger_request *handle;
-        int ret;
-        rbd_assert(!rbd_dev->watch_event);
+        rbd_assert(!rbd_dev->watch_handle);
-        rbd_assert(!rbd_dev->watch_request);
-        ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
+        handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
-                                     &rbd_dev->watch_event);
+                                 &rbd_dev->header_oloc, rbd_watch_cb,
-        if (ret < 0)
+                                 rbd_watch_errcb, rbd_dev);
-                return ret;
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
-        obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
+        rbd_dev->watch_handle = handle;
-        if (IS_ERR(obj_request)) {
+        return 0;
-                ceph_osdc_cancel_event(rbd_dev->watch_event);
+}
-                rbd_dev->watch_event = NULL;
-                return PTR_ERR(obj_request);
-        }
-        /*
+static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
-         * A watch request is set to linger, so the underlying osd
+{
-         * request won't go away until we unregister it.  We retain
+        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-         * a pointer to the object request during that time (in
+        int ret;
-         * rbd_dev->watch_request), so we'll keep a reference to it.
-         * We'll drop that reference after we've unregistered it in
-         * rbd_dev_header_unwatch_sync().
-         */
-        rbd_dev->watch_request = obj_request;
-        return 0;
+        if (!rbd_dev->watch_handle)
+                return;
+        ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
+        if (ret)
+                rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
+        rbd_dev->watch_handle = NULL;
 }
 /*
@@ -3241,24 +3174,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
 */
 static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
 {
-        struct rbd_obj_request *obj_request;
+        __rbd_dev_header_unwatch_sync(rbd_dev);
-        rbd_assert(rbd_dev->watch_event);
-        rbd_assert(rbd_dev->watch_request);
-        rbd_obj_request_end(rbd_dev->watch_request);
-        rbd_obj_request_put(rbd_dev->watch_request);
-        rbd_dev->watch_request = NULL;
-        obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
-        if (!IS_ERR(obj_request))
-                rbd_obj_request_put(obj_request);
-        else
-                rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
-                         PTR_ERR(obj_request));
-        ceph_osdc_cancel_event(rbd_dev->watch_event);
-        rbd_dev->watch_event = NULL;
        dout("%s flushing notifies\n", __func__);
        ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
@@ -3591,7 +3507,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
                if (!ondisk)
                        return -ENOMEM;
-                ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
+                ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
                                       0, size, ondisk);
                if (ret < 0)
                        goto out;
@@ -4033,6 +3949,8 @@ static void rbd_dev_release(struct device *dev)
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
        bool need_put = !!rbd_dev->opts;
+        ceph_oid_destroy(&rbd_dev->header_oid);
        rbd_put_client(rbd_dev->rbd_client);
        rbd_spec_put(rbd_dev->spec);
        kfree(rbd_dev->opts);
@@ -4063,6 +3981,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
        INIT_LIST_HEAD(&rbd_dev->node);
        init_rwsem(&rbd_dev->header_rwsem);
+        ceph_oid_init(&rbd_dev->header_oid);
+        ceph_oloc_init(&rbd_dev->header_oloc);
        rbd_dev->dev.bus = &rbd_bus_type;
        rbd_dev->dev.type = &rbd_device_type;
        rbd_dev->dev.parent = &rbd_root_dev;
@@ -4111,7 +4032,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
                __le64 size;
        } __attribute__ ((packed)) size_buf = { 0 };
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_size",
                                &snapid, sizeof (snapid),
                                &size_buf, sizeof (size_buf));
@@ -4151,7 +4072,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_object_prefix", NULL, 0,
                                reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4186,7 +4107,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
        u64 unsup;
        int ret;
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_features",
                                &snapid, sizeof (snapid),
                                &features_buf, sizeof (features_buf));
@@ -4248,7 +4169,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
        }
        snapid = cpu_to_le64(rbd_dev->spec->snap_id);
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_parent",
                                &snapid, sizeof (snapid),
                                reply_buf, size);
@@ -4351,7 +4272,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
        u64 stripe_count;
        int ret;
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_stripe_unit_count", NULL, 0,
                                (char *)&striping_info_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4599,7 +4520,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_snapcontext", NULL, 0,
                                reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4664,7 +4585,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
                return ERR_PTR(-ENOMEM);
        snapid = cpu_to_le64(snap_id);
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_snapshot_name",
                                &snapid, sizeof (snapid),
                                reply_buf, size);
@@ -4975,13 +4896,13 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
 again:
        ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
        if (ret == -ENOENT && tries++ < 1) {
-                ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
+                ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
-                                               &newest_epoch);
+                                            &newest_epoch);
                if (ret < 0)
                        return ret;
                if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
-                        ceph_monc_request_next_osdmap(&rbdc->client->monc);
+                        ceph_osdc_maybe_request_map(&rbdc->client->osdc);
                        (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
                                                     newest_epoch,
                                                     opts->mount_timeout);
@@ -5260,35 +5181,26 @@ err_out_unlock:
 static int rbd_dev_header_name(struct rbd_device *rbd_dev)
 {
        struct rbd_spec *spec = rbd_dev->spec;
-        size_t size;
+        int ret;
        /* Record the header object name for this rbd image. */
        rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+        rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
        if (rbd_dev->image_format == 1)
-                size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
+                ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
+                                       spec->image_name, RBD_SUFFIX);
        else
-                size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
+                ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
+                                       RBD_HEADER_PREFIX, spec->image_id);
-        rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
-        if (!rbd_dev->header_name)
-                return -ENOMEM;
-        if (rbd_dev->image_format == 1)
+        return ret;
-                sprintf(rbd_dev->header_name, "%s%s",
-                        spec->image_name, RBD_SUFFIX);
-        else
-                sprintf(rbd_dev->header_name, "%s%s",
-                        RBD_HEADER_PREFIX, spec->image_id);
-        return 0;
 }
 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
 {
        rbd_dev_unprobe(rbd_dev);
-        kfree(rbd_dev->header_name);
-        rbd_dev->header_name = NULL;
        rbd_dev->image_format = 0;
        kfree(rbd_dev->spec->image_id);
        rbd_dev->spec->image_id = NULL;
@@ -5327,7 +5239,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
                                pr_info("image %s/%s does not exist\n",
                                        rbd_dev->spec->pool_name,
                                        rbd_dev->spec->image_name);
-                        goto out_header_name;
+                        goto err_out_format;
                }
        }
@@ -5373,7 +5285,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
                goto err_out_probe;
        dout("discovered format %u image, header name is %s\n",
-                rbd_dev->image_format, rbd_dev->header_name);
+                rbd_dev->image_format, rbd_dev->header_oid.name);
        return 0;
 err_out_probe:
@@ -5381,9 +5293,6 @@ err_out_probe:
 err_out_watch:
        if (!depth)
                rbd_dev_header_unwatch_sync(rbd_dev);
-out_header_name:
-        kfree(rbd_dev->header_name);
-        rbd_dev->header_name = NULL;
 err_out_format:
        rbd_dev->image_format = 0;
        kfree(rbd_dev->spec->image_id);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 43098cd9602b..eeb71e5de27a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page)
 /*
 * Finish an async read(ahead) op.
 */
-static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void finish_read(struct ceph_osd_request *req)
 {
        struct inode *inode = req->r_inode;
        struct ceph_osd_data *osd_data;
-        int rc = req->r_result;
+        int rc = req->r_result <= 0 ? req->r_result : 0;
-        int bytes = le32_to_cpu(msg->hdr.data_len);
+        int bytes = req->r_result >= 0 ? req->r_result : 0;
        int num_pages;
        int i;
@@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
        req->r_callback = finish_read;
        req->r_inode = inode;
-        ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
        dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
        ret = ceph_osdc_start_request(osdc, req, false);
        if (ret < 0)
@@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
                                   truncate_seq, truncate_size,
                                   &inode->i_mtime, &page, 1);
        if (err < 0) {
-                dout("writepage setting page/mapping error %d %p\n", err, page);
+                struct writeback_control tmp_wbc;
+                if (!wbc)
+                        wbc = &tmp_wbc;
+                if (err == -ERESTARTSYS) {
+                        /* killed by SIGKILL */
+                        dout("writepage interrupted page %p\n", page);
+                        redirty_page_for_writepage(wbc, page);
+                        end_page_writeback(page);
+                        goto out;
+                }
+                dout("writepage setting page/mapping error %d %p\n",
+                     err, page);
                SetPageError(page);
                mapping_set_error(&inode->i_data, err);
-                if (wbc)
+                wbc->pages_skipped++;
-                        wbc->pages_skipped++;
        } else {
                dout("writepage cleaned page %p\n", page);
                err = 0;  /* vfs expects us to return 0 */
@@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
        BUG_ON(!inode);
        ihold(inode);
        err = writepage_nounlock(page, wbc);
+        if (err == -ERESTARTSYS) {
+                /* direct memory reclaimer was killed by SIGKILL. return 0
+                 * to prevent caller from setting mapping/page error */
+                err = 0;
+        }
        unlock_page(page);
        iput(inode);
        return err;
 }
 /*
 * lame release_pages helper.  release_pages() isn't exported to
 * modules.
@@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num)
 * If we get an error, set the mapping error bit, but not the individual
 * page error bits.
 */
-static void writepages_finish(struct ceph_osd_request *req,
+static void writepages_finish(struct ceph_osd_request *req)
-                              struct ceph_msg *msg)
 {
        struct inode *inode = req->r_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req,
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        bool remove_page;
        dout("writepages_finish %p rc %d\n", inode, rc);
        if (rc < 0)
                mapping_set_error(mapping, rc);
@@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req,
                                clear_bdi_congested(&fsc->backing_dev_info,
                                                    BLK_RW_ASYNC);
+                        if (rc < 0)
+                                SetPageError(page);
                        ceph_put_snap_context(page_snap_context(page));
                        page->private = 0;
                        ClearPagePrivate(page);
@@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping,
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
        if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
-                pr_warn("writepage_start %p on forced umount\n", inode);
+                if (ci->i_wrbuffer_ref > 0) {
-                truncate_pagecache(inode, 0);
+                        pr_warn_ratelimited(
+                                "writepage_start %p %lld forced umount\n",
+                                inode, ceph_ino(inode));
+                }
                mapping_set_error(mapping, -EIO);
                return -EIO; /* we're in a forced umount, don't write! */
        }
@@ -1063,10 +1079,7 @@ new_request:
                        pages = NULL;
                }
-                vino = ceph_vino(inode);
+                req->r_mtime = inode->i_mtime;
-                ceph_osdc_build_request(req, offset, snapc, vino.snap,
-                                        &inode->i_mtime);
                rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
                BUG_ON(rc);
                req = NULL;
@@ -1099,8 +1112,7 @@ release_pvec_pages:
                mapping->writeback_index = index;
 out:
-        if (req)
+        ceph_osdc_put_request(req);
-                ceph_osdc_put_request(req);
        ceph_put_snap_context(snapc);
        dout("writepages done, rc = %d\n", rc);
        return rc;
@@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file,
                            struct page *page)
 {
        struct inode *inode = file_inode(file);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_inode_info *ci = ceph_inode(inode);
        loff_t page_off = pos & PAGE_MASK;
        int pos_in_page = pos & ~PAGE_MASK;
@@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file,
        int r;
        struct ceph_snap_context *snapc, *oldest;
+        if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+                dout(" page %p forced umount\n", page);
+                unlock_page(page);
+                return -EIO;
+        }
 retry_locked:
        /* writepages currently holds page lock, but if we change that later, */
        wait_on_page_writeback(page);
@@ -1165,7 +1184,7 @@ retry_locked:
                        snapc = ceph_get_snap_context(snapc);
                        unlock_page(page);
                        ceph_queue_writeback(inode);
-                        r = wait_event_interruptible(ci->i_cap_wq,
+                        r = wait_event_killable(ci->i_cap_wq,
                               context_is_writeable_or_written(inode, snapc));
                        ceph_put_snap_context(snapc);
                        if (r == -ERESTARTSYS)
@@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = {
        .direct_IO = ceph_direct_io,
 };
+static void ceph_block_sigs(sigset_t *oldset)
+{
+        sigset_t mask;
+        siginitsetinv(&mask, sigmask(SIGKILL));
+        sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+static void ceph_restore_sigs(sigset_t *oldset)
+{
+        sigprocmask(SIG_SETMASK, oldset, NULL);
+}
 /*
 * vm ops
@@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct page *pinned_page = NULL;
        loff_t off = vmf->pgoff << PAGE_SHIFT;
        int want, got, ret;
+        sigset_t oldset;
+        ceph_block_sigs(&oldset);
        dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
             inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
@@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
        else
                want = CEPH_CAP_FILE_CACHE;
-        while (1) {
-                got = 0;
+        got = 0;
-                ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want,
+        ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
-                                    -1, &got, &pinned_page);
+        if (ret < 0)
-                if (ret == 0)
+                goto out_restore;
-                        break;
-                if (ret != -ERESTARTSYS) {
-                        WARN_ON(1);
-                        return VM_FAULT_SIGBUS;
-                }
-        }
        dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
             inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
@@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        ceph_put_cap_refs(ci, got);
        if (ret != -EAGAIN)
-                return ret;
+                goto out_restore;
        /* read inline data */
        if (off >= PAGE_SIZE) {
@@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                                                ~__GFP_FS));
                if (!page) {
                        ret = VM_FAULT_OOM;
-                        goto out;
+                        goto out_inline;
                }
                ret1 = __ceph_do_getattr(inode, page,
                                         CEPH_STAT_CAP_INLINE_DATA, true);
                if (ret1 < 0 || off >= i_size_read(inode)) {
                        unlock_page(page);
                        put_page(page);
-                        ret = VM_FAULT_SIGBUS;
+                        if (ret1 < 0)
-                        goto out;
+                                ret = ret1;
+                        else
+                                ret = VM_FAULT_SIGBUS;
+                        goto out_inline;
                }
                if (ret1 < PAGE_SIZE)
                        zero_user_segment(page, ret1, PAGE_SIZE);
@@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                SetPageUptodate(page);
                vmf->page = page;
                ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
+out_inline:
+                dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
+                     inode, off, (size_t)PAGE_SIZE, ret);
        }
-out:
+out_restore:
-        dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
+        ceph_restore_sigs(&oldset);
-             inode, off, (size_t)PAGE_SIZE, ret);
+        if (ret < 0)
+                ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
        return ret;
 }
@@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        loff_t size = i_size_read(inode);
        size_t len;
        int want, got, ret;
+        sigset_t oldset;
        prealloc_cf = ceph_alloc_cap_flush();
        if (!prealloc_cf)
-                return VM_FAULT_SIGBUS;
+                return VM_FAULT_OOM;
+        ceph_block_sigs(&oldset);
        if (ci->i_inline_version != CEPH_INLINE_NONE) {
                struct page *locked_page = NULL;
@@ -1423,10 +1462,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                ret = ceph_uninline_data(vma->vm_file, locked_page);
                if (locked_page)
                        unlock_page(locked_page);
-                if (ret < 0) {
+                if (ret < 0)
-                        ret = VM_FAULT_SIGBUS;
                        goto out_free;
-                }
        }
        if (off + PAGE_SIZE <= size)
@@ -1440,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
        else
                want = CEPH_CAP_FILE_BUFFER;
-        while (1) {
-                got = 0;
+        got = 0;
-                ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
+        ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
-                                    &got, NULL);
+                            &got, NULL);
-                if (ret == 0)
+        if (ret < 0)
-                        break;
+                goto out_free;
-                if (ret != -ERESTARTSYS) {
-                        WARN_ON(1);
-                        ret = VM_FAULT_SIGBUS;
-                        goto out_free;
-                }
-        }
        dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
             inode, off, len, ceph_cap_string(got));
        /* Update time before taking page lock */
        file_update_time(vma->vm_file);
-        lock_page(page);
+        do {
+                lock_page(page);
-        ret = VM_FAULT_NOPAGE;
+                if ((off > size) || (page->mapping != inode->i_mapping)) {
-        if ((off > size) ||
+                        unlock_page(page);
-            (page->mapping != inode->i_mapping)) {
+                        ret = VM_FAULT_NOPAGE;
-                unlock_page(page);
+                        break;
-                goto out;
+                }
-        }
+                ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+                if (ret >= 0) {
+                        /* success.  we'll keep the page locked. */
+                        set_page_dirty(page);
+                        ret = VM_FAULT_LOCKED;
+                }
+        } while (ret == -EAGAIN);
-        ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
-        if (ret >= 0) {
-                /* success.  we'll keep the page locked. */
-                set_page_dirty(page);
-                ret = VM_FAULT_LOCKED;
-        } else {
-                if (ret == -ENOMEM)
-                        ret = VM_FAULT_OOM;
-                else
-                        ret = VM_FAULT_SIGBUS;
-        }
-out:
        if (ret == VM_FAULT_LOCKED ||
            ci->i_inline_version != CEPH_INLINE_NONE) {
                int dirty;
@@ -1495,8 +1523,10 @@ out:
             inode, off, len, ceph_cap_string(got), ret);
        ceph_put_cap_refs(ci, got);
 out_free:
+        ceph_restore_sigs(&oldset);
        ceph_free_cap_flush(prealloc_cf);
+        if (ret < 0)
+                ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
        return ret;
 }
@@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
                goto out;
        }
-        ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+        req->r_mtime = inode->i_mtime;
        err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!err)
                err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
                        goto out_put;
        }
-        ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+        req->r_mtime = inode->i_mtime;
        err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!err)
                err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
        rd_req->r_flags = CEPH_OSD_FLAG_READ;
        osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
        rd_req->r_base_oloc.pool = pool;
-        snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
+        ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
-                 "%llx.00000000", ci->i_vino.ino);
-        rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
+        err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
+        if (err)
+                goto out_unlock;
        wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
                                         1, false, GFP_NOFS);
@@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
                goto out_unlock;
        }
-        wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
+        wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
-                          CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
        osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
-        wr_req->r_base_oloc.pool = pool;
+        ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
-        wr_req->r_base_oid = rd_req->r_base_oid;
+        ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
+        err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
+        if (err)
+                goto out_unlock;
        /* one page should be large enough for STAT data */
        pages = ceph_alloc_page_vector(1, GFP_KERNEL);
@@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
        osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
                                     0, false, true);
-        ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
-                                &ci->vfs_inode.i_mtime);
        err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
-        ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
+        wr_req->r_mtime = ci->vfs_inode.i_mtime;
-                                &ci->vfs_inode.i_mtime);
        err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
        if (!err)
@@ -1823,10 +1855,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 out_unlock:
        up_write(&mdsc->pool_perm_rwsem);
-        if (rd_req)
+        ceph_osdc_put_request(rd_req);
-                ceph_osdc_put_request(rd_req);
+        ceph_osdc_put_request(wr_req);
-        if (wr_req)
-                ceph_osdc_put_request(wr_req);
 out:
        if (!err)
                err = have;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a351480dbabc..c052b5bf219b 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int
        unlock_page(page);
 }
-static inline int cache_valid(struct ceph_inode_info *ci)
+static inline bool cache_valid(struct ceph_inode_info *ci)
 {
        return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
                (ci->i_fscache_gen == ci->i_rdcache_gen));
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cfaeef18cbca..c17b5d76d75e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1656,7 +1656,7 @@ retry_locked:
         */
        if ((!is_delayed || mdsc->stopping) &&
            !S_ISDIR(inode->i_mode) &&          /* ignore readdir cache */
-            ci->i_wrbuffer_ref == 0 &&          /* no dirty pages... */
+            !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
            inode->i_data.nrpages &&            /* have cached pages */
            (revoking & (CEPH_CAP_FILE_CACHE|
                         CEPH_CAP_FILE_LAZYIO)) && /*  or revoking cache */
@@ -1698,8 +1698,8 @@ retry_locked:
                revoking = cap->implemented & ~cap->issued;
                dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
-                     cap->mds, cap, ceph_cap_string(cap->issued),
+                     cap->mds, cap, ceph_cap_string(cap_used),
-                     ceph_cap_string(cap_used),
+                     ceph_cap_string(cap->issued),
                     ceph_cap_string(cap->implemented),
                     ceph_cap_string(revoking));
@@ -2317,7 +2317,7 @@ again:
        /* make sure file is actually open */
        file_wanted = __ceph_caps_file_wanted(ci);
-        if ((file_wanted & need) == 0) {
+        if ((file_wanted & need) != need) {
                dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
                     ceph_cap_string(need), ceph_cap_string(file_wanted));
                *err = -EBADF;
@@ -2412,12 +2412,26 @@ again:
                        goto out_unlock;
                }
-                if (!__ceph_is_any_caps(ci) &&
+                if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
-                    ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+                        int mds_wanted;
-                        dout("get_cap_refs %p forced umount\n", inode);
+                        if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
-                        *err = -EIO;
+                            CEPH_MOUNT_SHUTDOWN) {
-                        ret = 1;
+                                dout("get_cap_refs %p forced umount\n", inode);
-                        goto out_unlock;
+                                *err = -EIO;
+                                ret = 1;
+                                goto out_unlock;
+                        }
+                        mds_wanted = __ceph_caps_mds_wanted(ci);
+                        if ((mds_wanted & need) != need) {
+                                dout("get_cap_refs %p caps were dropped"
+                                     " (session killed?)\n", inode);
+                                *err = -ESTALE;
+                                ret = 1;
+                                goto out_unlock;
+                        }
+                        if ((mds_wanted & file_wanted) ==
+                            (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+                                ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
                }
                dout("get_cap_refs %p have %s needed %s\n", inode,
@@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                        if (err == -EAGAIN)
                                continue;
                        if (err < 0)
-                                return err;
+                                ret = err;
                } else {
                        ret = wait_event_interruptible(ci->i_cap_wq,
                                        try_get_cap_refs(ci, need, want, endoff,
@@ -2496,8 +2510,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                                continue;
                        if (err < 0)
                                ret = err;
-                        if (ret < 0)
+                }
-                                return ret;
+                if (ret < 0) {
+                        if (err == -ESTALE) {
+                                /* session was killed, try renew caps */
+                                ret = ceph_renew_caps(&ci->vfs_inode);
+                                if (ret == 0)
+                                        continue;
+                        }
+                        return ret;
                }
                if (ci->i_inline_version != CEPH_INLINE_NONE &&
@@ -2807,7 +2828,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
        if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
            ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
            (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
-            !ci->i_wrbuffer_ref) {
+            !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
                if (try_nonblocking_invalidate(inode)) {
                        /* there were locked pages.. invalidate later
                           in a separate thread. */
@@ -3226,6 +3247,8 @@ retry:
        if (target < 0) {
                __ceph_remove_cap(cap, false);
+                if (!ci->i_auth_cap)
+                        ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
                goto out_unlock;
        }
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 31f831471ed2..39ff678e567f 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
                                   path ? path : "");
                        spin_unlock(&req->r_old_dentry->d_lock);
                        kfree(path);
-                } else if (req->r_path2) {
+                } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
                        if (req->r_ino2.ino)
                                seq_printf(s, " #%llx/%s", req->r_ino2.ino,
                                           req->r_path2);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3ab1192d2029..6e0fedf6713b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -70,16 +70,42 @@ out_unlock:
 }
 /*
- * for readdir, we encode the directory frag and offset within that
+ * for f_pos for readdir:
- * frag into f_pos.
+ * - hash order:
+ *      (0xff << 52) | ((24 bits hash) << 28) |
+ *      (the nth entry has hash collision);
+ * - frag+name order;
+ *      ((frag value) << 28) | (the nth entry in frag);
 */
+#define OFFSET_BITS     28
+#define OFFSET_MASK     ((1 << OFFSET_BITS) - 1)
+#define HASH_ORDER      (0xffull << (OFFSET_BITS + 24))
+loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
+{
+        loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
+        if (hash_order)
+                fpos |= HASH_ORDER;
+        return fpos;
+}
+static bool is_hash_order(loff_t p)
+{
+        return (p & HASH_ORDER) == HASH_ORDER;
+}
 static unsigned fpos_frag(loff_t p)
 {
-        return p >> 32;
+        return p >> OFFSET_BITS;
 }
+static unsigned fpos_hash(loff_t p)
+{
+        return ceph_frag_value(fpos_frag(p));
+}
 static unsigned fpos_off(loff_t p)
 {
-        return p & 0xffffffff;
+        return p & OFFSET_MASK;
 }
 static int fpos_cmp(loff_t l, loff_t r)
@@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
        return 0;
 }
+static struct dentry *
+__dcache_find_get_entry(struct dentry *parent, u64 idx,
+                        struct ceph_readdir_cache_control *cache_ctl)
+{
+        struct inode *dir = d_inode(parent);
+        struct dentry *dentry;
+        unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
+        loff_t ptr_pos = idx * sizeof(struct dentry *);
+        pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
+        if (ptr_pos >= i_size_read(dir))
+                return NULL;
+        if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
+                ceph_readdir_cache_release(cache_ctl);
+                cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
+                if (!cache_ctl->page) {
+                        dout(" page %lu not found\n", ptr_pgoff);
+                        return ERR_PTR(-EAGAIN);
+                }
+                /* reading/filling the cache are serialized by
+                   i_mutex, no need to use page lock */
+                unlock_page(cache_ctl->page);
+                cache_ctl->dentries = kmap(cache_ctl->page);
+        }
+        cache_ctl->index = idx & idx_mask;
+        rcu_read_lock();
+        spin_lock(&parent->d_lock);
+        /* check i_size again here, because empty directory can be
+         * marked as complete while not holding the i_mutex. */
+        if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
+                dentry = cache_ctl->dentries[cache_ctl->index];
+        else
+                dentry = NULL;
+        spin_unlock(&parent->d_lock);
+        if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+                dentry = NULL;
+        rcu_read_unlock();
+        return dentry ? : ERR_PTR(-EAGAIN);
+}
 /*
 * When possible, we try to satisfy a readdir by peeking at the
 * dcache.  We make this work by carefully ordering dentries on
@@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
        struct inode *dir = d_inode(parent);
        struct dentry *dentry, *last = NULL;
        struct ceph_dentry_info *di;
-        unsigned nsize = PAGE_SIZE / sizeof(struct dentry *);
-        int err = 0;
-        loff_t ptr_pos = 0;
        struct ceph_readdir_cache_control cache_ctl = {};
+        u64 idx = 0;
+        int err = 0;
-        dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
+        dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos);
+        /* search start position */
+        if (ctx->pos > 2) {
+                u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
+                while (count > 0) {
+                        u64 step = count >> 1;
+                        dentry = __dcache_find_get_entry(parent, idx + step,
+                                                         &cache_ctl);
+                        if (!dentry) {
+                                /* use linar search */
+                                idx = 0;
+                                break;
+                        }
+                        if (IS_ERR(dentry)) {
+                                err = PTR_ERR(dentry);
+                                goto out;
+                        }
+                        di = ceph_dentry(dentry);
+                        spin_lock(&dentry->d_lock);
+                        if (fpos_cmp(di->offset, ctx->pos) < 0) {
+                                idx += step + 1;
+                                count -= step + 1;
+                        } else {
+                                count = step;
+                        }
+                        spin_unlock(&dentry->d_lock);
+                        dput(dentry);
+                }
-        /* we can calculate cache index for the first dirfrag */
+                dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
-        if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
-                cache_ctl.index = fpos_off(ctx->pos) - 2;
-                BUG_ON(cache_ctl.index < 0);
-                ptr_pos = cache_ctl.index * sizeof(struct dentry *);
        }
-        while (true) {
-                pgoff_t pgoff;
-                bool emit_dentry;
-                if (ptr_pos >= i_size_read(dir)) {
+        for (;;) {
+                bool emit_dentry = false;
+                dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
+                if (!dentry) {
                        fi->flags |= CEPH_F_ATEND;
                        err = 0;
                        break;
                }
+                if (IS_ERR(dentry)) {
-                err = -EAGAIN;
+                        err = PTR_ERR(dentry);
-                pgoff = ptr_pos >> PAGE_SHIFT;
+                        goto out;
-                if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
-                        ceph_readdir_cache_release(&cache_ctl);
-                        cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
-                        if (!cache_ctl.page) {
-                                dout(" page %lu not found\n", pgoff);
-                                break;
-                        }
-                        /* reading/filling the cache are serialized by
-                         * i_mutex, no need to use page lock */
-                        unlock_page(cache_ctl.page);
-                        cache_ctl.dentries = kmap(cache_ctl.page);
                }
-                rcu_read_lock();
-                spin_lock(&parent->d_lock);
-                /* check i_size again here, because empty directory can be
-                 * marked as complete while not holding the i_mutex. */
-                if (ceph_dir_is_complete_ordered(dir) &&
-                    ptr_pos < i_size_read(dir))
-                        dentry = cache_ctl.dentries[cache_ctl.index % nsize];
-                else
-                        dentry = NULL;
-                spin_unlock(&parent->d_lock);
-                if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
-                        dentry = NULL;
-                rcu_read_unlock();
-                if (!dentry)
-                        break;
-                emit_dentry = false;
                di = ceph_dentry(dentry);
                spin_lock(&dentry->d_lock);
                if (di->lease_shared_gen == shared_gen &&
                    d_really_is_positive(dentry) &&
-                    ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
-                    ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
                    fpos_cmp(ctx->pos, di->offset) <= 0) {
                        emit_dentry = true;
                }
                spin_unlock(&dentry->d_lock);
                if (emit_dentry) {
-                        dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+                        dout(" %llx dentry %p %pd %p\n", di->offset,
                             dentry, dentry, d_inode(dentry));
                        ctx->pos = di->offset;
                        if (!dir_emit(ctx, dentry->d_name.name,
@@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
                } else {
                        dput(dentry);
                }
-                cache_ctl.index++;
-                ptr_pos += sizeof(struct dentry *);
        }
+out:
        ceph_readdir_cache_release(&cache_ctl);
        if (last) {
                int ret;
@@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
        return err;
 }
+static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
+{
+        if (!fi->last_readdir)
+                return true;
+        if (is_hash_order(pos))
+                return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
+        else
+                return fi->frag != fpos_frag(pos);
+}
 static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
        struct ceph_file_info *fi = file->private_data;
@@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_mds_client *mdsc = fsc->mdsc;
-        unsigned frag = fpos_frag(ctx->pos);
+        int i;
-        int off = fpos_off(ctx->pos);
        int err;
        u32 ftype;
        struct ceph_mds_reply_info_parsed *rinfo;
-        dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
+        dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
        if (fi->flags & CEPH_F_ATEND)
                return 0;
@@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                            inode->i_mode >> 12))
                        return 0;
                ctx->pos = 1;
-                off = 1;
        }
        if (ctx->pos == 1) {
                ino_t ino = parent_ino(file->f_path.dentry);
@@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                            inode->i_mode >> 12))
                        return 0;
                ctx->pos = 2;
-                off = 2;
        }
        /* can we use the dcache? */
@@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                err = __dcache_readdir(file, ctx, shared_gen);
                if (err != -EAGAIN)
                        return err;
-                frag = fpos_frag(ctx->pos);
-                off = fpos_off(ctx->pos);
        } else {
                spin_unlock(&ci->i_ceph_lock);
        }
@@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        /* proceed with a normal readdir */
 more:
        /* do we have the correct frag content buffered? */
-        if (fi->frag != frag || fi->last_readdir == NULL) {
+        if (need_send_readdir(fi, ctx->pos)) {
                struct ceph_mds_request *req;
+                unsigned frag;
                int op = ceph_snap(inode) == CEPH_SNAPDIR ?
                        CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
@@ -305,6 +372,13 @@ more:
                        fi->last_readdir = NULL;
                }
+                if (is_hash_order(ctx->pos)) {
+                        frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+                                                NULL, NULL);
+                } else {
+                        frag = fpos_frag(ctx->pos);
+                }
                dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
                     ceph_vinop(inode), frag, fi->last_name);
                req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -331,6 +405,8 @@ more:
                req->r_readdir_cache_idx = fi->readdir_cache_idx;
                req->r_readdir_offset = fi->next_offset;
                req->r_args.readdir.frag = cpu_to_le32(frag);
+                req->r_args.readdir.flags =
+                                cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
                req->r_inode = inode;
                ihold(inode);
@@ -340,22 +416,26 @@ more:
                        ceph_mdsc_put_request(req);
                        return err;
                }
-                dout("readdir got and parsed readdir result=%d"
+                dout("readdir got and parsed readdir result=%d on "
-                     " on frag %x, end=%d, complete=%d\n", err, frag,
+                     "frag %x, end=%d, complete=%d, hash_order=%d\n",
+                     err, frag,
                     (int)req->r_reply_info.dir_end,
-                     (int)req->r_reply_info.dir_complete);
+                     (int)req->r_reply_info.dir_complete,
+                     (int)req->r_reply_info.hash_order);
-                /* note next offset and last dentry name */
                rinfo = &req->r_reply_info;
                if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
                        frag = le32_to_cpu(rinfo->dir_dir->frag);
-                        off = req->r_readdir_offset;
+                        if (!rinfo->hash_order) {
-                        fi->next_offset = off;
+                                fi->next_offset = req->r_readdir_offset;
+                                /* adjust ctx->pos to beginning of frag */
+                                ctx->pos = ceph_make_fpos(frag,
+                                                          fi->next_offset,
+                                                          false);
+                        }
                }
                fi->frag = frag;
-                fi->offset = fi->next_offset;
                fi->last_readdir = req;
                if (req->r_did_prepopulate) {
@@ -363,7 +443,8 @@ more:
                        if (fi->readdir_cache_idx < 0) {
                                /* preclude from marking dir ordered */
                                fi->dir_ordered_count = 0;
-                        } else if (ceph_frag_is_leftmost(frag) && off == 2) {
+                        } else if (ceph_frag_is_leftmost(frag) &&
+                                   fi->next_offset == 2) {
                                /* note dir version at start of readdir so
                                 * we can tell if any dentries get dropped */
                                fi->dir_release_count = req->r_dir_release_cnt;
@@ -377,65 +458,87 @@ more:
                        fi->dir_release_count = 0;
                }
-                if (req->r_reply_info.dir_end) {
+                /* note next offset and last dentry name */
-                        kfree(fi->last_name);
+                if (rinfo->dir_nr > 0) {
-                        fi->last_name = NULL;
+                        struct ceph_mds_reply_dir_entry *rde =
-                        if (ceph_frag_is_rightmost(frag))
+                                        rinfo->dir_entries + (rinfo->dir_nr-1);
-                                fi->next_offset = 2;
+                        unsigned next_offset = req->r_reply_info.dir_end ?
-                        else
+                                        2 : (fpos_off(rde->offset) + 1);
-                                fi->next_offset = 0;
+                        err = note_last_dentry(fi, rde->name, rde->name_len,
-                } else {
+                                               next_offset);
-                        err = note_last_dentry(fi,
-                                       rinfo->dir_dname[rinfo->dir_nr-1],
-                                       rinfo->dir_dname_len[rinfo->dir_nr-1],
-                                       fi->next_offset + rinfo->dir_nr);
                        if (err)
                                return err;
+                } else if (req->r_reply_info.dir_end) {
+                        fi->next_offset = 2;
+                        /* keep last name */
                }
        }
        rinfo = &fi->last_readdir->r_reply_info;
-        dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
+        dout("readdir frag %x num %d pos %llx chunk first %llx\n",
-             rinfo->dir_nr, off, fi->offset);
+             fi->frag, rinfo->dir_nr, ctx->pos,
+             rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
-        ctx->pos = ceph_make_fpos(frag, off);
-        while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
+        i = 0;
-                struct ceph_mds_reply_inode *in =
+        /* search start position */
-                        rinfo->dir_in[off - fi->offset].in;
+        if (rinfo->dir_nr > 0) {
+                int step, nr = rinfo->dir_nr;
+                while (nr > 0) {
+                        step = nr >> 1;
+                        if (rinfo->dir_entries[i + step].offset < ctx->pos) {
+                                i +=  step + 1;
+                                nr -= step + 1;
+                        } else {
+                                nr = step;
+                        }
+                }
+        }
+        for (; i < rinfo->dir_nr; i++) {
+                struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
                ino_t ino;
-                dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
+                BUG_ON(rde->offset < ctx->pos);
-                     off, off - fi->offset, rinfo->dir_nr, ctx->pos,
-                     rinfo->dir_dname_len[off - fi->offset],
+                ctx->pos = rde->offset;
-                     rinfo->dir_dname[off - fi->offset], in);
+                dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
-                BUG_ON(!in);
+                     i, rinfo->dir_nr, ctx->pos,
-                ftype = le32_to_cpu(in->mode) >> 12;
+                     rde->name_len, rde->name, &rde->inode.in);
-                vino.ino = le64_to_cpu(in->ino);
-                vino.snap = le64_to_cpu(in->snapid);
+                BUG_ON(!rde->inode.in);
+                ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
+                vino.ino = le64_to_cpu(rde->inode.in->ino);
+                vino.snap = le64_to_cpu(rde->inode.in->snapid);
                ino = ceph_vino_to_ino(vino);
-                if (!dir_emit(ctx,
-                            rinfo->dir_dname[off - fi->offset],
+                if (!dir_emit(ctx, rde->name, rde->name_len,
-                            rinfo->dir_dname_len[off - fi->offset],
+                              ceph_translate_ino(inode->i_sb, ino), ftype)) {
-                            ceph_translate_ino(inode->i_sb, ino), ftype)) {
                        dout("filldir stopping us...\n");
                        return 0;
                }
-                off++;
                ctx->pos++;
        }
-        if (fi->last_name) {
+        if (fi->next_offset > 2) {
                ceph_mdsc_put_request(fi->last_readdir);
                fi->last_readdir = NULL;
                goto more;
        }
        /* more frags? */
-        if (!ceph_frag_is_rightmost(frag)) {
+        if (!ceph_frag_is_rightmost(fi->frag)) {
-                frag = ceph_frag_next(frag);
+                unsigned frag = ceph_frag_next(fi->frag);
-                off = 0;
+                if (is_hash_order(ctx->pos)) {
-                ctx->pos = ceph_make_fpos(frag, off);
+                        loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
+                                                        fi->next_offset, true);
+                        if (new_pos > ctx->pos)
+                                ctx->pos = new_pos;
+                        /* keep last_name */
+                } else {
+                        ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
+                        kfree(fi->last_name);
+                        fi->last_name = NULL;
+                }
                dout("readdir next frag is %x\n", frag);
                goto more;
        }
@@ -467,7 +570,7 @@ more:
        return 0;
 }
-static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
+static void reset_readdir(struct ceph_file_info *fi)
 {
        if (fi->last_readdir) {
                ceph_mdsc_put_request(fi->last_readdir);
@@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
        fi->last_name = NULL;
        fi->dir_release_count = 0;
        fi->readdir_cache_idx = -1;
-        if (ceph_frag_is_leftmost(frag))
+        fi->next_offset = 2;  /* compensate for . and .. */
-                fi->next_offset = 2;  /* compensate for . and .. */
-        else
-                fi->next_offset = 0;
        fi->flags &= ~CEPH_F_ATEND;
 }
+/*
+ * discard buffered readdir content on seekdir(0), or seek to new frag,
+ * or seek prior to current chunk
+ */
+static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
+{
+        struct ceph_mds_reply_info_parsed *rinfo;
+        loff_t chunk_offset;
+        if (new_pos == 0)
+                return true;
+        if (is_hash_order(new_pos)) {
+                /* no need to reset last_name for a forward seek when
+                 * dentries are sotred in hash order */
+        } else if (fi->frag |= fpos_frag(new_pos)) {
+                return true;
+        }
+        rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
+        if (!rinfo || !rinfo->dir_nr)
+                return true;
+        chunk_offset = rinfo->dir_entries[0].offset;
+        return new_pos < chunk_offset ||
+               is_hash_order(new_pos) != is_hash_order(chunk_offset);
+}
 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 {
        struct ceph_file_info *fi = file->private_data;
        struct inode *inode = file->f_mapping->host;
-        loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
        loff_t retval;
        inode_lock(inode);
@@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
        }
        if (offset >= 0) {
+                if (need_reset_readdir(fi, offset)) {
+                        dout("dir_llseek dropping %p content\n", file);
+                        reset_readdir(fi);
+                } else if (is_hash_order(offset) && offset > file->f_pos) {
+                        /* for hash offset, we don't know if a forward seek
+                         * is within same frag */
+                        fi->dir_release_count = 0;
+                        fi->readdir_cache_idx = -1;
+                }
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
                        fi->flags &= ~CEPH_F_ATEND;
                }
                retval = offset;
-                if (offset == 0 ||
-                    fpos_frag(offset) != fi->frag ||
-                    fpos_off(offset) < fi->offset) {
-                        /* discard buffered readdir content on seekdir(0), or
-                         * seek to new frag, or seek prior to current chunk */
-                        dout("dir_llseek dropping %p content\n", file);
-                        reset_readdir(fi, fpos_frag(offset));
-                } else if (fpos_cmp(offset, old_offset) > 0) {
-                        /* reset dir_release_count if we did a forward seek */
-                        fi->dir_release_count = 0;
-                        fi->readdir_cache_idx = -1;
-                }
        }
 out:
        inode_unlock(inode);
@@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
        return dentry;
 }
-static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
 {
        return ceph_ino(inode) == CEPH_INO_ROOT &&
                strncmp(dentry->d_name.name, ".ceph", 5) == 0;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 4f1dc7120916..a888df6f2d71 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -192,6 +192,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 }
 /*
+ * try renew caps after session gets killed.
+ */
+int ceph_renew_caps(struct inode *inode)
+{
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_mds_request *req;
+        int err, flags, wanted;
+        spin_lock(&ci->i_ceph_lock);
+        wanted = __ceph_caps_file_wanted(ci);
+        if (__ceph_is_any_real_caps(ci) &&
+            (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
+                int issued = __ceph_caps_issued(ci, NULL);
+                spin_unlock(&ci->i_ceph_lock);
+                dout("renew caps %p want %s issued %s updating mds_wanted\n",
+                     inode, ceph_cap_string(wanted), ceph_cap_string(issued));
+                ceph_check_caps(ci, 0, NULL);
+                return 0;
+        }
+        spin_unlock(&ci->i_ceph_lock);
+        flags = 0;
+        if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
+                flags = O_RDWR;
+        else if (wanted & CEPH_CAP_FILE_RD)
+                flags = O_RDONLY;
+        else if (wanted & CEPH_CAP_FILE_WR)
+                flags = O_WRONLY;
+#ifdef O_LAZY
+        if (wanted & CEPH_CAP_FILE_LAZYIO)
+                flags |= O_LAZY;
+#endif
+        req = prepare_open_request(inode->i_sb, flags, 0);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                goto out;
+        }
+        req->r_inode = inode;
+        ihold(inode);
+        req->r_num_caps = 1;
+        req->r_fmode = -1;
+        err = ceph_mdsc_do_request(mdsc, NULL, req);
+        ceph_mdsc_put_request(req);
+out:
+        dout("renew caps %p open result=%d\n", inode, err);
+        return err < 0 ? err : 0;
+}
+/*
 * If we already have the requisite capabilities, we can satisfy
 * the open request locally (no need to request new caps from the
 * MDS).  We do, however, need to inform the MDS (asynchronously)
@@ -616,8 +669,7 @@ static void ceph_aio_complete(struct inode *inode,
        kfree(aio_req);
 }
-static void ceph_aio_complete_req(struct ceph_osd_request *req,
+static void ceph_aio_complete_req(struct ceph_osd_request *req)
-                                  struct ceph_msg *msg)
 {
        int rc = req->r_result;
        struct inode *inode = req->r_inode;
@@ -714,14 +766,21 @@ static void ceph_aio_retry_work(struct work_struct *work)
        req->r_flags =  CEPH_OSD_FLAG_ORDERSNAP |
                        CEPH_OSD_FLAG_ONDISK |
                        CEPH_OSD_FLAG_WRITE;
-        req->r_base_oloc = orig_req->r_base_oloc;
+        ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
-        req->r_base_oid = orig_req->r_base_oid;
+        ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
+        ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
+        if (ret) {
+                ceph_osdc_put_request(req);
+                req = orig_req;
+                goto out;
+        }
        req->r_ops[0] = orig_req->r_ops[0];
        osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
-        ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
+        req->r_mtime = aio_req->mtime;
-                                snapc, CEPH_NOSNAP, &aio_req->mtime);
+        req->r_data_offset = req->r_ops[0].extent.offset;
        ceph_osdc_put_request(orig_req);
@@ -733,7 +792,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 out:
        if (ret < 0) {
                req->r_result = ret;
-                ceph_aio_complete_req(req, NULL);
+                ceph_aio_complete_req(req);
        }
        ceph_put_snap_context(snapc);
@@ -764,6 +823,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
                list_add_tail(&req->r_unsafe_item,
                              &ci->i_unsafe_writes);
                spin_unlock(&ci->i_unsafe_lock);
+                complete_all(&req->r_completion);
        } else {
                spin_lock(&ci->i_unsafe_lock);
                list_del_init(&req->r_unsafe_item);
@@ -875,14 +936,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                                        (pos+len) | (PAGE_SIZE - 1));
                        osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+                        req->r_mtime = mtime;
                }
                osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
                                                 false, false);
-                ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
                if (aio_req) {
                        aio_req->total_len += len;
                        aio_req->num_reqs++;
@@ -956,7 +1015,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                                                              req, false);
                        if (ret < 0) {
                                req->r_result = ret;
-                                ceph_aio_complete_req(req, NULL);
+                                ceph_aio_complete_req(req);
                        }
                }
                return -EIOCBQUEUED;
@@ -1067,9 +1126,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
                osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
                                                false, true);
-                /* BUG_ON(vino.snap != CEPH_NOSNAP); */
+                req->r_mtime = mtime;
-                ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
                ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
                if (!ret)
                        ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1524,9 +1581,7 @@ static int ceph_zero_partial_object(struct inode *inode,
                goto out;
        }
-        ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
+        req->r_mtime = inode->i_mtime;
-                                &inode->i_mtime);
        ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!ret) {
                ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e669cfa9d793..f059b5997072 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -11,6 +11,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/random.h>
+#include <linux/sort.h>
 #include "super.h"
 #include "mds_client.h"
@@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
                diri_auth = ci->i_auth_cap->mds;
        spin_unlock(&ci->i_ceph_lock);
+        if (mds == -1) /* CDIR_AUTH_PARENT */
+                mds = diri_auth;
        mutex_lock(&ci->i_fragtree_mutex);
        if (ndist == 0 && mds == diri_auth) {
                /* no delegation info needed. */
@@ -300,20 +304,38 @@ out:
        return err;
 }
+static int frag_tree_split_cmp(const void *l, const void *r)
+{
+        struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
+        struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
+        return ceph_frag_compare(ls->frag, rs->frag);
+}
+static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
+{
+        if (!frag)
+                return f == ceph_frag_make(0, 0);
+        if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
+                return false;
+        return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
+}
 static int ceph_fill_fragtree(struct inode *inode,
                              struct ceph_frag_tree_head *fragtree,
                              struct ceph_mds_reply_dirfrag *dirinfo)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_inode_frag *frag;
+        struct ceph_inode_frag *frag, *prev_frag = NULL;
        struct rb_node *rb_node;
-        int i;
+        unsigned i, split_by, nsplits;
-        u32 id, nsplits;
+        u32 id;
        bool update = false;
        mutex_lock(&ci->i_fragtree_mutex);
        nsplits = le32_to_cpu(fragtree->nsplits);
-        if (nsplits) {
+        if (nsplits != ci->i_fragtree_nsplits) {
+                update = true;
+        } else if (nsplits) {
                i = prandom_u32() % nsplits;
                id = le32_to_cpu(fragtree->splits[i].frag);
                if (!__ceph_find_frag(ci, id))
@@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode,
        if (!update)
                goto out_unlock;
+        if (nsplits > 1) {
+                sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
+                     frag_tree_split_cmp, NULL);
+        }
        dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
        rb_node = rb_first(&ci->i_fragtree);
        for (i = 0; i < nsplits; i++) {
                id = le32_to_cpu(fragtree->splits[i].frag);
+                split_by = le32_to_cpu(fragtree->splits[i].by);
+                if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
+                        pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
+                               "frag %x split by %d\n", ceph_vinop(inode),
+                               i, nsplits, id, split_by);
+                        continue;
+                }
                frag = NULL;
                while (rb_node) {
                        frag = rb_entry(rb_node, struct ceph_inode_frag, node);
@@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode,
                                break;
                        }
                        rb_node = rb_next(rb_node);
-                        rb_erase(&frag->node, &ci->i_fragtree);
+                        /* delete stale split/leaf node */
-                        kfree(frag);
+                        if (frag->split_by > 0 ||
+                            !is_frag_child(frag->frag, prev_frag)) {
+                                rb_erase(&frag->node, &ci->i_fragtree);
+                                if (frag->split_by > 0)
+                                        ci->i_fragtree_nsplits--;
+                                kfree(frag);
+                        }
                        frag = NULL;
                }
                if (!frag) {
@@ -356,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode,
                        if (IS_ERR(frag))
                                continue;
                }
-                frag->split_by = le32_to_cpu(fragtree->splits[i].by);
+                if (frag->split_by == 0)
+                        ci->i_fragtree_nsplits++;
+                frag->split_by = split_by;
                dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+                prev_frag = frag;
        }
        while (rb_node) {
                frag = rb_entry(rb_node, struct ceph_inode_frag, node);
                rb_node = rb_next(rb_node);
-                rb_erase(&frag->node, &ci->i_fragtree);
+                /* delete stale split/leaf node */
-                kfree(frag);
+                if (frag->split_by > 0 ||
+                    !is_frag_child(frag->frag, prev_frag)) {
+                        rb_erase(&frag->node, &ci->i_fragtree);
+                        if (frag->split_by > 0)
+                                ci->i_fragtree_nsplits--;
+                        kfree(frag);
+                }
        }
 out_unlock:
        mutex_unlock(&ci->i_fragtree_mutex);
@@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode)
                rb_erase(n, &ci->i_fragtree);
                kfree(frag);
        }
+        ci->i_fragtree_nsplits = 0;
        __ceph_destroy_xattrs(ci);
        if (ci->i_xattrs.blob)
@@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode)
        return 1;
 }
+static inline blkcnt_t calc_inode_blocks(u64 size)
+{
+        return (size + (1<<9) - 1) >> 9;
+}
 /*
 * Helpers to fill in size, ctime, mtime, and atime.  We have to be
 * careful because either the client or MDS may have more up to date
@@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
                        size = 0;
                }
                i_size_write(inode, size);
-                inode->i_blocks = (size + (1<<9) - 1) >> 9;
+                inode->i_blocks = calc_inode_blocks(size);
                ci->i_reported_size = size;
                if (truncate_seq != ci->i_truncate_seq) {
                        dout("truncate_seq %u -> %u\n",
@@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                        spin_unlock(&ci->i_ceph_lock);
-                        err = -EINVAL;
+                        if (symlen != i_size_read(inode)) {
-                        if (WARN_ON(symlen != i_size_read(inode)))
+                                pr_err("fill_inode %llx.%llx BAD symlink "
-                                goto out;
+                                        "size %lld\n", ceph_vinop(inode),
+                                        i_size_read(inode));
+                                i_size_write(inode, symlen);
+                                inode->i_blocks = calc_inode_blocks(symlen);
+                        }
                        err = -ENOMEM;
                        sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
@@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
        int i, err = 0;
        for (i = 0; i < rinfo->dir_nr; i++) {
+                struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
                struct inode *in;
                int rc;
-                vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+                vino.ino = le64_to_cpu(rde->inode.in->ino);
-                vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+                vino.snap = le64_to_cpu(rde->inode.in->snapid);
                in = ceph_get_inode(req->r_dentry->d_sb, vino);
                if (IS_ERR(in)) {
@@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
                        dout("new_inode badness got %d\n", err);
                        continue;
                }
-                rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+                rc = fill_inode(in, NULL, &rde->inode, NULL, session,
                                req->r_request_started, -1,
                                &req->r_caps_reservation);
                if (rc < 0) {
                        pr_err("fill_inode badness on %p got %d\n", in, rc);
                        err = rc;
-                        continue;
                }
+                iput(in);
        }
        return err;
@@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                             struct ceph_mds_session *session)
 {
        struct dentry *parent = req->r_dentry;
+        struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct qstr dname;
        struct dentry *dn;
@@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        int err = 0, skipped = 0, ret, i;
        struct inode *snapdir = NULL;
        struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
-        struct ceph_dentry_info *di;
        u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+        u32 last_hash = 0;
+        u32 fpos_offset;
        struct ceph_readdir_cache_control cache_ctl = {};
        if (req->r_aborted)
                return readdir_prepopulate_inodes_only(req, session);
+        if (rinfo->hash_order && req->r_path2) {
+                last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+                                          req->r_path2, strlen(req->r_path2));
+                last_hash = ceph_frag_value(last_hash);
+        }
        if (rinfo->dir_dir &&
            le32_to_cpu(rinfo->dir_dir->frag) != frag) {
                dout("readdir_prepopulate got new frag %x -> %x\n",
                     frag, le32_to_cpu(rinfo->dir_dir->frag));
                frag = le32_to_cpu(rinfo->dir_dir->frag);
-                if (ceph_frag_is_leftmost(frag))
+                if (!rinfo->hash_order)
                        req->r_readdir_offset = 2;
-                else
-                        req->r_readdir_offset = 0;
        }
        if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
@@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
                /* note dir version at start of readdir so we can tell
                 * if any dentries get dropped */
-                struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
                req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
                req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
                req->r_readdir_cache_idx = 0;
        }
        cache_ctl.index = req->r_readdir_cache_idx;
+        fpos_offset = req->r_readdir_offset;
        /* FIXME: release caps/leases if error occurs */
        for (i = 0; i < rinfo->dir_nr; i++) {
+                struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
-                dname.name = rinfo->dir_dname[i];
+                dname.name = rde->name;
-                dname.len = rinfo->dir_dname_len[i];
+                dname.len = rde->name_len;
                dname.hash = full_name_hash(dname.name, dname.len);
-                vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+                vino.ino = le64_to_cpu(rde->inode.in->ino);
-                vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+                vino.snap = le64_to_cpu(rde->inode.in->snapid);
+                if (rinfo->hash_order) {
+                        u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+                                                 rde->name, rde->name_len);
+                        hash = ceph_frag_value(hash);
+                        if (hash != last_hash)
+                                fpos_offset = 2;
+                        last_hash = hash;
+                        rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
+                } else {
+                        rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
+                }
 retry_lookup:
                dn = d_lookup(parent, &dname);
@@ -1490,7 +1569,7 @@ retry_lookup:
                        }
                }
-                ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+                ret = fill_inode(in, NULL, &rde->inode, NULL, session,
                                 req->r_request_started, -1,
                                 &req->r_caps_reservation);
                if (ret < 0) {
@@ -1523,11 +1602,9 @@ retry_lookup:
                        dn = realdn;
                }
-                di = dn->d_fsdata;
+                ceph_dentry(dn)->offset = rde->offset;
-                di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
-                update_dentry_lease(dn, rinfo->dir_dlease[i],
+                update_dentry_lease(dn, rde->lease, req->r_session,
-                                    req->r_session,
                                    req->r_request_started);
                if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
@@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
        spin_lock(&ci->i_ceph_lock);
        dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
        i_size_write(inode, size);
-        inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+        inode->i_blocks = calc_inode_blocks(size);
        /* tell the MDS if we are approaching max_size */
        if ((size << 1) >= ci->i_max_size &&
@@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work)
        struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
                                                  i_pg_inv_work);
        struct inode *inode = &ci->vfs_inode;
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        u32 orig_gen;
        int check = 0;
        mutex_lock(&ci->i_truncate_mutex);
+        if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+                pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
+                                    inode, ceph_ino(inode));
+                mapping_set_error(inode->i_mapping, -EIO);
+                truncate_pagecache(inode, 0);
+                mutex_unlock(&ci->i_truncate_mutex);
+                goto out;
+        }
        spin_lock(&ci->i_ceph_lock);
        dout("invalidate_pages %p gen %d revoking %d\n", inode,
             ci->i_rdcache_gen, ci->i_rdcache_revoking);
@@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work)
        orig_gen = ci->i_rdcache_gen;
        spin_unlock(&ci->i_ceph_lock);
-        truncate_pagecache(inode, 0);
+        if (invalidate_inode_pages2(inode->i_mapping) < 0) {
+                pr_err("invalidate_pages %p fails\n", inode);
+        }
        spin_lock(&ci->i_ceph_lock);
        if (orig_gen == ci->i_rdcache_gen &&
@@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
                if ((issued & CEPH_CAP_FILE_EXCL) &&
                    attr->ia_size > inode->i_size) {
                        i_size_write(inode, attr->ia_size);
-                        inode->i_blocks =
+                        inode->i_blocks = calc_inode_blocks(attr->ia_size);
-                                (attr->ia_size + (1 << 9) - 1) >> 9;
                        inode->i_ctime = attr->ia_ctime;
                        ci->i_reported_size = attr->ia_size;
                        dirtied |= CEPH_CAP_FILE_EXCL;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index f851d8d70158..be6b1657b1af 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        if (copy_from_user(&dl, arg, sizeof(dl)))
                return -EFAULT;
-        down_read(&osdc->map_sem);
+        down_read(&osdc->lock);
        r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
                                          &dl.object_no, &dl.object_offset,
                                          &olen);
        if (r < 0) {
-                up_read(&osdc->map_sem);
+                up_read(&osdc->lock);
                return -EIO;
        }
        dl.file_offset -= dl.object_offset;
@@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
                 ceph_ino(inode), dl.object_no);
        oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
-        ceph_oid_set_name(&oid, dl.object_name);
+        ceph_oid_printf(&oid, "%s", dl.object_name);
-        r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
+        r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
        if (r < 0) {
-                up_read(&osdc->map_sem);
+                up_read(&osdc->lock);
                return r;
        }
-        dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+        dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
        if (dl.osd >= 0) {
                struct ceph_entity_addr *a =
                        ceph_osd_addr(osdc->osdmap, dl.osd);
@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        } else {
                memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
        }
-        up_read(&osdc->map_sem);
+        up_read(&osdc->lock);
        /* send result back to user */
        if (copy_to_user(arg, &dl, sizeof(dl)))
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 85b8517f17a0..2103b823bec0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end,
        ceph_decode_need(p, end, sizeof(num) + 2, bad);
        num = ceph_decode_32(p);
-        info->dir_end = ceph_decode_8(p);
+        {
-        info->dir_complete = ceph_decode_8(p);
+                u16 flags = ceph_decode_16(p);
+                info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
+                info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+                info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
+        }
        if (num == 0)
                goto done;
-        BUG_ON(!info->dir_in);
+        BUG_ON(!info->dir_entries);
-        info->dir_dname = (void *)(info->dir_in + num);
+        if ((unsigned long)(info->dir_entries + num) >
-        info->dir_dname_len = (void *)(info->dir_dname + num);
+            (unsigned long)info->dir_entries + info->dir_buf_size) {
-        info->dir_dlease = (void *)(info->dir_dname_len + num);
-        if ((unsigned long)(info->dir_dlease + num) >
-            (unsigned long)info->dir_in + info->dir_buf_size) {
                pr_err("dir contents are larger than expected\n");
                WARN_ON(1);
                goto bad;
@@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end,
        info->dir_nr = num;
        while (num) {
+                struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
                /* dentry */
                ceph_decode_need(p, end, sizeof(u32)*2, bad);
-                info->dir_dname_len[i] = ceph_decode_32(p);
+                rde->name_len = ceph_decode_32(p);
-                ceph_decode_need(p, end, info->dir_dname_len[i], bad);
+                ceph_decode_need(p, end, rde->name_len, bad);
-                info->dir_dname[i] = *p;
+                rde->name = *p;
-                *p += info->dir_dname_len[i];
+                *p += rde->name_len;
-                dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
+                dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
-                     info->dir_dname[i]);
+                rde->lease = *p;
-                info->dir_dlease[i] = *p;
                *p += sizeof(struct ceph_mds_reply_lease);
                /* inode */
-                err = parse_reply_info_in(p, end, &info->dir_in[i], features);
+                err = parse_reply_info_in(p, end, &rde->inode, features);
                if (err < 0)
                        goto out_bad;
+                /* ceph_readdir_prepopulate() will update it */
+                rde->offset = 0;
                i++;
                num--;
        }
@@ -345,9 +348,9 @@ out_bad:
 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
 {
-        if (!info->dir_in)
+        if (!info->dir_entries)
                return;
-        free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
+        free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
 }
@@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref)
        kfree(req);
 }
+DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
 /*
 * lookup session, bump ref if found.
 *
 * called under mdsc->mutex.
 */
-static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
+static struct ceph_mds_request *
-                                             u64 tid)
+lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
 {
        struct ceph_mds_request *req;
-        struct rb_node *n = mdsc->request_tree.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_mds_request, r_node);
-                if (tid < req->r_tid)
-                        n = n->rb_left;
-                else if (tid > req->r_tid)
-                        n = n->rb_right;
-                else {
-                        ceph_mdsc_get_request(req);
-                        return req;
-                }
-        }
-        return NULL;
-}
-static void __insert_request(struct ceph_mds_client *mdsc,
+        req = lookup_request(&mdsc->request_tree, tid);
-                             struct ceph_mds_request *new)
+        if (req)
-{
+                ceph_mdsc_get_request(req);
-        struct rb_node **p = &mdsc->request_tree.rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_mds_request *req = NULL;
-        while (*p) {
+        return req;
-                parent = *p;
-                req = rb_entry(parent, struct ceph_mds_request, r_node);
-                if (new->r_tid < req->r_tid)
-                        p = &(*p)->rb_left;
-                else if (new->r_tid > req->r_tid)
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
-        }
-        rb_link_node(&new->r_node, parent, p);
-        rb_insert_color(&new->r_node, &mdsc->request_tree);
 }
 /*
@@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
                                  req->r_num_caps);
        dout("__register_request %p tid %lld\n", req, req->r_tid);
        ceph_mdsc_get_request(req);
-        __insert_request(mdsc, req);
+        insert_request(&mdsc->request_tree, req);
        req->r_uid = current_fsuid();
        req->r_gid = current_fsgid();
@@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
                }
        }
-        rb_erase(&req->r_node, &mdsc->request_tree);
+        erase_request(&mdsc->request_tree, req);
-        RB_CLEAR_NODE(&req->r_node);
        if (req->r_unsafe_dir && req->r_got_unsafe) {
                struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
        int metadata_bytes = 0;
        int metadata_key_count = 0;
        struct ceph_options *opt = mdsc->fsc->client->options;
+        struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
        void *p;
        const char* metadata[][2] = {
                {"hostname", utsname()->nodename},
                {"kernel_version", utsname()->release},
-                {"entity_id", opt->name ? opt->name : ""},
+                {"entity_id", opt->name ? : ""},
+                {"root", fsopt->server_path ? : "/"},
                {NULL, NULL}
        };
@@ -1149,9 +1125,11 @@ out:
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                                  void *arg)
 {
+        struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
        struct ceph_inode_info *ci = ceph_inode(inode);
        LIST_HEAD(to_remove);
-        int drop = 0;
+        bool drop = false;
+        bool invalidate = false;
        dout("removing cap %p, ci is %p, inode is %p\n",
             cap, ci, &ci->vfs_inode);
@@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
        __ceph_remove_cap(cap, false);
        if (!ci->i_auth_cap) {
                struct ceph_cap_flush *cf;
-                struct ceph_mds_client *mdsc =
+                struct ceph_mds_client *mdsc = fsc->mdsc;
-                        ceph_sb_to_client(inode->i_sb)->mdsc;
+                ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+                if (ci->i_wrbuffer_ref > 0 &&
+                    ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+                        invalidate = true;
                while (true) {
                        struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
@@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                                inode, ceph_ino(inode));
                        ci->i_dirty_caps = 0;
                        list_del_init(&ci->i_dirty_item);
-                        drop = 1;
+                        drop = true;
                }
                if (!list_empty(&ci->i_flushing_item)) {
                        pr_warn_ratelimited(
@@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                        ci->i_flushing_caps = 0;
                        list_del_init(&ci->i_flushing_item);
                        mdsc->num_cap_flushing--;
-                        drop = 1;
+                        drop = true;
                }
                spin_unlock(&mdsc->cap_dirty_lock);
@@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                list_del(&cf->list);
                ceph_free_cap_flush(cf);
        }
-        while (drop--)
+        wake_up_all(&ci->i_cap_wq);
+        if (invalidate)
+                ceph_queue_invalidate(inode);
+        if (drop)
                iput(inode);
        return 0;
 }
@@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 */
 static void remove_session_caps(struct ceph_mds_session *session)
 {
+        struct ceph_fs_client *fsc = session->s_mdsc->fsc;
+        struct super_block *sb = fsc->sb;
        dout("remove_session_caps on %p\n", session);
-        iterate_session_caps(session, remove_session_caps_cb, NULL);
+        iterate_session_caps(session, remove_session_caps_cb, fsc);
        spin_lock(&session->s_cap_lock);
        if (session->s_nr_caps > 0) {
-                struct super_block *sb = session->s_mdsc->fsc->sb;
                struct inode *inode;
                struct ceph_cap *cap, *prev = NULL;
                struct ceph_vino vino;
@@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        wake_up_all(&ci->i_cap_wq);
        if (arg) {
                spin_lock(&ci->i_ceph_lock);
                ci->i_wanted_max_size = 0;
                ci->i_requested_max_size = 0;
                spin_unlock(&ci->i_ceph_lock);
        }
+        wake_up_all(&ci->i_cap_wq);
        return 0;
 }
@@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
        struct ceph_inode_info *ci = ceph_inode(dir);
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
-        size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
+        size_t size = sizeof(struct ceph_mds_reply_dir_entry);
-                      sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
        int order, num_entries;
        spin_lock(&ci->i_ceph_lock);
@@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
        order = get_order(size * num_entries);
        while (order >= 0) {
-                rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL |
+                rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
-                                                        __GFP_NOWARN,
+                                                             __GFP_NOWARN,
-                                                        order);
+                                                             order);
-                if (rinfo->dir_in)
+                if (rinfo->dir_entries)
                        break;
                order--;
        }
-        if (!rinfo->dir_in)
+        if (!rinfo->dir_entries)
                return -ENOMEM;
        num_entries = (PAGE_SIZE << order) / size;
@@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
        INIT_LIST_HEAD(&req->r_unsafe_target_item);
        req->r_fmode = -1;
        kref_init(&req->r_kref);
+        RB_CLEAR_NODE(&req->r_node);
        INIT_LIST_HEAD(&req->r_wait);
        init_completion(&req->r_completion);
        init_completion(&req->r_safe_completion);
@@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        /* get request, session */
        tid = le64_to_cpu(msg->hdr.tid);
        mutex_lock(&mdsc->mutex);
-        req = __lookup_request(mdsc, tid);
+        req = lookup_get_request(mdsc, tid);
        if (!req) {
                dout("handle_reply on unknown tid %llu\n", tid);
                mutex_unlock(&mdsc->mutex);
@@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
        fwd_seq = ceph_decode_32(&p);
        mutex_lock(&mdsc->mutex);
-        req = __lookup_request(mdsc, tid);
+        req = lookup_get_request(mdsc, tid);
        if (!req) {
                dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
                goto out;  /* dup reply? */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ee69a537dba5..e7d38aac7109 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in {
        u32 pool_ns_len;
 };
+struct ceph_mds_reply_dir_entry {
+        char                          *name;
+        u32                           name_len;
+        struct ceph_mds_reply_lease   *lease;
+        struct ceph_mds_reply_info_in inode;
+        loff_t                        offset;
+};
 /*
 * parsed info about an mds reply, including information about
 * either: 1) the target inode and/or its parent directory and dentry,
@@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed {
                        struct ceph_mds_reply_dirfrag *dir_dir;
                        size_t                        dir_buf_size;
                        int                           dir_nr;
-                        char                          **dir_dname;
+                        bool                          dir_complete;
-                        u32                           *dir_dname_len;
+                        bool                          dir_end;
-                        struct ceph_mds_reply_lease   **dir_dlease;
+                        bool                          hash_order;
-                        struct ceph_mds_reply_info_in *dir_in;
+                        struct ceph_mds_reply_dir_entry  *dir_entries;
-                        u8                            dir_complete, dir_end;
                };
                /* for create results */
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 261531e55e9d..8c3591a7fbae 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
        const void *start = *p;
        int i, j, n;
        int err = -EINVAL;
-        u16 version;
+        u8 mdsmap_v, mdsmap_cv;
        m = kzalloc(sizeof(*m), GFP_NOFS);
        if (m == NULL)
                return ERR_PTR(-ENOMEM);
-        ceph_decode_16_safe(p, end, version, bad);
+        ceph_decode_need(p, end, 1 + 1, bad);
-        if (version > 3) {
+        mdsmap_v = ceph_decode_8(p);
-                pr_warn("got mdsmap version %d > 3, failing", version);
+        mdsmap_cv = ceph_decode_8(p);
-                goto bad;
+        if (mdsmap_v >= 4) {
+               u32 mdsmap_len;
+               ceph_decode_32_safe(p, end, mdsmap_len, bad);
+               if (end < *p + mdsmap_len)
+                       goto bad;
+               end = *p + mdsmap_len;
        }
        ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
@@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                u32 namelen;
                s32 mds, inc, state;
                u64 state_seq;
-                u8 infoversion;
+                u8 info_v;
+                void *info_end = NULL;
                struct ceph_entity_addr addr;
                u32 num_export_targets;
                void *pexport_targets = NULL;
                struct ceph_timespec laggy_since;
                struct ceph_mds_info *info;
-                ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+                ceph_decode_need(p, end, sizeof(u64) + 1, bad);
                global_id = ceph_decode_64(p);
-                infoversion = ceph_decode_8(p);
+                info_v= ceph_decode_8(p);
+                if (info_v >= 4) {
+                        u32 info_len;
+                        u8 info_cv;
+                        ceph_decode_need(p, end, 1 + sizeof(u32), bad);
+                        info_cv = ceph_decode_8(p);
+                        info_len = ceph_decode_32(p);
+                        info_end = *p + info_len;
+                        if (info_end > end)
+                                goto bad;
+                }
+                ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
                *p += sizeof(u64);
                namelen = ceph_decode_32(p);  /* skip mds name */
                *p += namelen;
@@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                *p += sizeof(u32);
                ceph_decode_32_safe(p, end, namelen, bad);
                *p += namelen;
-                if (infoversion >= 2) {
+                if (info_v >= 2) {
                        ceph_decode_32_safe(p, end, num_export_targets, bad);
                        pexport_targets = *p;
                        *p += num_export_targets * sizeof(u32);
@@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                        num_export_targets = 0;
                }
+                if (info_end && *p != info_end) {
+                        if (*p > info_end)
+                                goto bad;
+                        *p = info_end;
+                }
                dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
                     i+1, n, global_id, mds, inc,
                     ceph_pr_addr(&addr.in_addr),
@@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
        m->m_cas_pg_pool = ceph_decode_64(p);
        /* ok, we don't care about the rest. */
+        *p = end;
        dout("mdsmap_decode success epoch %u\n", m->m_epoch);
        return m;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f12d5e2955c2..91e02481ce06 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
 * mount options
 */
 enum {
+        Opt_mds_namespace,
        Opt_wsize,
        Opt_rsize,
        Opt_rasize,
@@ -143,6 +144,7 @@ enum {
 };
 static match_table_t fsopt_tokens = {
+        {Opt_mds_namespace, "mds_namespace=%d"},
        {Opt_wsize, "wsize=%d"},
        {Opt_rsize, "rsize=%d"},
        {Opt_rasize, "rasize=%d"},
@@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private)
                break;
                /* misc */
+        case Opt_mds_namespace:
+                fsopt->mds_namespace = intval;
+                break;
        case Opt_wsize:
                fsopt->wsize = intval;
                break;
@@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
 {
        dout("destroy_mount_options %p\n", args);
        kfree(args->snapdir_name);
+        kfree(args->server_path);
        kfree(args);
 }
@@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
        if (ret)
                return ret;
+        ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
+        if (ret)
+                return ret;
        return ceph_compare_options(new_opt, fsc->client);
 }
 static int parse_mount_options(struct ceph_mount_options **pfsopt,
                               struct ceph_options **popt,
                               int flags, char *options,
-                               const char *dev_name,
+                               const char *dev_name)
-                               const char **path)
 {
        struct ceph_mount_options *fsopt;
        const char *dev_name_end;
@@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
        fsopt->congestion_kb = default_congestion_kb();
+        fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
        /*
         * Distinguish the server list from the path in "dev_name".
@@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
         */
        dev_name_end = strchr(dev_name, '/');
        if (dev_name_end) {
-                /* skip over leading '/' for path */
+                fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
-                *path = dev_name_end + 1;
+                if (!fsopt->server_path) {
+                        err = -ENOMEM;
+                        goto out;
+                }
        } else {
-                /* path is empty */
                dev_name_end = dev_name + strlen(dev_name);
-                *path = dev_name_end;
        }
        err = -EINVAL;
        dev_name_end--;         /* back up to ':' separator */
@@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
                goto out;
        }
        dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
-        dout("server path '%s'\n", *path);
+        if (fsopt->server_path)
+                dout("server path '%s'\n", fsopt->server_path);
        *popt = ceph_parse_options(options, dev_name, dev_name_end,
                                 parse_fsopt_token, (void *)fsopt);
@@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
                seq_puts(m, ",noacl");
 #endif
+        if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
+                seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
        if (fsopt->wsize)
                seq_printf(m, ",wsize=%d", fsopt->wsize);
        if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 {
        struct ceph_fs_client *fsc;
        const u64 supported_features =
-                CEPH_FEATURE_FLOCK |
+                CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
-                CEPH_FEATURE_DIRLAYOUTHASH |
+                CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
-                CEPH_FEATURE_MDS_INLINE_DATA;
        const u64 required_features = 0;
        int page_count;
        size_t size;
@@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
                goto fail;
        }
        fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+        fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
        ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
        fsc->mount_options = fsopt;
@@ -785,8 +799,7 @@ out:
 /*
 * mount: join the ceph cluster, and open root directory.
 */
-static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
-                      const char *path)
 {
        int err;
        unsigned long started = jiffies;  /* note the start time */
@@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
                        goto fail;
        }
-        if (path[0] == 0) {
+        if (!fsc->mount_options->server_path) {
                root = fsc->sb->s_root;
                dget(root);
        } else {
-                dout("mount opening base mountpoint\n");
+                const char *path = fsc->mount_options->server_path + 1;
+                dout("mount opening path %s\n", path);
                root = open_root_dentry(fsc, path, started);
                if (IS_ERR(root)) {
                        err = PTR_ERR(root);
@@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
        struct dentry *res;
        int err;
        int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
-        const char *path = NULL;
        struct ceph_mount_options *fsopt = NULL;
        struct ceph_options *opt = NULL;
@@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
        flags |= MS_POSIXACL;
 #endif
-        err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+        err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
        if (err < 0) {
                res = ERR_PTR(err);
                goto out_final;
@@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
                }
        }
-        res = ceph_real_mount(fsc, path);
+        res = ceph_real_mount(fsc);
        if (IS_ERR(res))
                goto out_splat;
        dout("root %p inode %p ino %llx.%llx\n", res,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7b99eb756477..0130a8592191 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -62,6 +62,7 @@ struct ceph_mount_options {
        int cap_release_safety;
        int max_readdir;       /* max readdir result (entires) */
        int max_readdir_bytes; /* max readdir result (bytes) */
+        int mds_namespace;
        /*
         * everything above this point can be memcmp'd; everything below
@@ -69,6 +70,7 @@ struct ceph_mount_options {
         */
        char *snapdir_name;   /* default ".snap" */
+        char *server_path;    /* default  "/" */
 };
 struct ceph_fs_client {
@@ -295,6 +297,7 @@ struct ceph_inode_info {
        u64 i_files, i_subdirs;
        struct rb_root i_fragtree;
+        int i_fragtree_nsplits;
        struct mutex i_fragtree_mutex;
        struct ceph_inode_xattrs_info i_xattrs;
@@ -469,6 +472,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_POOL_RD          (1 << 5)  /* can read from pool */
 #define CEPH_I_POOL_WR          (1 << 6)  /* can write to pool */
 #define CEPH_I_SEC_INITED       (1 << 7)  /* security initialized */
+#define CEPH_I_CAP_DROPPED      (1 << 8)  /* caps were forcibly dropped */
 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
                                           long long release_count,
@@ -537,11 +541,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
        return (struct ceph_dentry_info *)dentry->d_fsdata;
 }
-static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
-{
-        return ((loff_t)frag << 32) | (loff_t)off;
-}
 /*
 * caps helpers
 */
@@ -632,7 +631,6 @@ struct ceph_file_info {
        struct ceph_mds_request *last_readdir;
        /* readdir: position within a frag */
-        unsigned offset;       /* offset of last chunk, adjusted for . and .. */
        unsigned next_offset;  /* offset of next chunk (last_name's + 1) */
        char *last_name;       /* last entry in previous chunk */
        long long dir_release_count;
@@ -927,6 +925,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
+extern int ceph_renew_caps(struct inode *inode);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
                            struct file *file, unsigned flags, umode_t mode,
@@ -942,6 +941,7 @@ extern const struct inode_operations ceph_snapdir_iops;
 extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
        ceph_snapdir_dentry_ops;
+extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
 extern int ceph_handle_snapdir(struct ceph_mds_request *req,
                               struct dentry *dentry, int err);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0d66722c6a52..dacc1bd85629 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
        char buf[128];
        dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
-        down_read(&osdc->map_sem);
+        down_read(&osdc->lock);
        pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
        if (pool_name) {
                size_t len = strlen(pool_name);
@@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
                                ret = -ERANGE;
                }
        }
-        up_read(&osdc->map_sem);
+        up_read(&osdc->lock);
        return ret;
 }
@@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
        s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
        const char *pool_name;
-        down_read(&osdc->map_sem);
+        down_read(&osdc->lock);
        pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
        if (pool_name)
                ret = snprintf(val, size, "%s", pool_name);
        else
                ret = snprintf(val, size, "%lld", (unsigned long long)pool);
-        up_read(&osdc->map_sem);
+        up_read(&osdc->lock);
        return ret;
 }
@@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
        struct ceph_mds_request *req;
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_pagelist *pagelist = NULL;
+        int op = CEPH_MDS_OP_SETXATTR;
        int err;
        if (size > 0) {
@@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
                if (err)
                        goto out;
        } else if (!value) {
-                flags |= CEPH_XATTR_REMOVE;
+                if (flags & CEPH_XATTR_REPLACE)
+                        op = CEPH_MDS_OP_RMXATTR;
+                else
+                        flags |= CEPH_XATTR_REMOVE;
        }
        dout("setxattr value=%.*s\n", (int)size, value);
        /* do request */
-        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
+        req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
-                                       USE_AUTH_MDS);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
                goto out;
        }
-        req->r_args.setxattr.flags = cpu_to_le32(flags);
        req->r_path2 = kstrdup(name, GFP_NOFS);
        if (!req->r_path2) {
                ceph_mdsc_put_request(req);
@@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
                goto out;
        }
-        req->r_pagelist = pagelist;
+        if (op == CEPH_MDS_OP_SETXATTR) {
-        pagelist = NULL;
+                req->r_args.setxattr.flags = cpu_to_le32(flags);
+                req->r_pagelist = pagelist;
+                pagelist = NULL;
+        }
        req->r_inode = inode;
        ihold(inode);
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
index b827e066e55a..146507df8650 100644
--- a/include/linux/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
        return ceph_frag_make(newbits,
                         ceph_frag_value(f) | (i << (24 - newbits)));
 }
-static inline int ceph_frag_is_leftmost(__u32 f)
+static inline bool ceph_frag_is_leftmost(__u32 f)
 {
        return ceph_frag_value(f) == 0;
 }
-static inline int ceph_frag_is_rightmost(__u32 f)
+static inline bool ceph_frag_is_rightmost(__u32 f)
 {
        return ceph_frag_value(f) == ceph_frag_mask(f);
 }
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 37f28bf55ce4..dfce616002ad 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -153,8 +153,9 @@ struct ceph_dir_layout {
 /* watch-notify operations */
 enum {
-  WATCH_NOTIFY                          = 1, /* notifying watcher */
+        CEPH_WATCH_EVENT_NOTIFY           = 1, /* notifying watcher */
-  WATCH_NOTIFY_COMPLETE                 = 2, /* notifier notified when done */
+        CEPH_WATCH_EVENT_NOTIFY_COMPLETE  = 2, /* notifier notified when done */
+        CEPH_WATCH_EVENT_DISCONNECT       = 3, /* we were disconnected */
 };
@@ -207,6 +208,8 @@ struct ceph_mon_subscribe_ack {
        struct ceph_fsid fsid;
 } __attribute__ ((packed));
+#define CEPH_FS_CLUSTER_ID_NONE  -1
 /*
 * mdsmap flags
 */
@@ -344,6 +347,18 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_XATTR_REPLACE (1 << 1)
 #define CEPH_XATTR_REMOVE  (1 << 31)
+/*
+ * readdir request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS     (1<<0)
+/*
+ * readdir reply flags.
+ */
+#define CEPH_READDIR_FRAG_END           (1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE      (1<<8)
+#define CEPH_READDIR_HASH_ORDER         (1<<9)
 union ceph_mds_request_args {
        struct {
                __le32 mask;                 /* CEPH_CAP_* */
@@ -361,6 +376,7 @@ union ceph_mds_request_args {
                __le32 frag;                 /* which dir fragment */
                __le32 max_entries;          /* how many dentries to grab */
                __le32 max_bytes;
+                __le16 flags;
        } __attribute__ ((packed)) readdir;
        struct {
                __le32 mode;
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index a6ef9cc267ec..19e9932f3e77 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
 /*
 * bounds check input.
 */
-static inline int ceph_has_room(void **p, void *end, size_t n)
+static inline bool ceph_has_room(void **p, void *end, size_t n)
 {
        return end >= *p && n <= end - *p;
 }
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index db92a8d4926e..690985daad1c 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len)
                (off >> PAGE_SHIFT);
 }
+/*
+ * These are not meant to be generic - an integer key is assumed.
+ */
+#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)             \
+static void insert_##name(struct rb_root *root, type *t)                \
+{                                                                       \
+        struct rb_node **n = &root->rb_node;                            \
+        struct rb_node *parent = NULL;                                  \
+                                                                        \
+        BUG_ON(!RB_EMPTY_NODE(&t->nodefld));                            \
+                                                                        \
+        while (*n) {                                                    \
+                type *cur = rb_entry(*n, type, nodefld);                \
+                                                                        \
+                parent = *n;                                            \
+                if (t->keyfld < cur->keyfld)                            \
+                        n = &(*n)->rb_left;                             \
+                else if (t->keyfld > cur->keyfld)                       \
+                        n = &(*n)->rb_right;                            \
+                else                                                    \
+                        BUG();                                          \
+        }                                                               \
+                                                                        \
+        rb_link_node(&t->nodefld, parent, n);                           \
+        rb_insert_color(&t->nodefld, root);                             \
+}                                                                       \
+static void erase_##name(struct rb_root *root, type *t)                 \
+{                                                                       \
+        BUG_ON(RB_EMPTY_NODE(&t->nodefld));                             \
+        rb_erase(&t->nodefld, root);                                    \
+        RB_CLEAR_NODE(&t->nodefld);                                     \
+}
+#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)              \
+static type *lookup_##name(struct rb_root *root,                        \
+                           typeof(((type *)0)->keyfld) key)             \
+{                                                                       \
+        struct rb_node *n = root->rb_node;                              \
+                                                                        \
+        while (n) {                                                     \
+                type *cur = rb_entry(n, type, nodefld);                 \
+                                                                        \
+                if (key < cur->keyfld)                                  \
+                        n = n->rb_left;                                 \
+                else if (key > cur->keyfld)                             \
+                        n = n->rb_right;                                \
+                else                                                    \
+                        return cur;                                     \
+        }                                                               \
+                                                                        \
+        return NULL;                                                    \
+}
+#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld)                    \
+DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)                     \
+DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
 extern struct kmem_cache *ceph_inode_cachep;
 extern struct kmem_cache *ceph_cap_cachep;
 extern struct kmem_cache *ceph_cap_flush_cachep;
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index e230e7ed60d3..e2a92df08b47 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -39,20 +39,31 @@ struct ceph_mon_request {
        ceph_monc_request_func_t do_request;
 };
+typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *);
 /*
 * ceph_mon_generic_request is being used for the statfs and
 * mon_get_version requests which are being done a bit differently
 * because we need to get data back to the caller
 */
 struct ceph_mon_generic_request {
+        struct ceph_mon_client *monc;
        struct kref kref;
        u64 tid;
        struct rb_node node;
        int result;
-        void *buf;
        struct completion completion;
+        ceph_monc_callback_t complete_cb;
+        u64 private_data;          /* r_tid/linger_id */
        struct ceph_msg *request;  /* original request */
        struct ceph_msg *reply;    /* and reply */
+        union {
+                struct ceph_statfs *st;
+                u64 newest;
+        } u;
 };
 struct ceph_mon_client {
@@ -77,7 +88,6 @@ struct ceph_mon_client {
        /* pending generic requests */
        struct rb_root generic_request_tree;
-        int num_generic_requests;
        u64 last_tid;
        /* subs, indexed with CEPH_SUB_* */
@@ -86,6 +96,7 @@ struct ceph_mon_client {
                bool want;
                u32 have; /* epoch */
        } subs[3];
+        int fs_cluster_id; /* "mdsmap.<id>" sub */
 #ifdef CONFIG_DEBUG_FS
        struct dentry *debugfs_file;
@@ -116,16 +127,18 @@ extern const char *ceph_sub_str[];
 bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
                        bool continuous);
 void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
+void ceph_monc_renew_subs(struct ceph_mon_client *monc);
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
                                 unsigned long timeout);
 extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
                               struct ceph_statfs *buf);
-extern int ceph_monc_do_get_version(struct ceph_mon_client *monc,
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
-                                    const char *what, u64 *newest);
+                          u64 *newest);
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+                                ceph_monc_callback_t cb, u64 private_data);
 extern int ceph_monc_open_session(struct ceph_mon_client *monc);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index cbf460927c42..19b14862d3e0 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -20,10 +20,11 @@ struct ceph_osd_client;
 /*
 * completion callback for async writepages
 */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
-                                     struct ceph_msg *);
 typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
+#define CEPH_HOMELESS_OSD       -1
 /* a given osd we're communicating with */
 struct ceph_osd {
        atomic_t o_ref;
@@ -32,16 +33,15 @@ struct ceph_osd {
        int o_incarnation;
        struct rb_node o_node;
        struct ceph_connection o_con;
-        struct list_head o_requests;
+        struct rb_root o_requests;
-        struct list_head o_linger_requests;
+        struct rb_root o_linger_requests;
        struct list_head o_osd_lru;
        struct ceph_auth_handshake o_auth;
        unsigned long lru_ttl;
-        int o_marked_for_keepalive;
        struct list_head o_keepalive_item;
+        struct mutex lock;
 };
 #define CEPH_OSD_SLAB_OPS       2
 #define CEPH_OSD_MAX_OPS        16
@@ -104,76 +104,95 @@ struct ceph_osd_req_op {
                        struct ceph_osd_data response_data;
                        __u8 class_len;
                        __u8 method_len;
-                        __u8 argc;
+                        u32 indata_len;
                } cls;
                struct {
                        u64 cookie;
-                        u64 ver;
+                        __u8 op;           /* CEPH_OSD_WATCH_OP_ */
-                        u32 prot_ver;
+                        u32 gen;
-                        u32 timeout;
-                        __u8 flag;
                } watch;
                struct {
+                        struct ceph_osd_data request_data;
+                } notify_ack;
+                struct {
+                        u64 cookie;
+                        struct ceph_osd_data request_data;
+                        struct ceph_osd_data response_data;
+                } notify;
+                struct {
                        u64 expected_object_size;
                        u64 expected_write_size;
                } alloc_hint;
        };
 };
+struct ceph_osd_request_target {
+        struct ceph_object_id base_oid;
+        struct ceph_object_locator base_oloc;
+        struct ceph_object_id target_oid;
+        struct ceph_object_locator target_oloc;
+        struct ceph_pg pgid;
+        u32 pg_num;
+        u32 pg_num_mask;
+        struct ceph_osds acting;
+        struct ceph_osds up;
+        int size;
+        int min_size;
+        bool sort_bitwise;
+        unsigned int flags;                /* CEPH_OSD_FLAG_* */
+        bool paused;
+        int osd;
+};
 /* an in-flight request */
 struct ceph_osd_request {
        u64             r_tid;              /* unique for this client */
        struct rb_node  r_node;
-        struct list_head r_req_lru_item;
+        struct rb_node  r_mc_node;          /* map check */
-        struct list_head r_osd_item;
-        struct list_head r_linger_item;
-        struct list_head r_linger_osd_item;
        struct ceph_osd *r_osd;
-        struct ceph_pg   r_pgid;
-        int              r_pg_osds[CEPH_PG_MAX_SIZE];
+        struct ceph_osd_request_target r_t;
-        int              r_num_pg_osds;
+#define r_base_oid      r_t.base_oid
+#define r_base_oloc     r_t.base_oloc
+#define r_flags         r_t.flags
        struct ceph_msg  *r_request, *r_reply;
-        int               r_flags;     /* any additional flags for the osd */
        u32               r_sent;      /* >0 if r_request is sending/sent */
        /* request osd ops array  */
        unsigned int            r_num_ops;
-        /* these are updated on each send */
-        __le32           *r_request_osdmap_epoch;
-        __le32           *r_request_flags;
-        __le64           *r_request_pool;
-        void             *r_request_pgid;
-        __le32           *r_request_attempts;
-        bool              r_paused;
-        struct ceph_eversion *r_request_reassert_version;
        int               r_result;
-        int               r_got_reply;
+        bool              r_got_reply;
-        int               r_linger;
        struct ceph_osd_client *r_osdc;
        struct kref       r_kref;
        bool              r_mempool;
-        struct completion r_completion, r_safe_completion;
+        struct completion r_completion;
+        struct completion r_safe_completion;  /* fsync waiter */
        ceph_osdc_callback_t r_callback;
        ceph_osdc_unsafe_callback_t r_unsafe_callback;
-        struct ceph_eversion r_reassert_version;
        struct list_head  r_unsafe_item;
        struct inode *r_inode;                /* for use by callbacks */
        void *r_priv;                         /* ditto */
-        struct ceph_object_locator r_base_oloc;
+        /* set by submitter */
-        struct ceph_object_id r_base_oid;
+        u64 r_snapid;                         /* for reads, CEPH_NOSNAP o/w */
-        struct ceph_object_locator r_target_oloc;
+        struct ceph_snap_context *r_snapc;    /* for writes */
-        struct ceph_object_id r_target_oid;
+        struct timespec r_mtime;              /* ditto */
+        u64 r_data_offset;                    /* ditto */
-        u64               r_snapid;
+        bool r_linger;                        /* don't resend on failure */
-        unsigned long     r_stamp;            /* send OR check time */
-        struct ceph_snap_context *r_snapc;    /* snap context for writes */
+        /* internal */
+        unsigned long r_stamp;                /* jiffies, send or check time */
+        int r_attempts;
+        struct ceph_eversion r_replay_version; /* aka reassert_version */
+        u32 r_last_force_resend;
+        u32 r_map_dne_bound;
        struct ceph_osd_req_op r_ops[];
 };
@@ -182,44 +201,70 @@ struct ceph_request_redirect {
        struct ceph_object_locator oloc;
 };
-struct ceph_osd_event {
+typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
-        u64 cookie;
+                                 u64 notifier_id, void *data, size_t data_len);
-        int one_shot;
+typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
+struct ceph_osd_linger_request {
        struct ceph_osd_client *osdc;
-        void (*cb)(u64, u64, u8, void *);
+        u64 linger_id;
-        void *data;
+        bool committed;
-        struct rb_node node;
+        bool is_watch;                  /* watch or notify */
-        struct list_head osd_node;
+        struct ceph_osd *osd;
+        struct ceph_osd_request *reg_req;
+        struct ceph_osd_request *ping_req;
+        unsigned long ping_sent;
+        unsigned long watch_valid_thru;
+        struct list_head pending_lworks;
+        struct ceph_osd_request_target t;
+        u32 last_force_resend;
+        u32 map_dne_bound;
+        struct timespec mtime;
        struct kref kref;
-};
+        struct mutex lock;
+        struct rb_node node;            /* osd */
+        struct rb_node osdc_node;       /* osdc */
+        struct rb_node mc_node;         /* map check */
+        struct list_head scan_item;
+        struct completion reg_commit_wait;
+        struct completion notify_finish_wait;
+        int reg_commit_error;
+        int notify_finish_error;
+        int last_error;
+        u32 register_gen;
+        u64 notify_id;
+        rados_watchcb2_t wcb;
+        rados_watcherrcb_t errcb;
+        void *data;
-struct ceph_osd_event_work {
+        struct page ***preply_pages;
-        struct work_struct work;
+        size_t *preply_len;
-        struct ceph_osd_event *event;
-        u64 ver;
-        u64 notify_id;
-        u8 opcode;
 };
 struct ceph_osd_client {
        struct ceph_client     *client;
        struct ceph_osdmap     *osdmap;       /* current map */
-        struct rw_semaphore    map_sem;
+        struct rw_semaphore    lock;
-        struct completion      map_waiters;
-        u64                    last_requested_map;
-        struct mutex           request_mutex;
        struct rb_root         osds;          /* osds */
        struct list_head       osd_lru;       /* idle osds */
-        u64                    timeout_tid;   /* tid of timeout triggering rq */
+        spinlock_t             osd_lru_lock;
-        u64                    last_tid;      /* tid of last request */
+        struct ceph_osd        homeless_osd;
-        struct rb_root         requests;      /* pending requests */
+        atomic64_t             last_tid;      /* tid of last request */
-        struct list_head       req_lru;       /* in-flight lru */
+        u64                    last_linger_id;
-        struct list_head       req_unsent;    /* unsent/need-resend queue */
+        struct rb_root         linger_requests; /* lingering requests */
-        struct list_head       req_notarget;  /* map to no osd */
+        struct rb_root         map_checks;
-        struct list_head       req_linger;    /* lingering requests */
+        struct rb_root         linger_map_checks;
-        int                    num_requests;
+        atomic_t               num_requests;
+        atomic_t               num_homeless;
        struct delayed_work    timeout_work;
        struct delayed_work    osds_timeout_work;
 #ifdef CONFIG_DEBUG_FS
@@ -231,10 +276,6 @@ struct ceph_osd_client {
        struct ceph_msgpool     msgpool_op;
        struct ceph_msgpool     msgpool_op_reply;
-        spinlock_t              event_lock;
-        struct rb_root          event_tree;
-        u64                     event_count;
        struct workqueue_struct *notify_wq;
 };
@@ -271,9 +312,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
 extern struct ceph_osd_data *osd_req_op_extent_osd_data(
                                        struct ceph_osd_request *osd_req,
                                        unsigned int which);
-extern struct ceph_osd_data *osd_req_op_cls_response_data(
-                                        struct ceph_osd_request *osd_req,
-                                        unsigned int which);
 extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
                                        unsigned int which,
@@ -309,9 +347,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
 extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
                                 u16 opcode, const char *name, const void *value,
                                 size_t size, u8 cmp_op, u8 cmp_mode);
-extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
-                                        unsigned int which, u16 opcode,
-                                        u64 cookie, u64 version, int flag);
 extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
                                       unsigned int which,
                                       u64 expected_object_size,
@@ -322,11 +357,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
                                               unsigned int num_ops,
                                               bool use_mempool,
                                               gfp_t gfp_flags);
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
-extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-                                    struct ceph_snap_context *snapc,
-                                    u64 snap_id,
-                                    struct timespec *mtime);
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
                                      struct ceph_file_layout *layout,
@@ -338,9 +369,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
                                      u32 truncate_seq, u64 truncate_size,
                                      bool use_mempool);
-extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
-                                         struct ceph_osd_request *req);
 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
 extern void ceph_osdc_put_request(struct ceph_osd_request *req);
@@ -353,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
 extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                               struct ceph_vino vino,
@@ -371,11 +400,33 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
                                struct timespec *mtime,
                                struct page **pages, int nr_pages);
-/* watch/notify events */
+/* watch/notify */
-extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
+struct ceph_osd_linger_request *
-                                  void (*event_cb)(u64, u64, u8, void *),
+ceph_osdc_watch(struct ceph_osd_client *osdc,
-                                  void *data, struct ceph_osd_event **pevent);
+                struct ceph_object_id *oid,
-extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
+                struct ceph_object_locator *oloc,
-extern void ceph_osdc_put_event(struct ceph_osd_event *event);
+                rados_watchcb2_t wcb,
+                rados_watcherrcb_t errcb,
+                void *data);
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+                      struct ceph_osd_linger_request *lreq);
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+                         struct ceph_object_id *oid,
+                         struct ceph_object_locator *oloc,
+                         u64 notify_id,
+                         u64 cookie,
+                         void *payload,
+                         size_t payload_len);
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+                     struct ceph_object_id *oid,
+                     struct ceph_object_locator *oloc,
+                     void *payload,
+                     size_t payload_len,
+                     u32 timeout,
+                     struct page ***preply_pages,
+                     size_t *preply_len);
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+                          struct ceph_osd_linger_request *lreq);
 #endif
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index e55c08bc3a96..ddc426b22d81 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -24,21 +24,29 @@ struct ceph_pg {
        uint32_t seed;
 };
-#define CEPH_POOL_FLAG_HASHPSPOOL  1
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
+#define CEPH_POOL_FLAG_HASHPSPOOL       (1ULL << 0) /* hash pg seed and pool id
+                                                       together */
+#define CEPH_POOL_FLAG_FULL             (1ULL << 1) /* pool is full */
 struct ceph_pg_pool_info {
        struct rb_node node;
        s64 id;
-        u8 type;
+        u8 type; /* CEPH_POOL_TYPE_* */
        u8 size;
+        u8 min_size;
        u8 crush_ruleset;
        u8 object_hash;
+        u32 last_force_request_resend;
        u32 pg_num, pgp_num;
        int pg_num_mask, pgp_num_mask;
        s64 read_tier;
        s64 write_tier; /* wins for read+write ops */
-        u64 flags;
+        u64 flags; /* CEPH_POOL_FLAG_* */
        char *name;
+        bool was_full;  /* for handle_one_map() */
 };
 static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
@@ -57,6 +65,22 @@ struct ceph_object_locator {
        s64 pool;
 };
+static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
+{
+        oloc->pool = -1;
+}
+static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
+{
+        return oloc->pool == -1;
+}
+static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
+                                  const struct ceph_object_locator *src)
+{
+        dest->pool = src->pool;
+}
 /*
 * Maximum supported by kernel client object name length
 *
@@ -64,11 +88,47 @@ struct ceph_object_locator {
 */
 #define CEPH_MAX_OID_NAME_LEN 100
+/*
+ * 51-char inline_name is long enough for all cephfs and all but one
+ * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
+ * arbitrarily long (~PAGE_SIZE).  It's done once during rbd map; all
+ * other rbd requests fit into inline_name.
+ *
+ * Makes ceph_object_id 64 bytes on 64-bit.
+ */
+#define CEPH_OID_INLINE_LEN 52
+/*
+ * Both inline and external buffers have space for a NUL-terminator,
+ * which is carried around.  It's not required though - RADOS object
+ * names don't have to be NUL-terminated and may contain NULs.
+ */
 struct ceph_object_id {
-        char name[CEPH_MAX_OID_NAME_LEN];
+        char *name;
+        char inline_name[CEPH_OID_INLINE_LEN];
        int name_len;
 };
+static inline void ceph_oid_init(struct ceph_object_id *oid)
+{
+        oid->name = oid->inline_name;
+        oid->name_len = 0;
+}
+static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
+{
+        return oid->name == oid->inline_name && !oid->name_len;
+}
+void ceph_oid_copy(struct ceph_object_id *dest,
+                   const struct ceph_object_id *src);
+__printf(2, 3)
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
+__printf(3, 4)
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+                     const char *fmt, ...);
+void ceph_oid_destroy(struct ceph_object_id *oid);
 struct ceph_pg_mapping {
        struct rb_node node;
        struct ceph_pg pgid;
@@ -87,7 +147,6 @@ struct ceph_pg_mapping {
 struct ceph_osdmap {
        struct ceph_fsid fsid;
        u32 epoch;
-        u32 mkfs_epoch;
        struct ceph_timespec created, modified;
        u32 flags;         /* CEPH_OSDMAP_* */
@@ -113,43 +172,19 @@ struct ceph_osdmap {
        int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
 };
-static inline void ceph_oid_set_name(struct ceph_object_id *oid,
+static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
-                                     const char *name)
-{
-        int len;
-        len = strlen(name);
-        if (len > sizeof(oid->name)) {
-                WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
-                     name, len, sizeof(oid->name));
-                len = sizeof(oid->name);
-        }
-        memcpy(oid->name, name, len);
-        oid->name_len = len;
-}
-static inline void ceph_oid_copy(struct ceph_object_id *dest,
-                                 struct ceph_object_id *src)
-{
-        BUG_ON(src->name_len > sizeof(dest->name));
-        memcpy(dest->name, src->name, src->name_len);
-        dest->name_len = src->name_len;
-}
-static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
 {
        return osd >= 0 && osd < map->max_osd &&
               (map->osd_state[osd] & CEPH_OSD_EXISTS);
 }
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
 {
        return ceph_osd_exists(map, osd) &&
               (map->osd_state[osd] & CEPH_OSD_UP);
 }
-static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
 {
        return !ceph_osd_is_up(map, osd);
 }
@@ -192,28 +227,59 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
        return 0;
 }
+struct ceph_osdmap *ceph_osdmap_alloc(void);
 extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                            struct ceph_osdmap *map,
+                                             struct ceph_osdmap *map);
-                                            struct ceph_messenger *msgr);
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+struct ceph_osds {
+        int osds[CEPH_PG_MAX_SIZE];
+        int size;
+        int primary; /* id, NOT index */
+};
+static inline void ceph_osds_init(struct ceph_osds *set)
+{
+        set->size = 0;
+        set->primary = -1;
+}
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+                          const struct ceph_osds *new_acting,
+                          const struct ceph_osds *old_up,
+                          const struct ceph_osds *new_up,
+                          int old_size,
+                          int new_size,
+                          int old_min_size,
+                          int new_min_size,
+                          u32 old_pg_num,
+                          u32 new_pg_num,
+                          bool old_sort_bitwise,
+                          bool new_sort_bitwise,
+                          const struct ceph_pg *pgid);
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+                       const struct ceph_osds *new_acting,
+                       bool any_change);
 /* calculate mapping of a file extent to an object */
 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
                                         u64 off, u64 len,
                                         u64 *bno, u64 *oxoff, u64 *oxlen);
-/* calculate mapping of object to a placement group */
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
-extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
+                              struct ceph_object_id *oid,
-                               struct ceph_object_locator *oloc,
+                              struct ceph_object_locator *oloc,
-                               struct ceph_object_id *oid,
+                              struct ceph_pg *raw_pgid);
-                               struct ceph_pg *pg_out);
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
+                               const struct ceph_pg *raw_pgid,
-                               struct ceph_pg pgid,
+                               struct ceph_osds *up,
-                               int *osds, int *primary);
+                               struct ceph_osds *acting);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
-                                struct ceph_pg pgid);
+                              const struct ceph_pg *raw_pgid);
 extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
                                                    u64 id);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 2f822dca1046..5c0da61cb763 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -114,8 +114,8 @@ struct ceph_object_layout {
 * compound epoch+version, used by storage layer to serialize mutations
 */
 struct ceph_eversion {
-        __le32 epoch;
        __le64 version;
+        __le32 epoch;
 } __attribute__ ((packed));
 /*
@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSDMAP_NOIN     (1<<8)  /* block osd auto mark-in */
 #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
 #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB  (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
 /*
 * The error code to return when an OSD can't handle a write
@@ -389,6 +394,13 @@ enum {
        CEPH_OSD_FLAG_SKIPRWLOCKS =   0x10000,  /* skip rw locks */
        CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
        CEPH_OSD_FLAG_FLUSH =         0x40000,  /* this is part of flush */
+        CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000,  /* map snap direct to clone id */
+        CEPH_OSD_FLAG_ENFORCE_SNAPC   = 0x100000,  /* use snapc provided even if
+                                                      pool uses pool snaps */
+        CEPH_OSD_FLAG_REDIRECTED   = 0x200000,  /* op has been redirected */
+        CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000,  /* redirect bit is authoritative */
+        CEPH_OSD_FLAG_FULL_TRY =    0x800000,  /* try op despite full flag */
+        CEPH_OSD_FLAG_FULL_FORCE = 0x1000000,  /* force op despite full flag */
 };
 enum {
@@ -415,7 +427,17 @@ enum {
        CEPH_OSD_CMPXATTR_MODE_U64    = 2
 };
-#define RADOS_NOTIFY_VER        1
+enum {
+        CEPH_OSD_WATCH_OP_UNWATCH = 0,
+        CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+        /* note: use only ODD ids to prevent pre-giant code from
+           interpreting the op as UNWATCH */
+        CEPH_OSD_WATCH_OP_WATCH = 3,
+        CEPH_OSD_WATCH_OP_RECONNECT = 5,
+        CEPH_OSD_WATCH_OP_PING = 7,
+};
+const char *ceph_osd_watch_op_name(int o);
 /*
 * an individual object operation.  each may be accompanied by some data
@@ -450,10 +472,14 @@ struct ceph_osd_op {
                } __attribute__ ((packed)) snap;
                struct {
                        __le64 cookie;
-                        __le64 ver;
+                        __le64 ver;     /* no longer used */
-                        __u8 flag;      /* 0 = unwatch, 1 = watch */
+                        __u8 op;        /* CEPH_OSD_WATCH_OP_* */
+                        __le32 gen;     /* registration generation */
                } __attribute__ ((packed)) watch;
                struct {
+                        __le64 cookie;
+                } __attribute__ ((packed)) notify;
+                struct {
                        __le64 offset, length;
                        __le64 src_offset;
                } __attribute__ ((packed)) clonerange;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index dcc18c6f7cf9..55d2bfee16d7 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
 /*
 * true if we have the mon map (and have thus joined the cluster)
 */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static bool have_mon_and_osd_map(struct ceph_client *client)
 {
        return client->monc.monmap && client->monc.monmap->epoch &&
               client->osdc.osdmap && client->osdc.osdmap->epoch;
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 139a9cb19b0c..3773a4fa11e3 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
        }
 }
+const char *ceph_osd_watch_op_name(int o)
+{
+        switch (o) {
+        case CEPH_OSD_WATCH_OP_UNWATCH:
+                return "unwatch";
+        case CEPH_OSD_WATCH_OP_WATCH:
+                return "watch";
+        case CEPH_OSD_WATCH_OP_RECONNECT:
+                return "reconnect";
+        case CEPH_OSD_WATCH_OP_PING:
+                return "ping";
+        default:
+                return "???";
+        }
+}
 const char *ceph_osd_state_name(int s)
 {
        switch (s) {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index b902fbc7863e..e77b04ca7802 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p)
 {
        int i;
        struct ceph_client *client = s->private;
-        struct ceph_osdmap *map = client->osdc.osdmap;
+        struct ceph_osd_client *osdc = &client->osdc;
+        struct ceph_osdmap *map = osdc->osdmap;
        struct rb_node *n;
        if (map == NULL)
                return 0;
-        seq_printf(s, "epoch %d\n", map->epoch);
+        down_read(&osdc->lock);
-        seq_printf(s, "flags%s%s\n",
+        seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
-                   (map->flags & CEPH_OSDMAP_NEARFULL) ?  " NEARFULL" : "",
-                   (map->flags & CEPH_OSDMAP_FULL) ?  " FULL" : "");
        for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
-                struct ceph_pg_pool_info *pool =
+                struct ceph_pg_pool_info *pi =
                        rb_entry(n, struct ceph_pg_pool_info, node);
-                seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
+                seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
-                           pool->id, pool->pg_num, pool->pg_num_mask,
+                           pi->id, pi->name, pi->type, pi->size, pi->min_size,
-                           pool->read_tier, pool->write_tier);
+                           pi->pg_num, pi->pg_num_mask, pi->flags,
+                           pi->last_force_request_resend, pi->read_tier,
+                           pi->write_tier);
        }
        for (i = 0; i < map->max_osd; i++) {
                struct ceph_entity_addr *addr = &map->osd_addr[i];
@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p)
                           pg->pgid.seed, pg->primary_temp.osd);
        }
+        up_read(&osdc->lock);
        return 0;
 }
@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
                                        CEPH_SUBSCRIBE_ONETIME ?  "" : "+"));
                seq_putc(s, '\n');
        }
+        seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
        for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
                __u16 op;
@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p)
        return 0;
 }
-static int osdc_show(struct seq_file *s, void *pp)
+static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
 {
-        struct ceph_client *client = s->private;
+        int i;
-        struct ceph_osd_client *osdc = &client->osdc;
-        struct rb_node *p;
-        mutex_lock(&osdc->request_mutex);
+        seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
-        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+        for (i = 0; i < t->up.size; i++)
-                struct ceph_osd_request *req;
+                seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
-                unsigned int i;
+        seq_printf(s, "]/%d\t[", t->up.primary);
-                int opcode;
+        for (i = 0; i < t->acting.size; i++)
+                seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
+        seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
+                   t->target_oid.name_len, t->target_oid.name, t->flags);
+        if (t->paused)
+                seq_puts(s, "\tP");
+}
-                req = rb_entry(p, struct ceph_osd_request, r_node);
+static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
+{
+        int i;
-                seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
+        seq_printf(s, "%llu\t", req->r_tid);
-                           req->r_osd ? req->r_osd->o_osd : -1,
+        dump_target(s, &req->r_t);
-                           req->r_pgid.pool, req->r_pgid.seed);
-                seq_printf(s, "%.*s", req->r_base_oid.name_len,
+        seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
-                           req->r_base_oid.name);
+                   le32_to_cpu(req->r_replay_version.epoch),
+                   le64_to_cpu(req->r_replay_version.version));
-                if (req->r_reassert_version.epoch)
+        for (i = 0; i < req->r_num_ops; i++) {
-                        seq_printf(s, "\t%u'%llu",
+                struct ceph_osd_req_op *op = &req->r_ops[i];
-                           (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),
-                           le64_to_cpu(req->r_reassert_version.version));
+                seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
-                else
+                           ceph_osd_op_name(op->op));
-                        seq_printf(s, "\t");
+                if (op->op == CEPH_OSD_OP_WATCH)
+                        seq_printf(s, "-%s",
+                                   ceph_osd_watch_op_name(op->watch.op));
+        }
+        seq_putc(s, '\n');
+}
+static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+        struct rb_node *n;
+        mutex_lock(&osd->lock);
+        for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+                struct ceph_osd_request *req =
+                    rb_entry(n, struct ceph_osd_request, r_node);
+                dump_request(s, req);
+        }
+        mutex_unlock(&osd->lock);
+}
-                for (i = 0; i < req->r_num_ops; i++) {
+static void dump_linger_request(struct seq_file *s,
-                        opcode = req->r_ops[i].op;
+                                struct ceph_osd_linger_request *lreq)
-                        seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
+{
-                                   ceph_osd_op_name(opcode));
+        seq_printf(s, "%llu\t", lreq->linger_id);
-                }
+        dump_target(s, &lreq->t);
+        seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
+                   lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
+                   lreq->last_error);
+}
+static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+        struct rb_node *n;
+        mutex_lock(&osd->lock);
+        for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+                struct ceph_osd_linger_request *lreq =
+                    rb_entry(n, struct ceph_osd_linger_request, node);
+                dump_linger_request(s, lreq);
+        }
+        mutex_unlock(&osd->lock);
+}
-                seq_printf(s, "\n");
+static int osdc_show(struct seq_file *s, void *pp)
+{
+        struct ceph_client *client = s->private;
+        struct ceph_osd_client *osdc = &client->osdc;
+        struct rb_node *n;
+        down_read(&osdc->lock);
+        seq_printf(s, "REQUESTS %d homeless %d\n",
+                   atomic_read(&osdc->num_requests),
+                   atomic_read(&osdc->num_homeless));
+        for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+                struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+                dump_requests(s, osd);
        }
-        mutex_unlock(&osdc->request_mutex);
+        dump_requests(s, &osdc->homeless_osd);
+        seq_puts(s, "LINGER REQUESTS\n");
+        for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+                struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+                dump_linger_requests(s, osd);
+        }
+        dump_linger_requests(s, &osdc->homeless_osd);
+        up_read(&osdc->lock);
        return 0;
 }
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index cf638c009cfa..37c38a7fb5c5 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
        BUG_ON(num < 1); /* monmap sub is always there */
        ceph_encode_32(&p, num);
        for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
-                const char *s = ceph_sub_str[i];
+                char buf[32];
+                int len;
                if (!monc->subs[i].want)
                        continue;
-                dout("%s %s start %llu flags 0x%x\n", __func__, s,
+                len = sprintf(buf, "%s", ceph_sub_str[i]);
+                if (i == CEPH_SUB_MDSMAP &&
+                    monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
+                        len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
+                dout("%s %s start %llu flags 0x%x\n", __func__, buf,
                     le64_to_cpu(monc->subs[i].item.start),
                     monc->subs[i].item.flags);
-                ceph_encode_string(&p, end, s, strlen(s));
+                ceph_encode_string(&p, end, buf, len);
                memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
                p += sizeof(monc->subs[i].item);
        }
-        BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
+        BUG_ON(p > end);
        msg->front.iov_len = p - msg->front.iov_base;
        msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
        ceph_msg_revoke(msg);
@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
 }
 EXPORT_SYMBOL(ceph_monc_got_map);
-/*
+void ceph_monc_renew_subs(struct ceph_mon_client *monc)
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 {
-        dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
        mutex_lock(&monc->mutex);
-        if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
+        __send_subscribe(monc);
-                                 monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
-                __send_subscribe(monc);
        mutex_unlock(&monc->mutex);
 }
-EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
+EXPORT_SYMBOL(ceph_monc_renew_subs);
 /*
 * Wait for an osdmap with a given epoch.
@@ -478,51 +478,17 @@ out:
 /*
 * generic requests (currently statfs, mon_get_version)
 */
-static struct ceph_mon_generic_request *__lookup_generic_req(
+DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
-        struct ceph_mon_client *monc, u64 tid)
-{
-        struct ceph_mon_generic_request *req;
-        struct rb_node *n = monc->generic_request_tree.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_mon_generic_request, node);
-                if (tid < req->tid)
-                        n = n->rb_left;
-                else if (tid > req->tid)
-                        n = n->rb_right;
-                else
-                        return req;
-        }
-        return NULL;
-}
-static void __insert_generic_request(struct ceph_mon_client *monc,
-                            struct ceph_mon_generic_request *new)
-{
-        struct rb_node **p = &monc->generic_request_tree.rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_mon_generic_request *req = NULL;
-        while (*p) {
-                parent = *p;
-                req = rb_entry(parent, struct ceph_mon_generic_request, node);
-                if (new->tid < req->tid)
-                        p = &(*p)->rb_left;
-                else if (new->tid > req->tid)
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
-        }
-        rb_link_node(&new->node, parent, p);
-        rb_insert_color(&new->node, &monc->generic_request_tree);
-}
 static void release_generic_request(struct kref *kref)
 {
        struct ceph_mon_generic_request *req =
                container_of(kref, struct ceph_mon_generic_request, kref);
+        dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
+             req->reply);
+        WARN_ON(!RB_EMPTY_NODE(&req->node));
        if (req->reply)
                ceph_msg_put(req->reply);
        if (req->request)
@@ -533,7 +499,8 @@ static void release_generic_request(struct kref *kref)
 static void put_generic_request(struct ceph_mon_generic_request *req)
 {
-        kref_put(&req->kref, release_generic_request);
+        if (req)
+                kref_put(&req->kref, release_generic_request);
 }
 static void get_generic_request(struct ceph_mon_generic_request *req)
@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
        kref_get(&req->kref);
 }
+static struct ceph_mon_generic_request *
+alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
+{
+        struct ceph_mon_generic_request *req;
+        req = kzalloc(sizeof(*req), gfp);
+        if (!req)
+                return NULL;
+        req->monc = monc;
+        kref_init(&req->kref);
+        RB_CLEAR_NODE(&req->node);
+        init_completion(&req->completion);
+        dout("%s greq %p\n", __func__, req);
+        return req;
+}
+static void register_generic_request(struct ceph_mon_generic_request *req)
+{
+        struct ceph_mon_client *monc = req->monc;
+        WARN_ON(req->tid);
+        get_generic_request(req);
+        req->tid = ++monc->last_tid;
+        insert_generic_request(&monc->generic_request_tree, req);
+}
+static void send_generic_request(struct ceph_mon_client *monc,
+                                 struct ceph_mon_generic_request *req)
+{
+        WARN_ON(!req->tid);
+        dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+        req->request->hdr.tid = cpu_to_le64(req->tid);
+        ceph_con_send(&monc->con, ceph_msg_get(req->request));
+}
+static void __finish_generic_request(struct ceph_mon_generic_request *req)
+{
+        struct ceph_mon_client *monc = req->monc;
+        dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+        erase_generic_request(&monc->generic_request_tree, req);
+        ceph_msg_revoke(req->request);
+        ceph_msg_revoke_incoming(req->reply);
+}
+static void finish_generic_request(struct ceph_mon_generic_request *req)
+{
+        __finish_generic_request(req);
+        put_generic_request(req);
+}
+static void complete_generic_request(struct ceph_mon_generic_request *req)
+{
+        if (req->complete_cb)
+                req->complete_cb(req);
+        else
+                complete_all(&req->completion);
+        put_generic_request(req);
+}
+void cancel_generic_request(struct ceph_mon_generic_request *req)
+{
+        struct ceph_mon_client *monc = req->monc;
+        struct ceph_mon_generic_request *lookup_req;
+        dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+        mutex_lock(&monc->mutex);
+        lookup_req = lookup_generic_request(&monc->generic_request_tree,
+                                            req->tid);
+        if (lookup_req) {
+                WARN_ON(lookup_req != req);
+                finish_generic_request(req);
+        }
+        mutex_unlock(&monc->mutex);
+}
+static int wait_generic_request(struct ceph_mon_generic_request *req)
+{
+        int ret;
+        dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+        ret = wait_for_completion_interruptible(&req->completion);
+        if (ret)
+                cancel_generic_request(req);
+        else
+                ret = req->result; /* completed */
+        return ret;
+}
 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
                                         struct ceph_msg_header *hdr,
                                         int *skip)
@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
        struct ceph_msg *m;
        mutex_lock(&monc->mutex);
-        req = __lookup_generic_req(monc, tid);
+        req = lookup_generic_request(&monc->generic_request_tree, tid);
        if (!req) {
                dout("get_generic_reply %lld dne\n", tid);
                *skip = 1;
@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
        return m;
 }
-static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
-                                struct ceph_mon_generic_request *req)
-{
-        int err;
-        /* register request */
-        req->tid = tid != 0 ? tid : ++monc->last_tid;
-        req->request->hdr.tid = cpu_to_le64(req->tid);
-        __insert_generic_request(monc, req);
-        monc->num_generic_requests++;
-        ceph_con_send(&monc->con, ceph_msg_get(req->request));
-        mutex_unlock(&monc->mutex);
-        err = wait_for_completion_interruptible(&req->completion);
-        mutex_lock(&monc->mutex);
-        rb_erase(&req->node, &monc->generic_request_tree);
-        monc->num_generic_requests--;
-        if (!err)
-                err = req->result;
-        return err;
-}
-static int do_generic_request(struct ceph_mon_client *monc,
-                              struct ceph_mon_generic_request *req)
-{
-        int err;
-        mutex_lock(&monc->mutex);
-        err = __do_generic_request(monc, 0, req);
-        mutex_unlock(&monc->mutex);
-        return err;
-}
 /*
 * statfs
 */
@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
        struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
        u64 tid = le64_to_cpu(msg->hdr.tid);
+        dout("%s msg %p tid %llu\n", __func__, msg, tid);
        if (msg->front.iov_len != sizeof(*reply))
                goto bad;
-        dout("handle_statfs_reply %p tid %llu\n", msg, tid);
        mutex_lock(&monc->mutex);
-        req = __lookup_generic_req(monc, tid);
+        req = lookup_generic_request(&monc->generic_request_tree, tid);
-        if (req) {
+        if (!req) {
-                *(struct ceph_statfs *)req->buf = reply->st;
+                mutex_unlock(&monc->mutex);
-                req->result = 0;
+                return;
-                get_generic_request(req);
        }
+        req->result = 0;
+        *req->u.st = reply->st; /* struct */
+        __finish_generic_request(req);
        mutex_unlock(&monc->mutex);
-        if (req) {
-                complete_all(&req->completion);
+        complete_generic_request(req);
-                put_generic_request(req);
-        }
        return;
 bad:
@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 {
        struct ceph_mon_generic_request *req;
        struct ceph_mon_statfs *h;
-        int err;
+        int ret = -ENOMEM;
-        req = kzalloc(sizeof(*req), GFP_NOFS);
+        req = alloc_generic_request(monc, GFP_NOFS);
        if (!req)
-                return -ENOMEM;
+                goto out;
-        kref_init(&req->kref);
-        req->buf = buf;
-        init_completion(&req->completion);
-        err = -ENOMEM;
        req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
                                    true);
        if (!req->request)
                goto out;
-        req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
-                                  true);
+        req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
        if (!req->reply)
                goto out;
+        req->u.st = buf;
+        mutex_lock(&monc->mutex);
+        register_generic_request(req);
        /* fill out request */
        h = req->request->front.iov_base;
        h->monhdr.have_version = 0;
        h->monhdr.session_mon = cpu_to_le16(-1);
        h->monhdr.session_mon_tid = 0;
        h->fsid = monc->monmap->fsid;
+        send_generic_request(monc, req);
+        mutex_unlock(&monc->mutex);
-        err = do_generic_request(monc, req);
+        ret = wait_generic_request(req);
 out:
        put_generic_request(req);
-        return err;
+        return ret;
 }
 EXPORT_SYMBOL(ceph_monc_do_statfs);
@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
        void *end = p + msg->front_alloc_len;
        u64 handle;
-        dout("%s %p tid %llu\n", __func__, msg, tid);
+        dout("%s msg %p tid %llu\n", __func__, msg, tid);
        ceph_decode_need(&p, end, 2*sizeof(u64), bad);
        handle = ceph_decode_64(&p);
@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
                goto bad;
        mutex_lock(&monc->mutex);
-        req = __lookup_generic_req(monc, handle);
+        req = lookup_generic_request(&monc->generic_request_tree, handle);
-        if (req) {
+        if (!req) {
-                *(u64 *)req->buf = ceph_decode_64(&p);
+                mutex_unlock(&monc->mutex);
-                req->result = 0;
+                return;
-                get_generic_request(req);
        }
+        req->result = 0;
+        req->u.newest = ceph_decode_64(&p);
+        __finish_generic_request(req);
        mutex_unlock(&monc->mutex);
-        if (req) {
-                complete_all(&req->completion);
-                put_generic_request(req);
-        }
+        complete_generic_request(req);
        return;
 bad:
        pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
        ceph_msg_dump(msg);
 }
-/*
+static struct ceph_mon_generic_request *
- * Send MMonGetVersion and wait for the reply.
+__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
- *
+                        ceph_monc_callback_t cb, u64 private_data)
- * @what: one of "mdsmap", "osdmap" or "monmap"
- */
-int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
-                             u64 *newest)
 {
        struct ceph_mon_generic_request *req;
-        void *p, *end;
-        u64 tid;
-        int err;
-        req = kzalloc(sizeof(*req), GFP_NOFS);
+        req = alloc_generic_request(monc, GFP_NOIO);
        if (!req)
-                return -ENOMEM;
+                goto err_put_req;
-        kref_init(&req->kref);
-        req->buf = newest;
-        init_completion(&req->completion);
        req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
                                    sizeof(u64) + sizeof(u32) + strlen(what),
-                                    GFP_NOFS, true);
+                                    GFP_NOIO, true);
-        if (!req->request) {
+        if (!req->request)
-                err = -ENOMEM;
+                goto err_put_req;
-                goto out;
-        }
-        req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
+        req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
-                                  GFP_NOFS, true);
+                                  true);
-        if (!req->reply) {
+        if (!req->reply)
-                err = -ENOMEM;
+                goto err_put_req;
-                goto out;
-        }
-        p = req->request->front.iov_base;
+        req->complete_cb = cb;
-        end = p + req->request->front_alloc_len;
+        req->private_data = private_data;
-        /* fill out request */
        mutex_lock(&monc->mutex);
-        tid = ++monc->last_tid;
+        register_generic_request(req);
-        ceph_encode_64(&p, tid); /* handle */
+        {
-        ceph_encode_string(&p, end, what, strlen(what));
+                void *p = req->request->front.iov_base;
+                void *const end = p + req->request->front_alloc_len;
+                ceph_encode_64(&p, req->tid); /* handle */
+                ceph_encode_string(&p, end, what, strlen(what));
+                WARN_ON(p != end);
+        }
+        send_generic_request(monc, req);
+        mutex_unlock(&monc->mutex);
-        err = __do_generic_request(monc, tid, req);
+        return req;
-        mutex_unlock(&monc->mutex);
+err_put_req:
-out:
        put_generic_request(req);
-        return err;
+        return ERR_PTR(-ENOMEM);
+}
+/*
+ * Send MMonGetVersion and wait for the reply.
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+                          u64 *newest)
+{
+        struct ceph_mon_generic_request *req;
+        int ret;
+        req = __ceph_monc_get_version(monc, what, NULL, 0);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        ret = wait_generic_request(req);
+        if (!ret)
+                *newest = req->u.newest;
+        put_generic_request(req);
+        return ret;
 }
-EXPORT_SYMBOL(ceph_monc_do_get_version);
+EXPORT_SYMBOL(ceph_monc_get_version);
+/*
+ * Send MMonGetVersion,
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+                                ceph_monc_callback_t cb, u64 private_data)
+{
+        struct ceph_mon_generic_request *req;
+        req = __ceph_monc_get_version(monc, what, cb, private_data);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        put_generic_request(req);
+        return 0;
+}
+EXPORT_SYMBOL(ceph_monc_get_version_async);
 /*
 * Resend pending generic requests.
@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        if (!monc->m_subscribe_ack)
                goto out_auth;
-        monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
+        monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
                                         true);
        if (!monc->m_subscribe)
                goto out_subscribe_ack;
@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
        monc->generic_request_tree = RB_ROOT;
-        monc->num_generic_requests = 0;
        monc->last_tid = 0;
+        monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
        return 0;
 out_auth_reply:
@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
        ceph_auth_destroy(monc->auth);
+        WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
        ceph_msg_put(monc->m_auth);
        ceph_msg_put(monc->m_auth_reply);
        ceph_msg_put(monc->m_subscribe);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 40a53a70efdf..0160d7d09a1e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -19,25 +19,12 @@
 #include <linux/ceph/auth.h>
 #include <linux/ceph/pagelist.h>
-#define OSD_OP_FRONT_LEN        4096
 #define OSD_OPREPLY_FRONT_LEN   512
 static struct kmem_cache        *ceph_osd_request_cache;
 static const struct ceph_connection_operations osd_con_ops;
-static void __send_queued(struct ceph_osd_client *osdc);
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-static void __register_request(struct ceph_osd_client *osdc,
-                               struct ceph_osd_request *req);
-static void __unregister_request(struct ceph_osd_client *osdc,
-                                 struct ceph_osd_request *req);
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
-                                        struct ceph_osd_request *req);
-static void __enqueue_request(struct ceph_osd_request *req);
-static void __send_request(struct ceph_osd_client *osdc,
-                           struct ceph_osd_request *req);
 /*
 * Implement client access to distributed object storage cluster.
 *
@@ -56,6 +43,52 @@ static void __send_request(struct ceph_osd_client *osdc,
 * channel with an OSD is reset.
 */
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void link_linger(struct ceph_osd *osd,
+                        struct ceph_osd_linger_request *lreq);
+static void unlink_linger(struct ceph_osd *osd,
+                          struct ceph_osd_linger_request *lreq);
+#if 1
+static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
+{
+        bool wrlocked = true;
+        if (unlikely(down_read_trylock(sem))) {
+                wrlocked = false;
+                up_read(sem);
+        }
+        return wrlocked;
+}
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
+{
+        WARN_ON(!rwsem_is_locked(&osdc->lock));
+}
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
+{
+        WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_osd_locked(struct ceph_osd *osd)
+{
+        struct ceph_osd_client *osdc = osd->o_osdc;
+        WARN_ON(!(mutex_is_locked(&osd->lock) &&
+                  rwsem_is_locked(&osdc->lock)) &&
+                !rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
+{
+        WARN_ON(!mutex_is_locked(&lreq->lock));
+}
+#else
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
+static inline void verify_osd_locked(struct ceph_osd *osd) { }
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
+#endif
 /*
 * calculate the mapping of a file extent onto an object, and fill out the
 * request accordingly.  shorten extent as necessary if it crosses an
@@ -144,14 +177,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data);
-struct ceph_osd_data *
-osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
-                        unsigned int which)
-{
-        return osd_req_op_data(osd_req, which, cls, response_data);
-}
-EXPORT_SYMBOL(osd_req_op_cls_response_data);    /* ??? */
 void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
                        unsigned int which, struct page **pages,
                        u64 length, u32 alignment,
@@ -218,6 +243,8 @@ void osd_req_op_cls_request_data_pagelist(
        osd_data = osd_req_op_data(osd_req, which, cls, request_data);
        ceph_osd_data_pagelist_init(osd_data, pagelist);
+        osd_req->r_ops[which].cls.indata_len += pagelist->length;
+        osd_req->r_ops[which].indata_len += pagelist->length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
@@ -230,6 +257,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
        osd_data = osd_req_op_data(osd_req, which, cls, request_data);
        ceph_osd_data_pages_init(osd_data, pages, length, alignment,
                                pages_from_pool, own_pages);
+        osd_req->r_ops[which].cls.indata_len += length;
+        osd_req->r_ops[which].indata_len += length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
@@ -302,14 +331,76 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
        case CEPH_OSD_OP_STAT:
                ceph_osd_data_release(&op->raw_data_in);
                break;
+        case CEPH_OSD_OP_NOTIFY_ACK:
+                ceph_osd_data_release(&op->notify_ack.request_data);
+                break;
+        case CEPH_OSD_OP_NOTIFY:
+                ceph_osd_data_release(&op->notify.request_data);
+                ceph_osd_data_release(&op->notify.response_data);
+                break;
        default:
                break;
        }
 }
 /*
+ * Assumes @t is zero-initialized.
+ */
+static void target_init(struct ceph_osd_request_target *t)
+{
+        ceph_oid_init(&t->base_oid);
+        ceph_oloc_init(&t->base_oloc);
+        ceph_oid_init(&t->target_oid);
+        ceph_oloc_init(&t->target_oloc);
+        ceph_osds_init(&t->acting);
+        ceph_osds_init(&t->up);
+        t->size = -1;
+        t->min_size = -1;
+        t->osd = CEPH_HOMELESS_OSD;
+}
+static void target_copy(struct ceph_osd_request_target *dest,
+                        const struct ceph_osd_request_target *src)
+{
+        ceph_oid_copy(&dest->base_oid, &src->base_oid);
+        ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
+        ceph_oid_copy(&dest->target_oid, &src->target_oid);
+        ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
+        dest->pgid = src->pgid; /* struct */
+        dest->pg_num = src->pg_num;
+        dest->pg_num_mask = src->pg_num_mask;
+        ceph_osds_copy(&dest->acting, &src->acting);
+        ceph_osds_copy(&dest->up, &src->up);
+        dest->size = src->size;
+        dest->min_size = src->min_size;
+        dest->sort_bitwise = src->sort_bitwise;
+        dest->flags = src->flags;
+        dest->paused = src->paused;
+        dest->osd = src->osd;
+}
+static void target_destroy(struct ceph_osd_request_target *t)
+{
+        ceph_oid_destroy(&t->base_oid);
+        ceph_oid_destroy(&t->target_oid);
+}
+/*
 * requests
 */
+static void request_release_checks(struct ceph_osd_request *req)
+{
+        WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+        WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
+        WARN_ON(!list_empty(&req->r_unsafe_item));
+        WARN_ON(req->r_osd);
+}
 static void ceph_osdc_release_request(struct kref *kref)
 {
        struct ceph_osd_request *req = container_of(kref,
@@ -318,24 +409,19 @@ static void ceph_osdc_release_request(struct kref *kref)
        dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
             req->r_request, req->r_reply);
-        WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+        request_release_checks(req);
-        WARN_ON(!list_empty(&req->r_req_lru_item));
-        WARN_ON(!list_empty(&req->r_osd_item));
-        WARN_ON(!list_empty(&req->r_linger_item));
-        WARN_ON(!list_empty(&req->r_linger_osd_item));
-        WARN_ON(req->r_osd);
        if (req->r_request)
                ceph_msg_put(req->r_request);
-        if (req->r_reply) {
+        if (req->r_reply)
-                ceph_msg_revoke_incoming(req->r_reply);
                ceph_msg_put(req->r_reply);
-        }
        for (which = 0; which < req->r_num_ops; which++)
                osd_req_op_data_release(req, which);
+        target_destroy(&req->r_t);
        ceph_put_snap_context(req->r_snapc);
        if (req->r_mempool)
                mempool_free(req, req->r_osdc->req_mempool);
        else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
@@ -354,12 +440,66 @@ EXPORT_SYMBOL(ceph_osdc_get_request);
 void ceph_osdc_put_request(struct ceph_osd_request *req)
 {
-        dout("%s %p (was %d)\n", __func__, req,
+        if (req) {
-             atomic_read(&req->r_kref.refcount));
+                dout("%s %p (was %d)\n", __func__, req,
-        kref_put(&req->r_kref, ceph_osdc_release_request);
+                     atomic_read(&req->r_kref.refcount));
+                kref_put(&req->r_kref, ceph_osdc_release_request);
+        }
 }
 EXPORT_SYMBOL(ceph_osdc_put_request);
+static void request_init(struct ceph_osd_request *req)
+{
+        /* req only, each op is zeroed in _osd_req_op_init() */
+        memset(req, 0, sizeof(*req));
+        kref_init(&req->r_kref);
+        init_completion(&req->r_completion);
+        init_completion(&req->r_safe_completion);
+        RB_CLEAR_NODE(&req->r_node);
+        RB_CLEAR_NODE(&req->r_mc_node);
+        INIT_LIST_HEAD(&req->r_unsafe_item);
+        target_init(&req->r_t);
+}
+/*
+ * This is ugly, but it allows us to reuse linger registration and ping
+ * requests, keeping the structure of the code around send_linger{_ping}()
+ * reasonable.  Setting up a min_nr=2 mempool for each linger request
+ * and dealing with copying ops (this blasts req only, watch op remains
+ * intact) isn't any better.
+ */
+static void request_reinit(struct ceph_osd_request *req)
+{
+        struct ceph_osd_client *osdc = req->r_osdc;
+        bool mempool = req->r_mempool;
+        unsigned int num_ops = req->r_num_ops;
+        u64 snapid = req->r_snapid;
+        struct ceph_snap_context *snapc = req->r_snapc;
+        bool linger = req->r_linger;
+        struct ceph_msg *request_msg = req->r_request;
+        struct ceph_msg *reply_msg = req->r_reply;
+        dout("%s req %p\n", __func__, req);
+        WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+        request_release_checks(req);
+        WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
+        WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+        target_destroy(&req->r_t);
+        request_init(req);
+        req->r_osdc = osdc;
+        req->r_mempool = mempool;
+        req->r_num_ops = num_ops;
+        req->r_snapid = snapid;
+        req->r_snapc = snapc;
+        req->r_linger = linger;
+        req->r_request = request_msg;
+        req->r_reply = reply_msg;
+}
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
                                               struct ceph_snap_context *snapc,
                                               unsigned int num_ops,
@@ -367,8 +507,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
                                               gfp_t gfp_flags)
 {
        struct ceph_osd_request *req;
-        struct ceph_msg *msg;
-        size_t msg_size;
        if (use_mempool) {
                BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
@@ -383,73 +521,65 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
        if (unlikely(!req))
                return NULL;
-        /* req only, each op is zeroed in _osd_req_op_init() */
+        request_init(req);
-        memset(req, 0, sizeof(*req));
        req->r_osdc = osdc;
        req->r_mempool = use_mempool;
        req->r_num_ops = num_ops;
+        req->r_snapid = CEPH_NOSNAP;
+        req->r_snapc = ceph_get_snap_context(snapc);
-        kref_init(&req->r_kref);
+        dout("%s req %p\n", __func__, req);
-        init_completion(&req->r_completion);
+        return req;
-        init_completion(&req->r_safe_completion);
+}
-        RB_CLEAR_NODE(&req->r_node);
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
-        INIT_LIST_HEAD(&req->r_unsafe_item);
-        INIT_LIST_HEAD(&req->r_linger_item);
-        INIT_LIST_HEAD(&req->r_linger_osd_item);
-        INIT_LIST_HEAD(&req->r_req_lru_item);
-        INIT_LIST_HEAD(&req->r_osd_item);
-        req->r_base_oloc.pool = -1;
-        req->r_target_oloc.pool = -1;
-        msg_size = OSD_OPREPLY_FRONT_LEN;
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
-        if (num_ops > CEPH_OSD_SLAB_OPS) {
+{
-                /* ceph_osd_op and rval */
+        struct ceph_osd_client *osdc = req->r_osdc;
-                msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
+        struct ceph_msg *msg;
-                            (sizeof(struct ceph_osd_op) + 4);
+        int msg_size;
-        }
-        /* create reply message */
+        WARN_ON(ceph_oid_empty(&req->r_base_oid));
-        if (use_mempool)
-                msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
-        else
-                msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
-                                   gfp_flags, true);
-        if (!msg) {
-                ceph_osdc_put_request(req);
-                return NULL;
-        }
-        req->r_reply = msg;
+        /* create request message */
        msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
        msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
        msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
        msg_size += 1 + 8 + 4 + 4; /* pgid */
-        msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
+        msg_size += 4 + req->r_base_oid.name_len; /* oid */
-        msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
+        msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
        msg_size += 8; /* snapid */
        msg_size += 8; /* snap_seq */
-        msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
+        msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
        msg_size += 4; /* retry_attempt */
-        /* create request message; allow space for oid */
+        if (req->r_mempool)
-        if (use_mempool)
                msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
        else
-                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
+                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
-        if (!msg) {
+        if (!msg)
-                ceph_osdc_put_request(req);
+                return -ENOMEM;
-                return NULL;
-        }
        memset(msg->front.iov_base, 0, msg->front.iov_len);
        req->r_request = msg;
-        return req;
+        /* create reply message */
+        msg_size = OSD_OPREPLY_FRONT_LEN;
+        msg_size += req->r_base_oid.name_len;
+        msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
+        if (req->r_mempool)
+                msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+        else
+                msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
+        if (!msg)
+                return -ENOMEM;
+        req->r_reply = msg;
+        return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_alloc_request);
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
 static bool osd_req_opcode_valid(u16 opcode)
 {
@@ -587,8 +717,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
        osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
-        op->cls.argc = 0;       /* currently unused */
        op->indata_len = payload_len;
 }
 EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -627,21 +755,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
 }
 EXPORT_SYMBOL(osd_req_op_xattr_init);
-void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
+/*
-                                unsigned int which, u16 opcode,
+ * @watch_opcode: CEPH_OSD_WATCH_OP_*
-                                u64 cookie, u64 version, int flag)
+ */
+static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
+                                  u64 cookie, u8 watch_opcode)
 {
-        struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+        struct ceph_osd_req_op *op;
-                                                      opcode, 0);
-        BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+        op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
        op->watch.cookie = cookie;
-        op->watch.ver = version;
+        op->watch.op = watch_opcode;
-        if (opcode == CEPH_OSD_OP_WATCH && flag)
+        op->watch.gen = 0;
-                op->watch.flag = (u8)1;
 }
-EXPORT_SYMBOL(osd_req_op_watch_init);
 void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
                                unsigned int which,
@@ -686,16 +812,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
        }
 }
-static u64 osd_req_encode_op(struct ceph_osd_request *req,
+static u32 osd_req_encode_op(struct ceph_osd_op *dst,
-                              struct ceph_osd_op *dst, unsigned int which)
+                             const struct ceph_osd_req_op *src)
 {
-        struct ceph_osd_req_op *src;
-        struct ceph_osd_data *osd_data;
-        u64 request_data_len = 0;
-        u64 data_length;
-        BUG_ON(which >= req->r_num_ops);
-        src = &req->r_ops[which];
        if (WARN_ON(!osd_req_opcode_valid(src->op))) {
                pr_err("unrecognized osd opcode %d\n", src->op);
@@ -704,57 +823,36 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
        switch (src->op) {
        case CEPH_OSD_OP_STAT:
-                osd_data = &src->raw_data_in;
-                ceph_osdc_msg_data_add(req->r_reply, osd_data);
                break;
        case CEPH_OSD_OP_READ:
        case CEPH_OSD_OP_WRITE:
        case CEPH_OSD_OP_WRITEFULL:
        case CEPH_OSD_OP_ZERO:
        case CEPH_OSD_OP_TRUNCATE:
-                if (src->op == CEPH_OSD_OP_WRITE ||
-                    src->op == CEPH_OSD_OP_WRITEFULL)
-                        request_data_len = src->extent.length;
                dst->extent.offset = cpu_to_le64(src->extent.offset);
                dst->extent.length = cpu_to_le64(src->extent.length);
                dst->extent.truncate_size =
                        cpu_to_le64(src->extent.truncate_size);
                dst->extent.truncate_seq =
                        cpu_to_le32(src->extent.truncate_seq);
-                osd_data = &src->extent.osd_data;
-                if (src->op == CEPH_OSD_OP_WRITE ||
-                    src->op == CEPH_OSD_OP_WRITEFULL)
-                        ceph_osdc_msg_data_add(req->r_request, osd_data);
-                else
-                        ceph_osdc_msg_data_add(req->r_reply, osd_data);
                break;
        case CEPH_OSD_OP_CALL:
                dst->cls.class_len = src->cls.class_len;
                dst->cls.method_len = src->cls.method_len;
-                osd_data = &src->cls.request_info;
+                dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
-                ceph_osdc_msg_data_add(req->r_request, osd_data);
-                BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
-                request_data_len = osd_data->pagelist->length;
-                osd_data = &src->cls.request_data;
-                data_length = ceph_osd_data_length(osd_data);
-                if (data_length) {
-                        BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
-                        dst->cls.indata_len = cpu_to_le32(data_length);
-                        ceph_osdc_msg_data_add(req->r_request, osd_data);
-                        src->indata_len += data_length;
-                        request_data_len += data_length;
-                }
-                osd_data = &src->cls.response_data;
-                ceph_osdc_msg_data_add(req->r_reply, osd_data);
                break;
        case CEPH_OSD_OP_STARTSYNC:
                break;
-        case CEPH_OSD_OP_NOTIFY_ACK:
        case CEPH_OSD_OP_WATCH:
                dst->watch.cookie = cpu_to_le64(src->watch.cookie);
-                dst->watch.ver = cpu_to_le64(src->watch.ver);
+                dst->watch.ver = cpu_to_le64(0);
-                dst->watch.flag = src->watch.flag;
+                dst->watch.op = src->watch.op;
+                dst->watch.gen = cpu_to_le32(src->watch.gen);
+                break;
+        case CEPH_OSD_OP_NOTIFY_ACK:
+                break;
+        case CEPH_OSD_OP_NOTIFY:
+                dst->notify.cookie = cpu_to_le64(src->notify.cookie);
                break;
        case CEPH_OSD_OP_SETALLOCHINT:
                dst->alloc_hint.expected_object_size =
@@ -768,9 +866,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
                dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
                dst->xattr.cmp_op = src->xattr.cmp_op;
                dst->xattr.cmp_mode = src->xattr.cmp_mode;
-                osd_data = &src->xattr.osd_data;
-                ceph_osdc_msg_data_add(req->r_request, osd_data);
-                request_data_len = osd_data->pagelist->length;
                break;
        case CEPH_OSD_OP_CREATE:
        case CEPH_OSD_OP_DELETE:
@@ -787,7 +882,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
        dst->flags = cpu_to_le32(src->flags);
        dst->payload_len = cpu_to_le32(src->indata_len);
-        return request_data_len;
+        return src->indata_len;
 }
 /*
@@ -824,17 +919,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
        req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
                                        GFP_NOFS);
-        if (!req)
+        if (!req) {
-                return ERR_PTR(-ENOMEM);
+                r = -ENOMEM;
+                goto fail;
-        req->r_flags = flags;
+        }
        /* calculate max write size */
        r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
-        if (r < 0) {
+        if (r)
-                ceph_osdc_put_request(req);
+                goto fail;
-                return ERR_PTR(r);
-        }
        if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
                osd_req_op_init(req, which, opcode, 0);
@@ -854,194 +947,71 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                                       truncate_size, truncate_seq);
        }
+        req->r_flags = flags;
        req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+        ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
-        snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
+        req->r_snapid = vino.snap;
-                 "%llx.%08llx", vino.ino, objnum);
+        if (flags & CEPH_OSD_FLAG_WRITE)
-        req->r_base_oid.name_len = strlen(req->r_base_oid.name);
+                req->r_data_offset = off;
+        r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+        if (r)
+                goto fail;
        return req;
+fail:
+        ceph_osdc_put_request(req);
+        return ERR_PTR(r);
 }
 EXPORT_SYMBOL(ceph_osdc_new_request);
 /*
 * We keep osd requests in an rbtree, sorted by ->r_tid.
 */
-static void __insert_request(struct ceph_osd_client *osdc,
+DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
-                             struct ceph_osd_request *new)
+DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
-{
-        struct rb_node **p = &osdc->requests.rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_osd_request *req = NULL;
-        while (*p) {
-                parent = *p;
-                req = rb_entry(parent, struct ceph_osd_request, r_node);
-                if (new->r_tid < req->r_tid)
-                        p = &(*p)->rb_left;
-                else if (new->r_tid > req->r_tid)
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
-        }
-        rb_link_node(&new->r_node, parent, p);
-        rb_insert_color(&new->r_node, &osdc->requests);
-}
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
-                                                 u64 tid)
-{
-        struct ceph_osd_request *req;
-        struct rb_node *n = osdc->requests.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_osd_request, r_node);
-                if (tid < req->r_tid)
-                        n = n->rb_left;
-                else if (tid > req->r_tid)
-                        n = n->rb_right;
-                else
-                        return req;
-        }
-        return NULL;
-}
-static struct ceph_osd_request *
+static bool osd_homeless(struct ceph_osd *osd)
-__lookup_request_ge(struct ceph_osd_client *osdc,
-                    u64 tid)
 {
-        struct ceph_osd_request *req;
+        return osd->o_osd == CEPH_HOMELESS_OSD;
-        struct rb_node *n = osdc->requests.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_osd_request, r_node);
-                if (tid < req->r_tid) {
-                        if (!n->rb_left)
-                                return req;
-                        n = n->rb_left;
-                } else if (tid > req->r_tid) {
-                        n = n->rb_right;
-                } else {
-                        return req;
-                }
-        }
-        return NULL;
 }
-static void __kick_linger_request(struct ceph_osd_request *req)
+static bool osd_registered(struct ceph_osd *osd)
 {
-        struct ceph_osd_client *osdc = req->r_osdc;
+        verify_osdc_locked(osd->o_osdc);
-        struct ceph_osd *osd = req->r_osd;
-        /*
-         * Linger requests need to be resent with a new tid to avoid
-         * the dup op detection logic on the OSDs.  Achieve this with
-         * a re-register dance instead of open-coding.
-         */
-        ceph_osdc_get_request(req);
-        if (!list_empty(&req->r_linger_item))
-                __unregister_linger_request(osdc, req);
-        else
-                __unregister_request(osdc, req);
-        __register_request(osdc, req);
-        ceph_osdc_put_request(req);
-        /*
-         * Unless request has been registered as both normal and
-         * lingering, __unregister{,_linger}_request clears r_osd.
-         * However, here we need to preserve r_osd to make sure we
-         * requeue on the same OSD.
-         */
-        WARN_ON(req->r_osd || !osd);
-        req->r_osd = osd;
-        dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
+        return !RB_EMPTY_NODE(&osd->o_node);
-        __enqueue_request(req);
 }
 /*
- * Resubmit requests pending on the given osd.
+ * Assumes @osd is zero-initialized.
 */
-static void __kick_osd_requests(struct ceph_osd_client *osdc,
+static void osd_init(struct ceph_osd *osd)
-                                struct ceph_osd *osd)
 {
-        struct ceph_osd_request *req, *nreq;
+        atomic_set(&osd->o_ref, 1);
-        LIST_HEAD(resend);
+        RB_CLEAR_NODE(&osd->o_node);
-        LIST_HEAD(resend_linger);
+        osd->o_requests = RB_ROOT;
-        int err;
+        osd->o_linger_requests = RB_ROOT;
+        INIT_LIST_HEAD(&osd->o_osd_lru);
-        dout("%s osd%d\n", __func__, osd->o_osd);
+        INIT_LIST_HEAD(&osd->o_keepalive_item);
-        err = __reset_osd(osdc, osd);
+        osd->o_incarnation = 1;
-        if (err)
+        mutex_init(&osd->lock);
-                return;
-        /*
-         * Build up a list of requests to resend by traversing the
-         * osd's list of requests.  Requests for a given object are
-         * sent in tid order, and that is also the order they're
-         * kept on this list.  Therefore all requests that are in
-         * flight will be found first, followed by all requests that
-         * have not yet been sent.  And to resend requests while
-         * preserving this order we will want to put any sent
-         * requests back on the front of the osd client's unsent
-         * list.
-         *
-         * So we build a separate ordered list of already-sent
-         * requests for the affected osd and splice it onto the
-         * front of the osd client's unsent list.  Once we've seen a
-         * request that has not yet been sent we're done.  Those
-         * requests are already sitting right where they belong.
-         */
-        list_for_each_entry(req, &osd->o_requests, r_osd_item) {
-                if (!req->r_sent)
-                        break;
-                if (!req->r_linger) {
-                        dout("%s requeueing %p tid %llu\n", __func__, req,
-                             req->r_tid);
-                        list_move_tail(&req->r_req_lru_item, &resend);
-                        req->r_flags |= CEPH_OSD_FLAG_RETRY;
-                } else {
-                        list_move_tail(&req->r_req_lru_item, &resend_linger);
-                }
-        }
-        list_splice(&resend, &osdc->req_unsent);
-        /*
-         * Both registered and not yet registered linger requests are
-         * enqueued with a new tid on the same OSD.  We add/move them
-         * to req_unsent/o_requests at the end to keep things in tid
-         * order.
-         */
-        list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
-                                 r_linger_osd_item) {
-                WARN_ON(!list_empty(&req->r_req_lru_item));
-                __kick_linger_request(req);
-        }
-        list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
-                __kick_linger_request(req);
 }
-/*
+static void osd_cleanup(struct ceph_osd *osd)
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
 {
-        struct ceph_osd *osd = con->private;
+        WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
-        struct ceph_osd_client *osdc;
+        WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+        WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
-        if (!osd)
+        WARN_ON(!list_empty(&osd->o_osd_lru));
-                return;
+        WARN_ON(!list_empty(&osd->o_keepalive_item));
-        dout("osd_reset osd%d\n", osd->o_osd);
-        osdc = osd->o_osdc;
+        if (osd->o_auth.authorizer) {
-        down_read(&osdc->map_sem);
+                WARN_ON(osd_homeless(osd));
-        mutex_lock(&osdc->request_mutex);
+                ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
-        __kick_osd_requests(osdc, osd);
+        }
-        __send_queued(osdc);
-        mutex_unlock(&osdc->request_mutex);
-        up_read(&osdc->map_sem);
 }
 /*
@@ -1051,22 +1021,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 {
        struct ceph_osd *osd;
-        osd = kzalloc(sizeof(*osd), GFP_NOFS);
+        WARN_ON(onum == CEPH_HOMELESS_OSD);
-        if (!osd)
-                return NULL;
-        atomic_set(&osd->o_ref, 1);
+        osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
+        osd_init(osd);
        osd->o_osdc = osdc;
        osd->o_osd = onum;
-        RB_CLEAR_NODE(&osd->o_node);
-        INIT_LIST_HEAD(&osd->o_requests);
-        INIT_LIST_HEAD(&osd->o_linger_requests);
-        INIT_LIST_HEAD(&osd->o_osd_lru);
-        osd->o_incarnation = 1;
        ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
-        INIT_LIST_HEAD(&osd->o_keepalive_item);
        return osd;
 }
@@ -1087,114 +1050,115 @@ static void put_osd(struct ceph_osd *osd)
        dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
             atomic_read(&osd->o_ref) - 1);
        if (atomic_dec_and_test(&osd->o_ref)) {
-                if (osd->o_auth.authorizer)
+                osd_cleanup(osd);
-                        ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
                kfree(osd);
        }
 }
-/*
+DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-        dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-        WARN_ON(!list_empty(&osd->o_requests));
-        WARN_ON(!list_empty(&osd->o_linger_requests));
-        list_del_init(&osd->o_osd_lru);
+static void __move_osd_to_lru(struct ceph_osd *osd)
-        rb_erase(&osd->o_node, &osdc->osds);
-        RB_CLEAR_NODE(&osd->o_node);
-}
-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 {
-        dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
+        struct ceph_osd_client *osdc = osd->o_osdc;
-        if (!RB_EMPTY_NODE(&osd->o_node)) {
-                ceph_con_close(&osd->o_con);
-                __remove_osd(osdc, osd);
-                put_osd(osd);
-        }
-}
-static void remove_all_osds(struct ceph_osd_client *osdc)
-{
-        dout("%s %p\n", __func__, osdc);
-        mutex_lock(&osdc->request_mutex);
-        while (!RB_EMPTY_ROOT(&osdc->osds)) {
-                struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
-                                                struct ceph_osd, o_node);
-                remove_osd(osdc, osd);
-        }
-        mutex_unlock(&osdc->request_mutex);
-}
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+        dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
-                              struct ceph_osd *osd)
-{
-        dout("%s %p\n", __func__, osd);
        BUG_ON(!list_empty(&osd->o_osd_lru));
+        spin_lock(&osdc->osd_lru_lock);
        list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+        spin_unlock(&osdc->osd_lru_lock);
        osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
 }
-static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
+static void maybe_move_osd_to_lru(struct ceph_osd *osd)
-                                  struct ceph_osd *osd)
 {
-        dout("%s %p\n", __func__, osd);
+        if (RB_EMPTY_ROOT(&osd->o_requests) &&
+            RB_EMPTY_ROOT(&osd->o_linger_requests))
-        if (list_empty(&osd->o_requests) &&
+                __move_osd_to_lru(osd);
-            list_empty(&osd->o_linger_requests))
-                __move_osd_to_lru(osdc, osd);
 }
 static void __remove_osd_from_lru(struct ceph_osd *osd)
 {
-        dout("__remove_osd_from_lru %p\n", osd);
+        struct ceph_osd_client *osdc = osd->o_osdc;
+        dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+        spin_lock(&osdc->osd_lru_lock);
        if (!list_empty(&osd->o_osd_lru))
                list_del_init(&osd->o_osd_lru);
+        spin_unlock(&osdc->osd_lru_lock);
 }
-static void remove_old_osds(struct ceph_osd_client *osdc)
+/*
+ * Close the connection and assign any leftover requests to the
+ * homeless session.
+ */
+static void close_osd(struct ceph_osd *osd)
 {
-        struct ceph_osd *osd, *nosd;
+        struct ceph_osd_client *osdc = osd->o_osdc;
+        struct rb_node *n;
-        dout("__remove_old_osds %p\n", osdc);
+        verify_osdc_wrlocked(osdc);
-        mutex_lock(&osdc->request_mutex);
+        dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
-        list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
-                if (time_before(jiffies, osd->lru_ttl))
+        ceph_con_close(&osd->o_con);
-                        break;
-                remove_osd(osdc, osd);
+        for (n = rb_first(&osd->o_requests); n; ) {
+                struct ceph_osd_request *req =
+                    rb_entry(n, struct ceph_osd_request, r_node);
+                n = rb_next(n); /* unlink_request() */
+                dout(" reassigning req %p tid %llu\n", req, req->r_tid);
+                unlink_request(osd, req);
+                link_request(&osdc->homeless_osd, req);
+        }
+        for (n = rb_first(&osd->o_linger_requests); n; ) {
+                struct ceph_osd_linger_request *lreq =
+                    rb_entry(n, struct ceph_osd_linger_request, node);
+                n = rb_next(n); /* unlink_linger() */
+                dout(" reassigning lreq %p linger_id %llu\n", lreq,
+                     lreq->linger_id);
+                unlink_linger(osd, lreq);
+                link_linger(&osdc->homeless_osd, lreq);
        }
-        mutex_unlock(&osdc->request_mutex);
+        __remove_osd_from_lru(osd);
+        erase_osd(&osdc->osds, osd);
+        put_osd(osd);
 }
 /*
 * reset osd connect
 */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+static int reopen_osd(struct ceph_osd *osd)
 {
        struct ceph_entity_addr *peer_addr;
-        dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
+        dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
-        if (list_empty(&osd->o_requests) &&
-            list_empty(&osd->o_linger_requests)) {
+        if (RB_EMPTY_ROOT(&osd->o_requests) &&
-                remove_osd(osdc, osd);
+            RB_EMPTY_ROOT(&osd->o_linger_requests)) {
+                close_osd(osd);
                return -ENODEV;
        }
-        peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+        peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
        if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
                        !ceph_con_opened(&osd->o_con)) {
-                struct ceph_osd_request *req;
+                struct rb_node *n;
                dout("osd addr hasn't changed and connection never opened, "
                     "letting msgr retry\n");
                /* touch each r_stamp for handle_timeout()'s benfit */
-                list_for_each_entry(req, &osd->o_requests, r_osd_item)
+                for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+                        struct ceph_osd_request *req =
+                            rb_entry(n, struct ceph_osd_request, r_node);
                        req->r_stamp = jiffies;
+                }
                return -EAGAIN;
        }
@@ -1206,455 +1170,1370 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
        return 0;
 }
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
+                                          bool wrlocked)
 {
-        struct rb_node **p = &osdc->osds.rb_node;
+        struct ceph_osd *osd;
-        struct rb_node *parent = NULL;
-        struct ceph_osd *osd = NULL;
-        dout("__insert_osd %p osd%d\n", new, new->o_osd);
+        if (wrlocked)
-        while (*p) {
+                verify_osdc_wrlocked(osdc);
-                parent = *p;
+        else
-                osd = rb_entry(parent, struct ceph_osd, o_node);
+                verify_osdc_locked(osdc);
-                if (new->o_osd < osd->o_osd)
-                        p = &(*p)->rb_left;
+        if (o != CEPH_HOMELESS_OSD)
-                else if (new->o_osd > osd->o_osd)
+                osd = lookup_osd(&osdc->osds, o);
-                        p = &(*p)->rb_right;
+        else
-                else
+                osd = &osdc->homeless_osd;
-                        BUG();
+        if (!osd) {
+                if (!wrlocked)
+                        return ERR_PTR(-EAGAIN);
+                osd = create_osd(osdc, o);
+                insert_osd(&osdc->osds, osd);
+                ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+                              &osdc->osdmap->osd_addr[osd->o_osd]);
        }
-        rb_link_node(&new->o_node, parent, p);
+        dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
-        rb_insert_color(&new->o_node, &osdc->osds);
+        return osd;
 }
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+/*
+ * Create request <-> OSD session relation.
+ *
+ * @req has to be assigned a tid, @osd may be homeless.
+ */
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 {
-        struct ceph_osd *osd;
+        verify_osd_locked(osd);
-        struct rb_node *n = osdc->osds.rb_node;
+        WARN_ON(!req->r_tid || req->r_osd);
+        dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
-        while (n) {
+             req, req->r_tid);
-                osd = rb_entry(n, struct ceph_osd, o_node);
-                if (o < osd->o_osd)
+        if (!osd_homeless(osd))
-                        n = n->rb_left;
+                __remove_osd_from_lru(osd);
-                else if (o > osd->o_osd)
+        else
-                        n = n->rb_right;
+                atomic_inc(&osd->o_osdc->num_homeless);
-                else
-                        return osd;
+        get_osd(osd);
-        }
+        insert_request(&osd->o_requests, req);
-        return NULL;
+        req->r_osd = osd;
 }
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 {
-        schedule_delayed_work(&osdc->timeout_work,
+        verify_osd_locked(osd);
-                              osdc->client->options->osd_keepalive_timeout);
+        WARN_ON(req->r_osd != osd);
+        dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+             req, req->r_tid);
+        req->r_osd = NULL;
+        erase_request(&osd->o_requests, req);
+        put_osd(osd);
+        if (!osd_homeless(osd))
+                maybe_move_osd_to_lru(osd);
+        else
+                atomic_dec(&osd->o_osdc->num_homeless);
 }
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+static bool __pool_full(struct ceph_pg_pool_info *pi)
 {
-        cancel_delayed_work(&osdc->timeout_work);
+        return pi->flags & CEPH_POOL_FLAG_FULL;
 }
-/*
+static bool have_pool_full(struct ceph_osd_client *osdc)
- * Register request, assign tid.  If this is the first request, set up
- * the timeout event.
- */
-static void __register_request(struct ceph_osd_client *osdc,
-                               struct ceph_osd_request *req)
 {
-        req->r_tid = ++osdc->last_tid;
+        struct rb_node *n;
-        req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
-        dout("__register_request %p tid %lld\n", req, req->r_tid);
+        for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
-        __insert_request(osdc, req);
+                struct ceph_pg_pool_info *pi =
-        ceph_osdc_get_request(req);
+                    rb_entry(n, struct ceph_pg_pool_info, node);
-        osdc->num_requests++;
-        if (osdc->num_requests == 1) {
+                if (__pool_full(pi))
-                dout(" first request, scheduling timeout\n");
+                        return true;
-                __schedule_osd_timeout(osdc);
        }
+        return false;
+}
+static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+        struct ceph_pg_pool_info *pi;
+        pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+        if (!pi)
+                return false;
+        return __pool_full(pi);
 }
 /*
- * called under osdc->request_mutex
+ * Returns whether a request should be blocked from being sent
+ * based on the current osdmap and osd_client settings.
 */
-static void __unregister_request(struct ceph_osd_client *osdc,
+static bool target_should_be_paused(struct ceph_osd_client *osdc,
-                                 struct ceph_osd_request *req)
+                                    const struct ceph_osd_request_target *t,
+                                    struct ceph_pg_pool_info *pi)
 {
-        if (RB_EMPTY_NODE(&req->r_node)) {
+        bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
-                dout("__unregister_request %p tid %lld not registered\n",
+        bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-                        req, req->r_tid);
+                       ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
-                return;
+                       __pool_full(pi);
+        WARN_ON(pi->id != t->base_oloc.pool);
+        return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+               (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+enum calc_target_result {
+        CALC_TARGET_NO_ACTION = 0,
+        CALC_TARGET_NEED_RESEND,
+        CALC_TARGET_POOL_DNE,
+};
+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
+                                           struct ceph_osd_request_target *t,
+                                           u32 *last_force_resend,
+                                           bool any_change)
+{
+        struct ceph_pg_pool_info *pi;
+        struct ceph_pg pgid, last_pgid;
+        struct ceph_osds up, acting;
+        bool force_resend = false;
+        bool need_check_tiering = false;
+        bool need_resend = false;
+        bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
+                                             CEPH_OSDMAP_SORTBITWISE);
+        enum calc_target_result ct_res;
+        int ret;
+        pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
+        if (!pi) {
+                t->osd = CEPH_HOMELESS_OSD;
+                ct_res = CALC_TARGET_POOL_DNE;
+                goto out;
        }
-        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+        if (osdc->osdmap->epoch == pi->last_force_request_resend) {
-        rb_erase(&req->r_node, &osdc->requests);
+                if (last_force_resend &&
-        RB_CLEAR_NODE(&req->r_node);
+                    *last_force_resend < pi->last_force_request_resend) {
-        osdc->num_requests--;
+                        *last_force_resend = pi->last_force_request_resend;
+                        force_resend = true;
+                } else if (!last_force_resend) {
+                        force_resend = true;
+                }
+        }
+        if (ceph_oid_empty(&t->target_oid) || force_resend) {
+                ceph_oid_copy(&t->target_oid, &t->base_oid);
+                need_check_tiering = true;
+        }
+        if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
+                ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+                need_check_tiering = true;
+        }
-        if (req->r_osd) {
+        if (need_check_tiering &&
-                /* make sure the original request isn't in flight. */
+            (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
-                ceph_msg_revoke(req->r_request);
+                if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+                        t->target_oloc.pool = pi->read_tier;
+                if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+                        t->target_oloc.pool = pi->write_tier;
+        }
-                list_del_init(&req->r_osd_item);
+        ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
-                maybe_move_osd_to_lru(osdc, req->r_osd);
+                                        &t->target_oloc, &pgid);
-                if (list_empty(&req->r_linger_osd_item))
+        if (ret) {
-                        req->r_osd = NULL;
+                WARN_ON(ret != -ENOENT);
+                t->osd = CEPH_HOMELESS_OSD;
+                ct_res = CALC_TARGET_POOL_DNE;
+                goto out;
+        }
+        last_pgid.pool = pgid.pool;
+        last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
+        ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+        if (any_change &&
+            ceph_is_new_interval(&t->acting,
+                                 &acting,
+                                 &t->up,
+                                 &up,
+                                 t->size,
+                                 pi->size,
+                                 t->min_size,
+                                 pi->min_size,
+                                 t->pg_num,
+                                 pi->pg_num,
+                                 t->sort_bitwise,
+                                 sort_bitwise,
+                                 &last_pgid))
+                force_resend = true;
+        if (t->paused && !target_should_be_paused(osdc, t, pi)) {
+                t->paused = false;
+                need_resend = true;
        }
-        list_del_init(&req->r_req_lru_item);
+        if (ceph_pg_compare(&t->pgid, &pgid) ||
-        ceph_osdc_put_request(req);
+            ceph_osds_changed(&t->acting, &acting, any_change) ||
+            force_resend) {
+                t->pgid = pgid; /* struct */
+                ceph_osds_copy(&t->acting, &acting);
+                ceph_osds_copy(&t->up, &up);
+                t->size = pi->size;
+                t->min_size = pi->min_size;
+                t->pg_num = pi->pg_num;
+                t->pg_num_mask = pi->pg_num_mask;
+                t->sort_bitwise = sort_bitwise;
+                t->osd = acting.primary;
+                need_resend = true;
+        }
+        ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+out:
+        dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+        return ct_res;
+}
+static void setup_request_data(struct ceph_osd_request *req,
+                               struct ceph_msg *msg)
+{
+        u32 data_len = 0;
+        int i;
+        if (!list_empty(&msg->data))
+                return;
+        WARN_ON(msg->data_length);
+        for (i = 0; i < req->r_num_ops; i++) {
+                struct ceph_osd_req_op *op = &req->r_ops[i];
+                switch (op->op) {
+                /* request */
+                case CEPH_OSD_OP_WRITE:
+                case CEPH_OSD_OP_WRITEFULL:
+                        WARN_ON(op->indata_len != op->extent.length);
+                        ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
+                        break;
+                case CEPH_OSD_OP_SETXATTR:
+                case CEPH_OSD_OP_CMPXATTR:
+                        WARN_ON(op->indata_len != op->xattr.name_len +
+                                                  op->xattr.value_len);
+                        ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
+                        break;
+                case CEPH_OSD_OP_NOTIFY_ACK:
+                        ceph_osdc_msg_data_add(msg,
+                                               &op->notify_ack.request_data);
+                        break;
+                /* reply */
+                case CEPH_OSD_OP_STAT:
+                        ceph_osdc_msg_data_add(req->r_reply,
+                                               &op->raw_data_in);
+                        break;
+                case CEPH_OSD_OP_READ:
+                        ceph_osdc_msg_data_add(req->r_reply,
+                                               &op->extent.osd_data);
+                        break;
+                /* both */
+                case CEPH_OSD_OP_CALL:
+                        WARN_ON(op->indata_len != op->cls.class_len +
+                                                  op->cls.method_len +
+                                                  op->cls.indata_len);
+                        ceph_osdc_msg_data_add(msg, &op->cls.request_info);
+                        /* optional, can be NONE */
+                        ceph_osdc_msg_data_add(msg, &op->cls.request_data);
+                        /* optional, can be NONE */
+                        ceph_osdc_msg_data_add(req->r_reply,
+                                               &op->cls.response_data);
+                        break;
+                case CEPH_OSD_OP_NOTIFY:
+                        ceph_osdc_msg_data_add(msg,
+                                               &op->notify.request_data);
+                        ceph_osdc_msg_data_add(req->r_reply,
+                                               &op->notify.response_data);
+                        break;
+                }
+                data_len += op->indata_len;
+        }
+        WARN_ON(data_len != msg->data_length);
+}
+static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+        void *p = msg->front.iov_base;
+        void *const end = p + msg->front_alloc_len;
+        u32 data_len = 0;
+        int i;
+        if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
+                /* snapshots aren't writeable */
+                WARN_ON(req->r_snapid != CEPH_NOSNAP);
+        } else {
+                WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
+                        req->r_data_offset || req->r_snapc);
+        }
+        setup_request_data(req, msg);
+        ceph_encode_32(&p, 1); /* client_inc, always 1 */
+        ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
+        ceph_encode_32(&p, req->r_flags);
+        ceph_encode_timespec(p, &req->r_mtime);
+        p += sizeof(struct ceph_timespec);
+        /* aka reassert_version */
+        memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
+        p += sizeof(req->r_replay_version);
+        /* oloc */
+        ceph_encode_8(&p, 4);
+        ceph_encode_8(&p, 4);
+        ceph_encode_32(&p, 8 + 4 + 4);
+        ceph_encode_64(&p, req->r_t.target_oloc.pool);
+        ceph_encode_32(&p, -1); /* preferred */
+        ceph_encode_32(&p, 0); /* key len */
+        /* pgid */
+        ceph_encode_8(&p, 1);
+        ceph_encode_64(&p, req->r_t.pgid.pool);
+        ceph_encode_32(&p, req->r_t.pgid.seed);
+        ceph_encode_32(&p, -1); /* preferred */
+        /* oid */
+        ceph_encode_32(&p, req->r_t.target_oid.name_len);
+        memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
+        p += req->r_t.target_oid.name_len;
-        if (osdc->num_requests == 0) {
+        /* ops, can imply data */
-                dout(" no requests, canceling timeout\n");
+        ceph_encode_16(&p, req->r_num_ops);
-                __cancel_osd_timeout(osdc);
+        for (i = 0; i < req->r_num_ops; i++) {
+                data_len += osd_req_encode_op(p, &req->r_ops[i]);
+                p += sizeof(struct ceph_osd_op);
        }
+        ceph_encode_64(&p, req->r_snapid); /* snapid */
+        if (req->r_snapc) {
+                ceph_encode_64(&p, req->r_snapc->seq);
+                ceph_encode_32(&p, req->r_snapc->num_snaps);
+                for (i = 0; i < req->r_snapc->num_snaps; i++)
+                        ceph_encode_64(&p, req->r_snapc->snaps[i]);
+        } else {
+                ceph_encode_64(&p, 0); /* snap_seq */
+                ceph_encode_32(&p, 0); /* snaps len */
+        }
+        ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
+        BUG_ON(p > end);
+        msg->front.iov_len = p - msg->front.iov_base;
+        msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
+        msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+        msg->hdr.data_len = cpu_to_le32(data_len);
+        /*
+         * The header "data_off" is a hint to the receiver allowing it
+         * to align received data into its buffers such that there's no
+         * need to re-copy it before writing it to disk (direct I/O).
+         */
+        msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
+        dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__,
+             req, req->r_t.target_oid.name_len, req->r_t.target_oid.name,
+             req->r_t.target_oid.name_len, msg->front.iov_len, data_len);
 }
 /*
- * Cancel a previously queued request message
+ * @req has to be assigned a tid and registered.
 */
-static void __cancel_request(struct ceph_osd_request *req)
+static void send_request(struct ceph_osd_request *req)
 {
-        if (req->r_sent && req->r_osd) {
+        struct ceph_osd *osd = req->r_osd;
+        verify_osd_locked(osd);
+        WARN_ON(osd->o_osd != req->r_t.osd);
+        /*
+         * We may have a previously queued request message hanging
+         * around.  Cancel it to avoid corrupting the msgr.
+         */
+        if (req->r_sent)
                ceph_msg_revoke(req->r_request);
-                req->r_sent = 0;
+        req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+        if (req->r_attempts)
+                req->r_flags |= CEPH_OSD_FLAG_RETRY;
+        else
+                WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
+        encode_request(req, req->r_request);
+        dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
+             __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
+             req->r_t.osd, req->r_flags, req->r_attempts);
+        req->r_t.paused = false;
+        req->r_stamp = jiffies;
+        req->r_attempts++;
+        req->r_sent = osd->o_incarnation;
+        req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+        ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
+}
+static void maybe_request_map(struct ceph_osd_client *osdc)
+{
+        bool continuous = false;
+        verify_osdc_locked(osdc);
+        WARN_ON(!osdc->osdmap->epoch);
+        if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+            ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
+            ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
+                dout("%s osdc %p continuous\n", __func__, osdc);
+                continuous = true;
+        } else {
+                dout("%s osdc %p onetime\n", __func__, osdc);
        }
+        if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+                               osdc->osdmap->epoch + 1, continuous))
+                ceph_monc_renew_subs(&osdc->client->monc);
 }
-static void __register_linger_request(struct ceph_osd_client *osdc,
+static void send_map_check(struct ceph_osd_request *req);
-                                    struct ceph_osd_request *req)
+static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 {
-        dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+        struct ceph_osd_client *osdc = req->r_osdc;
-        WARN_ON(!req->r_linger);
+        struct ceph_osd *osd;
+        enum calc_target_result ct_res;
+        bool need_send = false;
+        bool promoted = false;
+        WARN_ON(req->r_tid || req->r_got_reply);
+        dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
+again:
+        ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+        if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
+                goto promote;
+        osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
+        if (IS_ERR(osd)) {
+                WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
+                goto promote;
+        }
+        if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+            ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
+                dout("req %p pausewr\n", req);
+                req->r_t.paused = true;
+                maybe_request_map(osdc);
+        } else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+                   ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
+                dout("req %p pauserd\n", req);
+                req->r_t.paused = true;
+                maybe_request_map(osdc);
+        } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+                   !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
+                                     CEPH_OSD_FLAG_FULL_FORCE)) &&
+                   (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+                    pool_full(osdc, req->r_t.base_oloc.pool))) {
+                dout("req %p full/pool_full\n", req);
+                pr_warn_ratelimited("FULL or reached pool quota\n");
+                req->r_t.paused = true;
+                maybe_request_map(osdc);
+        } else if (!osd_homeless(osd)) {
+                need_send = true;
+        } else {
+                maybe_request_map(osdc);
+        }
+        mutex_lock(&osd->lock);
+        /*
+         * Assign the tid atomically with send_request() to protect
+         * multiple writes to the same object from racing with each
+         * other, resulting in out of order ops on the OSDs.
+         */
+        req->r_tid = atomic64_inc_return(&osdc->last_tid);
+        link_request(osd, req);
+        if (need_send)
+                send_request(req);
+        mutex_unlock(&osd->lock);
+        if (ct_res == CALC_TARGET_POOL_DNE)
+                send_map_check(req);
+        if (promoted)
+                downgrade_write(&osdc->lock);
+        return;
+promote:
+        up_read(&osdc->lock);
+        down_write(&osdc->lock);
+        wrlocked = true;
+        promoted = true;
+        goto again;
+}
+static void account_request(struct ceph_osd_request *req)
+{
+        unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+        if (req->r_flags & CEPH_OSD_FLAG_READ) {
+                WARN_ON(req->r_flags & mask);
+                req->r_flags |= CEPH_OSD_FLAG_ACK;
+        } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+                WARN_ON(!(req->r_flags & mask));
+        else
+                WARN_ON(1);
+        WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+        atomic_inc(&req->r_osdc->num_requests);
+}
+static void submit_request(struct ceph_osd_request *req, bool wrlocked)
+{
        ceph_osdc_get_request(req);
-        list_add_tail(&req->r_linger_item, &osdc->req_linger);
+        account_request(req);
-        if (req->r_osd)
+        __submit_request(req, wrlocked);
-                list_add_tail(&req->r_linger_osd_item,
-                              &req->r_osd->o_linger_requests);
 }
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
+static void __finish_request(struct ceph_osd_request *req)
-                                        struct ceph_osd_request *req)
 {
-        WARN_ON(!req->r_linger);
+        struct ceph_osd_client *osdc = req->r_osdc;
+        struct ceph_osd *osd = req->r_osd;
-        if (list_empty(&req->r_linger_item)) {
+        verify_osd_locked(osd);
-                dout("%s %p tid %llu not registered\n", __func__, req,
+        dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
-                     req->r_tid);
+        WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
+        unlink_request(osd, req);
+        atomic_dec(&osdc->num_requests);
+        /*
+         * If an OSD has failed or returned and a request has been sent
+         * twice, it's possible to get a reply and end up here while the
+         * request message is queued for delivery.  We will ignore the
+         * reply, so not a big deal, but better to try and catch it.
+         */
+        ceph_msg_revoke(req->r_request);
+        ceph_msg_revoke_incoming(req->r_reply);
+}
+static void finish_request(struct ceph_osd_request *req)
+{
+        __finish_request(req);
+        ceph_osdc_put_request(req);
+}
+static void __complete_request(struct ceph_osd_request *req)
+{
+        if (req->r_callback)
+                req->r_callback(req);
+        else
+                complete_all(&req->r_completion);
+}
+/*
+ * Note that this is open-coded in handle_reply(), which has to deal
+ * with ack vs commit, dup acks, etc.
+ */
+static void complete_request(struct ceph_osd_request *req, int err)
+{
+        dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+        req->r_result = err;
+        __finish_request(req);
+        __complete_request(req);
+        complete_all(&req->r_safe_completion);
+        ceph_osdc_put_request(req);
+}
+static void cancel_map_check(struct ceph_osd_request *req)
+{
+        struct ceph_osd_client *osdc = req->r_osdc;
+        struct ceph_osd_request *lookup_req;
+        verify_osdc_wrlocked(osdc);
+        lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+        if (!lookup_req)
                return;
+        WARN_ON(lookup_req != req);
+        erase_request_mc(&osdc->map_checks, req);
+        ceph_osdc_put_request(req);
+}
+static void cancel_request(struct ceph_osd_request *req)
+{
+        dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+        cancel_map_check(req);
+        finish_request(req);
+}
+static void check_pool_dne(struct ceph_osd_request *req)
+{
+        struct ceph_osd_client *osdc = req->r_osdc;
+        struct ceph_osdmap *map = osdc->osdmap;
+        verify_osdc_wrlocked(osdc);
+        WARN_ON(!map->epoch);
+        if (req->r_attempts) {
+                /*
+                 * We sent a request earlier, which means that
+                 * previously the pool existed, and now it does not
+                 * (i.e., it was deleted).
+                 */
+                req->r_map_dne_bound = map->epoch;
+                dout("%s req %p tid %llu pool disappeared\n", __func__, req,
+                     req->r_tid);
+        } else {
+                dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
+                     req, req->r_tid, req->r_map_dne_bound, map->epoch);
        }
-        dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+        if (req->r_map_dne_bound) {
-        list_del_init(&req->r_linger_item);
+                if (map->epoch >= req->r_map_dne_bound) {
+                        /* we had a new enough map */
+                        pr_info_ratelimited("tid %llu pool does not exist\n",
+                                            req->r_tid);
+                        complete_request(req, -ENOENT);
+                }
+        } else {
+                send_map_check(req);
+        }
+}
-        if (req->r_osd) {
+static void map_check_cb(struct ceph_mon_generic_request *greq)
-                list_del_init(&req->r_linger_osd_item);
+{
-                maybe_move_osd_to_lru(osdc, req->r_osd);
+        struct ceph_osd_client *osdc = &greq->monc->client->osdc;
-                if (list_empty(&req->r_osd_item))
+        struct ceph_osd_request *req;
-                        req->r_osd = NULL;
+        u64 tid = greq->private_data;
+        WARN_ON(greq->result || !greq->u.newest);
+        down_write(&osdc->lock);
+        req = lookup_request_mc(&osdc->map_checks, tid);
+        if (!req) {
+                dout("%s tid %llu dne\n", __func__, tid);
+                goto out_unlock;
        }
+        dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
+             req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
+        if (!req->r_map_dne_bound)
+                req->r_map_dne_bound = greq->u.newest;
+        erase_request_mc(&osdc->map_checks, req);
+        check_pool_dne(req);
        ceph_osdc_put_request(req);
+out_unlock:
+        up_write(&osdc->lock);
 }
-void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
+static void send_map_check(struct ceph_osd_request *req)
-                                  struct ceph_osd_request *req)
 {
-        if (!req->r_linger) {
+        struct ceph_osd_client *osdc = req->r_osdc;
-                dout("set_request_linger %p\n", req);
+        struct ceph_osd_request *lookup_req;
-                req->r_linger = 1;
+        int ret;
+        verify_osdc_wrlocked(osdc);
+        lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+        if (lookup_req) {
+                WARN_ON(lookup_req != req);
+                return;
        }
+        ceph_osdc_get_request(req);
+        insert_request_mc(&osdc->map_checks, req);
+        ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+                                          map_check_cb, req->r_tid);
+        WARN_ON(ret);
 }
-EXPORT_SYMBOL(ceph_osdc_set_request_linger);
 /*
- * Returns whether a request should be blocked from being sent
+ * lingering requests, watch/notify v2 infrastructure
- * based on the current osdmap and osd_client settings.
- *
- * Caller should hold map_sem for read.
 */
-static bool __req_should_be_paused(struct ceph_osd_client *osdc,
+static void linger_release(struct kref *kref)
-                                   struct ceph_osd_request *req)
 {
-        bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+        struct ceph_osd_linger_request *lreq =
-        bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+            container_of(kref, struct ceph_osd_linger_request, kref);
-                ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
-        return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
+        dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
-                (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
+             lreq->reg_req, lreq->ping_req);
+        WARN_ON(!RB_EMPTY_NODE(&lreq->node));
+        WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
+        WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
+        WARN_ON(!list_empty(&lreq->scan_item));
+        WARN_ON(!list_empty(&lreq->pending_lworks));
+        WARN_ON(lreq->osd);
+        if (lreq->reg_req)
+                ceph_osdc_put_request(lreq->reg_req);
+        if (lreq->ping_req)
+                ceph_osdc_put_request(lreq->ping_req);
+        target_destroy(&lreq->t);
+        kfree(lreq);
 }
+static void linger_put(struct ceph_osd_linger_request *lreq)
+{
+        if (lreq)
+                kref_put(&lreq->kref, linger_release);
+}
+static struct ceph_osd_linger_request *
+linger_get(struct ceph_osd_linger_request *lreq)
+{
+        kref_get(&lreq->kref);
+        return lreq;
+}
+static struct ceph_osd_linger_request *
+linger_alloc(struct ceph_osd_client *osdc)
+{
+        struct ceph_osd_linger_request *lreq;
+        lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
+        if (!lreq)
+                return NULL;
+        kref_init(&lreq->kref);
+        mutex_init(&lreq->lock);
+        RB_CLEAR_NODE(&lreq->node);
+        RB_CLEAR_NODE(&lreq->osdc_node);
+        RB_CLEAR_NODE(&lreq->mc_node);
+        INIT_LIST_HEAD(&lreq->scan_item);
+        INIT_LIST_HEAD(&lreq->pending_lworks);
+        init_completion(&lreq->reg_commit_wait);
+        init_completion(&lreq->notify_finish_wait);
+        lreq->osdc = osdc;
+        target_init(&lreq->t);
+        dout("%s lreq %p\n", __func__, lreq);
+        return lreq;
+}
+DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
+DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
+DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
 /*
- * Calculate mapping of a request to a PG.  Takes tiering into account.
+ * Create linger request <-> OSD session relation.
+ *
+ * @lreq has to be registered, @osd may be homeless.
 */
-static int __calc_request_pg(struct ceph_osdmap *osdmap,
+static void link_linger(struct ceph_osd *osd,
-                             struct ceph_osd_request *req,
+                        struct ceph_osd_linger_request *lreq)
-                             struct ceph_pg *pg_out)
 {
-        bool need_check_tiering;
+        verify_osd_locked(osd);
+        WARN_ON(!lreq->linger_id || lreq->osd);
+        dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+             osd->o_osd, lreq, lreq->linger_id);
-        need_check_tiering = false;
+        if (!osd_homeless(osd))
-        if (req->r_target_oloc.pool == -1) {
+                __remove_osd_from_lru(osd);
-                req->r_target_oloc = req->r_base_oloc; /* struct */
+        else
-                need_check_tiering = true;
+                atomic_inc(&osd->o_osdc->num_homeless);
+        get_osd(osd);
+        insert_linger(&osd->o_linger_requests, lreq);
+        lreq->osd = osd;
+}
+static void unlink_linger(struct ceph_osd *osd,
+                          struct ceph_osd_linger_request *lreq)
+{
+        verify_osd_locked(osd);
+        WARN_ON(lreq->osd != osd);
+        dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+             osd->o_osd, lreq, lreq->linger_id);
+        lreq->osd = NULL;
+        erase_linger(&osd->o_linger_requests, lreq);
+        put_osd(osd);
+        if (!osd_homeless(osd))
+                maybe_move_osd_to_lru(osd);
+        else
+                atomic_dec(&osd->o_osdc->num_homeless);
+}
+static bool __linger_registered(struct ceph_osd_linger_request *lreq)
+{
+        verify_osdc_locked(lreq->osdc);
+        return !RB_EMPTY_NODE(&lreq->osdc_node);
+}
+static bool linger_registered(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        bool registered;
+        down_read(&osdc->lock);
+        registered = __linger_registered(lreq);
+        up_read(&osdc->lock);
+        return registered;
+}
+static void linger_register(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        verify_osdc_wrlocked(osdc);
+        WARN_ON(lreq->linger_id);
+        linger_get(lreq);
+        lreq->linger_id = ++osdc->last_linger_id;
+        insert_linger_osdc(&osdc->linger_requests, lreq);
+}
+static void linger_unregister(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        verify_osdc_wrlocked(osdc);
+        erase_linger_osdc(&osdc->linger_requests, lreq);
+        linger_put(lreq);
+}
+static void cancel_linger_request(struct ceph_osd_request *req)
+{
+        struct ceph_osd_linger_request *lreq = req->r_priv;
+        WARN_ON(!req->r_linger);
+        cancel_request(req);
+        linger_put(lreq);
+}
+struct linger_work {
+        struct work_struct work;
+        struct ceph_osd_linger_request *lreq;
+        struct list_head pending_item;
+        unsigned long queued_stamp;
+        union {
+                struct {
+                        u64 notify_id;
+                        u64 notifier_id;
+                        void *payload; /* points into @msg front */
+                        size_t payload_len;
+                        struct ceph_msg *msg; /* for ceph_msg_put() */
+                } notify;
+                struct {
+                        int err;
+                } error;
+        };
+};
+static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
+                                       work_func_t workfn)
+{
+        struct linger_work *lwork;
+        lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
+        if (!lwork)
+                return NULL;
+        INIT_WORK(&lwork->work, workfn);
+        INIT_LIST_HEAD(&lwork->pending_item);
+        lwork->lreq = linger_get(lreq);
+        return lwork;
+}
+static void lwork_free(struct linger_work *lwork)
+{
+        struct ceph_osd_linger_request *lreq = lwork->lreq;
+        mutex_lock(&lreq->lock);
+        list_del(&lwork->pending_item);
+        mutex_unlock(&lreq->lock);
+        linger_put(lreq);
+        kfree(lwork);
+}
+static void lwork_queue(struct linger_work *lwork)
+{
+        struct ceph_osd_linger_request *lreq = lwork->lreq;
+        struct ceph_osd_client *osdc = lreq->osdc;
+        verify_lreq_locked(lreq);
+        WARN_ON(!list_empty(&lwork->pending_item));
+        lwork->queued_stamp = jiffies;
+        list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
+        queue_work(osdc->notify_wq, &lwork->work);
+}
+static void do_watch_notify(struct work_struct *w)
+{
+        struct linger_work *lwork = container_of(w, struct linger_work, work);
+        struct ceph_osd_linger_request *lreq = lwork->lreq;
+        if (!linger_registered(lreq)) {
+                dout("%s lreq %p not registered\n", __func__, lreq);
+                goto out;
        }
-        if (req->r_target_oid.name_len == 0) {
-                ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
+        WARN_ON(!lreq->is_watch);
-                need_check_tiering = true;
+        dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
+             __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
+             lwork->notify.payload_len);
+        lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
+                  lwork->notify.notifier_id, lwork->notify.payload,
+                  lwork->notify.payload_len);
+out:
+        ceph_msg_put(lwork->notify.msg);
+        lwork_free(lwork);
+}
+static void do_watch_error(struct work_struct *w)
+{
+        struct linger_work *lwork = container_of(w, struct linger_work, work);
+        struct ceph_osd_linger_request *lreq = lwork->lreq;
+        if (!linger_registered(lreq)) {
+                dout("%s lreq %p not registered\n", __func__, lreq);
+                goto out;
        }
-        if (need_check_tiering &&
+        dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
-            (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+        lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
-                struct ceph_pg_pool_info *pi;
+out:
-                pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
+        lwork_free(lwork);
-                if (pi) {
+}
-                        if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
-                            pi->read_tier >= 0)
+static void queue_watch_error(struct ceph_osd_linger_request *lreq)
-                                req->r_target_oloc.pool = pi->read_tier;
+{
-                        if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+        struct linger_work *lwork;
-                            pi->write_tier >= 0)
-                                req->r_target_oloc.pool = pi->write_tier;
+        lwork = lwork_alloc(lreq, do_watch_error);
+        if (!lwork) {
+                pr_err("failed to allocate error-lwork\n");
+                return;
+        }
+        lwork->error.err = lreq->last_error;
+        lwork_queue(lwork);
+}
+static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
+                                       int result)
+{
+        if (!completion_done(&lreq->reg_commit_wait)) {
+                lreq->reg_commit_error = (result <= 0 ? result : 0);
+                complete_all(&lreq->reg_commit_wait);
+        }
+}
+static void linger_commit_cb(struct ceph_osd_request *req)
+{
+        struct ceph_osd_linger_request *lreq = req->r_priv;
+        mutex_lock(&lreq->lock);
+        dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
+             lreq->linger_id, req->r_result);
+        WARN_ON(!__linger_registered(lreq));
+        linger_reg_commit_complete(lreq, req->r_result);
+        lreq->committed = true;
+        if (!lreq->is_watch) {
+                struct ceph_osd_data *osd_data =
+                    osd_req_op_data(req, 0, notify, response_data);
+                void *p = page_address(osd_data->pages[0]);
+                WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
+                        osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+                /* make note of the notify_id */
+                if (req->r_ops[0].outdata_len >= sizeof(u64)) {
+                        lreq->notify_id = ceph_decode_64(&p);
+                        dout("lreq %p notify_id %llu\n", lreq,
+                             lreq->notify_id);
+                } else {
+                        dout("lreq %p no notify_id\n", lreq);
                }
-                /* !pi is caught in ceph_oloc_oid_to_pg() */
        }
-        return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
+        mutex_unlock(&lreq->lock);
-                                   &req->r_target_oid, pg_out);
+        linger_put(lreq);
 }
-static void __enqueue_request(struct ceph_osd_request *req)
+static int normalize_watch_error(int err)
 {
-        struct ceph_osd_client *osdc = req->r_osdc;
+        /*
+         * Translate ENOENT -> ENOTCONN so that a delete->disconnection
+         * notification and a failure to reconnect because we raced with
+         * the delete appear the same to the user.
+         */
+        if (err == -ENOENT)
+                err = -ENOTCONN;
+        return err;
+}
+static void linger_reconnect_cb(struct ceph_osd_request *req)
+{
+        struct ceph_osd_linger_request *lreq = req->r_priv;
+        mutex_lock(&lreq->lock);
+        dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
+             lreq, lreq->linger_id, req->r_result, lreq->last_error);
+        if (req->r_result < 0) {
+                if (!lreq->last_error) {
+                        lreq->last_error = normalize_watch_error(req->r_result);
+                        queue_watch_error(lreq);
+                }
+        }
-        dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
+        mutex_unlock(&lreq->lock);
-             req->r_osd ? req->r_osd->o_osd : -1);
+        linger_put(lreq);
+}
+static void send_linger(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_request *req = lreq->reg_req;
+        struct ceph_osd_req_op *op = &req->r_ops[0];
-        if (req->r_osd) {
+        verify_osdc_wrlocked(req->r_osdc);
-                __remove_osd_from_lru(req->r_osd);
+        dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
-                list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
-                list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
+        if (req->r_osd)
+                cancel_linger_request(req);
+        request_reinit(req);
+        ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+        ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+        req->r_flags = lreq->t.flags;
+        req->r_mtime = lreq->mtime;
+        mutex_lock(&lreq->lock);
+        if (lreq->is_watch && lreq->committed) {
+                WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+                        op->watch.cookie != lreq->linger_id);
+                op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
+                op->watch.gen = ++lreq->register_gen;
+                dout("lreq %p reconnect register_gen %u\n", lreq,
+                     op->watch.gen);
+                req->r_callback = linger_reconnect_cb;
        } else {
-                list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
+                if (!lreq->is_watch)
+                        lreq->notify_id = 0;
+                else
+                        WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+                dout("lreq %p register\n", lreq);
+                req->r_callback = linger_commit_cb;
        }
+        mutex_unlock(&lreq->lock);
+        req->r_priv = linger_get(lreq);
+        req->r_linger = true;
+        submit_request(req, true);
 }
-/*
+static void linger_ping_cb(struct ceph_osd_request *req)
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately.  If there is
- * no up osd, set r_osd to NULL.  Move the request to the appropriate list
- * (unsent, homeless) or leave on in-flight lru.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_request(struct ceph_osd_client *osdc,
-                         struct ceph_osd_request *req, int force_resend)
 {
-        struct ceph_pg pgid;
+        struct ceph_osd_linger_request *lreq = req->r_priv;
-        int acting[CEPH_PG_MAX_SIZE];
-        int num, o;
+        mutex_lock(&lreq->lock);
-        int err;
+        dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
-        bool was_paused;
+             __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
+             lreq->last_error);
-        dout("map_request %p tid %lld\n", req, req->r_tid);
+        if (lreq->register_gen == req->r_ops[0].watch.gen) {
+                if (!req->r_result) {
-        err = __calc_request_pg(osdc->osdmap, req, &pgid);
+                        lreq->watch_valid_thru = lreq->ping_sent;
-        if (err) {
+                } else if (!lreq->last_error) {
-                list_move(&req->r_req_lru_item, &osdc->req_notarget);
+                        lreq->last_error = normalize_watch_error(req->r_result);
-                return err;
+                        queue_watch_error(lreq);
-        }
-        req->r_pgid = pgid;
-        num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
-        if (num < 0)
-                num = 0;
-        was_paused = req->r_paused;
-        req->r_paused = __req_should_be_paused(osdc, req);
-        if (was_paused && !req->r_paused)
-                force_resend = 1;
-        if ((!force_resend &&
-             req->r_osd && req->r_osd->o_osd == o &&
-             req->r_sent >= req->r_osd->o_incarnation &&
-             req->r_num_pg_osds == num &&
-             memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
-            (req->r_osd == NULL && o == -1) ||
-            req->r_paused)
-                return 0;  /* no change */
-        dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
-             req->r_tid, pgid.pool, pgid.seed, o,
-             req->r_osd ? req->r_osd->o_osd : -1);
-        /* record full pg acting set */
-        memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
-        req->r_num_pg_osds = num;
-        if (req->r_osd) {
-                __cancel_request(req);
-                list_del_init(&req->r_osd_item);
-                list_del_init(&req->r_linger_osd_item);
-                req->r_osd = NULL;
-        }
-        req->r_osd = __lookup_osd(osdc, o);
-        if (!req->r_osd && o >= 0) {
-                err = -ENOMEM;
-                req->r_osd = create_osd(osdc, o);
-                if (!req->r_osd) {
-                        list_move(&req->r_req_lru_item, &osdc->req_notarget);
-                        goto out;
                }
+        } else {
+                dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
+                     lreq->register_gen, req->r_ops[0].watch.gen);
+        }
-                dout("map_request osd %p is osd%d\n", req->r_osd, o);
+        mutex_unlock(&lreq->lock);
-                __insert_osd(osdc, req->r_osd);
+        linger_put(lreq);
+}
+static void send_linger_ping(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        struct ceph_osd_request *req = lreq->ping_req;
+        struct ceph_osd_req_op *op = &req->r_ops[0];
-                ceph_con_open(&req->r_osd->o_con,
+        if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
-                              CEPH_ENTITY_TYPE_OSD, o,
+                dout("%s PAUSERD\n", __func__);
-                              &osdc->osdmap->osd_addr[o]);
+                return;
        }
-        __enqueue_request(req);
+        lreq->ping_sent = jiffies;
-        err = 1;   /* osd or pg changed */
+        dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
+             __func__, lreq, lreq->linger_id, lreq->ping_sent,
+             lreq->register_gen);
-out:
+        if (req->r_osd)
-        return err;
+                cancel_linger_request(req);
+        request_reinit(req);
+        target_copy(&req->r_t, &lreq->t);
+        WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+                op->watch.cookie != lreq->linger_id ||
+                op->watch.op != CEPH_OSD_WATCH_OP_PING);
+        op->watch.gen = lreq->register_gen;
+        req->r_callback = linger_ping_cb;
+        req->r_priv = linger_get(lreq);
+        req->r_linger = true;
+        ceph_osdc_get_request(req);
+        account_request(req);
+        req->r_tid = atomic64_inc_return(&osdc->last_tid);
+        link_request(lreq->osd, req);
+        send_request(req);
 }
-/*
+static void linger_submit(struct ceph_osd_linger_request *lreq)
- * caller should hold map_sem (for read) and request_mutex
- */
-static void __send_request(struct ceph_osd_client *osdc,
-                           struct ceph_osd_request *req)
 {
-        void *p;
+        struct ceph_osd_client *osdc = lreq->osdc;
+        struct ceph_osd *osd;
-        dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
+        calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
-             req, req->r_tid, req->r_osd->o_osd, req->r_flags,
+        osd = lookup_create_osd(osdc, lreq->t.osd, true);
-             (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
+        link_linger(osd, lreq);
-        /* fill in message content that changes each time we send it */
+        send_linger(lreq);
-        put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
+}
-        put_unaligned_le32(req->r_flags, req->r_request_flags);
-        put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
-        p = req->r_request_pgid;
-        ceph_encode_64(&p, req->r_pgid.pool);
-        ceph_encode_32(&p, req->r_pgid.seed);
-        put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */
-        memcpy(req->r_request_reassert_version, &req->r_reassert_version,
-               sizeof(req->r_reassert_version));
-        req->r_stamp = jiffies;
+static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
-        list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        struct ceph_osd_linger_request *lookup_lreq;
-        ceph_msg_get(req->r_request); /* send consumes a ref */
+        verify_osdc_wrlocked(osdc);
-        req->r_sent = req->r_osd->o_incarnation;
+        lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+                                       lreq->linger_id);
+        if (!lookup_lreq)
+                return;
-        ceph_con_send(&req->r_osd->o_con, req->r_request);
+        WARN_ON(lookup_lreq != lreq);
+        erase_linger_mc(&osdc->linger_map_checks, lreq);
+        linger_put(lreq);
 }
 /*
- * Send any requests in the queue (req_unsent).
+ * @lreq has to be both registered and linked.
 */
-static void __send_queued(struct ceph_osd_client *osdc)
+static void __linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+        if (lreq->is_watch && lreq->ping_req->r_osd)
+                cancel_linger_request(lreq->ping_req);
+        if (lreq->reg_req->r_osd)
+                cancel_linger_request(lreq->reg_req);
+        cancel_linger_map_check(lreq);
+        unlink_linger(lreq->osd, lreq);
+        linger_unregister(lreq);
+}
+static void linger_cancel(struct ceph_osd_linger_request *lreq)
 {
-        struct ceph_osd_request *req, *tmp;
+        struct ceph_osd_client *osdc = lreq->osdc;
-        dout("__send_queued\n");
+        down_write(&osdc->lock);
-        list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
+        if (__linger_registered(lreq))
-                __send_request(osdc, req);
+                __linger_cancel(lreq);
+        up_write(&osdc->lock);
 }
-/*
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
- * Caller should hold map_sem for read and request_mutex.
- */
+static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
-static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
+{
-                                     struct ceph_osd_request *req,
+        struct ceph_osd_client *osdc = lreq->osdc;
-                                     bool nofail)
+        struct ceph_osdmap *map = osdc->osdmap;
-{
-        int rc;
+        verify_osdc_wrlocked(osdc);
+        WARN_ON(!map->epoch);
-        __register_request(osdc, req);
-        req->r_sent = 0;
+        if (lreq->register_gen) {
-        req->r_got_reply = 0;
+                lreq->map_dne_bound = map->epoch;
-        rc = __map_request(osdc, req, 0);
+                dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
-        if (rc < 0) {
+                     lreq, lreq->linger_id);
-                if (nofail) {
+        } else {
-                        dout("osdc_start_request failed map, "
+                dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
-                                " will retry %lld\n", req->r_tid);
+                     __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
-                        rc = 0;
+                     map->epoch);
-                } else {
-                        __unregister_request(osdc, req);
-                }
-                return rc;
        }
-        if (req->r_osd == NULL) {
+        if (lreq->map_dne_bound) {
-                dout("send_request %p no up osds in pg\n", req);
+                if (map->epoch >= lreq->map_dne_bound) {
-                ceph_monc_request_next_osdmap(&osdc->client->monc);
+                        /* we had a new enough map */
+                        pr_info("linger_id %llu pool does not exist\n",
+                                lreq->linger_id);
+                        linger_reg_commit_complete(lreq, -ENOENT);
+                        __linger_cancel(lreq);
+                }
        } else {
-                __send_queued(osdc);
+                send_linger_map_check(lreq);
        }
+}
-        return 0;
+static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
+{
+        struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+        struct ceph_osd_linger_request *lreq;
+        u64 linger_id = greq->private_data;
+        WARN_ON(greq->result || !greq->u.newest);
+        down_write(&osdc->lock);
+        lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
+        if (!lreq) {
+                dout("%s linger_id %llu dne\n", __func__, linger_id);
+                goto out_unlock;
+        }
+        dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
+             __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+             greq->u.newest);
+        if (!lreq->map_dne_bound)
+                lreq->map_dne_bound = greq->u.newest;
+        erase_linger_mc(&osdc->linger_map_checks, lreq);
+        check_linger_pool_dne(lreq);
+        linger_put(lreq);
+out_unlock:
+        up_write(&osdc->lock);
+}
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        struct ceph_osd_linger_request *lookup_lreq;
+        int ret;
+        verify_osdc_wrlocked(osdc);
+        lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+                                       lreq->linger_id);
+        if (lookup_lreq) {
+                WARN_ON(lookup_lreq != lreq);
+                return;
+        }
+        linger_get(lreq);
+        insert_linger_mc(&osdc->linger_map_checks, lreq);
+        ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+                                          linger_map_check_cb, lreq->linger_id);
+        WARN_ON(ret);
+}
+static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
+{
+        int ret;
+        dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+        ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
+        return ret ?: lreq->reg_commit_error;
+}
+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
+{
+        int ret;
+        dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+        ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
+        return ret ?: lreq->notify_finish_error;
 }
 /*
- * Timeout callback, called every N seconds when 1 or more osd
+ * Timeout callback, called every N seconds.  When 1 or more OSD
- * requests has been active for more than N seconds.  When this
+ * requests has been active for more than N seconds, we send a keepalive
- * happens, we ping all OSDs with requests who have timed out to
+ * (tag + timestamp) to its OSD to ensure any communications channel
- * ensure any communications channel reset is detected.  Reset the
+ * reset is detected.
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
 */
 static void handle_timeout(struct work_struct *work)
 {
        struct ceph_osd_client *osdc =
                container_of(work, struct ceph_osd_client, timeout_work.work);
        struct ceph_options *opts = osdc->client->options;
-        struct ceph_osd_request *req;
+        unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
-        struct ceph_osd *osd;
+        LIST_HEAD(slow_osds);
-        struct list_head slow_osds;
+        struct rb_node *n, *p;
-        dout("timeout\n");
-        down_read(&osdc->map_sem);
-        ceph_monc_request_next_osdmap(&osdc->client->monc);
-        mutex_lock(&osdc->request_mutex);
+        dout("%s osdc %p\n", __func__, osdc);
+        down_write(&osdc->lock);
        /*
         * ping osds that are a bit slow.  this ensures that if there
         * is a break in the TCP connection we will notice, and reopen
         * a connection with that osd (from the fault callback).
         */
-        INIT_LIST_HEAD(&slow_osds);
+        for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
-        list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+                struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
-                if (time_before(jiffies,
+                bool found = false;
-                                req->r_stamp + opts->osd_keepalive_timeout))
-                        break;
+                for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+                        struct ceph_osd_request *req =
+                            rb_entry(p, struct ceph_osd_request, r_node);
+                        if (time_before(req->r_stamp, cutoff)) {
+                                dout(" req %p tid %llu on osd%d is laggy\n",
+                                     req, req->r_tid, osd->o_osd);
+                                found = true;
+                        }
+                }
+                for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
+                        struct ceph_osd_linger_request *lreq =
+                            rb_entry(p, struct ceph_osd_linger_request, node);
+                        dout(" lreq %p linger_id %llu is served by osd%d\n",
+                             lreq, lreq->linger_id, osd->o_osd);
+                        found = true;
+                        mutex_lock(&lreq->lock);
+                        if (lreq->is_watch && lreq->committed && !lreq->last_error)
+                                send_linger_ping(lreq);
+                        mutex_unlock(&lreq->lock);
+                }
-                osd = req->r_osd;
+                if (found)
-                BUG_ON(!osd);
+                        list_move_tail(&osd->o_keepalive_item, &slow_osds);
-                dout(" tid %llu is slow, will send keepalive on osd%d\n",
-                     req->r_tid, osd->o_osd);
-                list_move_tail(&osd->o_keepalive_item, &slow_osds);
        }
+        if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
+                maybe_request_map(osdc);
        while (!list_empty(&slow_osds)) {
-                osd = list_entry(slow_osds.next, struct ceph_osd,
+                struct ceph_osd *osd = list_first_entry(&slow_osds,
-                                 o_keepalive_item);
+                                                        struct ceph_osd,
+                                                        o_keepalive_item);
                list_del_init(&osd->o_keepalive_item);
                ceph_con_keepalive(&osd->o_con);
        }
-        __schedule_osd_timeout(osdc);
+        up_write(&osdc->lock);
-        __send_queued(osdc);
+        schedule_delayed_work(&osdc->timeout_work,
-        mutex_unlock(&osdc->request_mutex);
+                              osdc->client->options->osd_keepalive_timeout);
-        up_read(&osdc->map_sem);
 }
 static void handle_osds_timeout(struct work_struct *work)
@@ -1663,12 +2542,20 @@ static void handle_osds_timeout(struct work_struct *work)
                container_of(work, struct ceph_osd_client,
                             osds_timeout_work.work);
        unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
+        struct ceph_osd *osd, *nosd;
-        dout("osds timeout\n");
+        dout("%s osdc %p\n", __func__, osdc);
-        down_read(&osdc->map_sem);
+        down_write(&osdc->lock);
-        remove_old_osds(osdc);
+        list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
-        up_read(&osdc->map_sem);
+                if (time_before(jiffies, osd->lru_ttl))
+                        break;
+                WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+                WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+                close_osd(osd);
+        }
+        up_write(&osdc->lock);
        schedule_delayed_work(&osdc->osds_timeout_work,
                              round_jiffies_relative(delay));
 }
@@ -1776,107 +2663,76 @@ e_inval:
        goto out;
 }
-static void complete_request(struct ceph_osd_request *req)
+struct MOSDOpReply {
-{
+        struct ceph_pg pgid;
-        complete_all(&req->r_safe_completion);  /* fsync waiter */
+        u64 flags;
-}
+        int result;
+        u32 epoch;
+        int num_ops;
+        u32 outdata_len[CEPH_OSD_MAX_OPS];
+        s32 rval[CEPH_OSD_MAX_OPS];
+        int retry_attempt;
+        struct ceph_eversion replay_version;
+        u64 user_version;
+        struct ceph_request_redirect redirect;
+};
-/*
+static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 {
-        void *p, *end;
+        void *p = msg->front.iov_base;
-        struct ceph_osd_request *req;
+        void *const end = p + msg->front.iov_len;
-        struct ceph_request_redirect redir;
+        u16 version = le16_to_cpu(msg->hdr.version);
-        u64 tid;
+        struct ceph_eversion bad_replay_version;
-        int object_len;
-        unsigned int numops;
-        int payload_len, flags;
-        s32 result;
-        s32 retry_attempt;
-        struct ceph_pg pg;
-        int err;
-        u32 reassert_epoch;
-        u64 reassert_version;
-        u32 osdmap_epoch;
-        int already_completed;
-        u32 bytes;
        u8 decode_redir;
-        unsigned int i;
+        u32 len;
+        int ret;
-        tid = le64_to_cpu(msg->hdr.tid);
+        int i;
-        dout("handle_reply %p tid %llu\n", msg, tid);
-        p = msg->front.iov_base;
+        ceph_decode_32_safe(&p, end, len, e_inval);
-        end = p + msg->front.iov_len;
+        ceph_decode_need(&p, end, len, e_inval);
+        p += len; /* skip oid */
-        ceph_decode_need(&p, end, 4, bad);
+        ret = ceph_decode_pgid(&p, end, &m->pgid);
-        object_len = ceph_decode_32(&p);
+        if (ret)
-        ceph_decode_need(&p, end, object_len, bad);
+                return ret;
-        p += object_len;
-        err = ceph_decode_pgid(&p, end, &pg);
+        ceph_decode_64_safe(&p, end, m->flags, e_inval);
-        if (err)
+        ceph_decode_32_safe(&p, end, m->result, e_inval);
-                goto bad;
+        ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
+        memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
+        p += sizeof(bad_replay_version);
+        ceph_decode_32_safe(&p, end, m->epoch, e_inval);
-        ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
+        ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
-        flags = ceph_decode_64(&p);
+        if (m->num_ops > ARRAY_SIZE(m->outdata_len))
-        result = ceph_decode_32(&p);
+                goto e_inval;
-        reassert_epoch = ceph_decode_32(&p);
-        reassert_version = ceph_decode_64(&p);
-        osdmap_epoch = ceph_decode_32(&p);
-        /* lookup */
-        down_read(&osdc->map_sem);
-        mutex_lock(&osdc->request_mutex);
-        req = __lookup_request(osdc, tid);
-        if (req == NULL) {
-                dout("handle_reply tid %llu dne\n", tid);
-                goto bad_mutex;
-        }
-        ceph_osdc_get_request(req);
-        dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
+        ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
-             req, result);
+                         e_inval);
+        for (i = 0; i < m->num_ops; i++) {
-        ceph_decode_need(&p, end, 4, bad_put);
-        numops = ceph_decode_32(&p);
-        if (numops > CEPH_OSD_MAX_OPS)
-                goto bad_put;
-        if (numops != req->r_num_ops)
-                goto bad_put;
-        payload_len = 0;
-        ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
-        for (i = 0; i < numops; i++) {
                struct ceph_osd_op *op = p;
-                int len;
-                len = le32_to_cpu(op->payload_len);
+                m->outdata_len[i] = le32_to_cpu(op->payload_len);
-                req->r_ops[i].outdata_len = len;
-                dout(" op %d has %d bytes\n", i, len);
-                payload_len += len;
                p += sizeof(*op);
        }
-        bytes = le32_to_cpu(msg->hdr.data_len);
-        if (payload_len != bytes) {
-                pr_warn("sum of op payload lens %d != data_len %d\n",
-                        payload_len, bytes);
-                goto bad_put;
-        }
-        ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
+        ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
-        retry_attempt = ceph_decode_32(&p);
+        for (i = 0; i < m->num_ops; i++)
-        for (i = 0; i < numops; i++)
+                ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
-                req->r_ops[i].rval = ceph_decode_32(&p);
-        if (le16_to_cpu(msg->hdr.version) >= 6) {
+        if (version >= 5) {
-                p += 8 + 4; /* skip replay_version */
+                ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
-                p += 8; /* skip user_version */
+                memcpy(&m->replay_version, p, sizeof(m->replay_version));
+                p += sizeof(m->replay_version);
+                ceph_decode_64_safe(&p, end, m->user_version, e_inval);
+        } else {
+                m->replay_version = bad_replay_version; /* struct */
+                m->user_version = le64_to_cpu(m->replay_version.version);
+        }
-                if (le16_to_cpu(msg->hdr.version) >= 7)
+        if (version >= 6) {
-                        ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+                if (version >= 7)
+                        ceph_decode_8_safe(&p, end, decode_redir, e_inval);
                else
                        decode_redir = 1;
        } else {
@@ -1884,228 +2740,410 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
        }
        if (decode_redir) {
-                err = ceph_redirect_decode(&p, end, &redir);
+                ret = ceph_redirect_decode(&p, end, &m->redirect);
-                if (err)
+                if (ret)
-                        goto bad_put;
+                        return ret;
        } else {
-                redir.oloc.pool = -1;
+                ceph_oloc_init(&m->redirect.oloc);
        }
-        if (redir.oloc.pool != -1) {
+        return 0;
-                dout("redirect pool %lld\n", redir.oloc.pool);
-                __unregister_request(osdc, req);
-                req->r_target_oloc = redir.oloc; /* struct */
-                /*
+e_inval:
-                 * Start redirect requests with nofail=true.  If
+        return -EINVAL;
-                 * mapping fails, request will end up on the notarget
+}
-                 * list, waiting for the new osdmap (which can take
-                 * a while), even though the original request mapped
-                 * successfully.  In the future we might want to follow
-                 * original request's nofail setting here.
-                 */
-                err = __ceph_osdc_start_request(osdc, req, true);
-                BUG_ON(err);
-                goto out_unlock;
+/*
-        }
+ * We are done with @req if
+ *   - @m is a safe reply, or
+ *   - @m is an unsafe reply and we didn't want a safe one
+ */
+static bool done_request(const struct ceph_osd_request *req,
+                         const struct MOSDOpReply *m)
+{
+        return (m->result < 0 ||
+                (m->flags & CEPH_OSD_FLAG_ONDISK) ||
+                !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
+}
-        already_completed = req->r_got_reply;
+/*
-        if (!req->r_got_reply) {
+ * handle osd op reply.  either call the callback if it is specified,
-                req->r_result = result;
+ * or do the completion to wake up the waiting thread.
-                dout("handle_reply result %d bytes %d\n", req->r_result,
+ *
-                     bytes);
+ * ->r_unsafe_callback is set?  yes                     no
-                if (req->r_result == 0)
+ *
-                        req->r_result = bytes;
+ * first reply is OK (needed    r_cb/r_completion,      r_cb/r_completion,
+ * any or needed/got safe)      r_safe_completion       r_safe_completion
+ *
+ * first reply is unsafe        r_unsafe_cb(true)       (nothing)
+ *
+ * when we get the safe reply   r_unsafe_cb(false),     r_cb/r_completion,
+ *                              r_safe_completion       r_safe_completion
+ */
+static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
+{
+        struct ceph_osd_client *osdc = osd->o_osdc;
+        struct ceph_osd_request *req;
+        struct MOSDOpReply m;
+        u64 tid = le64_to_cpu(msg->hdr.tid);
+        u32 data_len = 0;
+        bool already_acked;
+        int ret;
+        int i;
-                /* in case this is a write and we need to replay, */
+        dout("%s msg %p tid %llu\n", __func__, msg, tid);
-                req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
-                req->r_reassert_version.version = cpu_to_le64(reassert_version);
-                req->r_got_reply = 1;
+        down_read(&osdc->lock);
-        } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+        if (!osd_registered(osd)) {
-                dout("handle_reply tid %llu dup ack\n", tid);
+                dout("%s osd%d unknown\n", __func__, osd->o_osd);
-                goto out_unlock;
+                goto out_unlock_osdc;
        }
+        WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
-        dout("handle_reply tid %llu flags %d\n", tid, flags);
+        mutex_lock(&osd->lock);
+        req = lookup_request(&osd->o_requests, tid);
+        if (!req) {
+                dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
+                goto out_unlock_session;
+        }
-        if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
+        ret = decode_MOSDOpReply(msg, &m);
-                __register_linger_request(osdc, req);
+        if (ret) {
+                pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
+                       req->r_tid, ret);
+                ceph_msg_dump(msg);
+                goto fail_request;
+        }
+        dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
+             __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
+             m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
+             le64_to_cpu(m.replay_version.version), m.user_version);
+        if (m.retry_attempt >= 0) {
+                if (m.retry_attempt != req->r_attempts - 1) {
+                        dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
+                             req, req->r_tid, m.retry_attempt,
+                             req->r_attempts - 1);
+                        goto out_unlock_session;
+                }
+        } else {
+                WARN_ON(1); /* MOSDOpReply v4 is assumed */
+        }
-        /* either this is a read, or we got the safe response */
+        if (!ceph_oloc_empty(&m.redirect.oloc)) {
-        if (result < 0 ||
+                dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
-            (flags & CEPH_OSD_FLAG_ONDISK) ||
+                     m.redirect.oloc.pool);
-            ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+                unlink_request(osd, req);
-                __unregister_request(osdc, req);
+                mutex_unlock(&osd->lock);
+                ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
+                req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
+                req->r_tid = 0;
+                __submit_request(req, false);
+                goto out_unlock_osdc;
+        }
-        mutex_unlock(&osdc->request_mutex);
+        if (m.num_ops != req->r_num_ops) {
-        up_read(&osdc->map_sem);
+                pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
+                       req->r_num_ops, req->r_tid);
+                goto fail_request;
+        }
+        for (i = 0; i < req->r_num_ops; i++) {
+                dout(" req %p tid %llu op %d rval %d len %u\n", req,
+                     req->r_tid, i, m.rval[i], m.outdata_len[i]);
+                req->r_ops[i].rval = m.rval[i];
+                req->r_ops[i].outdata_len = m.outdata_len[i];
+                data_len += m.outdata_len[i];
+        }
+        if (data_len != le32_to_cpu(msg->hdr.data_len)) {
+                pr_err("sum of lens %u != %u for tid %llu\n", data_len,
+                       le32_to_cpu(msg->hdr.data_len), req->r_tid);
+                goto fail_request;
+        }
+        dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
+             req, req->r_tid, req->r_got_reply, m.result, data_len);
+        already_acked = req->r_got_reply;
+        if (!already_acked) {
+                req->r_result = m.result ?: data_len;
+                req->r_replay_version = m.replay_version; /* struct */
+                req->r_got_reply = true;
+        } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
+                dout("req %p tid %llu dup ack\n", req, req->r_tid);
+                goto out_unlock_session;
+        }
-        if (!already_completed) {
+        if (done_request(req, &m)) {
-                if (req->r_unsafe_callback &&
+                __finish_request(req);
-                    result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
+                if (req->r_linger) {
-                        req->r_unsafe_callback(req, true);
+                        WARN_ON(req->r_unsafe_callback);
-                if (req->r_callback)
+                        dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
-                        req->r_callback(req, msg);
+                        __complete_request(req);
-                else
+                }
-                        complete_all(&req->r_completion);
        }
-        if (flags & CEPH_OSD_FLAG_ONDISK) {
+        mutex_unlock(&osd->lock);
-                if (req->r_unsafe_callback && already_completed)
+        up_read(&osdc->lock);
+        if (done_request(req, &m)) {
+                if (already_acked && req->r_unsafe_callback) {
+                        dout("req %p tid %llu safe-cb\n", req, req->r_tid);
                        req->r_unsafe_callback(req, false);
-                complete_request(req);
+                } else if (!req->r_linger) {
+                        dout("req %p tid %llu cb\n", req, req->r_tid);
+                        __complete_request(req);
+                }
+        } else {
+                if (req->r_unsafe_callback) {
+                        dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
+                        req->r_unsafe_callback(req, true);
+                } else {
+                        WARN_ON(1);
+                }
        }
+        if (m.flags & CEPH_OSD_FLAG_ONDISK)
+                complete_all(&req->r_safe_completion);
-out:
-        dout("req=%p req->r_linger=%d\n", req, req->r_linger);
        ceph_osdc_put_request(req);
        return;
-out_unlock:
-        mutex_unlock(&osdc->request_mutex);
-        up_read(&osdc->map_sem);
-        goto out;
-bad_put:
+fail_request:
-        req->r_result = -EIO;
+        complete_request(req, -EIO);
-        __unregister_request(osdc, req);
+out_unlock_session:
-        if (req->r_callback)
+        mutex_unlock(&osd->lock);
-                req->r_callback(req, msg);
+out_unlock_osdc:
-        else
+        up_read(&osdc->lock);
-                complete_all(&req->r_completion);
-        complete_request(req);
-        ceph_osdc_put_request(req);
-bad_mutex:
-        mutex_unlock(&osdc->request_mutex);
-        up_read(&osdc->map_sem);
-bad:
-        pr_err("corrupt osd_op_reply got %d %d\n",
-               (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
-        ceph_msg_dump(msg);
 }
-static void reset_changed_osds(struct ceph_osd_client *osdc)
+static void set_pool_was_full(struct ceph_osd_client *osdc)
 {
-        struct rb_node *p, *n;
+        struct rb_node *n;
-        dout("%s %p\n", __func__, osdc);
+        for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
-        for (p = rb_first(&osdc->osds); p; p = n) {
+                struct ceph_pg_pool_info *pi =
-                struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
+                    rb_entry(n, struct ceph_pg_pool_info, node);
-                n = rb_next(p);
+                pi->was_full = __pool_full(pi);
-                if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-                    memcmp(&osd->o_con.peer_addr,
-                           ceph_osd_addr(osdc->osdmap,
-                                         osd->o_osd),
-                           sizeof(struct ceph_entity_addr)) != 0)
-                        __reset_osd(osdc, osd);
        }
 }
+static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+        struct ceph_pg_pool_info *pi;
+        pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+        if (!pi)
+                return false;
+        return pi->was_full && !__pool_full(pi);
+}
+static enum calc_target_result
+recalc_linger_target(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        enum calc_target_result ct_res;
+        ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
+        if (ct_res == CALC_TARGET_NEED_RESEND) {
+                struct ceph_osd *osd;
+                osd = lookup_create_osd(osdc, lreq->t.osd, true);
+                if (osd != lreq->osd) {
+                        unlink_linger(lreq->osd, lreq);
+                        link_linger(osd, lreq);
+                }
+        }
+        return ct_res;
+}
 /*
- * Requeue requests whose mapping to an OSD has changed.  If requests map to
+ * Requeue requests whose mapping to an OSD has changed.
- * no osd, request a new map.
- *
- * Caller should hold map_sem for read.
 */
-static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
+static void scan_requests(struct ceph_osd *osd,
-                          bool force_resend_writes)
+                          bool force_resend,
+                          bool cleared_full,
+                          bool check_pool_cleared_full,
+                          struct rb_root *need_resend,
+                          struct list_head *need_resend_linger)
 {
-        struct ceph_osd_request *req, *nreq;
+        struct ceph_osd_client *osdc = osd->o_osdc;
-        struct rb_node *p;
+        struct rb_node *n;
-        int needmap = 0;
+        bool force_resend_writes;
-        int err;
-        bool force_resend_req;
+        for (n = rb_first(&osd->o_linger_requests); n; ) {
+                struct ceph_osd_linger_request *lreq =
+                    rb_entry(n, struct ceph_osd_linger_request, node);
+                enum calc_target_result ct_res;
+                n = rb_next(n); /* recalc_linger_target() */
+                dout("%s lreq %p linger_id %llu\n", __func__, lreq,
+                     lreq->linger_id);
+                ct_res = recalc_linger_target(lreq);
+                switch (ct_res) {
+                case CALC_TARGET_NO_ACTION:
+                        force_resend_writes = cleared_full ||
+                            (check_pool_cleared_full &&
+                             pool_cleared_full(osdc, lreq->t.base_oloc.pool));
+                        if (!force_resend && !force_resend_writes)
+                                break;
+                        /* fall through */
+                case CALC_TARGET_NEED_RESEND:
+                        cancel_linger_map_check(lreq);
+                        /*
+                         * scan_requests() for the previous epoch(s)
+                         * may have already added it to the list, since
+                         * it's not unlinked here.
+                         */
+                        if (list_empty(&lreq->scan_item))
+                                list_add_tail(&lreq->scan_item, need_resend_linger);
+                        break;
+                case CALC_TARGET_POOL_DNE:
+                        check_linger_pool_dne(lreq);
+                        break;
+                }
+        }
-        dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
+        for (n = rb_first(&osd->o_requests); n; ) {
-                force_resend_writes ? " (force resend writes)" : "");
+                struct ceph_osd_request *req =
-        mutex_lock(&osdc->request_mutex);
+                    rb_entry(n, struct ceph_osd_request, r_node);
-        for (p = rb_first(&osdc->requests); p; ) {
+                enum calc_target_result ct_res;
-                req = rb_entry(p, struct ceph_osd_request, r_node);
-                p = rb_next(p);
+                n = rb_next(n); /* unlink_request(), check_pool_dne() */
+                dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+                ct_res = calc_target(osdc, &req->r_t,
+                                     &req->r_last_force_resend, false);
+                switch (ct_res) {
+                case CALC_TARGET_NO_ACTION:
+                        force_resend_writes = cleared_full ||
+                            (check_pool_cleared_full &&
+                             pool_cleared_full(osdc, req->r_t.base_oloc.pool));
+                        if (!force_resend &&
+                            (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
+                             !force_resend_writes))
+                                break;
+                        /* fall through */
+                case CALC_TARGET_NEED_RESEND:
+                        cancel_map_check(req);
+                        unlink_request(osd, req);
+                        insert_request(need_resend, req);
+                        break;
+                case CALC_TARGET_POOL_DNE:
+                        check_pool_dne(req);
+                        break;
+                }
+        }
+}
+static int handle_one_map(struct ceph_osd_client *osdc,
+                          void *p, void *end, bool incremental,
+                          struct rb_root *need_resend,
+                          struct list_head *need_resend_linger)
+{
+        struct ceph_osdmap *newmap;
+        struct rb_node *n;
+        bool skipped_map = false;
+        bool was_full;
+        was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+        set_pool_was_full(osdc);
+        if (incremental)
+                newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+        else
+                newmap = ceph_osdmap_decode(&p, end);
+        if (IS_ERR(newmap))
+                return PTR_ERR(newmap);
+        if (newmap != osdc->osdmap) {
                /*
-                 * For linger requests that have not yet been
+                 * Preserve ->was_full before destroying the old map.
-                 * registered, move them to the linger list; they'll
+                 * For pools that weren't in the old map, ->was_full
-                 * be sent to the osd in the loop below.  Unregister
+                 * should be false.
-                 * the request before re-registering it as a linger
-                 * request to ensure the __map_request() below
-                 * will decide it needs to be sent.
                 */
-                if (req->r_linger && list_empty(&req->r_linger_item)) {
+                for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
-                        dout("%p tid %llu restart on osd%d\n",
+                        struct ceph_pg_pool_info *pi =
-                             req, req->r_tid,
+                            rb_entry(n, struct ceph_pg_pool_info, node);
-                             req->r_osd ? req->r_osd->o_osd : -1);
+                        struct ceph_pg_pool_info *old_pi;
-                        ceph_osdc_get_request(req);
-                        __unregister_request(osdc, req);
+                        old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
-                        __register_linger_request(osdc, req);
+                        if (old_pi)
-                        ceph_osdc_put_request(req);
+                                pi->was_full = old_pi->was_full;
-                        continue;
+                        else
+                                WARN_ON(pi->was_full);
                }
-                force_resend_req = force_resend ||
+                if (osdc->osdmap->epoch &&
-                        (force_resend_writes &&
+                    osdc->osdmap->epoch + 1 < newmap->epoch) {
-                                req->r_flags & CEPH_OSD_FLAG_WRITE);
+                        WARN_ON(incremental);
-                err = __map_request(osdc, req, force_resend_req);
+                        skipped_map = true;
-                if (err < 0)
-                        continue;  /* error */
-                if (req->r_osd == NULL) {
-                        dout("%p tid %llu maps to no osd\n", req, req->r_tid);
-                        needmap++;  /* request a newer map */
-                } else if (err > 0) {
-                        if (!req->r_linger) {
-                                dout("%p tid %llu requeued on osd%d\n", req,
-                                     req->r_tid,
-                                     req->r_osd ? req->r_osd->o_osd : -1);
-                                req->r_flags |= CEPH_OSD_FLAG_RETRY;
-                        }
                }
+                ceph_osdmap_destroy(osdc->osdmap);
+                osdc->osdmap = newmap;
        }
-        list_for_each_entry_safe(req, nreq, &osdc->req_linger,
+        was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
-                                 r_linger_item) {
+        scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
-                dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
+                      need_resend, need_resend_linger);
-                err = __map_request(osdc, req,
+        for (n = rb_first(&osdc->osds); n; ) {
-                                    force_resend || force_resend_writes);
+                struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
-                dout("__map_request returned %d\n", err);
-                if (err < 0)
+                n = rb_next(n); /* close_osd() */
-                        continue;  /* hrm! */
-                if (req->r_osd == NULL || err > 0) {
+                scan_requests(osd, skipped_map, was_full, true, need_resend,
-                        if (req->r_osd == NULL) {
+                              need_resend_linger);
-                                dout("lingering %p tid %llu maps to no osd\n",
+                if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-                                     req, req->r_tid);
+                    memcmp(&osd->o_con.peer_addr,
-                                /*
+                           ceph_osd_addr(osdc->osdmap, osd->o_osd),
-                                 * A homeless lingering request makes
+                           sizeof(struct ceph_entity_addr)))
-                                 * no sense, as it's job is to keep
+                        close_osd(osd);
-                                 * a particular OSD connection open.
+        }
-                                 * Request a newer map and kick the
-                                 * request, knowing that it won't be
-                                 * resent until we actually get a map
-                                 * that can tell us where to send it.
-                                 */
-                                needmap++;
-                        }
-                        dout("kicking lingering %p tid %llu osd%d\n", req,
+        return 0;
-                             req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
+}
-                        __register_request(osdc, req);
-                        __unregister_linger_request(osdc, req);
+static void kick_requests(struct ceph_osd_client *osdc,
+                          struct rb_root *need_resend,
+                          struct list_head *need_resend_linger)
+{
+        struct ceph_osd_linger_request *lreq, *nlreq;
+        struct rb_node *n;
+        for (n = rb_first(need_resend); n; ) {
+                struct ceph_osd_request *req =
+                    rb_entry(n, struct ceph_osd_request, r_node);
+                struct ceph_osd *osd;
+                n = rb_next(n);
+                erase_request(need_resend, req); /* before link_request() */
+                WARN_ON(req->r_osd);
+                calc_target(osdc, &req->r_t, NULL, false);
+                osd = lookup_create_osd(osdc, req->r_t.osd, true);
+                link_request(osd, req);
+                if (!req->r_linger) {
+                        if (!osd_homeless(osd) && !req->r_t.paused)
+                                send_request(req);
+                } else {
+                        cancel_linger_request(req);
                }
        }
-        reset_changed_osds(osdc);
-        mutex_unlock(&osdc->request_mutex);
-        if (needmap) {
+        list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
-                dout("%d requests for down osds, need new map\n", needmap);
+                if (!osd_homeless(lreq->osd))
-                ceph_monc_request_next_osdmap(&osdc->client->monc);
+                        send_linger(lreq);
+                list_del_init(&lreq->scan_item);
        }
 }
 /*
 * Process updated osd map.
 *
@@ -2115,27 +3153,31 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
 */
 void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 {
-        void *p, *end, *next;
+        void *p = msg->front.iov_base;
+        void *const end = p + msg->front.iov_len;
        u32 nr_maps, maplen;
        u32 epoch;
-        struct ceph_osdmap *newmap = NULL, *oldmap;
-        int err;
        struct ceph_fsid fsid;
-        bool was_full;
+        struct rb_root need_resend = RB_ROOT;
+        LIST_HEAD(need_resend_linger);
+        bool handled_incremental = false;
+        bool was_pauserd, was_pausewr;
+        bool pauserd, pausewr;
+        int err;
-        dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+        dout("%s have %u\n", __func__, osdc->osdmap->epoch);
-        p = msg->front.iov_base;
+        down_write(&osdc->lock);
-        end = p + msg->front.iov_len;
        /* verify fsid */
        ceph_decode_need(&p, end, sizeof(fsid), bad);
        ceph_decode_copy(&p, &fsid, sizeof(fsid));
        if (ceph_check_fsid(osdc->client, &fsid) < 0)
-                return;
+                goto bad;
-        down_write(&osdc->map_sem);
+        was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+        was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-        was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+                      ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+                      have_pool_full(osdc);
        /* incremental maps */
        ceph_decode_32_safe(&p, end, nr_maps, bad);
@@ -2145,34 +3187,23 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
                epoch = ceph_decode_32(&p);
                maplen = ceph_decode_32(&p);
                ceph_decode_need(&p, end, maplen, bad);
-                next = p + maplen;
+                if (osdc->osdmap->epoch &&
-                if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+                    osdc->osdmap->epoch + 1 == epoch) {
                        dout("applying incremental map %u len %d\n",
                             epoch, maplen);
-                        newmap = osdmap_apply_incremental(&p, next,
+                        err = handle_one_map(osdc, p, p + maplen, true,
-                                                          osdc->osdmap,
+                                             &need_resend, &need_resend_linger);
-                                                          &osdc->client->msgr);
+                        if (err)
-                        if (IS_ERR(newmap)) {
-                                err = PTR_ERR(newmap);
                                goto bad;
-                        }
+                        handled_incremental = true;
-                        BUG_ON(!newmap);
-                        if (newmap != osdc->osdmap) {
-                                ceph_osdmap_destroy(osdc->osdmap);
-                                osdc->osdmap = newmap;
-                        }
-                        was_full = was_full ||
-                                ceph_osdmap_flag(osdc->osdmap,
-                                                 CEPH_OSDMAP_FULL);
-                        kick_requests(osdc, 0, was_full);
                } else {
                        dout("ignoring incremental map %u len %d\n",
                             epoch, maplen);
                }
-                p = next;
+                p += maplen;
                nr_maps--;
        }
-        if (newmap)
+        if (handled_incremental)
                goto done;
        /* full maps */
@@ -2186,455 +3217,647 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
                if (nr_maps > 1) {
                        dout("skipping non-latest full map %u len %d\n",
                             epoch, maplen);
-                } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+                } else if (osdc->osdmap->epoch >= epoch) {
                        dout("skipping full map %u len %d, "
                             "older than our %u\n", epoch, maplen,
                             osdc->osdmap->epoch);
                } else {
-                        int skipped_map = 0;
                        dout("taking full map %u len %d\n", epoch, maplen);
-                        newmap = ceph_osdmap_decode(&p, p+maplen);
+                        err = handle_one_map(osdc, p, p + maplen, false,
-                        if (IS_ERR(newmap)) {
+                                             &need_resend, &need_resend_linger);
-                                err = PTR_ERR(newmap);
+                        if (err)
                                goto bad;
-                        }
-                        BUG_ON(!newmap);
-                        oldmap = osdc->osdmap;
-                        osdc->osdmap = newmap;
-                        if (oldmap) {
-                                if (oldmap->epoch + 1 < newmap->epoch)
-                                        skipped_map = 1;
-                                ceph_osdmap_destroy(oldmap);
-                        }
-                        was_full = was_full ||
-                                ceph_osdmap_flag(osdc->osdmap,
-                                                 CEPH_OSDMAP_FULL);
-                        kick_requests(osdc, skipped_map, was_full);
                }
                p += maplen;
                nr_maps--;
        }
-        if (!osdc->osdmap)
-                goto bad;
 done:
-        downgrade_write(&osdc->map_sem);
-        ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
-                          osdc->osdmap->epoch);
        /*
         * subscribe to subsequent osdmap updates if full to ensure
         * we find out when we are no longer full and stop returning
         * ENOSPC.
         */
-        if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+        pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
-                ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
+        pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-                ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
+                  ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
-                ceph_monc_request_next_osdmap(&osdc->client->monc);
+                  have_pool_full(osdc);
+        if (was_pauserd || was_pausewr || pauserd || pausewr)
-        mutex_lock(&osdc->request_mutex);
+                maybe_request_map(osdc);
-        __send_queued(osdc);
-        mutex_unlock(&osdc->request_mutex);
+        kick_requests(osdc, &need_resend, &need_resend_linger);
-        up_read(&osdc->map_sem);
+        ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+                          osdc->osdmap->epoch);
+        up_write(&osdc->lock);
        wake_up_all(&osdc->client->auth_wq);
        return;
 bad:
        pr_err("osdc handle_map corrupt msg\n");
        ceph_msg_dump(msg);
-        up_write(&osdc->map_sem);
+        up_write(&osdc->lock);
 }
 /*
- * watch/notify callback event infrastructure
+ * Resubmit requests pending on the given osd.
- *
- * These callbacks are used both for watch and notify operations.
 */
-static void __release_event(struct kref *kref)
+static void kick_osd_requests(struct ceph_osd *osd)
 {
-        struct ceph_osd_event *event =
+        struct rb_node *n;
-                container_of(kref, struct ceph_osd_event, kref);
-        dout("__release_event %p\n", event);
+        for (n = rb_first(&osd->o_requests); n; ) {
-        kfree(event);
+                struct ceph_osd_request *req =
-}
+                    rb_entry(n, struct ceph_osd_request, r_node);
-static void get_event(struct ceph_osd_event *event)
+                n = rb_next(n); /* cancel_linger_request() */
-{
-        kref_get(&event->kref);
-}
-void ceph_osdc_put_event(struct ceph_osd_event *event)
+                if (!req->r_linger) {
-{
+                        if (!req->r_t.paused)
-        kref_put(&event->kref, __release_event);
+                                send_request(req);
+                } else {
+                        cancel_linger_request(req);
+                }
+        }
+        for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+                struct ceph_osd_linger_request *lreq =
+                    rb_entry(n, struct ceph_osd_linger_request, node);
+                send_linger(lreq);
+        }
 }
-EXPORT_SYMBOL(ceph_osdc_put_event);
-static void __insert_event(struct ceph_osd_client *osdc,
+/*
-                             struct ceph_osd_event *new)
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_fault(struct ceph_connection *con)
 {
-        struct rb_node **p = &osdc->event_tree.rb_node;
+        struct ceph_osd *osd = con->private;
-        struct rb_node *parent = NULL;
+        struct ceph_osd_client *osdc = osd->o_osdc;
-        struct ceph_osd_event *event = NULL;
-        while (*p) {
+        dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
-                parent = *p;
-                event = rb_entry(parent, struct ceph_osd_event, node);
+        down_write(&osdc->lock);
-                if (new->cookie < event->cookie)
+        if (!osd_registered(osd)) {
-                        p = &(*p)->rb_left;
+                dout("%s osd%d unknown\n", __func__, osd->o_osd);
-                else if (new->cookie > event->cookie)
+                goto out_unlock;
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
        }
-        rb_link_node(&new->node, parent, p);
+        if (!reopen_osd(osd))
-        rb_insert_color(&new->node, &osdc->event_tree);
+                kick_osd_requests(osd);
+        maybe_request_map(osdc);
+out_unlock:
+        up_write(&osdc->lock);
 }
-static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
+/*
-                                                u64 cookie)
+ * Process osd watch notifications
+ */
+static void handle_watch_notify(struct ceph_osd_client *osdc,
+                                struct ceph_msg *msg)
 {
-        struct rb_node **p = &osdc->event_tree.rb_node;
+        void *p = msg->front.iov_base;
-        struct rb_node *parent = NULL;
+        void *const end = p + msg->front.iov_len;
-        struct ceph_osd_event *event = NULL;
+        struct ceph_osd_linger_request *lreq;
+        struct linger_work *lwork;
+        u8 proto_ver, opcode;
+        u64 cookie, notify_id;
+        u64 notifier_id = 0;
+        s32 return_code = 0;
+        void *payload = NULL;
+        u32 payload_len = 0;
-        while (*p) {
+        ceph_decode_8_safe(&p, end, proto_ver, bad);
-                parent = *p;
+        ceph_decode_8_safe(&p, end, opcode, bad);
-                event = rb_entry(parent, struct ceph_osd_event, node);
+        ceph_decode_64_safe(&p, end, cookie, bad);
-                if (cookie < event->cookie)
+        p += 8; /* skip ver */
-                        p = &(*p)->rb_left;
+        ceph_decode_64_safe(&p, end, notify_id, bad);
-                else if (cookie > event->cookie)
-                        p = &(*p)->rb_right;
+        if (proto_ver >= 1) {
-                else
+                ceph_decode_32_safe(&p, end, payload_len, bad);
-                        return event;
+                ceph_decode_need(&p, end, payload_len, bad);
+                payload = p;
+                p += payload_len;
        }
-        return NULL;
-}
-static void __remove_event(struct ceph_osd_event *event)
+        if (le16_to_cpu(msg->hdr.version) >= 2)
-{
+                ceph_decode_32_safe(&p, end, return_code, bad);
-        struct ceph_osd_client *osdc = event->osdc;
-        if (!RB_EMPTY_NODE(&event->node)) {
+        if (le16_to_cpu(msg->hdr.version) >= 3)
-                dout("__remove_event removed %p\n", event);
+                ceph_decode_64_safe(&p, end, notifier_id, bad);
-                rb_erase(&event->node, &osdc->event_tree);
-                ceph_osdc_put_event(event);
+        down_read(&osdc->lock);
+        lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
+        if (!lreq) {
+                dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
+                     cookie);
+                goto out_unlock_osdc;
+        }
+        mutex_lock(&lreq->lock);
+        dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
+             opcode, cookie, lreq, lreq->is_watch);
+        if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
+                if (!lreq->last_error) {
+                        lreq->last_error = -ENOTCONN;
+                        queue_watch_error(lreq);
+                }
+        } else if (!lreq->is_watch) {
+                /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
+                if (lreq->notify_id && lreq->notify_id != notify_id) {
+                        dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
+                             lreq->notify_id, notify_id);
+                } else if (!completion_done(&lreq->notify_finish_wait)) {
+                        struct ceph_msg_data *data =
+                            list_first_entry_or_null(&msg->data,
+                                                     struct ceph_msg_data,
+                                                     links);
+                        if (data) {
+                                if (lreq->preply_pages) {
+                                        WARN_ON(data->type !=
+                                                        CEPH_MSG_DATA_PAGES);
+                                        *lreq->preply_pages = data->pages;
+                                        *lreq->preply_len = data->length;
+                                } else {
+                                        ceph_release_page_vector(data->pages,
+                                               calc_pages_for(0, data->length));
+                                }
+                        }
+                        lreq->notify_finish_error = return_code;
+                        complete_all(&lreq->notify_finish_wait);
+                }
        } else {
-                dout("__remove_event didn't remove %p\n", event);
+                /* CEPH_WATCH_EVENT_NOTIFY */
+                lwork = lwork_alloc(lreq, do_watch_notify);
+                if (!lwork) {
+                        pr_err("failed to allocate notify-lwork\n");
+                        goto out_unlock_lreq;
+                }
+                lwork->notify.notify_id = notify_id;
+                lwork->notify.notifier_id = notifier_id;
+                lwork->notify.payload = payload;
+                lwork->notify.payload_len = payload_len;
+                lwork->notify.msg = ceph_msg_get(msg);
+                lwork_queue(lwork);
        }
+out_unlock_lreq:
+        mutex_unlock(&lreq->lock);
+out_unlock_osdc:
+        up_read(&osdc->lock);
+        return;
+bad:
+        pr_err("osdc handle_watch_notify corrupt msg\n");
 }
-int ceph_osdc_create_event(struct ceph_osd_client *osdc,
+/*
-                           void (*event_cb)(u64, u64, u8, void *),
+ * Register request, send initial attempt.
-                           void *data, struct ceph_osd_event **pevent)
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+                            struct ceph_osd_request *req,
+                            bool nofail)
 {
-        struct ceph_osd_event *event;
+        down_read(&osdc->lock);
+        submit_request(req, false);
-        event = kmalloc(sizeof(*event), GFP_NOIO);
+        up_read(&osdc->lock);
-        if (!event)
-                return -ENOMEM;
-        dout("create_event %p\n", event);
-        event->cb = event_cb;
-        event->one_shot = 0;
-        event->data = data;
-        event->osdc = osdc;
-        INIT_LIST_HEAD(&event->osd_node);
-        RB_CLEAR_NODE(&event->node);
-        kref_init(&event->kref);   /* one ref for us */
-        kref_get(&event->kref);    /* one ref for the caller */
-        spin_lock(&osdc->event_lock);
-        event->cookie = ++osdc->event_count;
-        __insert_event(osdc, event);
-        spin_unlock(&osdc->event_lock);
-        *pevent = event;
        return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_create_event);
+EXPORT_SYMBOL(ceph_osdc_start_request);
-void ceph_osdc_cancel_event(struct ceph_osd_event *event)
+/*
+ * Unregister a registered request.  The request is not completed (i.e.
+ * no callbacks or wakeups) - higher layers are supposed to know what
+ * they are canceling.
+ */
+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
 {
-        struct ceph_osd_client *osdc = event->osdc;
+        struct ceph_osd_client *osdc = req->r_osdc;
-        dout("cancel_event %p\n", event);
+        down_write(&osdc->lock);
-        spin_lock(&osdc->event_lock);
+        if (req->r_osd)
-        __remove_event(event);
+                cancel_request(req);
-        spin_unlock(&osdc->event_lock);
+        up_write(&osdc->lock);
-        ceph_osdc_put_event(event); /* caller's */
 }
-EXPORT_SYMBOL(ceph_osdc_cancel_event);
+EXPORT_SYMBOL(ceph_osdc_cancel_request);
-static void do_event_work(struct work_struct *work)
+/*
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
+static int wait_request_timeout(struct ceph_osd_request *req,
+                                unsigned long timeout)
 {
-        struct ceph_osd_event_work *event_work =
+        long left;
-                container_of(work, struct ceph_osd_event_work, work);
-        struct ceph_osd_event *event = event_work->event;
-        u64 ver = event_work->ver;
-        u64 notify_id = event_work->notify_id;
-        u8 opcode = event_work->opcode;
-        dout("do_event_work completing %p\n", event);
+        dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
-        event->cb(ver, notify_id, opcode, event->data);
+        left = wait_for_completion_killable_timeout(&req->r_completion,
-        dout("do_event_work completed %p\n", event);
+                                                ceph_timeout_jiffies(timeout));
-        ceph_osdc_put_event(event);
+        if (left <= 0) {
-        kfree(event_work);
+                left = left ?: -ETIMEDOUT;
+                ceph_osdc_cancel_request(req);
+                /* kludge - need to to wake ceph_osdc_sync() */
+                complete_all(&req->r_safe_completion);
+        } else {
+                left = req->r_result; /* completed */
+        }
+        return left;
 }
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+                           struct ceph_osd_request *req)
+{
+        return wait_request_timeout(req, 0);
+}
+EXPORT_SYMBOL(ceph_osdc_wait_request);
 /*
- * Process osd watch notifications
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
 */
-static void handle_watch_notify(struct ceph_osd_client *osdc,
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
-                                struct ceph_msg *msg)
 {
-        void *p, *end;
+        struct rb_node *n, *p;
-        u8 proto_ver;
+        u64 last_tid = atomic64_read(&osdc->last_tid);
-        u64 cookie, ver, notify_id;
-        u8 opcode;
-        struct ceph_osd_event *event;
-        struct ceph_osd_event_work *event_work;
-        p = msg->front.iov_base;
+again:
-        end = p + msg->front.iov_len;
+        down_read(&osdc->lock);
+        for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+                struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
-        ceph_decode_8_safe(&p, end, proto_ver, bad);
+                mutex_lock(&osd->lock);
-        ceph_decode_8_safe(&p, end, opcode, bad);
+                for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
-        ceph_decode_64_safe(&p, end, cookie, bad);
+                        struct ceph_osd_request *req =
-        ceph_decode_64_safe(&p, end, ver, bad);
+                            rb_entry(p, struct ceph_osd_request, r_node);
-        ceph_decode_64_safe(&p, end, notify_id, bad);
+                        if (req->r_tid > last_tid)
+                                break;
+                        if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
+                                continue;
-        spin_lock(&osdc->event_lock);
+                        ceph_osdc_get_request(req);
-        event = __find_event(osdc, cookie);
+                        mutex_unlock(&osd->lock);
-        if (event) {
+                        up_read(&osdc->lock);
-                BUG_ON(event->one_shot);
+                        dout("%s waiting on req %p tid %llu last_tid %llu\n",
-                get_event(event);
+                             __func__, req, req->r_tid, last_tid);
-        }
+                        wait_for_completion(&req->r_safe_completion);
-        spin_unlock(&osdc->event_lock);
+                        ceph_osdc_put_request(req);
-        dout("handle_watch_notify cookie %lld ver %lld event %p\n",
+                        goto again;
-             cookie, ver, event);
-        if (event) {
-                event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
-                if (!event_work) {
-                        pr_err("couldn't allocate event_work\n");
-                        ceph_osdc_put_event(event);
-                        return;
                }
-                INIT_WORK(&event_work->work, do_event_work);
-                event_work->event = event;
-                event_work->ver = ver;
-                event_work->notify_id = notify_id;
-                event_work->opcode = opcode;
-                queue_work(osdc->notify_wq, &event_work->work);
+                mutex_unlock(&osd->lock);
        }
-        return;
+        up_read(&osdc->lock);
+        dout("%s done last_tid %llu\n", __func__, last_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_sync);
-bad:
+static struct ceph_osd_request *
-        pr_err("osdc handle_watch_notify corrupt msg\n");
+alloc_linger_request(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_request *req;
+        req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
+        if (!req)
+                return NULL;
+        ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+        ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+        if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
+                ceph_osdc_put_request(req);
+                return NULL;
+        }
+        return req;
 }
 /*
- * build new request AND message
+ * Returns a handle, caller owns a ref.
- *
 */
-void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
+struct ceph_osd_linger_request *
-                                struct ceph_snap_context *snapc, u64 snap_id,
+ceph_osdc_watch(struct ceph_osd_client *osdc,
-                                struct timespec *mtime)
+                struct ceph_object_id *oid,
-{
+                struct ceph_object_locator *oloc,
-        struct ceph_msg *msg = req->r_request;
+                rados_watchcb2_t wcb,
-        void *p;
+                rados_watcherrcb_t errcb,
-        size_t msg_size;
+                void *data)
-        int flags = req->r_flags;
+{
-        u64 data_len;
+        struct ceph_osd_linger_request *lreq;
-        unsigned int i;
+        int ret;
-        req->r_snapid = snap_id;
-        req->r_snapc = ceph_get_snap_context(snapc);
-        /* encode request */
-        msg->hdr.version = cpu_to_le16(4);
-        p = msg->front.iov_base;
-        ceph_encode_32(&p, 1);   /* client_inc  is always 1 */
-        req->r_request_osdmap_epoch = p;
-        p += 4;
-        req->r_request_flags = p;
-        p += 4;
-        if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-                ceph_encode_timespec(p, mtime);
-        p += sizeof(struct ceph_timespec);
-        req->r_request_reassert_version = p;
-        p += sizeof(struct ceph_eversion); /* will get filled in */
-        /* oloc */
-        ceph_encode_8(&p, 4);
-        ceph_encode_8(&p, 4);
-        ceph_encode_32(&p, 8 + 4 + 4);
-        req->r_request_pool = p;
-        p += 8;
-        ceph_encode_32(&p, -1);  /* preferred */
-        ceph_encode_32(&p, 0);   /* key len */
-        ceph_encode_8(&p, 1);
+        lreq = linger_alloc(osdc);
-        req->r_request_pgid = p;
+        if (!lreq)
-        p += 8 + 4;
+                return ERR_PTR(-ENOMEM);
-        ceph_encode_32(&p, -1);  /* preferred */
-        /* oid */
+        lreq->is_watch = true;
-        ceph_encode_32(&p, req->r_base_oid.name_len);
+        lreq->wcb = wcb;
-        memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
+        lreq->errcb = errcb;
-        dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
+        lreq->data = data;
-             req->r_base_oid.name, req->r_base_oid.name_len);
+        lreq->watch_valid_thru = jiffies;
-        p += req->r_base_oid.name_len;
+        ceph_oid_copy(&lreq->t.base_oid, oid);
-        /* ops--can imply data */
+        ceph_oloc_copy(&lreq->t.base_oloc, oloc);
-        ceph_encode_16(&p, (u16)req->r_num_ops);
+        lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
-        data_len = 0;
+        lreq->mtime = CURRENT_TIME;
-        for (i = 0; i < req->r_num_ops; i++) {
-                data_len += osd_req_encode_op(req, p, i);
+        lreq->reg_req = alloc_linger_request(lreq);
-                p += sizeof(struct ceph_osd_op);
+        if (!lreq->reg_req) {
+                ret = -ENOMEM;
+                goto err_put_lreq;
        }
-        /* snaps */
+        lreq->ping_req = alloc_linger_request(lreq);
-        ceph_encode_64(&p, req->r_snapid);
+        if (!lreq->ping_req) {
-        ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
+                ret = -ENOMEM;
-        ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
+                goto err_put_lreq;
-        if (req->r_snapc) {
-                for (i = 0; i < snapc->num_snaps; i++) {
-                        ceph_encode_64(&p, req->r_snapc->snaps[i]);
-                }
        }
-        req->r_request_attempts = p;
+        down_write(&osdc->lock);
-        p += 4;
+        linger_register(lreq); /* before osd_req_op_* */
+        osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
-        /* data */
+                              CEPH_OSD_WATCH_OP_WATCH);
-        if (flags & CEPH_OSD_FLAG_WRITE) {
+        osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
-                u16 data_off;
+                              CEPH_OSD_WATCH_OP_PING);
+        linger_submit(lreq);
-                /*
+        up_write(&osdc->lock);
-                 * The header "data_off" is a hint to the receiver
-                 * allowing it to align received data into its
+        ret = linger_reg_commit_wait(lreq);
-                 * buffers such that there's no need to re-copy
+        if (ret) {
-                 * it before writing it to disk (direct I/O).
+                linger_cancel(lreq);
-                 */
+                goto err_put_lreq;
-                data_off = (u16) (off & 0xffff);
-                req->r_request->hdr.data_off = cpu_to_le16(data_off);
        }
-        req->r_request->hdr.data_len = cpu_to_le32(data_len);
-        BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+        return lreq;
-        msg_size = p - msg->front.iov_base;
-        msg->front.iov_len = msg_size;
-        msg->hdr.front_len = cpu_to_le32(msg_size);
-        dout("build_request msg_size was %d\n", (int)msg_size);
+err_put_lreq:
+        linger_put(lreq);
+        return ERR_PTR(ret);
 }
-EXPORT_SYMBOL(ceph_osdc_build_request);
+EXPORT_SYMBOL(ceph_osdc_watch);
 /*
- * Register request, send initial attempt.
+ * Releases a ref.
+ *
+ * Times out after mount_timeout to preserve rbd unmap behaviour
+ * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
+ * with mount_timeout").
 */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
-                            struct ceph_osd_request *req,
+                      struct ceph_osd_linger_request *lreq)
-                            bool nofail)
 {
-        int rc;
+        struct ceph_options *opts = osdc->client->options;
+        struct ceph_osd_request *req;
+        int ret;
-        down_read(&osdc->map_sem);
+        req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
-        mutex_lock(&osdc->request_mutex);
+        if (!req)
+                return -ENOMEM;
+        ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+        ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+        req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+        req->r_mtime = CURRENT_TIME;
+        osd_req_op_watch_init(req, 0, lreq->linger_id,
+                              CEPH_OSD_WATCH_OP_UNWATCH);
-        rc = __ceph_osdc_start_request(osdc, req, nofail);
+        ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+        if (ret)
+                goto out_put_req;
-        mutex_unlock(&osdc->request_mutex);
+        ceph_osdc_start_request(osdc, req, false);
-        up_read(&osdc->map_sem);
+        linger_cancel(lreq);
+        linger_put(lreq);
+        ret = wait_request_timeout(req, opts->mount_timeout);
-        return rc;
+out_put_req:
+        ceph_osdc_put_request(req);
+        return ret;
 }
-EXPORT_SYMBOL(ceph_osdc_start_request);
+EXPORT_SYMBOL(ceph_osdc_unwatch);
-/*
+static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
- * Unregister a registered request.  The request is not completed (i.e.
+                                      u64 notify_id, u64 cookie, void *payload,
- * no callbacks or wakeups) - higher layers are supposed to know what
+                                      size_t payload_len)
- * they are canceling.
- */
-void ceph_osdc_cancel_request(struct ceph_osd_request *req)
 {
-        struct ceph_osd_client *osdc = req->r_osdc;
+        struct ceph_osd_req_op *op;
+        struct ceph_pagelist *pl;
+        int ret;
-        mutex_lock(&osdc->request_mutex);
+        op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
-        if (req->r_linger)
-                __unregister_linger_request(osdc, req);
+        pl = kmalloc(sizeof(*pl), GFP_NOIO);
-        __unregister_request(osdc, req);
+        if (!pl)
-        mutex_unlock(&osdc->request_mutex);
+                return -ENOMEM;
+        ceph_pagelist_init(pl);
+        ret = ceph_pagelist_encode_64(pl, notify_id);
+        ret |= ceph_pagelist_encode_64(pl, cookie);
+        if (payload) {
+                ret |= ceph_pagelist_encode_32(pl, payload_len);
+                ret |= ceph_pagelist_append(pl, payload, payload_len);
+        } else {
+                ret |= ceph_pagelist_encode_32(pl, 0);
+        }
+        if (ret) {
+                ceph_pagelist_release(pl);
+                return -ENOMEM;
+        }
-        dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+        ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
+        op->indata_len = pl->length;
+        return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_cancel_request);
-/*
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
- * wait for a request to complete
+                         struct ceph_object_id *oid,
- */
+                         struct ceph_object_locator *oloc,
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+                         u64 notify_id,
-                           struct ceph_osd_request *req)
+                         u64 cookie,
+                         void *payload,
+                         size_t payload_len)
 {
-        int rc;
+        struct ceph_osd_request *req;
+        int ret;
-        dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+        req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+        if (!req)
+                return -ENOMEM;
-        rc = wait_for_completion_interruptible(&req->r_completion);
+        ceph_oid_copy(&req->r_base_oid, oid);
-        if (rc < 0) {
+        ceph_oloc_copy(&req->r_base_oloc, oloc);
-                dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
+        req->r_flags = CEPH_OSD_FLAG_READ;
-                ceph_osdc_cancel_request(req);
-                complete_request(req);
+        ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
-                return rc;
+        if (ret)
+                goto out_put_req;
+        ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
+                                         payload_len);
+        if (ret)
+                goto out_put_req;
+        ceph_osdc_start_request(osdc, req, false);
+        ret = ceph_osdc_wait_request(osdc, req);
+out_put_req:
+        ceph_osdc_put_request(req);
+        return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify_ack);
+static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
+                                  u64 cookie, u32 prot_ver, u32 timeout,
+                                  void *payload, size_t payload_len)
+{
+        struct ceph_osd_req_op *op;
+        struct ceph_pagelist *pl;
+        int ret;
+        op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
+        op->notify.cookie = cookie;
+        pl = kmalloc(sizeof(*pl), GFP_NOIO);
+        if (!pl)
+                return -ENOMEM;
+        ceph_pagelist_init(pl);
+        ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
+        ret |= ceph_pagelist_encode_32(pl, timeout);
+        ret |= ceph_pagelist_encode_32(pl, payload_len);
+        ret |= ceph_pagelist_append(pl, payload, payload_len);
+        if (ret) {
+                ceph_pagelist_release(pl);
+                return -ENOMEM;
        }
-        dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
+        ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
-             req->r_result);
+        op->indata_len = pl->length;
-        return req->r_result;
+        return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_wait_request);
 /*
- * sync - wait for all in-flight requests to flush.  avoid starvation.
+ * @timeout: in seconds
+ *
+ * @preply_{pages,len} are initialized both on success and error.
+ * The caller is responsible for:
+ *
+ *     ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
 */
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+                     struct ceph_object_id *oid,
+                     struct ceph_object_locator *oloc,
+                     void *payload,
+                     size_t payload_len,
+                     u32 timeout,
+                     struct page ***preply_pages,
+                     size_t *preply_len)
 {
-        struct ceph_osd_request *req;
+        struct ceph_osd_linger_request *lreq;
-        u64 last_tid, next_tid = 0;
+        struct page **pages;
+        int ret;
-        mutex_lock(&osdc->request_mutex);
+        WARN_ON(!timeout);
-        last_tid = osdc->last_tid;
+        if (preply_pages) {
-        while (1) {
+                *preply_pages = NULL;
-                req = __lookup_request_ge(osdc, next_tid);
+                *preply_len = 0;
-                if (!req)
+        }
-                        break;
-                if (req->r_tid > last_tid)
-                        break;
-                next_tid = req->r_tid + 1;
+        lreq = linger_alloc(osdc);
-                if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+        if (!lreq)
-                        continue;
+                return -ENOMEM;
-                ceph_osdc_get_request(req);
+        lreq->preply_pages = preply_pages;
-                mutex_unlock(&osdc->request_mutex);
+        lreq->preply_len = preply_len;
-                dout("sync waiting on tid %llu (last is %llu)\n",
-                     req->r_tid, last_tid);
+        ceph_oid_copy(&lreq->t.base_oid, oid);
-                wait_for_completion(&req->r_safe_completion);
+        ceph_oloc_copy(&lreq->t.base_oloc, oloc);
-                mutex_lock(&osdc->request_mutex);
+        lreq->t.flags = CEPH_OSD_FLAG_READ;
-                ceph_osdc_put_request(req);
+        lreq->reg_req = alloc_linger_request(lreq);
+        if (!lreq->reg_req) {
+                ret = -ENOMEM;
+                goto out_put_lreq;
        }
-        mutex_unlock(&osdc->request_mutex);
-        dout("sync done (thru tid %llu)\n", last_tid);
+        /* for notify_id */
+        pages = ceph_alloc_page_vector(1, GFP_NOIO);
+        if (IS_ERR(pages)) {
+                ret = PTR_ERR(pages);
+                goto out_put_lreq;
+        }
+        down_write(&osdc->lock);
+        linger_register(lreq); /* before osd_req_op_* */
+        ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
+                                     timeout, payload, payload_len);
+        if (ret) {
+                linger_unregister(lreq);
+                up_write(&osdc->lock);
+                ceph_release_page_vector(pages, 1);
+                goto out_put_lreq;
+        }
+        ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
+                                                 response_data),
+                                 pages, PAGE_SIZE, 0, false, true);
+        linger_submit(lreq);
+        up_write(&osdc->lock);
+        ret = linger_reg_commit_wait(lreq);
+        if (!ret)
+                ret = linger_notify_finish_wait(lreq);
+        else
+                dout("lreq %p failed to initiate notify %d\n", lreq, ret);
+        linger_cancel(lreq);
+out_put_lreq:
+        linger_put(lreq);
+        return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify);
+/*
+ * Return the number of milliseconds since the watch was last
+ * confirmed, or an error.  If there is an error, the watch is no
+ * longer valid, and should be destroyed with ceph_osdc_unwatch().
+ */
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+                          struct ceph_osd_linger_request *lreq)
+{
+        unsigned long stamp, age;
+        int ret;
+        down_read(&osdc->lock);
+        mutex_lock(&lreq->lock);
+        stamp = lreq->watch_valid_thru;
+        if (!list_empty(&lreq->pending_lworks)) {
+                struct linger_work *lwork =
+                    list_first_entry(&lreq->pending_lworks,
+                                     struct linger_work,
+                                     pending_item);
+                if (time_before(lwork->queued_stamp, stamp))
+                        stamp = lwork->queued_stamp;
+        }
+        age = jiffies - stamp;
+        dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
+             lreq, lreq->linger_id, age, lreq->last_error);
+        /* we are truncating to msecs, so return a safe upper bound */
+        ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
+        mutex_unlock(&lreq->lock);
+        up_read(&osdc->lock);
+        return ret;
 }
-EXPORT_SYMBOL(ceph_osdc_sync);
 /*
 * Call all pending notify callbacks - for use after a watch is
@@ -2646,6 +3869,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
 }
 EXPORT_SYMBOL(ceph_osdc_flush_notifies);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
+{
+        down_read(&osdc->lock);
+        maybe_request_map(osdc);
+        up_read(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
 /*
 * init, shutdown
@@ -2656,43 +3886,35 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
        dout("init\n");
        osdc->client = client;
-        osdc->osdmap = NULL;
+        init_rwsem(&osdc->lock);
-        init_rwsem(&osdc->map_sem);
-        init_completion(&osdc->map_waiters);
-        osdc->last_requested_map = 0;
-        mutex_init(&osdc->request_mutex);
-        osdc->last_tid = 0;
        osdc->osds = RB_ROOT;
        INIT_LIST_HEAD(&osdc->osd_lru);
-        osdc->requests = RB_ROOT;
+        spin_lock_init(&osdc->osd_lru_lock);
-        INIT_LIST_HEAD(&osdc->req_lru);
+        osd_init(&osdc->homeless_osd);
-        INIT_LIST_HEAD(&osdc->req_unsent);
+        osdc->homeless_osd.o_osdc = osdc;
-        INIT_LIST_HEAD(&osdc->req_notarget);
+        osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
-        INIT_LIST_HEAD(&osdc->req_linger);
+        osdc->linger_requests = RB_ROOT;
-        osdc->num_requests = 0;
+        osdc->map_checks = RB_ROOT;
+        osdc->linger_map_checks = RB_ROOT;
        INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
        INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
-        spin_lock_init(&osdc->event_lock);
-        osdc->event_tree = RB_ROOT;
-        osdc->event_count = 0;
-        schedule_delayed_work(&osdc->osds_timeout_work,
-            round_jiffies_relative(osdc->client->options->osd_idle_ttl));
        err = -ENOMEM;
+        osdc->osdmap = ceph_osdmap_alloc();
+        if (!osdc->osdmap)
+                goto out;
        osdc->req_mempool = mempool_create_slab_pool(10,
                                                     ceph_osd_request_cache);
        if (!osdc->req_mempool)
-                goto out;
+                goto out_map;
        err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
-                                OSD_OP_FRONT_LEN, 10, true,
+                                PAGE_SIZE, 10, true, "osd_op");
-                                "osd_op");
        if (err < 0)
                goto out_mempool;
        err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
-                                OSD_OPREPLY_FRONT_LEN, 10, true,
+                                PAGE_SIZE, 10, true, "osd_op_reply");
-                                "osd_op_reply");
        if (err < 0)
                goto out_msgpool;
@@ -2701,6 +3923,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
        if (!osdc->notify_wq)
                goto out_msgpool_reply;
+        schedule_delayed_work(&osdc->timeout_work,
+                              osdc->client->options->osd_keepalive_timeout);
+        schedule_delayed_work(&osdc->osds_timeout_work,
+            round_jiffies_relative(osdc->client->options->osd_idle_ttl));
        return 0;
 out_msgpool_reply:
@@ -2709,6 +3936,8 @@ out_msgpool:
        ceph_msgpool_destroy(&osdc->msgpool_op);
 out_mempool:
        mempool_destroy(osdc->req_mempool);
+out_map:
+        ceph_osdmap_destroy(osdc->osdmap);
 out:
        return err;
 }
@@ -2719,11 +3948,25 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
        destroy_workqueue(osdc->notify_wq);
        cancel_delayed_work_sync(&osdc->timeout_work);
        cancel_delayed_work_sync(&osdc->osds_timeout_work);
-        if (osdc->osdmap) {
-                ceph_osdmap_destroy(osdc->osdmap);
+        down_write(&osdc->lock);
-                osdc->osdmap = NULL;
+        while (!RB_EMPTY_ROOT(&osdc->osds)) {
+                struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
+                                                struct ceph_osd, o_node);
+                close_osd(osd);
        }
-        remove_all_osds(osdc);
+        up_write(&osdc->lock);
+        WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+        osd_cleanup(&osdc->homeless_osd);
+        WARN_ON(!list_empty(&osdc->osd_lru));
+        WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
+        WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
+        WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
+        WARN_ON(atomic_read(&osdc->num_requests));
+        WARN_ON(atomic_read(&osdc->num_homeless));
+        ceph_osdmap_destroy(osdc->osdmap);
        mempool_destroy(osdc->req_mempool);
        ceph_msgpool_destroy(&osdc->msgpool_op);
        ceph_msgpool_destroy(&osdc->msgpool_op_reply);
@@ -2752,15 +3995,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                return PTR_ERR(req);
        /* it may be a short read due to an object boundary */
        osd_req_op_extent_osd_data_pages(req, 0,
                                pages, *plen, page_align, false, false);
        dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
             off, *plen, *plen, page_align);
-        ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
        rc = ceph_osdc_start_request(osdc, req, false);
        if (!rc)
                rc = ceph_osdc_wait_request(osdc, req);
@@ -2786,7 +4026,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
        int rc = 0;
        int page_align = off & ~PAGE_MASK;
-        BUG_ON(vino.snap != CEPH_NOSNAP);       /* snapshots aren't writeable */
        req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
                                    CEPH_OSD_OP_WRITE,
                                    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
@@ -2800,8 +4039,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
                                false, false);
        dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
-        ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
+        req->r_mtime = *mtime;
        rc = ceph_osdc_start_request(osdc, req, true);
        if (!rc)
                rc = ceph_osdc_wait_request(osdc, req);
@@ -2841,19 +4079,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
        struct ceph_osd *osd = con->private;
-        struct ceph_osd_client *osdc;
+        struct ceph_osd_client *osdc = osd->o_osdc;
        int type = le16_to_cpu(msg->hdr.type);
-        if (!osd)
-                goto out;
-        osdc = osd->o_osdc;
        switch (type) {
        case CEPH_MSG_OSD_MAP:
                ceph_osdc_handle_map(osdc, msg);
                break;
        case CEPH_MSG_OSD_OPREPLY:
-                handle_reply(osdc, msg);
+                handle_reply(osd, msg);
                break;
        case CEPH_MSG_WATCH_NOTIFY:
                handle_watch_notify(osdc, msg);
@@ -2863,7 +4097,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
                pr_err("received unknown message type %d %s\n", type,
                       ceph_msg_type_name(type));
        }
-out:
        ceph_msg_put(msg);
 }
@@ -2878,21 +4112,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 {
        struct ceph_osd *osd = con->private;
        struct ceph_osd_client *osdc = osd->o_osdc;
-        struct ceph_msg *m;
+        struct ceph_msg *m = NULL;
        struct ceph_osd_request *req;
        int front_len = le32_to_cpu(hdr->front_len);
        int data_len = le32_to_cpu(hdr->data_len);
-        u64 tid;
+        u64 tid = le64_to_cpu(hdr->tid);
-        tid = le64_to_cpu(hdr->tid);
+        down_read(&osdc->lock);
-        mutex_lock(&osdc->request_mutex);
+        if (!osd_registered(osd)) {
-        req = __lookup_request(osdc, tid);
+                dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
+                *skip = 1;
+                goto out_unlock_osdc;
+        }
+        WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
+        mutex_lock(&osd->lock);
+        req = lookup_request(&osd->o_requests, tid);
        if (!req) {
                dout("%s osd%d tid %llu unknown, skipping\n", __func__,
                     osd->o_osd, tid);
-                m = NULL;
                *skip = 1;
-                goto out;
+                goto out_unlock_session;
        }
        ceph_msg_revoke_incoming(req->r_reply);
@@ -2904,7 +4144,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
                                 false);
                if (!m)
-                        goto out;
+                        goto out_unlock_session;
                ceph_msg_put(req->r_reply);
                req->r_reply = m;
        }
@@ -2915,14 +4155,49 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                        req->r_reply->data_length);
                m = NULL;
                *skip = 1;
-                goto out;
+                goto out_unlock_session;
        }
        m = ceph_msg_get(req->r_reply);
        dout("get_reply tid %lld %p\n", tid, m);
-out:
+out_unlock_session:
-        mutex_unlock(&osdc->request_mutex);
+        mutex_unlock(&osd->lock);
+out_unlock_osdc:
+        up_read(&osdc->lock);
+        return m;
+}
+/*
+ * TODO: switch to a msg-owned pagelist
+ */
+static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
+{
+        struct ceph_msg *m;
+        int type = le16_to_cpu(hdr->type);
+        u32 front_len = le32_to_cpu(hdr->front_len);
+        u32 data_len = le32_to_cpu(hdr->data_len);
+        m = ceph_msg_new(type, front_len, GFP_NOIO, false);
+        if (!m)
+                return NULL;
+        if (data_len) {
+                struct page **pages;
+                struct ceph_osd_data osd_data;
+                pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
+                                               GFP_NOIO);
+                if (!pages) {
+                        ceph_msg_put(m);
+                        return NULL;
+                }
+                ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
+                                         false);
+                ceph_osdc_msg_data_add(m, &osd_data);
+        }
        return m;
 }
@@ -2932,18 +4207,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 {
        struct ceph_osd *osd = con->private;
        int type = le16_to_cpu(hdr->type);
-        int front = le32_to_cpu(hdr->front_len);
        *skip = 0;
        switch (type) {
        case CEPH_MSG_OSD_MAP:
        case CEPH_MSG_WATCH_NOTIFY:
-                return ceph_msg_new(type, front, GFP_NOFS, false);
+                return alloc_msg_with_page_vector(hdr);
        case CEPH_MSG_OSD_OPREPLY:
                return get_reply(con, hdr, skip);
        default:
-                pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+                pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
-                        osd->o_osd);
+                        osd->o_osd, type);
                *skip = 1;
                return NULL;
        }
@@ -3047,5 +4321,5 @@ static const struct ceph_connection_operations osd_con_ops = {
        .alloc_msg = alloc_msg,
        .sign_message = osd_sign_message,
        .check_message_signature = osd_check_message_signature,
-        .fault = osd_reset,
+        .fault = osd_fault,
 };
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 243574c8cf33..cde52e94732f 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -380,23 +380,24 @@ bad:
        return ERR_PTR(err);
 }
-/*
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds) and primary_temp (explicit primary setting)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
 {
-        if (l.pool < r.pool)
+        if (lhs->pool < rhs->pool)
                return -1;
-        if (l.pool > r.pool)
+        if (lhs->pool > rhs->pool)
                return 1;
-        if (l.seed < r.seed)
+        if (lhs->seed < rhs->seed)
                return -1;
-        if (l.seed > r.seed)
+        if (lhs->seed > rhs->seed)
                return 1;
        return 0;
 }
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds) and primary_temp (explicit primary setting)
+ */
 static int __insert_pg_mapping(struct ceph_pg_mapping *new,
                               struct rb_root *root)
 {
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
        while (*p) {
                parent = *p;
                pg = rb_entry(parent, struct ceph_pg_mapping, node);
-                c = pgid_cmp(new->pgid, pg->pgid);
+                c = ceph_pg_compare(&new->pgid, &pg->pgid);
                if (c < 0)
                        p = &(*p)->rb_left;
                else if (c > 0)
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
        while (n) {
                pg = rb_entry(n, struct ceph_pg_mapping, node);
-                c = pgid_cmp(pgid, pg->pgid);
+                c = ceph_pg_compare(&pgid, &pg->pgid);
                if (c < 0) {
                        n = n->rb_left;
                } else if (c > 0) {
@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
        *p += 4;  /* skip crash_replay_interval */
        if (ev >= 7)
-                *p += 1;  /* skip min_size */
+                pi->min_size = ceph_decode_8(p);
+        else
+                pi->min_size = pi->size - pi->size / 2;
        if (ev >= 8)
                *p += 8 + 8;  /* skip quota_max_* */
@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
                pi->write_tier = -1;
        }
+        if (ev >= 10) {
+                /* skip properties */
+                num = ceph_decode_32(p);
+                while (num--) {
+                        len = ceph_decode_32(p);
+                        *p += len; /* key */
+                        len = ceph_decode_32(p);
+                        *p += len; /* val */
+                }
+        }
+        if (ev >= 11) {
+                /* skip hit_set_params */
+                *p += 1 + 1; /* versions */
+                len = ceph_decode_32(p);
+                *p += len;
+                *p += 4; /* skip hit_set_period */
+                *p += 4; /* skip hit_set_count */
+        }
+        if (ev >= 12)
+                *p += 4; /* skip stripe_width */
+        if (ev >= 13) {
+                *p += 8; /* skip target_max_bytes */
+                *p += 8; /* skip target_max_objects */
+                *p += 4; /* skip cache_target_dirty_ratio_micro */
+                *p += 4; /* skip cache_target_full_ratio_micro */
+                *p += 4; /* skip cache_min_flush_age */
+                *p += 4; /* skip cache_min_evict_age */
+        }
+        if (ev >=  14) {
+                /* skip erasure_code_profile */
+                len = ceph_decode_32(p);
+                *p += len;
+        }
+        if (ev >= 15)
+                pi->last_force_request_resend = ceph_decode_32(p);
+        else
+                pi->last_force_request_resend = 0;
        /* ignore the rest */
        *p = pool_end;
@@ -660,6 +707,23 @@ bad:
 /*
 * osd map
 */
+struct ceph_osdmap *ceph_osdmap_alloc(void)
+{
+        struct ceph_osdmap *map;
+        map = kzalloc(sizeof(*map), GFP_NOIO);
+        if (!map)
+                return NULL;
+        map->pg_pools = RB_ROOT;
+        map->pool_max = -1;
+        map->pg_temp = RB_ROOT;
+        map->primary_temp = RB_ROOT;
+        mutex_init(&map->crush_scratch_mutex);
+        return map;
+}
 void ceph_osdmap_destroy(struct ceph_osdmap *map)
 {
        dout("osdmap_destroy %p\n", map);
@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
        struct ceph_osdmap *map;
        int ret;
-        map = kzalloc(sizeof(*map), GFP_NOFS);
+        map = ceph_osdmap_alloc();
        if (!map)
                return ERR_PTR(-ENOMEM);
-        map->pg_temp = RB_ROOT;
-        map->primary_temp = RB_ROOT;
-        mutex_init(&map->crush_scratch_mutex);
        ret = osdmap_decode(p, end, map);
        if (ret) {
                ceph_osdmap_destroy(map);
@@ -1204,8 +1264,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
 * decode and apply an incremental map update.
 */
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                             struct ceph_osdmap *map,
+                                             struct ceph_osdmap *map)
-                                             struct ceph_messenger *msgr)
 {
        struct crush_map *newcrush = NULL;
        struct ceph_fsid fsid;
@@ -1381,8 +1440,252 @@ bad:
        return ERR_PTR(err);
 }
+void ceph_oid_copy(struct ceph_object_id *dest,
+                   const struct ceph_object_id *src)
+{
+        WARN_ON(!ceph_oid_empty(dest));
+        if (src->name != src->inline_name) {
+                /* very rare, see ceph_object_id definition */
+                dest->name = kmalloc(src->name_len + 1,
+                                     GFP_NOIO | __GFP_NOFAIL);
+        }
+        memcpy(dest->name, src->name, src->name_len + 1);
+        dest->name_len = src->name_len;
+}
+EXPORT_SYMBOL(ceph_oid_copy);
+static __printf(2, 0)
+int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
+{
+        int len;
+        WARN_ON(!ceph_oid_empty(oid));
+        len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
+        if (len >= sizeof(oid->inline_name))
+                return len;
+        oid->name_len = len;
+        return 0;
+}
+/*
+ * If oid doesn't fit into inline buffer, BUG.
+ */
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
+{
+        va_list ap;
+        va_start(ap, fmt);
+        BUG_ON(oid_printf_vargs(oid, fmt, ap));
+        va_end(ap);
+}
+EXPORT_SYMBOL(ceph_oid_printf);
+static __printf(3, 0)
+int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
+                      const char *fmt, va_list ap)
+{
+        va_list aq;
+        int len;
+        va_copy(aq, ap);
+        len = oid_printf_vargs(oid, fmt, aq);
+        va_end(aq);
+        if (len) {
+                char *external_name;
+                external_name = kmalloc(len + 1, gfp);
+                if (!external_name)
+                        return -ENOMEM;
+                oid->name = external_name;
+                WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
+                oid->name_len = len;
+        }
+        return 0;
+}
+/*
+ * If oid doesn't fit into inline buffer, allocate.
+ */
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+                     const char *fmt, ...)
+{
+        va_list ap;
+        int ret;
+        va_start(ap, fmt);
+        ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
+        va_end(ap);
+        return ret;
+}
+EXPORT_SYMBOL(ceph_oid_aprintf);
+void ceph_oid_destroy(struct ceph_object_id *oid)
+{
+        if (oid->name != oid->inline_name)
+                kfree(oid->name);
+}
+EXPORT_SYMBOL(ceph_oid_destroy);
+/*
+ * osds only
+ */
+static bool __osds_equal(const struct ceph_osds *lhs,
+                         const struct ceph_osds *rhs)
+{
+        if (lhs->size == rhs->size &&
+            !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
+                return true;
+        return false;
+}
+/*
+ * osds + primary
+ */
+static bool osds_equal(const struct ceph_osds *lhs,
+                       const struct ceph_osds *rhs)
+{
+        if (__osds_equal(lhs, rhs) &&
+            lhs->primary == rhs->primary)
+                return true;
+        return false;
+}
+static bool osds_valid(const struct ceph_osds *set)
+{
+        /* non-empty set */
+        if (set->size > 0 && set->primary >= 0)
+                return true;
+        /* empty can_shift_osds set */
+        if (!set->size && set->primary == -1)
+                return true;
+        /* empty !can_shift_osds set - all NONE */
+        if (set->size > 0 && set->primary == -1) {
+                int i;
+                for (i = 0; i < set->size; i++) {
+                        if (set->osds[i] != CRUSH_ITEM_NONE)
+                                break;
+                }
+                if (i == set->size)
+                        return true;
+        }
+        return false;
+}
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
+{
+        memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
+        dest->size = src->size;
+        dest->primary = src->primary;
+}
+static bool is_split(const struct ceph_pg *pgid,
+                     u32 old_pg_num,
+                     u32 new_pg_num)
+{
+        int old_bits = calc_bits_of(old_pg_num);
+        int old_mask = (1 << old_bits) - 1;
+        int n;
+        WARN_ON(pgid->seed >= old_pg_num);
+        if (new_pg_num <= old_pg_num)
+                return false;
+        for (n = 1; ; n++) {
+                int next_bit = n << (old_bits - 1);
+                u32 s = next_bit | pgid->seed;
+                if (s < old_pg_num || s == pgid->seed)
+                        continue;
+                if (s >= new_pg_num)
+                        break;
+                s = ceph_stable_mod(s, old_pg_num, old_mask);
+                if (s == pgid->seed)
+                        return true;
+        }
+        return false;
+}
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+                          const struct ceph_osds *new_acting,
+                          const struct ceph_osds *old_up,
+                          const struct ceph_osds *new_up,
+                          int old_size,
+                          int new_size,
+                          int old_min_size,
+                          int new_min_size,
+                          u32 old_pg_num,
+                          u32 new_pg_num,
+                          bool old_sort_bitwise,
+                          bool new_sort_bitwise,
+                          const struct ceph_pg *pgid)
+{
+        return !osds_equal(old_acting, new_acting) ||
+               !osds_equal(old_up, new_up) ||
+               old_size != new_size ||
+               old_min_size != new_min_size ||
+               is_split(pgid, old_pg_num, new_pg_num) ||
+               old_sort_bitwise != new_sort_bitwise;
+}
+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
+{
+        int i;
+        for (i = 0; i < acting->size; i++) {
+                if (acting->osds[i] == osd)
+                        return i;
+        }
+        return -1;
+}
+static bool primary_changed(const struct ceph_osds *old_acting,
+                            const struct ceph_osds *new_acting)
+{
+        if (!old_acting->size && !new_acting->size)
+                return false; /* both still empty */
+        if (!old_acting->size ^ !new_acting->size)
+                return true; /* was empty, now not, or vice versa */
+        if (old_acting->primary != new_acting->primary)
+                return true; /* primary changed */
+        if (calc_pg_rank(old_acting->primary, old_acting) !=
+            calc_pg_rank(new_acting->primary, new_acting))
+                return true;
+        return false; /* same primary (tho replicas may have changed) */
+}
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+                       const struct ceph_osds *new_acting,
+                       bool any_change)
+{
+        if (primary_changed(old_acting, new_acting))
+                return true;
+        if (any_change && !__osds_equal(old_acting, new_acting))
+                return true;
+        return false;
+}
 /*
 * calculate file layout from given offset, length.
@@ -1455,30 +1758,71 @@ invalid:
 EXPORT_SYMBOL(ceph_calc_file_object_mapping);
 /*
- * Calculate mapping of a (oloc, oid) pair to a PG.  Should only be
+ * Map an object into a PG.
- * called with target's (oloc, oid), since tiering isn't taken into
+ *
- * account.
+ * Should only be called with target_oid and target_oloc (as opposed to
+ * base_oid and base_oloc), since tiering isn't taken into account.
 */
-int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
-                        struct ceph_object_locator *oloc,
+                              struct ceph_object_id *oid,
-                        struct ceph_object_id *oid,
+                              struct ceph_object_locator *oloc,
-                        struct ceph_pg *pg_out)
+                              struct ceph_pg *raw_pgid)
 {
        struct ceph_pg_pool_info *pi;
-        pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+        pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
        if (!pi)
-                return -EIO;
+                return -ENOENT;
-        pg_out->pool = oloc->pool;
+        raw_pgid->pool = oloc->pool;
-        pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
+        raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
-                                     oid->name_len);
+                                       oid->name_len);
-        dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
+        dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len,
-             pg_out->pool, pg_out->seed);
+             oid->name, raw_pgid->pool, raw_pgid->seed);
        return 0;
 }
-EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+EXPORT_SYMBOL(ceph_object_locator_to_pg);
+/*
+ * Map a raw PG (full precision ps) into an actual PG.
+ */
+static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
+                         const struct ceph_pg *raw_pgid,
+                         struct ceph_pg *pgid)
+{
+        pgid->pool = raw_pgid->pool;
+        pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
+                                     pi->pg_num_mask);
+}
+/*
+ * Map a raw PG (full precision ps) into a placement ps (placement
+ * seed).  Include pool id in that value so that different pools don't
+ * use the same seeds.
+ */
+static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
+                         const struct ceph_pg *raw_pgid)
+{
+        if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+                /* hash pool id and seed so that pool PGs do not overlap */
+                return crush_hash32_2(CRUSH_HASH_RJENKINS1,
+                                      ceph_stable_mod(raw_pgid->seed,
+                                                      pi->pgp_num,
+                                                      pi->pgp_num_mask),
+                                      raw_pgid->pool);
+        } else {
+                /*
+                 * legacy behavior: add ps and pool together.  this is
+                 * not a great approach because the PGs from each pool
+                 * will overlap on top of each other: 0.5 == 1.4 ==
+                 * 2.3 == ...
+                 */
+                return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
+                                       pi->pgp_num_mask) +
+                       (unsigned)raw_pgid->pool;
+        }
+}
 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
                    int *result, int result_max,
@@ -1497,84 +1841,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 }
 /*
- * Calculate raw (crush) set for given pgid.
+ * Calculate raw set (CRUSH output) for given PG.  The result may
+ * contain nonexistent OSDs.  ->primary is undefined for a raw set.
 *
- * Return raw set length, or error.
+ * Placement seed (CRUSH input) is returned through @ppps.
 */
-static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
+static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
-                          struct ceph_pg_pool_info *pool,
+                           struct ceph_pg_pool_info *pi,
-                          struct ceph_pg pgid, u32 pps, int *osds)
+                           const struct ceph_pg *raw_pgid,
+                           struct ceph_osds *raw,
+                           u32 *ppps)
 {
+        u32 pps = raw_pg_to_pps(pi, raw_pgid);
        int ruleno;
        int len;
-        /* crush */
+        ceph_osds_init(raw);
-        ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
+        if (ppps)
-                                 pool->type, pool->size);
+                *ppps = pps;
+        ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
+                                 pi->size);
        if (ruleno < 0) {
                pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
-                       pgid.pool, pool->crush_ruleset, pool->type,
+                       pi->id, pi->crush_ruleset, pi->type, pi->size);
-                       pool->size);
+                return;
-                return -ENOENT;
        }
-        len = do_crush(osdmap, ruleno, pps, osds,
+        len = do_crush(osdmap, ruleno, pps, raw->osds,
-                       min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+                       min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
                       osdmap->osd_weight, osdmap->max_osd);
        if (len < 0) {
                pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
-                       len, ruleno, pgid.pool, pool->crush_ruleset,
+                       len, ruleno, pi->id, pi->crush_ruleset, pi->type,
-                       pool->type, pool->size);
+                       pi->size);
-                return len;
+                return;
        }
-        return len;
+        raw->size = len;
 }
 /*
- * Given raw set, calculate up set and up primary.
+ * Given raw set, calculate up set and up primary.  By definition of an
+ * up set, the result won't contain nonexistent or down OSDs.
 *
- * Return up set length.  *primary is set to up primary osd id, or -1
+ * This is done in-place - on return @set is the up set.  If it's
- * if up set is empty.
+ * empty, ->primary will remain undefined.
 */
-static int raw_to_up_osds(struct ceph_osdmap *osdmap,
+static void raw_to_up_osds(struct ceph_osdmap *osdmap,
-                          struct ceph_pg_pool_info *pool,
+                           struct ceph_pg_pool_info *pi,
-                          int *osds, int len, int *primary)
+                           struct ceph_osds *set)
 {
-        int up_primary = -1;
        int i;
-        if (ceph_can_shift_osds(pool)) {
+        /* ->primary is undefined for a raw set */
+        BUG_ON(set->primary != -1);
+        if (ceph_can_shift_osds(pi)) {
                int removed = 0;
-                for (i = 0; i < len; i++) {
+                /* shift left */
-                        if (ceph_osd_is_down(osdmap, osds[i])) {
+                for (i = 0; i < set->size; i++) {
+                        if (ceph_osd_is_down(osdmap, set->osds[i])) {
                                removed++;
                                continue;
                        }
                        if (removed)
-                                osds[i - removed] = osds[i];
+                                set->osds[i - removed] = set->osds[i];
                }
+                set->size -= removed;
-                len -= removed;
+                if (set->size > 0)
-                if (len > 0)
+                        set->primary = set->osds[0];
-                        up_primary = osds[0];
        } else {
-                for (i = len - 1; i >= 0; i--) {
+                /* set down/dne devices to NONE */
-                        if (ceph_osd_is_down(osdmap, osds[i]))
+                for (i = set->size - 1; i >= 0; i--) {
-                                osds[i] = CRUSH_ITEM_NONE;
+                        if (ceph_osd_is_down(osdmap, set->osds[i]))
+                                set->osds[i] = CRUSH_ITEM_NONE;
                        else
-                                up_primary = osds[i];
+                                set->primary = set->osds[i];
                }
        }
-        *primary = up_primary;
-        return len;
 }
-static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
+static void apply_primary_affinity(struct ceph_osdmap *osdmap,
-                                   struct ceph_pg_pool_info *pool,
+                                   struct ceph_pg_pool_info *pi,
-                                   int *osds, int len, int *primary)
+                                   u32 pps,
+                                   struct ceph_osds *up)
 {
        int i;
        int pos = -1;
@@ -1586,8 +1938,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
        if (!osdmap->osd_primary_affinity)
                return;
-        for (i = 0; i < len; i++) {
+        for (i = 0; i < up->size; i++) {
-                int osd = osds[i];
+                int osd = up->osds[i];
                if (osd != CRUSH_ITEM_NONE &&
                    osdmap->osd_primary_affinity[osd] !=
@@ -1595,7 +1947,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
                        break;
                }
        }
-        if (i == len)
+        if (i == up->size)
                return;
        /*
@@ -1603,8 +1955,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
         * osd into the hash/rng so that a proportional fraction of an
         * osd's pgs get rejected as primary.
         */
-        for (i = 0; i < len; i++) {
+        for (i = 0; i < up->size; i++) {
-                int osd = osds[i];
+                int osd = up->osds[i];
                u32 aff;
                if (osd == CRUSH_ITEM_NONE)
@@ -1629,135 +1981,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
        if (pos < 0)
                return;
-        *primary = osds[pos];
+        up->primary = up->osds[pos];
-        if (ceph_can_shift_osds(pool) && pos > 0) {
+        if (ceph_can_shift_osds(pi) && pos > 0) {
                /* move the new primary to the front */
                for (i = pos; i > 0; i--)
-                        osds[i] = osds[i - 1];
+                        up->osds[i] = up->osds[i - 1];
-                osds[0] = *primary;
+                up->osds[0] = up->primary;
        }
 }
 /*
- * Given up set, apply pg_temp and primary_temp mappings.
+ * Get pg_temp and primary_temp mappings for given PG.
 *
- * Return acting set length.  *primary is set to acting primary osd id,
+ * Note that a PG may have none, only pg_temp, only primary_temp or
- * or -1 if acting set is empty.
+ * both pg_temp and primary_temp mappings.  This means @temp isn't
+ * always a valid OSD set on return: in the "only primary_temp" case,
+ * @temp will have its ->primary >= 0 but ->size == 0.
 */
-static int apply_temps(struct ceph_osdmap *osdmap,
+static void get_temp_osds(struct ceph_osdmap *osdmap,
-                       struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
+                          struct ceph_pg_pool_info *pi,
-                       int *osds, int len, int *primary)
+                          const struct ceph_pg *raw_pgid,
+                          struct ceph_osds *temp)
 {
+        struct ceph_pg pgid;
        struct ceph_pg_mapping *pg;
-        int temp_len;
-        int temp_primary;
        int i;
-        /* raw_pg -> pg */
+        raw_pg_to_pg(pi, raw_pgid, &pgid);
-        pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
+        ceph_osds_init(temp);
-                                    pool->pg_num_mask);
        /* pg_temp? */
        pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
        if (pg) {
-                temp_len = 0;
-                temp_primary = -1;
                for (i = 0; i < pg->pg_temp.len; i++) {
                        if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
-                                if (ceph_can_shift_osds(pool))
+                                if (ceph_can_shift_osds(pi))
                                        continue;
-                                else
-                                        osds[temp_len++] = CRUSH_ITEM_NONE;
+                                temp->osds[temp->size++] = CRUSH_ITEM_NONE;
                        } else {
-                                osds[temp_len++] = pg->pg_temp.osds[i];
+                                temp->osds[temp->size++] = pg->pg_temp.osds[i];
                        }
                }
                /* apply pg_temp's primary */
-                for (i = 0; i < temp_len; i++) {
+                for (i = 0; i < temp->size; i++) {
-                        if (osds[i] != CRUSH_ITEM_NONE) {
+                        if (temp->osds[i] != CRUSH_ITEM_NONE) {
-                                temp_primary = osds[i];
+                                temp->primary = temp->osds[i];
                                break;
                        }
                }
-        } else {
-                temp_len = len;
-                temp_primary = *primary;
        }
        /* primary_temp? */
        pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
        if (pg)
-                temp_primary = pg->primary_temp.osd;
+                temp->primary = pg->primary_temp.osd;
-        *primary = temp_primary;
-        return temp_len;
 }
 /*
- * Calculate acting set for given pgid.
+ * Map a PG to its acting set as well as its up set.
 *
- * Return acting set length, or error.  *primary is set to acting
+ * Acting set is used for data mapping purposes, while up set can be
- * primary osd id, or -1 if acting set is empty or on error.
+ * recorded for detecting interval changes and deciding whether to
+ * resend a request.
 */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
-                        int *osds, int *primary)
+                               const struct ceph_pg *raw_pgid,
+                               struct ceph_osds *up,
+                               struct ceph_osds *acting)
 {
-        struct ceph_pg_pool_info *pool;
+        struct ceph_pg_pool_info *pi;
        u32 pps;
-        int len;
-        pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
+        pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
-        if (!pool) {
+        if (!pi) {
-                *primary = -1;
+                ceph_osds_init(up);
-                return -ENOENT;
+                ceph_osds_init(acting);
+                goto out;
        }
-        if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+        pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
-                /* hash pool id and seed so that pool PGs do not overlap */
+        raw_to_up_osds(osdmap, pi, up);
-                pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
+        apply_primary_affinity(osdmap, pi, pps, up);
-                                     ceph_stable_mod(pgid.seed, pool->pgp_num,
+        get_temp_osds(osdmap, pi, raw_pgid, acting);
-                                                     pool->pgp_num_mask),
+        if (!acting->size) {
-                                     pgid.pool);
+                memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
-        } else {
+                acting->size = up->size;
-                /*
+                if (acting->primary == -1)
-                 * legacy behavior: add ps and pool together.  this is
+                        acting->primary = up->primary;
-                 * not a great approach because the PGs from each pool
-                 * will overlap on top of each other: 0.5 == 1.4 ==
-                 * 2.3 == ...
-                 */
-                pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
-                                      pool->pgp_num_mask) +
-                        (unsigned)pgid.pool;
-        }
-        len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
-        if (len < 0) {
-                *primary = -1;
-                return len;
        }
+out:
-        len = raw_to_up_osds(osdmap, pool, osds, len, primary);
+        WARN_ON(!osds_valid(up) || !osds_valid(acting));
-        apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
-        len = apply_temps(osdmap, pool, pgid, osds, len, primary);
-        return len;
 }
 /*
- * Return primary osd for given pgid, or -1 if none.
+ * Return acting primary for given PG, or -1 if none.
 */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+                              const struct ceph_pg *raw_pgid)
 {
-        int osds[CEPH_PG_MAX_SIZE];
+        struct ceph_osds up, acting;
-        int primary;
-        ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
-        return primary;
+        ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
+        return acting.primary;
 }
-EXPORT_SYMBOL(ceph_calc_pg_primary);
+EXPORT_SYMBOL(ceph_pg_to_acting_primary);
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-05-26 17:10:32 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-05-26 17:10:32 -0400
commit	a10c38a4f385f5d7c173a263ff6bb2d36021b3bb (patch)
tree	3cbaa916940b36a9fdb27c8a231e1488fbc352d6
parent	ea8ea737c46cffa5d0ee74309f81e55a7e5e9c2a (diff)
parent	e536030934aebf049fe6aaebc58dd37aeee21840 (diff)