29 files changed, 4758 insertions, 2508 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0ede6d7e2568..81666a56415e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -350,12 +350,12 @@ struct rbd_device {
        struct rbd_spec         *spec;
        struct rbd_options      *opts;
-        char                    *header_name;
+        struct ceph_object_id   header_oid;
+        struct ceph_object_locator header_oloc;
        struct ceph_file_layout layout;
-        struct ceph_osd_event   *watch_event;
+        struct ceph_osd_linger_request *watch_handle;
-        struct rbd_obj_request  *watch_request;
        struct rbd_spec         *parent_spec;
        u64                     parent_overlap;
@@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
        return __rbd_obj_request_wait(obj_request, 0);
 }
-static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
-                                        unsigned long timeout)
-{
-        return __rbd_obj_request_wait(obj_request, timeout);
-}
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
 {
@@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
                complete_all(&obj_request->completion);
 }
-static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
-{
-        dout("%s: obj %p\n", __func__, obj_request);
-        obj_request_done_set(obj_request);
-}
 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
 {
        struct rbd_img_request *img_request = NULL;
@@ -1828,13 +1816,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
                obj_request_done_set(obj_request);
 }
-static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
-                                struct ceph_msg *msg)
 {
        struct rbd_obj_request *obj_request = osd_req->r_priv;
        u16 opcode;
-        dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
+        dout("%s: osd_req %p\n", __func__, osd_req);
        rbd_assert(osd_req == obj_request->osd_req);
        if (obj_request_img_data_test(obj_request)) {
                rbd_assert(obj_request->img_request);
@@ -1878,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
        case CEPH_OSD_OP_CALL:
                rbd_osd_call_callback(obj_request);
                break;
-        case CEPH_OSD_OP_NOTIFY_ACK:
-        case CEPH_OSD_OP_WATCH:
-                rbd_osd_trivial_callback(obj_request);
-                break;
        default:
                rbd_warn(NULL, "%s: unsupported op %hu",
                        obj_request->object_name, (unsigned short) opcode);
@@ -1896,27 +1879,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
 {
        struct rbd_img_request *img_request = obj_request->img_request;
        struct ceph_osd_request *osd_req = obj_request->osd_req;
-        u64 snap_id;
-        rbd_assert(osd_req != NULL);
+        if (img_request)
+                osd_req->r_snapid = img_request->snap_id;
-        snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
-        ceph_osdc_build_request(osd_req, obj_request->offset,
-                        NULL, snap_id, NULL);
 }
 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
 {
-        struct rbd_img_request *img_request = obj_request->img_request;
        struct ceph_osd_request *osd_req = obj_request->osd_req;
-        struct ceph_snap_context *snapc;
-        struct timespec mtime = CURRENT_TIME;
-        rbd_assert(osd_req != NULL);
+        osd_req->r_mtime = CURRENT_TIME;
+        osd_req->r_data_offset = obj_request->offset;
-        snapc = img_request ? img_request->snapc : NULL;
-        ceph_osdc_build_request(osd_req, obj_request->offset,
-                        snapc, CEPH_NOSNAP, &mtime);
 }
 /*
@@ -1954,7 +1927,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
        osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
                                          GFP_NOIO);
        if (!osd_req)
-                return NULL;    /* ENOMEM */
+                goto fail;
        if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
                osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
@@ -1965,9 +1938,18 @@ static struct ceph_osd_request *rbd_osd_req_create(
        osd_req->r_priv = obj_request;
        osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-        ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+        if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+                             obj_request->object_name))
+                goto fail;
+        if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+                goto fail;
        return osd_req;
+fail:
+        ceph_osdc_put_request(osd_req);
+        return NULL;
 }
 /*
@@ -2003,16 +1985,25 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
        osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
                                                false, GFP_NOIO);
        if (!osd_req)
-                return NULL;    /* ENOMEM */
+                goto fail;
        osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
        osd_req->r_callback = rbd_osd_req_callback;
        osd_req->r_priv = obj_request;
        osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-        ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+        if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+                             obj_request->object_name))
+                goto fail;
+        if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+                goto fail;
        return osd_req;
+fail:
+        ceph_osdc_put_request(osd_req);
+        return NULL;
 }
@@ -2973,17 +2964,20 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request)
 {
        struct rbd_obj_request *obj_request;
        struct rbd_obj_request *next_obj_request;
+        int ret = 0;
        dout("%s: img %p\n", __func__, img_request);
-        for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
-                int ret;
+        rbd_img_request_get(img_request);
+        for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
                ret = rbd_img_obj_request_submit(obj_request);
                if (ret)
-                        return ret;
+                        goto out_put_ireq;
        }
-        return 0;
+out_put_ireq:
+        rbd_img_request_put(img_request);
+        return ret;
 }
 static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
@@ -3090,45 +3084,18 @@ out_err:
        obj_request_done_set(obj_request);
 }
-static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev);
-{
+static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev);
-        struct rbd_obj_request *obj_request;
-        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-        int ret;
-        obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
-                                                        OBJ_REQUEST_NODATA);
-        if (!obj_request)
-                return -ENOMEM;
-        ret = -ENOMEM;
-        obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
-                                                  obj_request);
-        if (!obj_request->osd_req)
-                goto out;
-        osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
-                                        notify_id, 0, 0);
-        rbd_osd_req_format_read(obj_request);
-        ret = rbd_obj_request_submit(osdc, obj_request);
+static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
-        if (ret)
+                         u64 notifier_id, void *data, size_t data_len)
-                goto out;
-        ret = rbd_obj_request_wait(obj_request);
-out:
-        rbd_obj_request_put(obj_request);
-        return ret;
-}
-static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 {
-        struct rbd_device *rbd_dev = (struct rbd_device *)data;
+        struct rbd_device *rbd_dev = arg;
+        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
        int ret;
-        dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
+        dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev,
-                rbd_dev->header_name, (unsigned long long)notify_id,
+             cookie, notify_id);
-                (unsigned int)opcode);
        /*
         * Until adequate refresh error handling is in place, there is
@@ -3140,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
        if (ret)
                rbd_warn(rbd_dev, "refresh failed: %d", ret);
-        ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
+        ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
+                                   &rbd_dev->header_oloc, notify_id, cookie,
+                                   NULL, 0);
        if (ret)
                rbd_warn(rbd_dev, "notify_ack ret %d", ret);
 }
-/*
+static void rbd_watch_errcb(void *arg, u64 cookie, int err)
- * Send a (un)watch request and wait for the ack.  Return a request
- * with a ref held on success or error.
- */
-static struct rbd_obj_request *rbd_obj_watch_request_helper(
-                                                struct rbd_device *rbd_dev,
-                                                bool watch)
 {
-        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+        struct rbd_device *rbd_dev = arg;
-        struct ceph_options *opts = osdc->client->options;
-        struct rbd_obj_request *obj_request;
        int ret;
-        obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
+        rbd_warn(rbd_dev, "encountered watch error: %d", err);
-                                             OBJ_REQUEST_NODATA);
-        if (!obj_request)
-                return ERR_PTR(-ENOMEM);
-        obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
-                                                  obj_request);
-        if (!obj_request->osd_req) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
-                              rbd_dev->watch_event->cookie, 0, watch);
-        rbd_osd_req_format_write(obj_request);
-        if (watch)
+        __rbd_dev_header_unwatch_sync(rbd_dev);
-                ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
-        ret = rbd_obj_request_submit(osdc, obj_request);
-        if (ret)
-                goto out;
-        ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
+        ret = rbd_dev_header_watch_sync(rbd_dev);
-        if (ret)
-                goto out;
-        ret = obj_request->result;
        if (ret) {
-                if (watch)
+                rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
-                        rbd_obj_request_end(obj_request);
+                return;
-                goto out;
        }
-        return obj_request;
+        ret = rbd_dev_refresh(rbd_dev);
+        if (ret)
-out:
+                rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
-        rbd_obj_request_put(obj_request);
-        return ERR_PTR(ret);
 }
 /*
@@ -3205,35 +3140,33 @@ out:
 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
 {
        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-        struct rbd_obj_request *obj_request;
+        struct ceph_osd_linger_request *handle;
-        int ret;
-        rbd_assert(!rbd_dev->watch_event);
+        rbd_assert(!rbd_dev->watch_handle);
-        rbd_assert(!rbd_dev->watch_request);
-        ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
+        handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
-                                     &rbd_dev->watch_event);
+                                 &rbd_dev->header_oloc, rbd_watch_cb,
-        if (ret < 0)
+                                 rbd_watch_errcb, rbd_dev);
-                return ret;
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
-        obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
+        rbd_dev->watch_handle = handle;
-        if (IS_ERR(obj_request)) {
+        return 0;
-                ceph_osdc_cancel_event(rbd_dev->watch_event);
+}
-                rbd_dev->watch_event = NULL;
-                return PTR_ERR(obj_request);
-        }
-        /*
+static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
-         * A watch request is set to linger, so the underlying osd
+{
-         * request won't go away until we unregister it.  We retain
+        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-         * a pointer to the object request during that time (in
+        int ret;
-         * rbd_dev->watch_request), so we'll keep a reference to it.
-         * We'll drop that reference after we've unregistered it in
-         * rbd_dev_header_unwatch_sync().
-         */
-        rbd_dev->watch_request = obj_request;
-        return 0;
+        if (!rbd_dev->watch_handle)
+                return;
+        ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
+        if (ret)
+                rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
+        rbd_dev->watch_handle = NULL;
 }
 /*
@@ -3241,24 +3174,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
 */
 static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
 {
-        struct rbd_obj_request *obj_request;
+        __rbd_dev_header_unwatch_sync(rbd_dev);
-        rbd_assert(rbd_dev->watch_event);
-        rbd_assert(rbd_dev->watch_request);
-        rbd_obj_request_end(rbd_dev->watch_request);
-        rbd_obj_request_put(rbd_dev->watch_request);
-        rbd_dev->watch_request = NULL;
-        obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
-        if (!IS_ERR(obj_request))
-                rbd_obj_request_put(obj_request);
-        else
-                rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
-                         PTR_ERR(obj_request));
-        ceph_osdc_cancel_event(rbd_dev->watch_event);
-        rbd_dev->watch_event = NULL;
        dout("%s flushing notifies\n", __func__);
        ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
@@ -3591,7 +3507,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
                if (!ondisk)
                        return -ENOMEM;
-                ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
+                ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
                                       0, size, ondisk);
                if (ret < 0)
                        goto out;
@@ -4033,6 +3949,8 @@ static void rbd_dev_release(struct device *dev)
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
        bool need_put = !!rbd_dev->opts;
+        ceph_oid_destroy(&rbd_dev->header_oid);
        rbd_put_client(rbd_dev->rbd_client);
        rbd_spec_put(rbd_dev->spec);
        kfree(rbd_dev->opts);
@@ -4063,6 +3981,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
        INIT_LIST_HEAD(&rbd_dev->node);
        init_rwsem(&rbd_dev->header_rwsem);
+        ceph_oid_init(&rbd_dev->header_oid);
+        ceph_oloc_init(&rbd_dev->header_oloc);
        rbd_dev->dev.bus = &rbd_bus_type;
        rbd_dev->dev.type = &rbd_device_type;
        rbd_dev->dev.parent = &rbd_root_dev;
@@ -4111,7 +4032,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
                __le64 size;
        } __attribute__ ((packed)) size_buf = { 0 };
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_size",
                                &snapid, sizeof (snapid),
                                &size_buf, sizeof (size_buf));
@@ -4151,7 +4072,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_object_prefix", NULL, 0,
                                reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4186,7 +4107,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
        u64 unsup;
        int ret;
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_features",
                                &snapid, sizeof (snapid),
                                &features_buf, sizeof (features_buf));
@@ -4248,7 +4169,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
        }
        snapid = cpu_to_le64(rbd_dev->spec->snap_id);
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_parent",
                                &snapid, sizeof (snapid),
                                reply_buf, size);
@@ -4351,7 +4272,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
        u64 stripe_count;
        int ret;
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_stripe_unit_count", NULL, 0,
                                (char *)&striping_info_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4599,7 +4520,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_snapcontext", NULL, 0,
                                reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4664,7 +4585,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
                return ERR_PTR(-ENOMEM);
        snapid = cpu_to_le64(snap_id);
-        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_snapshot_name",
                                &snapid, sizeof (snapid),
                                reply_buf, size);
@@ -4975,13 +4896,13 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
 again:
        ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
        if (ret == -ENOENT && tries++ < 1) {
-                ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
+                ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
-                                               &newest_epoch);
+                                            &newest_epoch);
                if (ret < 0)
                        return ret;
                if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
-                        ceph_monc_request_next_osdmap(&rbdc->client->monc);
+                        ceph_osdc_maybe_request_map(&rbdc->client->osdc);
                        (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
                                                     newest_epoch,
                                                     opts->mount_timeout);
@@ -5260,35 +5181,26 @@ err_out_unlock:
 static int rbd_dev_header_name(struct rbd_device *rbd_dev)
 {
        struct rbd_spec *spec = rbd_dev->spec;
-        size_t size;
+        int ret;
        /* Record the header object name for this rbd image. */
        rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+        rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
        if (rbd_dev->image_format == 1)
-                size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
+                ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
+                                       spec->image_name, RBD_SUFFIX);
        else
-                size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
+                ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
+                                       RBD_HEADER_PREFIX, spec->image_id);
-        rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
-        if (!rbd_dev->header_name)
-                return -ENOMEM;
-        if (rbd_dev->image_format == 1)
+        return ret;
-                sprintf(rbd_dev->header_name, "%s%s",
-                        spec->image_name, RBD_SUFFIX);
-        else
-                sprintf(rbd_dev->header_name, "%s%s",
-                        RBD_HEADER_PREFIX, spec->image_id);
-        return 0;
 }
 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
 {
        rbd_dev_unprobe(rbd_dev);
-        kfree(rbd_dev->header_name);
-        rbd_dev->header_name = NULL;
        rbd_dev->image_format = 0;
        kfree(rbd_dev->spec->image_id);
        rbd_dev->spec->image_id = NULL;
@@ -5327,7 +5239,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
                                pr_info("image %s/%s does not exist\n",
                                        rbd_dev->spec->pool_name,
                                        rbd_dev->spec->image_name);
-                        goto out_header_name;
+                        goto err_out_format;
                }
        }
@@ -5373,7 +5285,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
                goto err_out_probe;
        dout("discovered format %u image, header name is %s\n",
-                rbd_dev->image_format, rbd_dev->header_name);
+                rbd_dev->image_format, rbd_dev->header_oid.name);
        return 0;
 err_out_probe:
@@ -5381,9 +5293,6 @@ err_out_probe:
 err_out_watch:
        if (!depth)
                rbd_dev_header_unwatch_sync(rbd_dev);
-out_header_name:
-        kfree(rbd_dev->header_name);
-        rbd_dev->header_name = NULL;
 err_out_format:
        rbd_dev->image_format = 0;
        kfree(rbd_dev->spec->image_id);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 43098cd9602b..eeb71e5de27a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page)
 /*
 * Finish an async read(ahead) op.
 */
-static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void finish_read(struct ceph_osd_request *req)
 {
        struct inode *inode = req->r_inode;
        struct ceph_osd_data *osd_data;
-        int rc = req->r_result;
+        int rc = req->r_result <= 0 ? req->r_result : 0;
-        int bytes = le32_to_cpu(msg->hdr.data_len);
+        int bytes = req->r_result >= 0 ? req->r_result : 0;
        int num_pages;
        int i;
@@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
        req->r_callback = finish_read;
        req->r_inode = inode;
-        ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
        dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
        ret = ceph_osdc_start_request(osdc, req, false);
        if (ret < 0)
@@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
                                   truncate_seq, truncate_size,
                                   &inode->i_mtime, &page, 1);
        if (err < 0) {
-                dout("writepage setting page/mapping error %d %p\n", err, page);
+                struct writeback_control tmp_wbc;
+                if (!wbc)
+                        wbc = &tmp_wbc;
+                if (err == -ERESTARTSYS) {
+                        /* killed by SIGKILL */
+                        dout("writepage interrupted page %p\n", page);
+                        redirty_page_for_writepage(wbc, page);
+                        end_page_writeback(page);
+                        goto out;
+                }
+                dout("writepage setting page/mapping error %d %p\n",
+                     err, page);
                SetPageError(page);
                mapping_set_error(&inode->i_data, err);
-                if (wbc)
+                wbc->pages_skipped++;
-                        wbc->pages_skipped++;
        } else {
                dout("writepage cleaned page %p\n", page);
                err = 0;  /* vfs expects us to return 0 */
@@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
        BUG_ON(!inode);
        ihold(inode);
        err = writepage_nounlock(page, wbc);
+        if (err == -ERESTARTSYS) {
+                /* direct memory reclaimer was killed by SIGKILL. return 0
+                 * to prevent caller from setting mapping/page error */
+                err = 0;
+        }
        unlock_page(page);
        iput(inode);
        return err;
 }
 /*
 * lame release_pages helper.  release_pages() isn't exported to
 * modules.
@@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num)
 * If we get an error, set the mapping error bit, but not the individual
 * page error bits.
 */
-static void writepages_finish(struct ceph_osd_request *req,
+static void writepages_finish(struct ceph_osd_request *req)
-                              struct ceph_msg *msg)
 {
        struct inode *inode = req->r_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req,
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        bool remove_page;
        dout("writepages_finish %p rc %d\n", inode, rc);
        if (rc < 0)
                mapping_set_error(mapping, rc);
@@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req,
                                clear_bdi_congested(&fsc->backing_dev_info,
                                                    BLK_RW_ASYNC);
+                        if (rc < 0)
+                                SetPageError(page);
                        ceph_put_snap_context(page_snap_context(page));
                        page->private = 0;
                        ClearPagePrivate(page);
@@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping,
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
        if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
-                pr_warn("writepage_start %p on forced umount\n", inode);
+                if (ci->i_wrbuffer_ref > 0) {
-                truncate_pagecache(inode, 0);
+                        pr_warn_ratelimited(
+                                "writepage_start %p %lld forced umount\n",
+                                inode, ceph_ino(inode));
+                }
                mapping_set_error(mapping, -EIO);
                return -EIO; /* we're in a forced umount, don't write! */
        }
@@ -1063,10 +1079,7 @@ new_request:
                        pages = NULL;
                }
-                vino = ceph_vino(inode);
+                req->r_mtime = inode->i_mtime;
-                ceph_osdc_build_request(req, offset, snapc, vino.snap,
-                                        &inode->i_mtime);
                rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
                BUG_ON(rc);
                req = NULL;
@@ -1099,8 +1112,7 @@ release_pvec_pages:
                mapping->writeback_index = index;
 out:
-        if (req)
+        ceph_osdc_put_request(req);
-                ceph_osdc_put_request(req);
        ceph_put_snap_context(snapc);
        dout("writepages done, rc = %d\n", rc);
        return rc;
@@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file,
                            struct page *page)
 {
        struct inode *inode = file_inode(file);
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_inode_info *ci = ceph_inode(inode);
        loff_t page_off = pos & PAGE_MASK;
        int pos_in_page = pos & ~PAGE_MASK;
@@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file,
        int r;
        struct ceph_snap_context *snapc, *oldest;
+        if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+                dout(" page %p forced umount\n", page);
+                unlock_page(page);
+                return -EIO;
+        }
 retry_locked:
        /* writepages currently holds page lock, but if we change that later, */
        wait_on_page_writeback(page);
@@ -1165,7 +1184,7 @@ retry_locked:
                        snapc = ceph_get_snap_context(snapc);
                        unlock_page(page);
                        ceph_queue_writeback(inode);
-                        r = wait_event_interruptible(ci->i_cap_wq,
+                        r = wait_event_killable(ci->i_cap_wq,
                               context_is_writeable_or_written(inode, snapc));
                        ceph_put_snap_context(snapc);
                        if (r == -ERESTARTSYS)
@@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = {
        .direct_IO = ceph_direct_io,
 };
+static void ceph_block_sigs(sigset_t *oldset)
+{
+        sigset_t mask;
+        siginitsetinv(&mask, sigmask(SIGKILL));
+        sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+static void ceph_restore_sigs(sigset_t *oldset)
+{
+        sigprocmask(SIG_SETMASK, oldset, NULL);
+}
 /*
 * vm ops
@@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct page *pinned_page = NULL;
        loff_t off = vmf->pgoff << PAGE_SHIFT;
        int want, got, ret;
+        sigset_t oldset;
+        ceph_block_sigs(&oldset);
        dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
             inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
@@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
        else
                want = CEPH_CAP_FILE_CACHE;
-        while (1) {
-                got = 0;
+        got = 0;
-                ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want,
+        ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
-                                    -1, &got, &pinned_page);
+        if (ret < 0)
-                if (ret == 0)
+                goto out_restore;
-                        break;
-                if (ret != -ERESTARTSYS) {
-                        WARN_ON(1);
-                        return VM_FAULT_SIGBUS;
-                }
-        }
        dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
             inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
@@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        ceph_put_cap_refs(ci, got);
        if (ret != -EAGAIN)
-                return ret;
+                goto out_restore;
        /* read inline data */
        if (off >= PAGE_SIZE) {
@@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                                                ~__GFP_FS));
                if (!page) {
                        ret = VM_FAULT_OOM;
-                        goto out;
+                        goto out_inline;
                }
                ret1 = __ceph_do_getattr(inode, page,
                                         CEPH_STAT_CAP_INLINE_DATA, true);
                if (ret1 < 0 || off >= i_size_read(inode)) {
                        unlock_page(page);
                        put_page(page);
-                        ret = VM_FAULT_SIGBUS;
+                        if (ret1 < 0)
-                        goto out;
+                                ret = ret1;
+                        else
+                                ret = VM_FAULT_SIGBUS;
+                        goto out_inline;
                }
                if (ret1 < PAGE_SIZE)
                        zero_user_segment(page, ret1, PAGE_SIZE);
@@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                SetPageUptodate(page);
                vmf->page = page;
                ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
+out_inline:
+                dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
+                     inode, off, (size_t)PAGE_SIZE, ret);
        }
-out:
+out_restore:
-        dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
+        ceph_restore_sigs(&oldset);
-             inode, off, (size_t)PAGE_SIZE, ret);
+        if (ret < 0)
+                ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
        return ret;
 }
@@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        loff_t size = i_size_read(inode);
        size_t len;
        int want, got, ret;
+        sigset_t oldset;
        prealloc_cf = ceph_alloc_cap_flush();
        if (!prealloc_cf)
-                return VM_FAULT_SIGBUS;
+                return VM_FAULT_OOM;
+        ceph_block_sigs(&oldset);
        if (ci->i_inline_version != CEPH_INLINE_NONE) {
                struct page *locked_page = NULL;
@@ -1423,10 +1462,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                ret = ceph_uninline_data(vma->vm_file, locked_page);
                if (locked_page)
                        unlock_page(locked_page);
-                if (ret < 0) {
+                if (ret < 0)
-                        ret = VM_FAULT_SIGBUS;
                        goto out_free;
-                }
        }
        if (off + PAGE_SIZE <= size)
@@ -1440,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
        else
                want = CEPH_CAP_FILE_BUFFER;
-        while (1) {
-                got = 0;
+        got = 0;
-                ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
+        ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
-                                    &got, NULL);
+                            &got, NULL);
-                if (ret == 0)
+        if (ret < 0)
-                        break;
+                goto out_free;
-                if (ret != -ERESTARTSYS) {
-                        WARN_ON(1);
-                        ret = VM_FAULT_SIGBUS;
-                        goto out_free;
-                }
-        }
        dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
             inode, off, len, ceph_cap_string(got));
        /* Update time before taking page lock */
        file_update_time(vma->vm_file);
-        lock_page(page);
+        do {
+                lock_page(page);
-        ret = VM_FAULT_NOPAGE;
+                if ((off > size) || (page->mapping != inode->i_mapping)) {
-        if ((off > size) ||
+                        unlock_page(page);
-            (page->mapping != inode->i_mapping)) {
+                        ret = VM_FAULT_NOPAGE;
-                unlock_page(page);
+                        break;
-                goto out;
+                }
-        }
+                ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+                if (ret >= 0) {
+                        /* success.  we'll keep the page locked. */
+                        set_page_dirty(page);
+                        ret = VM_FAULT_LOCKED;
+                }
+        } while (ret == -EAGAIN);
-        ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
-        if (ret >= 0) {
-                /* success.  we'll keep the page locked. */
-                set_page_dirty(page);
-                ret = VM_FAULT_LOCKED;
-        } else {
-                if (ret == -ENOMEM)
-                        ret = VM_FAULT_OOM;
-                else
-                        ret = VM_FAULT_SIGBUS;
-        }
-out:
        if (ret == VM_FAULT_LOCKED ||
            ci->i_inline_version != CEPH_INLINE_NONE) {
                int dirty;
@@ -1495,8 +1523,10 @@ out:
             inode, off, len, ceph_cap_string(got), ret);
        ceph_put_cap_refs(ci, got);
 out_free:
+        ceph_restore_sigs(&oldset);
        ceph_free_cap_flush(prealloc_cf);
+        if (ret < 0)
+                ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
        return ret;
 }
@@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
                goto out;
        }
-        ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+        req->r_mtime = inode->i_mtime;
        err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!err)
                err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
                        goto out_put;
        }
-        ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+        req->r_mtime = inode->i_mtime;
        err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!err)
                err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
        rd_req->r_flags = CEPH_OSD_FLAG_READ;
        osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
        rd_req->r_base_oloc.pool = pool;
-        snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
+        ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
-                 "%llx.00000000", ci->i_vino.ino);
-        rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
+        err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
+        if (err)
+                goto out_unlock;
        wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
                                         1, false, GFP_NOFS);
@@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
                goto out_unlock;
        }
-        wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
+        wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
-                          CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
        osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
-        wr_req->r_base_oloc.pool = pool;
+        ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
-        wr_req->r_base_oid = rd_req->r_base_oid;
+        ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
+        err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
+        if (err)
+                goto out_unlock;
        /* one page should be large enough for STAT data */
        pages = ceph_alloc_page_vector(1, GFP_KERNEL);
@@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
        osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
                                     0, false, true);
-        ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
-                                &ci->vfs_inode.i_mtime);
        err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
-        ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
+        wr_req->r_mtime = ci->vfs_inode.i_mtime;
-                                &ci->vfs_inode.i_mtime);
        err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
        if (!err)
@@ -1823,10 +1855,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 out_unlock:
        up_write(&mdsc->pool_perm_rwsem);
-        if (rd_req)
+        ceph_osdc_put_request(rd_req);
-                ceph_osdc_put_request(rd_req);
+        ceph_osdc_put_request(wr_req);
-        if (wr_req)
-                ceph_osdc_put_request(wr_req);
 out:
        if (!err)
                err = have;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a351480dbabc..c052b5bf219b 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int
        unlock_page(page);
 }
-static inline int cache_valid(struct ceph_inode_info *ci)
+static inline bool cache_valid(struct ceph_inode_info *ci)
 {
        return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
                (ci->i_fscache_gen == ci->i_rdcache_gen));
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cfaeef18cbca..c17b5d76d75e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1656,7 +1656,7 @@ retry_locked:
         */
        if ((!is_delayed || mdsc->stopping) &&
            !S_ISDIR(inode->i_mode) &&          /* ignore readdir cache */
-            ci->i_wrbuffer_ref == 0 &&          /* no dirty pages... */
+            !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
            inode->i_data.nrpages &&            /* have cached pages */
            (revoking & (CEPH_CAP_FILE_CACHE|
                         CEPH_CAP_FILE_LAZYIO)) && /*  or revoking cache */
@@ -1698,8 +1698,8 @@ retry_locked:
                revoking = cap->implemented & ~cap->issued;
                dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
-                     cap->mds, cap, ceph_cap_string(cap->issued),
+                     cap->mds, cap, ceph_cap_string(cap_used),
-                     ceph_cap_string(cap_used),
+                     ceph_cap_string(cap->issued),
                     ceph_cap_string(cap->implemented),
                     ceph_cap_string(revoking));
@@ -2317,7 +2317,7 @@ again:
        /* make sure file is actually open */
        file_wanted = __ceph_caps_file_wanted(ci);
-        if ((file_wanted & need) == 0) {
+        if ((file_wanted & need) != need) {
                dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
                     ceph_cap_string(need), ceph_cap_string(file_wanted));
                *err = -EBADF;
@@ -2412,12 +2412,26 @@ again:
                        goto out_unlock;
                }
-                if (!__ceph_is_any_caps(ci) &&
+                if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
-                    ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+                        int mds_wanted;
-                        dout("get_cap_refs %p forced umount\n", inode);
+                        if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
-                        *err = -EIO;
+                            CEPH_MOUNT_SHUTDOWN) {
-                        ret = 1;
+                                dout("get_cap_refs %p forced umount\n", inode);
-                        goto out_unlock;
+                                *err = -EIO;
+                                ret = 1;
+                                goto out_unlock;
+                        }
+                        mds_wanted = __ceph_caps_mds_wanted(ci);
+                        if ((mds_wanted & need) != need) {
+                                dout("get_cap_refs %p caps were dropped"
+                                     " (session killed?)\n", inode);
+                                *err = -ESTALE;
+                                ret = 1;
+                                goto out_unlock;
+                        }
+                        if ((mds_wanted & file_wanted) ==
+                            (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+                                ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
                }
                dout("get_cap_refs %p have %s needed %s\n", inode,
@@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                        if (err == -EAGAIN)
                                continue;
                        if (err < 0)
-                                return err;
+                                ret = err;
                } else {
                        ret = wait_event_interruptible(ci->i_cap_wq,
                                        try_get_cap_refs(ci, need, want, endoff,
@@ -2496,8 +2510,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                                continue;
                        if (err < 0)
                                ret = err;
-                        if (ret < 0)
+                }
-                                return ret;
+                if (ret < 0) {
+                        if (err == -ESTALE) {
+                                /* session was killed, try renew caps */
+                                ret = ceph_renew_caps(&ci->vfs_inode);
+                                if (ret == 0)
+                                        continue;
+                        }
+                        return ret;
                }
                if (ci->i_inline_version != CEPH_INLINE_NONE &&
@@ -2807,7 +2828,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
        if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
            ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
            (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
-            !ci->i_wrbuffer_ref) {
+            !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
                if (try_nonblocking_invalidate(inode)) {
                        /* there were locked pages.. invalidate later
                           in a separate thread. */
@@ -3226,6 +3247,8 @@ retry:
        if (target < 0) {
                __ceph_remove_cap(cap, false);
+                if (!ci->i_auth_cap)
+                        ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
                goto out_unlock;
        }
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 31f831471ed2..39ff678e567f 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
                                   path ? path : "");
                        spin_unlock(&req->r_old_dentry->d_lock);
                        kfree(path);
-                } else if (req->r_path2) {
+                } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
                        if (req->r_ino2.ino)
                                seq_printf(s, " #%llx/%s", req->r_ino2.ino,
                                           req->r_path2);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3ab1192d2029..6e0fedf6713b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -70,16 +70,42 @@ out_unlock:
 }
 /*
- * for readdir, we encode the directory frag and offset within that
+ * for f_pos for readdir:
- * frag into f_pos.
+ * - hash order:
+ *      (0xff << 52) | ((24 bits hash) << 28) |
+ *      (the nth entry has hash collision);
+ * - frag+name order;
+ *      ((frag value) << 28) | (the nth entry in frag);
 */
+#define OFFSET_BITS     28
+#define OFFSET_MASK     ((1 << OFFSET_BITS) - 1)
+#define HASH_ORDER      (0xffull << (OFFSET_BITS + 24))
+loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
+{
+        loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
+        if (hash_order)
+                fpos |= HASH_ORDER;
+        return fpos;
+}
+static bool is_hash_order(loff_t p)
+{
+        return (p & HASH_ORDER) == HASH_ORDER;
+}
 static unsigned fpos_frag(loff_t p)
 {
-        return p >> 32;
+        return p >> OFFSET_BITS;
 }
+static unsigned fpos_hash(loff_t p)
+{
+        return ceph_frag_value(fpos_frag(p));
+}
 static unsigned fpos_off(loff_t p)
 {
-        return p & 0xffffffff;
+        return p & OFFSET_MASK;
 }
 static int fpos_cmp(loff_t l, loff_t r)
@@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
        return 0;
 }
+static struct dentry *
+__dcache_find_get_entry(struct dentry *parent, u64 idx,
+                        struct ceph_readdir_cache_control *cache_ctl)
+{
+        struct inode *dir = d_inode(parent);
+        struct dentry *dentry;
+        unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
+        loff_t ptr_pos = idx * sizeof(struct dentry *);
+        pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
+        if (ptr_pos >= i_size_read(dir))
+                return NULL;
+        if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
+                ceph_readdir_cache_release(cache_ctl);
+                cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
+                if (!cache_ctl->page) {
+                        dout(" page %lu not found\n", ptr_pgoff);
+                        return ERR_PTR(-EAGAIN);
+                }
+                /* reading/filling the cache are serialized by
+                   i_mutex, no need to use page lock */
+                unlock_page(cache_ctl->page);
+                cache_ctl->dentries = kmap(cache_ctl->page);
+        }
+        cache_ctl->index = idx & idx_mask;
+        rcu_read_lock();
+        spin_lock(&parent->d_lock);
+        /* check i_size again here, because empty directory can be
+         * marked as complete while not holding the i_mutex. */
+        if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
+                dentry = cache_ctl->dentries[cache_ctl->index];
+        else
+                dentry = NULL;
+        spin_unlock(&parent->d_lock);
+        if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+                dentry = NULL;
+        rcu_read_unlock();
+        return dentry ? : ERR_PTR(-EAGAIN);
+}
 /*
 * When possible, we try to satisfy a readdir by peeking at the
 * dcache.  We make this work by carefully ordering dentries on
@@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
        struct inode *dir = d_inode(parent);
        struct dentry *dentry, *last = NULL;
        struct ceph_dentry_info *di;
-        unsigned nsize = PAGE_SIZE / sizeof(struct dentry *);
-        int err = 0;
-        loff_t ptr_pos = 0;
        struct ceph_readdir_cache_control cache_ctl = {};
+        u64 idx = 0;
+        int err = 0;
-        dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
+        dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos);
+        /* search start position */
+        if (ctx->pos > 2) {
+                u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
+                while (count > 0) {
+                        u64 step = count >> 1;
+                        dentry = __dcache_find_get_entry(parent, idx + step,
+                                                         &cache_ctl);
+                        if (!dentry) {
+                                /* use linar search */
+                                idx = 0;
+                                break;
+                        }
+                        if (IS_ERR(dentry)) {
+                                err = PTR_ERR(dentry);
+                                goto out;
+                        }
+                        di = ceph_dentry(dentry);
+                        spin_lock(&dentry->d_lock);
+                        if (fpos_cmp(di->offset, ctx->pos) < 0) {
+                                idx += step + 1;
+                                count -= step + 1;
+                        } else {
+                                count = step;
+                        }
+                        spin_unlock(&dentry->d_lock);
+                        dput(dentry);
+                }
-        /* we can calculate cache index for the first dirfrag */
+                dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
-        if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
-                cache_ctl.index = fpos_off(ctx->pos) - 2;
-                BUG_ON(cache_ctl.index < 0);
-                ptr_pos = cache_ctl.index * sizeof(struct dentry *);
        }
-        while (true) {
-                pgoff_t pgoff;
-                bool emit_dentry;
-                if (ptr_pos >= i_size_read(dir)) {
+        for (;;) {
+                bool emit_dentry = false;
+                dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
+                if (!dentry) {
                        fi->flags |= CEPH_F_ATEND;
                        err = 0;
                        break;
                }
+                if (IS_ERR(dentry)) {
-                err = -EAGAIN;
+                        err = PTR_ERR(dentry);
-                pgoff = ptr_pos >> PAGE_SHIFT;
+                        goto out;
-                if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
-                        ceph_readdir_cache_release(&cache_ctl);
-                        cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
-                        if (!cache_ctl.page) {
-                                dout(" page %lu not found\n", pgoff);
-                                break;
-                        }
-                        /* reading/filling the cache are serialized by
-                         * i_mutex, no need to use page lock */
-                        unlock_page(cache_ctl.page);
-                        cache_ctl.dentries = kmap(cache_ctl.page);
                }
-                rcu_read_lock();
-                spin_lock(&parent->d_lock);
-                /* check i_size again here, because empty directory can be
-                 * marked as complete while not holding the i_mutex. */
-                if (ceph_dir_is_complete_ordered(dir) &&
-                    ptr_pos < i_size_read(dir))
-                        dentry = cache_ctl.dentries[cache_ctl.index % nsize];
-                else
-                        dentry = NULL;
-                spin_unlock(&parent->d_lock);
-                if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
-                        dentry = NULL;
-                rcu_read_unlock();
-                if (!dentry)
-                        break;
-                emit_dentry = false;
                di = ceph_dentry(dentry);
                spin_lock(&dentry->d_lock);
                if (di->lease_shared_gen == shared_gen &&
                    d_really_is_positive(dentry) &&
-                    ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
-                    ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
                    fpos_cmp(ctx->pos, di->offset) <= 0) {
                        emit_dentry = true;
                }
                spin_unlock(&dentry->d_lock);
                if (emit_dentry) {
-                        dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+                        dout(" %llx dentry %p %pd %p\n", di->offset,
                             dentry, dentry, d_inode(dentry));
                        ctx->pos = di->offset;
                        if (!dir_emit(ctx, dentry->d_name.name,
@@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
                } else {
                        dput(dentry);
                }
-                cache_ctl.index++;
-                ptr_pos += sizeof(struct dentry *);
        }
+out:
        ceph_readdir_cache_release(&cache_ctl);
        if (last) {
                int ret;
@@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
        return err;
 }
+static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
+{
+        if (!fi->last_readdir)
+                return true;
+        if (is_hash_order(pos))
+                return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
+        else
+                return fi->frag != fpos_frag(pos);
+}
 static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
        struct ceph_file_info *fi = file->private_data;
@@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_mds_client *mdsc = fsc->mdsc;
-        unsigned frag = fpos_frag(ctx->pos);
+        int i;
-        int off = fpos_off(ctx->pos);
        int err;
        u32 ftype;
        struct ceph_mds_reply_info_parsed *rinfo;
-        dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
+        dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
        if (fi->flags & CEPH_F_ATEND)
                return 0;
@@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                            inode->i_mode >> 12))
                        return 0;
                ctx->pos = 1;
-                off = 1;
        }
        if (ctx->pos == 1) {
                ino_t ino = parent_ino(file->f_path.dentry);
@@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                            inode->i_mode >> 12))
                        return 0;
                ctx->pos = 2;
-                off = 2;
        }
        /* can we use the dcache? */
@@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                err = __dcache_readdir(file, ctx, shared_gen);
                if (err != -EAGAIN)
                        return err;
-                frag = fpos_frag(ctx->pos);
-                off = fpos_off(ctx->pos);
        } else {
                spin_unlock(&ci->i_ceph_lock);
        }
@@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        /* proceed with a normal readdir */
 more:
        /* do we have the correct frag content buffered? */
-        if (fi->frag != frag || fi->last_readdir == NULL) {
+        if (need_send_readdir(fi, ctx->pos)) {
                struct ceph_mds_request *req;
+                unsigned frag;
                int op = ceph_snap(inode) == CEPH_SNAPDIR ?
                        CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
@@ -305,6 +372,13 @@ more:
                        fi->last_readdir = NULL;
                }
+                if (is_hash_order(ctx->pos)) {
+                        frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+                                                NULL, NULL);
+                } else {
+                        frag = fpos_frag(ctx->pos);
+                }
                dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
                     ceph_vinop(inode), frag, fi->last_name);
                req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -331,6 +405,8 @@ more:
                req->r_readdir_cache_idx = fi->readdir_cache_idx;
                req->r_readdir_offset = fi->next_offset;
                req->r_args.readdir.frag = cpu_to_le32(frag);
+                req->r_args.readdir.flags =
+                                cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
                req->r_inode = inode;
                ihold(inode);
@@ -340,22 +416,26 @@ more:
                        ceph_mdsc_put_request(req);
                        return err;
                }
-                dout("readdir got and parsed readdir result=%d"
+                dout("readdir got and parsed readdir result=%d on "
-                     " on frag %x, end=%d, complete=%d\n", err, frag,
+                     "frag %x, end=%d, complete=%d, hash_order=%d\n",
+                     err, frag,
                     (int)req->r_reply_info.dir_end,
-                     (int)req->r_reply_info.dir_complete);
+                     (int)req->r_reply_info.dir_complete,
+                     (int)req->r_reply_info.hash_order);
-                /* note next offset and last dentry name */
                rinfo = &req->r_reply_info;
                if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
                        frag = le32_to_cpu(rinfo->dir_dir->frag);
-                        off = req->r_readdir_offset;
+                        if (!rinfo->hash_order) {
-                        fi->next_offset = off;
+                                fi->next_offset = req->r_readdir_offset;
+                                /* adjust ctx->pos to beginning of frag */
+                                ctx->pos = ceph_make_fpos(frag,
+                                                          fi->next_offset,
+                                                          false);
+                        }
                }
                fi->frag = frag;
-                fi->offset = fi->next_offset;
                fi->last_readdir = req;
                if (req->r_did_prepopulate) {
@@ -363,7 +443,8 @@ more:
                        if (fi->readdir_cache_idx < 0) {
                                /* preclude from marking dir ordered */
                                fi->dir_ordered_count = 0;
-                        } else if (ceph_frag_is_leftmost(frag) && off == 2) {
+                        } else if (ceph_frag_is_leftmost(frag) &&
+                                   fi->next_offset == 2) {
                                /* note dir version at start of readdir so
                                 * we can tell if any dentries get dropped */
                                fi->dir_release_count = req->r_dir_release_cnt;
@@ -377,65 +458,87 @@ more:
                        fi->dir_release_count = 0;
                }
-                if (req->r_reply_info.dir_end) {
+                /* note next offset and last dentry name */
-                        kfree(fi->last_name);
+                if (rinfo->dir_nr > 0) {
-                        fi->last_name = NULL;
+                        struct ceph_mds_reply_dir_entry *rde =
-                        if (ceph_frag_is_rightmost(frag))
+                                        rinfo->dir_entries + (rinfo->dir_nr-1);
-                                fi->next_offset = 2;
+                        unsigned next_offset = req->r_reply_info.dir_end ?
-                        else
+                                        2 : (fpos_off(rde->offset) + 1);
-                                fi->next_offset = 0;
+                        err = note_last_dentry(fi, rde->name, rde->name_len,
-                } else {
+                                               next_offset);
-                        err = note_last_dentry(fi,
-                                       rinfo->dir_dname[rinfo->dir_nr-1],
-                                       rinfo->dir_dname_len[rinfo->dir_nr-1],
-                                       fi->next_offset + rinfo->dir_nr);
                        if (err)
                                return err;
+                } else if (req->r_reply_info.dir_end) {
+                        fi->next_offset = 2;
+                        /* keep last name */
                }
        }
        rinfo = &fi->last_readdir->r_reply_info;
-        dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
+        dout("readdir frag %x num %d pos %llx chunk first %llx\n",
-             rinfo->dir_nr, off, fi->offset);
+             fi->frag, rinfo->dir_nr, ctx->pos,
+             rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
-        ctx->pos = ceph_make_fpos(frag, off);
-        while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
+        i = 0;
-                struct ceph_mds_reply_inode *in =
+        /* search start position */
-                        rinfo->dir_in[off - fi->offset].in;
+        if (rinfo->dir_nr > 0) {
+                int step, nr = rinfo->dir_nr;
+                while (nr > 0) {
+                        step = nr >> 1;
+                        if (rinfo->dir_entries[i + step].offset < ctx->pos) {
+                                i +=  step + 1;
+                                nr -= step + 1;
+                        } else {
+                                nr = step;
+                        }
+                }
+        }
+        for (; i < rinfo->dir_nr; i++) {
+                struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
                ino_t ino;
-                dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
+                BUG_ON(rde->offset < ctx->pos);
-                     off, off - fi->offset, rinfo->dir_nr, ctx->pos,
-                     rinfo->dir_dname_len[off - fi->offset],
+                ctx->pos = rde->offset;
-                     rinfo->dir_dname[off - fi->offset], in);
+                dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
-                BUG_ON(!in);
+                     i, rinfo->dir_nr, ctx->pos,
-                ftype = le32_to_cpu(in->mode) >> 12;
+                     rde->name_len, rde->name, &rde->inode.in);
-                vino.ino = le64_to_cpu(in->ino);
-                vino.snap = le64_to_cpu(in->snapid);
+                BUG_ON(!rde->inode.in);
+                ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
+                vino.ino = le64_to_cpu(rde->inode.in->ino);
+                vino.snap = le64_to_cpu(rde->inode.in->snapid);
                ino = ceph_vino_to_ino(vino);
-                if (!dir_emit(ctx,
-                            rinfo->dir_dname[off - fi->offset],
+                if (!dir_emit(ctx, rde->name, rde->name_len,
-                            rinfo->dir_dname_len[off - fi->offset],
+                              ceph_translate_ino(inode->i_sb, ino), ftype)) {
-                            ceph_translate_ino(inode->i_sb, ino), ftype)) {
                        dout("filldir stopping us...\n");
                        return 0;
                }
-                off++;
                ctx->pos++;
        }
-        if (fi->last_name) {
+        if (fi->next_offset > 2) {
                ceph_mdsc_put_request(fi->last_readdir);
                fi->last_readdir = NULL;
                goto more;
        }
        /* more frags? */
-        if (!ceph_frag_is_rightmost(frag)) {
+        if (!ceph_frag_is_rightmost(fi->frag)) {
-                frag = ceph_frag_next(frag);
+                unsigned frag = ceph_frag_next(fi->frag);
-                off = 0;
+                if (is_hash_order(ctx->pos)) {
-                ctx->pos = ceph_make_fpos(frag, off);
+                        loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
+                                                        fi->next_offset, true);
+                        if (new_pos > ctx->pos)
+                                ctx->pos = new_pos;
+                        /* keep last_name */
+                } else {
+                        ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
+                        kfree(fi->last_name);
+                        fi->last_name = NULL;
+                }
                dout("readdir next frag is %x\n", frag);
                goto more;
        }
@@ -467,7 +570,7 @@ more:
        return 0;
 }
-static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
+static void reset_readdir(struct ceph_file_info *fi)
 {
        if (fi->last_readdir) {
                ceph_mdsc_put_request(fi->last_readdir);
@@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
        fi->last_name = NULL;
        fi->dir_release_count = 0;
        fi->readdir_cache_idx = -1;
-        if (ceph_frag_is_leftmost(frag))
+        fi->next_offset = 2;  /* compensate for . and .. */
-                fi->next_offset = 2;  /* compensate for . and .. */
-        else
-                fi->next_offset = 0;
        fi->flags &= ~CEPH_F_ATEND;
 }
+/*
+ * discard buffered readdir content on seekdir(0), or seek to new frag,
+ * or seek prior to current chunk
+ */
+static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
+{
+        struct ceph_mds_reply_info_parsed *rinfo;
+        loff_t chunk_offset;
+        if (new_pos == 0)
+                return true;
+        if (is_hash_order(new_pos)) {
+                /* no need to reset last_name for a forward seek when
+                 * dentries are sotred in hash order */
+        } else if (fi->frag |= fpos_frag(new_pos)) {
+                return true;
+        }
+        rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
+        if (!rinfo || !rinfo->dir_nr)
+                return true;
+        chunk_offset = rinfo->dir_entries[0].offset;
+        return new_pos < chunk_offset ||
+               is_hash_order(new_pos) != is_hash_order(chunk_offset);
+}
 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 {
        struct ceph_file_info *fi = file->private_data;
        struct inode *inode = file->f_mapping->host;
-        loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
        loff_t retval;
        inode_lock(inode);
@@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
        }
        if (offset >= 0) {
+                if (need_reset_readdir(fi, offset)) {
+                        dout("dir_llseek dropping %p content\n", file);
+                        reset_readdir(fi);
+                } else if (is_hash_order(offset) && offset > file->f_pos) {
+                        /* for hash offset, we don't know if a forward seek
+                         * is within same frag */
+                        fi->dir_release_count = 0;
+                        fi->readdir_cache_idx = -1;
+                }
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
                        fi->flags &= ~CEPH_F_ATEND;
                }
                retval = offset;
-                if (offset == 0 ||
-                    fpos_frag(offset) != fi->frag ||
-                    fpos_off(offset) < fi->offset) {
-                        /* discard buffered readdir content on seekdir(0), or
-                         * seek to new frag, or seek prior to current chunk */
-                        dout("dir_llseek dropping %p content\n", file);
-                        reset_readdir(fi, fpos_frag(offset));
-                } else if (fpos_cmp(offset, old_offset) > 0) {
-                        /* reset dir_release_count if we did a forward seek */
-                        fi->dir_release_count = 0;
-                        fi->readdir_cache_idx = -1;
-                }
        }
 out:
        inode_unlock(inode);
@@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
        return dentry;
 }
-static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
 {
        return ceph_ino(inode) == CEPH_INO_ROOT &&
                strncmp(dentry->d_name.name, ".ceph", 5) == 0;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 4f1dc7120916..a888df6f2d71 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -192,6 +192,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 }
 /*
+ * try renew caps after session gets killed.
+ */
+int ceph_renew_caps(struct inode *inode)
+{
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_mds_request *req;
+        int err, flags, wanted;
+        spin_lock(&ci->i_ceph_lock);
+        wanted = __ceph_caps_file_wanted(ci);
+        if (__ceph_is_any_real_caps(ci) &&
+            (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
+                int issued = __ceph_caps_issued(ci, NULL);
+                spin_unlock(&ci->i_ceph_lock);
+                dout("renew caps %p want %s issued %s updating mds_wanted\n",
+                     inode, ceph_cap_string(wanted), ceph_cap_string(issued));
+                ceph_check_caps(ci, 0, NULL);
+                return 0;
+        }
+        spin_unlock(&ci->i_ceph_lock);
+        flags = 0;
+        if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
+                flags = O_RDWR;
+        else if (wanted & CEPH_CAP_FILE_RD)
+                flags = O_RDONLY;
+        else if (wanted & CEPH_CAP_FILE_WR)
+                flags = O_WRONLY;
+#ifdef O_LAZY
+        if (wanted & CEPH_CAP_FILE_LAZYIO)
+                flags |= O_LAZY;
+#endif
+        req = prepare_open_request(inode->i_sb, flags, 0);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                goto out;
+        }
+        req->r_inode = inode;
+        ihold(inode);
+        req->r_num_caps = 1;
+        req->r_fmode = -1;
+        err = ceph_mdsc_do_request(mdsc, NULL, req);
+        ceph_mdsc_put_request(req);
+out:
+        dout("renew caps %p open result=%d\n", inode, err);
+        return err < 0 ? err : 0;
+}
+/*
 * If we already have the requisite capabilities, we can satisfy
 * the open request locally (no need to request new caps from the
 * MDS).  We do, however, need to inform the MDS (asynchronously)
@@ -616,8 +669,7 @@ static void ceph_aio_complete(struct inode *inode,
        kfree(aio_req);
 }
-static void ceph_aio_complete_req(struct ceph_osd_request *req,
+static void ceph_aio_complete_req(struct ceph_osd_request *req)
-                                  struct ceph_msg *msg)
 {
        int rc = req->r_result;
        struct inode *inode = req->r_inode;
@@ -714,14 +766,21 @@ static void ceph_aio_retry_work(struct work_struct *work)
        req->r_flags =  CEPH_OSD_FLAG_ORDERSNAP |
                        CEPH_OSD_FLAG_ONDISK |
                        CEPH_OSD_FLAG_WRITE;
-        req->r_base_oloc = orig_req->r_base_oloc;
+        ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
-        req->r_base_oid = orig_req->r_base_oid;
+        ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
+        ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
+        if (ret) {
+                ceph_osdc_put_request(req);
+                req = orig_req;
+                goto out;
+        }
        req->r_ops[0] = orig_req->r_ops[0];
        osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
-        ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
+        req->r_mtime = aio_req->mtime;
-                                snapc, CEPH_NOSNAP, &aio_req->mtime);
+        req->r_data_offset = req->r_ops[0].extent.offset;
        ceph_osdc_put_request(orig_req);
@@ -733,7 +792,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 out:
        if (ret < 0) {
                req->r_result = ret;
-                ceph_aio_complete_req(req, NULL);
+                ceph_aio_complete_req(req);
        }
        ceph_put_snap_context(snapc);
@@ -764,6 +823,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
                list_add_tail(&req->r_unsafe_item,
                              &ci->i_unsafe_writes);
                spin_unlock(&ci->i_unsafe_lock);
+                complete_all(&req->r_completion);
        } else {
                spin_lock(&ci->i_unsafe_lock);
                list_del_init(&req->r_unsafe_item);
@@ -875,14 +936,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                                        (pos+len) | (PAGE_SIZE - 1));
                        osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+                        req->r_mtime = mtime;
                }
                osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
                                                 false, false);
-                ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
                if (aio_req) {
                        aio_req->total_len += len;
                        aio_req->num_reqs++;
@@ -956,7 +1015,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                                                              req, false);
                        if (ret < 0) {
                                req->r_result = ret;
-                                ceph_aio_complete_req(req, NULL);
+                                ceph_aio_complete_req(req);
                        }
                }
                return -EIOCBQUEUED;
@@ -1067,9 +1126,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
                osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
                                                false, true);
-                /* BUG_ON(vino.snap != CEPH_NOSNAP); */
+                req->r_mtime = mtime;
-                ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
                ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
                if (!ret)
                        ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1524,9 +1581,7 @@ static int ceph_zero_partial_object(struct inode *inode,
                goto out;
        }
-        ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
+        req->r_mtime = inode->i_mtime;
-                                &inode->i_mtime);
        ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!ret) {
                ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e669cfa9d793..f059b5997072 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -11,6 +11,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/random.h>
+#include <linux/sort.h>
 #include "super.h"
 #include "mds_client.h"
@@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
                diri_auth = ci->i_auth_cap->mds;
        spin_unlock(&ci->i_ceph_lock);
+        if (mds == -1) /* CDIR_AUTH_PARENT */
+                mds = diri_auth;
        mutex_lock(&ci->i_fragtree_mutex);
        if (ndist == 0 && mds == diri_auth) {
                /* no delegation info needed. */
@@ -300,20 +304,38 @@ out:
        return err;
 }
+static int frag_tree_split_cmp(const void *l, const void *r)
+{
+        struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
+        struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
+        return ceph_frag_compare(ls->frag, rs->frag);
+}
+static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
+{
+        if (!frag)
+                return f == ceph_frag_make(0, 0);
+        if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
+                return false;
+        return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
+}
 static int ceph_fill_fragtree(struct inode *inode,
                              struct ceph_frag_tree_head *fragtree,
                              struct ceph_mds_reply_dirfrag *dirinfo)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_inode_frag *frag;
+        struct ceph_inode_frag *frag, *prev_frag = NULL;
        struct rb_node *rb_node;
-        int i;
+        unsigned i, split_by, nsplits;
-        u32 id, nsplits;
+        u32 id;
        bool update = false;
        mutex_lock(&ci->i_fragtree_mutex);
        nsplits = le32_to_cpu(fragtree->nsplits);
-        if (nsplits) {
+        if (nsplits != ci->i_fragtree_nsplits) {
+                update = true;
+        } else if (nsplits) {
                i = prandom_u32() % nsplits;
                id = le32_to_cpu(fragtree->splits[i].frag);
                if (!__ceph_find_frag(ci, id))
@@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode,
        if (!update)
                goto out_unlock;
+        if (nsplits > 1) {
+                sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
+                     frag_tree_split_cmp, NULL);
+        }
        dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
        rb_node = rb_first(&ci->i_fragtree);
        for (i = 0; i < nsplits; i++) {
                id = le32_to_cpu(fragtree->splits[i].frag);
+                split_by = le32_to_cpu(fragtree->splits[i].by);
+                if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
+                        pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
+                               "frag %x split by %d\n", ceph_vinop(inode),
+                               i, nsplits, id, split_by);
+                        continue;
+                }
                frag = NULL;
                while (rb_node) {
                        frag = rb_entry(rb_node, struct ceph_inode_frag, node);
@@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode,
                                break;
                        }
                        rb_node = rb_next(rb_node);
-                        rb_erase(&frag->node, &ci->i_fragtree);
+                        /* delete stale split/leaf node */
-                        kfree(frag);
+                        if (frag->split_by > 0 ||
+                            !is_frag_child(frag->frag, prev_frag)) {
+                                rb_erase(&frag->node, &ci->i_fragtree);
+                                if (frag->split_by > 0)
+                                        ci->i_fragtree_nsplits--;
+                                kfree(frag);
+                        }
                        frag = NULL;
                }
                if (!frag) {
@@ -356,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode,
                        if (IS_ERR(frag))
                                continue;
                }
-                frag->split_by = le32_to_cpu(fragtree->splits[i].by);
+                if (frag->split_by == 0)
+                        ci->i_fragtree_nsplits++;
+                frag->split_by = split_by;
                dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+                prev_frag = frag;
        }
        while (rb_node) {
                frag = rb_entry(rb_node, struct ceph_inode_frag, node);
                rb_node = rb_next(rb_node);
-                rb_erase(&frag->node, &ci->i_fragtree);
+                /* delete stale split/leaf node */
-                kfree(frag);
+                if (frag->split_by > 0 ||
+                    !is_frag_child(frag->frag, prev_frag)) {
+                        rb_erase(&frag->node, &ci->i_fragtree);
+                        if (frag->split_by > 0)
+                                ci->i_fragtree_nsplits--;
+                        kfree(frag);
+                }
        }
 out_unlock:
        mutex_unlock(&ci->i_fragtree_mutex);
@@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode)
                rb_erase(n, &ci->i_fragtree);
                kfree(frag);
        }
+        ci->i_fragtree_nsplits = 0;
        __ceph_destroy_xattrs(ci);
        if (ci->i_xattrs.blob)
@@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode)
        return 1;
 }
+static inline blkcnt_t calc_inode_blocks(u64 size)
+{
+        return (size + (1<<9) - 1) >> 9;
+}
 /*
 * Helpers to fill in size, ctime, mtime, and atime.  We have to be
 * careful because either the client or MDS may have more up to date
@@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
                        size = 0;
                }
                i_size_write(inode, size);
-                inode->i_blocks = (size + (1<<9) - 1) >> 9;
+                inode->i_blocks = calc_inode_blocks(size);
                ci->i_reported_size = size;
                if (truncate_seq != ci->i_truncate_seq) {
                        dout("truncate_seq %u -> %u\n",
@@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                        spin_unlock(&ci->i_ceph_lock);
-                        err = -EINVAL;
+                        if (symlen != i_size_read(inode)) {
-                        if (WARN_ON(symlen != i_size_read(inode)))
+                                pr_err("fill_inode %llx.%llx BAD symlink "
-                                goto out;
+                                        "size %lld\n", ceph_vinop(inode),
+                                        i_size_read(inode));
+                                i_size_write(inode, symlen);
+                                inode->i_blocks = calc_inode_blocks(symlen);
+                        }
                        err = -ENOMEM;
                        sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
@@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
        int i, err = 0;
        for (i = 0; i < rinfo->dir_nr; i++) {
+                struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
                struct inode *in;
                int rc;
-                vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+                vino.ino = le64_to_cpu(rde->inode.in->ino);
-                vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+                vino.snap = le64_to_cpu(rde->inode.in->snapid);
                in = ceph_get_inode(req->r_dentry->d_sb, vino);
                if (IS_ERR(in)) {
@@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
                        dout("new_inode badness got %d\n", err);
                        continue;
                }
-                rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+                rc = fill_inode(in, NULL, &rde->inode, NULL, session,
                                req->r_request_started, -1,
                                &req->r_caps_reservation);
                if (rc < 0) {
                        pr_err("fill_inode badness on %p got %d\n", in, rc);
                        err = rc;
-                        continue;
                }
+                iput(in);
        }
        return err;
@@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                             struct ceph_mds_session *session)
 {
        struct dentry *parent = req->r_dentry;
+        struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct qstr dname;
        struct dentry *dn;
@@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        int err = 0, skipped = 0, ret, i;
        struct inode *snapdir = NULL;
        struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
-        struct ceph_dentry_info *di;
        u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+        u32 last_hash = 0;
+        u32 fpos_offset;
        struct ceph_readdir_cache_control cache_ctl = {};
        if (req->r_aborted)
                return readdir_prepopulate_inodes_only(req, session);
+        if (rinfo->hash_order && req->r_path2) {
+                last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+                                          req->r_path2, strlen(req->r_path2));
+                last_hash = ceph_frag_value(last_hash);
+        }
        if (rinfo->dir_dir &&
            le32_to_cpu(rinfo->dir_dir->frag) != frag) {
                dout("readdir_prepopulate got new frag %x -> %x\n",
                     frag, le32_to_cpu(rinfo->dir_dir->frag));
                frag = le32_to_cpu(rinfo->dir_dir->frag);
-                if (ceph_frag_is_leftmost(frag))
+                if (!rinfo->hash_order)
                        req->r_readdir_offset = 2;
-                else
-                        req->r_readdir_offset = 0;
        }
        if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
@@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
                /* note dir version at start of readdir so we can tell
                 * if any dentries get dropped */
-                struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
                req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
                req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
                req->r_readdir_cache_idx = 0;
        }
        cache_ctl.index = req->r_readdir_cache_idx;
+        fpos_offset = req->r_readdir_offset;
        /* FIXME: release caps/leases if error occurs */
        for (i = 0; i < rinfo->dir_nr; i++) {
+                struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
-                dname.name = rinfo->dir_dname[i];
+                dname.name = rde->name;
-                dname.len = rinfo->dir_dname_len[i];
+                dname.len = rde->name_len;
                dname.hash = full_name_hash(dname.name, dname.len);
-                vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+                vino.ino = le64_to_cpu(rde->inode.in->ino);
-                vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+                vino.snap = le64_to_cpu(rde->inode.in->snapid);
+                if (rinfo->hash_order) {
+                        u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+                                                 rde->name, rde->name_len);
+                        hash = ceph_frag_value(hash);
+                        if (hash != last_hash)
+                                fpos_offset = 2;
+                        last_hash = hash;
+                        rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
+                } else {
+                        rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
+                }
 retry_lookup:
                dn = d_lookup(parent, &dname);
@@ -1490,7 +1569,7 @@ retry_lookup:
                        }
                }
-                ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+                ret = fill_inode(in, NULL, &rde->inode, NULL, session,
                                 req->r_request_started, -1,
                                 &req->r_caps_reservation);
                if (ret < 0) {
@@ -1523,11 +1602,9 @@ retry_lookup:
                        dn = realdn;
                }
-                di = dn->d_fsdata;
+                ceph_dentry(dn)->offset = rde->offset;
-                di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
-                update_dentry_lease(dn, rinfo->dir_dlease[i],
+                update_dentry_lease(dn, rde->lease, req->r_session,
-                                    req->r_session,
                                    req->r_request_started);
                if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
@@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
        spin_lock(&ci->i_ceph_lock);
        dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
        i_size_write(inode, size);
-        inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+        inode->i_blocks = calc_inode_blocks(size);
        /* tell the MDS if we are approaching max_size */
        if ((size << 1) >= ci->i_max_size &&
@@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work)
        struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
                                                  i_pg_inv_work);
        struct inode *inode = &ci->vfs_inode;
+        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        u32 orig_gen;
        int check = 0;
        mutex_lock(&ci->i_truncate_mutex);
+        if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+                pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
+                                    inode, ceph_ino(inode));
+                mapping_set_error(inode->i_mapping, -EIO);
+                truncate_pagecache(inode, 0);
+                mutex_unlock(&ci->i_truncate_mutex);
+                goto out;
+        }
        spin_lock(&ci->i_ceph_lock);
        dout("invalidate_pages %p gen %d revoking %d\n", inode,
             ci->i_rdcache_gen, ci->i_rdcache_revoking);
@@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work)
        orig_gen = ci->i_rdcache_gen;
        spin_unlock(&ci->i_ceph_lock);
-        truncate_pagecache(inode, 0);
+        if (invalidate_inode_pages2(inode->i_mapping) < 0) {
+                pr_err("invalidate_pages %p fails\n", inode);
+        }
        spin_lock(&ci->i_ceph_lock);
        if (orig_gen == ci->i_rdcache_gen &&
@@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
                if ((issued & CEPH_CAP_FILE_EXCL) &&
                    attr->ia_size > inode->i_size) {
                        i_size_write(inode, attr->ia_size);
-                        inode->i_blocks =
+                        inode->i_blocks = calc_inode_blocks(attr->ia_size);
-                                (attr->ia_size + (1 << 9) - 1) >> 9;
                        inode->i_ctime = attr->ia_ctime;
                        ci->i_reported_size = attr->ia_size;
                        dirtied |= CEPH_CAP_FILE_EXCL;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index f851d8d70158..be6b1657b1af 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        if (copy_from_user(&dl, arg, sizeof(dl)))
                return -EFAULT;
-        down_read(&osdc->map_sem);
+        down_read(&osdc->lock);
        r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
                                          &dl.object_no, &dl.object_offset,
                                          &olen);
        if (r < 0) {
-                up_read(&osdc->map_sem);
+                up_read(&osdc->lock);
                return -EIO;
        }
        dl.file_offset -= dl.object_offset;
@@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
                 ceph_ino(inode), dl.object_no);
        oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
-        ceph_oid_set_name(&oid, dl.object_name);
+        ceph_oid_printf(&oid, "%s", dl.object_name);
-        r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
+        r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
        if (r < 0) {
-                up_read(&osdc->map_sem);
+                up_read(&osdc->lock);
                return r;
        }
-        dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+        dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
        if (dl.osd >= 0) {
                struct ceph_entity_addr *a =
                        ceph_osd_addr(osdc->osdmap, dl.osd);
@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        } else {
                memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
        }
-        up_read(&osdc->map_sem);
+        up_read(&osdc->lock);
        /* send result back to user */
        if (copy_to_user(arg, &dl, sizeof(dl)))
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 85b8517f17a0..2103b823bec0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end,
        ceph_decode_need(p, end, sizeof(num) + 2, bad);
        num = ceph_decode_32(p);
-        info->dir_end = ceph_decode_8(p);
+        {
-        info->dir_complete = ceph_decode_8(p);
+                u16 flags = ceph_decode_16(p);
+                info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
+                info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+                info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
+        }
        if (num == 0)
                goto done;
-        BUG_ON(!info->dir_in);
+        BUG_ON(!info->dir_entries);
-        info->dir_dname = (void *)(info->dir_in + num);
+        if ((unsigned long)(info->dir_entries + num) >
-        info->dir_dname_len = (void *)(info->dir_dname + num);
+            (unsigned long)info->dir_entries + info->dir_buf_size) {
-        info->dir_dlease = (void *)(info->dir_dname_len + num);
-        if ((unsigned long)(info->dir_dlease + num) >
-            (unsigned long)info->dir_in + info->dir_buf_size) {
                pr_err("dir contents are larger than expected\n");
                WARN_ON(1);
                goto bad;
@@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end,
        info->dir_nr = num;
        while (num) {
+                struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
                /* dentry */
                ceph_decode_need(p, end, sizeof(u32)*2, bad);
-                info->dir_dname_len[i] = ceph_decode_32(p);
+                rde->name_len = ceph_decode_32(p);
-                ceph_decode_need(p, end, info->dir_dname_len[i], bad);
+                ceph_decode_need(p, end, rde->name_len, bad);
-                info->dir_dname[i] = *p;
+                rde->name = *p;
-                *p += info->dir_dname_len[i];
+                *p += rde->name_len;
-                dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
+                dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
-                     info->dir_dname[i]);
+                rde->lease = *p;
-                info->dir_dlease[i] = *p;
                *p += sizeof(struct ceph_mds_reply_lease);
                /* inode */
-                err = parse_reply_info_in(p, end, &info->dir_in[i], features);
+                err = parse_reply_info_in(p, end, &rde->inode, features);
                if (err < 0)
                        goto out_bad;
+                /* ceph_readdir_prepopulate() will update it */
+                rde->offset = 0;
                i++;
                num--;
        }
@@ -345,9 +348,9 @@ out_bad:
 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
 {
-        if (!info->dir_in)
+        if (!info->dir_entries)
                return;
-        free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
+        free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
 }
@@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref)
        kfree(req);
 }
+DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
 /*
 * lookup session, bump ref if found.
 *
 * called under mdsc->mutex.
 */
-static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
+static struct ceph_mds_request *
-                                             u64 tid)
+lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
 {
        struct ceph_mds_request *req;
-        struct rb_node *n = mdsc->request_tree.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_mds_request, r_node);
-                if (tid < req->r_tid)
-                        n = n->rb_left;
-                else if (tid > req->r_tid)
-                        n = n->rb_right;
-                else {
-                        ceph_mdsc_get_request(req);
-                        return req;
-                }
-        }
-        return NULL;
-}
-static void __insert_request(struct ceph_mds_client *mdsc,
+        req = lookup_request(&mdsc->request_tree, tid);
-                             struct ceph_mds_request *new)
+        if (req)
-{
+                ceph_mdsc_get_request(req);
-        struct rb_node **p = &mdsc->request_tree.rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_mds_request *req = NULL;
-        while (*p) {
+        return req;
-                parent = *p;
-                req = rb_entry(parent, struct ceph_mds_request, r_node);
-                if (new->r_tid < req->r_tid)
-                        p = &(*p)->rb_left;
-                else if (new->r_tid > req->r_tid)
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
-        }
-        rb_link_node(&new->r_node, parent, p);
-        rb_insert_color(&new->r_node, &mdsc->request_tree);
 }
 /*
@@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
                                  req->r_num_caps);
        dout("__register_request %p tid %lld\n", req, req->r_tid);
        ceph_mdsc_get_request(req);
-        __insert_request(mdsc, req);
+        insert_request(&mdsc->request_tree, req);
        req->r_uid = current_fsuid();
        req->r_gid = current_fsgid();
@@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
                }
        }
-        rb_erase(&req->r_node, &mdsc->request_tree);
+        erase_request(&mdsc->request_tree, req);
-        RB_CLEAR_NODE(&req->r_node);
        if (req->r_unsafe_dir && req->r_got_unsafe) {
                struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
        int metadata_bytes = 0;
        int metadata_key_count = 0;
        struct ceph_options *opt = mdsc->fsc->client->options;
+        struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
        void *p;
        const char* metadata[][2] = {
                {"hostname", utsname()->nodename},
                {"kernel_version", utsname()->release},
-                {"entity_id", opt->name ? opt->name : ""},
+                {"entity_id", opt->name ? : ""},
+                {"root", fsopt->server_path ? : "/"},
                {NULL, NULL}
        };
@@ -1149,9 +1125,11 @@ out:
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                                  void *arg)
 {
+        struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
        struct ceph_inode_info *ci = ceph_inode(inode);
        LIST_HEAD(to_remove);
-        int drop = 0;
+        bool drop = false;
+        bool invalidate = false;
        dout("removing cap %p, ci is %p, inode is %p\n",
             cap, ci, &ci->vfs_inode);
@@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
        __ceph_remove_cap(cap, false);
        if (!ci->i_auth_cap) {
                struct ceph_cap_flush *cf;
-                struct ceph_mds_client *mdsc =
+                struct ceph_mds_client *mdsc = fsc->mdsc;
-                        ceph_sb_to_client(inode->i_sb)->mdsc;
+                ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+                if (ci->i_wrbuffer_ref > 0 &&
+                    ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+                        invalidate = true;
                while (true) {
                        struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
@@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                                inode, ceph_ino(inode));
                        ci->i_dirty_caps = 0;
                        list_del_init(&ci->i_dirty_item);
-                        drop = 1;
+                        drop = true;
                }
                if (!list_empty(&ci->i_flushing_item)) {
                        pr_warn_ratelimited(
@@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                        ci->i_flushing_caps = 0;
                        list_del_init(&ci->i_flushing_item);
                        mdsc->num_cap_flushing--;
-                        drop = 1;
+                        drop = true;
                }
                spin_unlock(&mdsc->cap_dirty_lock);
@@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                list_del(&cf->list);
                ceph_free_cap_flush(cf);
        }
-        while (drop--)
+        wake_up_all(&ci->i_cap_wq);
+        if (invalidate)
+                ceph_queue_invalidate(inode);
+        if (drop)
                iput(inode);
        return 0;
 }
@@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 */
 static void remove_session_caps(struct ceph_mds_session *session)
 {
+        struct ceph_fs_client *fsc = session->s_mdsc->fsc;
+        struct super_block *sb = fsc->sb;
        dout("remove_session_caps on %p\n", session);
-        iterate_session_caps(session, remove_session_caps_cb, NULL);
+        iterate_session_caps(session, remove_session_caps_cb, fsc);
        spin_lock(&session->s_cap_lock);
        if (session->s_nr_caps > 0) {
-                struct super_block *sb = session->s_mdsc->fsc->sb;
                struct inode *inode;
                struct ceph_cap *cap, *prev = NULL;
                struct ceph_vino vino;
@@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        wake_up_all(&ci->i_cap_wq);
        if (arg) {
                spin_lock(&ci->i_ceph_lock);
                ci->i_wanted_max_size = 0;
                ci->i_requested_max_size = 0;
                spin_unlock(&ci->i_ceph_lock);
        }
+        wake_up_all(&ci->i_cap_wq);
        return 0;
 }
@@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
        struct ceph_inode_info *ci = ceph_inode(dir);
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
-        size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
+        size_t size = sizeof(struct ceph_mds_reply_dir_entry);
-                      sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
        int order, num_entries;
        spin_lock(&ci->i_ceph_lock);
@@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
        order = get_order(size * num_entries);
        while (order >= 0) {
-                rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL |
+                rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
-                                                        __GFP_NOWARN,
+                                                             __GFP_NOWARN,
-                                                        order);
+                                                             order);
-                if (rinfo->dir_in)
+                if (rinfo->dir_entries)
                        break;
                order--;
        }
-        if (!rinfo->dir_in)
+        if (!rinfo->dir_entries)
                return -ENOMEM;
        num_entries = (PAGE_SIZE << order) / size;
@@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
        INIT_LIST_HEAD(&req->r_unsafe_target_item);
        req->r_fmode = -1;
        kref_init(&req->r_kref);
+        RB_CLEAR_NODE(&req->r_node);
        INIT_LIST_HEAD(&req->r_wait);
        init_completion(&req->r_completion);
        init_completion(&req->r_safe_completion);
@@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        /* get request, session */
        tid = le64_to_cpu(msg->hdr.tid);
        mutex_lock(&mdsc->mutex);
-        req = __lookup_request(mdsc, tid);
+        req = lookup_get_request(mdsc, tid);
        if (!req) {
                dout("handle_reply on unknown tid %llu\n", tid);
                mutex_unlock(&mdsc->mutex);
@@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
        fwd_seq = ceph_decode_32(&p);
        mutex_lock(&mdsc->mutex);
-        req = __lookup_request(mdsc, tid);
+        req = lookup_get_request(mdsc, tid);
        if (!req) {
                dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
                goto out;  /* dup reply? */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ee69a537dba5..e7d38aac7109 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in {
        u32 pool_ns_len;
 };
+struct ceph_mds_reply_dir_entry {
+        char                          *name;
+        u32                           name_len;
+        struct ceph_mds_reply_lease   *lease;
+        struct ceph_mds_reply_info_in inode;
+        loff_t                        offset;
+};
 /*
 * parsed info about an mds reply, including information about
 * either: 1) the target inode and/or its parent directory and dentry,
@@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed {
                        struct ceph_mds_reply_dirfrag *dir_dir;
                        size_t                        dir_buf_size;
                        int                           dir_nr;
-                        char                          **dir_dname;
+                        bool                          dir_complete;
-                        u32                           *dir_dname_len;
+                        bool                          dir_end;
-                        struct ceph_mds_reply_lease   **dir_dlease;
+                        bool                          hash_order;
-                        struct ceph_mds_reply_info_in *dir_in;
+                        struct ceph_mds_reply_dir_entry  *dir_entries;
-                        u8                            dir_complete, dir_end;
                };
                /* for create results */
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 261531e55e9d..8c3591a7fbae 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
        const void *start = *p;
        int i, j, n;
        int err = -EINVAL;
-        u16 version;
+        u8 mdsmap_v, mdsmap_cv;
        m = kzalloc(sizeof(*m), GFP_NOFS);
        if (m == NULL)
                return ERR_PTR(-ENOMEM);
-        ceph_decode_16_safe(p, end, version, bad);
+        ceph_decode_need(p, end, 1 + 1, bad);
-        if (version > 3) {
+        mdsmap_v = ceph_decode_8(p);
-                pr_warn("got mdsmap version %d > 3, failing", version);
+        mdsmap_cv = ceph_decode_8(p);
-                goto bad;
+        if (mdsmap_v >= 4) {
+               u32 mdsmap_len;
+               ceph_decode_32_safe(p, end, mdsmap_len, bad);
+               if (end < *p + mdsmap_len)
+                       goto bad;
+               end = *p + mdsmap_len;
        }
        ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
@@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                u32 namelen;
                s32 mds, inc, state;
                u64 state_seq;
-                u8 infoversion;
+                u8 info_v;
+                void *info_end = NULL;
                struct ceph_entity_addr addr;
                u32 num_export_targets;
                void *pexport_targets = NULL;
                struct ceph_timespec laggy_since;
                struct ceph_mds_info *info;
-                ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+                ceph_decode_need(p, end, sizeof(u64) + 1, bad);
                global_id = ceph_decode_64(p);
-                infoversion = ceph_decode_8(p);
+                info_v= ceph_decode_8(p);
+                if (info_v >= 4) {
+                        u32 info_len;
+                        u8 info_cv;
+                        ceph_decode_need(p, end, 1 + sizeof(u32), bad);
+                        info_cv = ceph_decode_8(p);
+                        info_len = ceph_decode_32(p);
+                        info_end = *p + info_len;
+                        if (info_end > end)
+                                goto bad;
+                }
+                ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
                *p += sizeof(u64);
                namelen = ceph_decode_32(p);  /* skip mds name */
                *p += namelen;
@@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                *p += sizeof(u32);
                ceph_decode_32_safe(p, end, namelen, bad);
                *p += namelen;
-                if (infoversion >= 2) {
+                if (info_v >= 2) {
                        ceph_decode_32_safe(p, end, num_export_targets, bad);
                        pexport_targets = *p;
                        *p += num_export_targets * sizeof(u32);
@@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                        num_export_targets = 0;
                }
+                if (info_end && *p != info_end) {
+                        if (*p > info_end)
+                                goto bad;
+                        *p = info_end;
+                }
                dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
                     i+1, n, global_id, mds, inc,
                     ceph_pr_addr(&addr.in_addr),
@@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
        m->m_cas_pg_pool = ceph_decode_64(p);
        /* ok, we don't care about the rest. */
+        *p = end;
        dout("mdsmap_decode success epoch %u\n", m->m_epoch);
        return m;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f12d5e2955c2..91e02481ce06 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
 * mount options
 */
 enum {
+        Opt_mds_namespace,
        Opt_wsize,
        Opt_rsize,
        Opt_rasize,
@@ -143,6 +144,7 @@ enum {
 };
 static match_table_t fsopt_tokens = {
+        {Opt_mds_namespace, "mds_namespace=%d"},
        {Opt_wsize, "wsize=%d"},
        {Opt_rsize, "rsize=%d"},
        {Opt_rasize, "rasize=%d"},
@@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private)
                break;
                /* misc */
+        case Opt_mds_namespace:
+                fsopt->mds_namespace = intval;
+                break;
        case Opt_wsize:
                fsopt->wsize = intval;
                break;
@@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
 {
        dout("destroy_mount_options %p\n", args);
        kfree(args->snapdir_name);
+        kfree(args->server_path);
        kfree(args);
 }
@@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
        if (ret)
                return ret;
+        ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
+        if (ret)
+                return ret;
        return ceph_compare_options(new_opt, fsc->client);
 }
 static int parse_mount_options(struct ceph_mount_options **pfsopt,
                               struct ceph_options **popt,
                               int flags, char *options,
-                               const char *dev_name,
+                               const char *dev_name)
-                               const char **path)
 {
        struct ceph_mount_options *fsopt;
        const char *dev_name_end;
@@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
        fsopt->congestion_kb = default_congestion_kb();
+        fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
        /*
         * Distinguish the server list from the path in "dev_name".
@@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
         */
        dev_name_end = strchr(dev_name, '/');
        if (dev_name_end) {
-                /* skip over leading '/' for path */
+                fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
-                *path = dev_name_end + 1;
+                if (!fsopt->server_path) {
+                        err = -ENOMEM;
+                        goto out;
+                }
        } else {
-                /* path is empty */
                dev_name_end = dev_name + strlen(dev_name);
-                *path = dev_name_end;
        }
        err = -EINVAL;
        dev_name_end--;         /* back up to ':' separator */
@@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
                goto out;
        }
        dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
-        dout("server path '%s'\n", *path);
+        if (fsopt->server_path)
+                dout("server path '%s'\n", fsopt->server_path);
        *popt = ceph_parse_options(options, dev_name, dev_name_end,
                                 parse_fsopt_token, (void *)fsopt);
@@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
                seq_puts(m, ",noacl");
 #endif
+        if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
+                seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
        if (fsopt->wsize)
                seq_printf(m, ",wsize=%d", fsopt->wsize);
        if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 {
        struct ceph_fs_client *fsc;
        const u64 supported_features =
-                CEPH_FEATURE_FLOCK |
+                CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
-                CEPH_FEATURE_DIRLAYOUTHASH |
+                CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
-                CEPH_FEATURE_MDS_INLINE_DATA;
        const u64 required_features = 0;
        int page_count;
        size_t size;
@@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
                goto fail;
        }
        fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+        fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
        ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
        fsc->mount_options = fsopt;
@@ -785,8 +799,7 @@ out:
 /*
 * mount: join the ceph cluster, and open root directory.
 */
-static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
-                      const char *path)
 {
        int err;
        unsigned long started = jiffies;  /* note the start time */
@@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
                        goto fail;
        }
-        if (path[0] == 0) {
+        if (!fsc->mount_options->server_path) {
                root = fsc->sb->s_root;
                dget(root);
        } else {
-                dout("mount opening base mountpoint\n");
+                const char *path = fsc->mount_options->server_path + 1;
+                dout("mount opening path %s\n", path);
                root = open_root_dentry(fsc, path, started);
                if (IS_ERR(root)) {
                        err = PTR_ERR(root);
@@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
        struct dentry *res;
        int err;
        int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
-        const char *path = NULL;
        struct ceph_mount_options *fsopt = NULL;
        struct ceph_options *opt = NULL;
@@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
        flags |= MS_POSIXACL;
 #endif
-        err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+        err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
        if (err < 0) {
                res = ERR_PTR(err);
                goto out_final;
@@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
                }
        }
-        res = ceph_real_mount(fsc, path);
+        res = ceph_real_mount(fsc);
        if (IS_ERR(res))
                goto out_splat;
        dout("root %p inode %p ino %llx.%llx\n", res,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7b99eb756477..0130a8592191 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -62,6 +62,7 @@ struct ceph_mount_options {
        int cap_release_safety;
        int max_readdir;       /* max readdir result (entires) */
        int max_readdir_bytes; /* max readdir result (bytes) */
+        int mds_namespace;
        /*
         * everything above this point can be memcmp'd; everything below
@@ -69,6 +70,7 @@ struct ceph_mount_options {
         */
        char *snapdir_name;   /* default ".snap" */
+        char *server_path;    /* default  "/" */
 };
 struct ceph_fs_client {
@@ -295,6 +297,7 @@ struct ceph_inode_info {
        u64 i_files, i_subdirs;
        struct rb_root i_fragtree;
+        int i_fragtree_nsplits;
        struct mutex i_fragtree_mutex;
        struct ceph_inode_xattrs_info i_xattrs;
@@ -469,6 +472,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_POOL_RD          (1 << 5)  /* can read from pool */
 #define CEPH_I_POOL_WR          (1 << 6)  /* can write to pool */
 #define CEPH_I_SEC_INITED       (1 << 7)  /* security initialized */
+#define CEPH_I_CAP_DROPPED      (1 << 8)  /* caps were forcibly dropped */
 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
                                           long long release_count,
@@ -537,11 +541,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
        return (struct ceph_dentry_info *)dentry->d_fsdata;
 }
-static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
-{
-        return ((loff_t)frag << 32) | (loff_t)off;
-}
 /*
 * caps helpers
 */
@@ -632,7 +631,6 @@ struct ceph_file_info {
        struct ceph_mds_request *last_readdir;
        /* readdir: position within a frag */
-        unsigned offset;       /* offset of last chunk, adjusted for . and .. */
        unsigned next_offset;  /* offset of next chunk (last_name's + 1) */
        char *last_name;       /* last entry in previous chunk */
        long long dir_release_count;
@@ -927,6 +925,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
+extern int ceph_renew_caps(struct inode *inode);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
                            struct file *file, unsigned flags, umode_t mode,
@@ -942,6 +941,7 @@ extern const struct inode_operations ceph_snapdir_iops;
 extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
        ceph_snapdir_dentry_ops;
+extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
 extern int ceph_handle_snapdir(struct ceph_mds_request *req,
                               struct dentry *dentry, int err);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0d66722c6a52..dacc1bd85629 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
        char buf[128];
        dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
-        down_read(&osdc->map_sem);
+        down_read(&osdc->lock);
        pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
        if (pool_name) {
                size_t len = strlen(pool_name);
@@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
                                ret = -ERANGE;
                }
        }
-        up_read(&osdc->map_sem);
+        up_read(&osdc->lock);
        return ret;
 }
@@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
        s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
        const char *pool_name;
-        down_read(&osdc->map_sem);
+        down_read(&osdc->lock);
        pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
        if (pool_name)
                ret = snprintf(val, size, "%s", pool_name);
        else
                ret = snprintf(val, size, "%lld", (unsigned long long)pool);
-        up_read(&osdc->map_sem);
+        up_read(&osdc->lock);
        return ret;
 }
@@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
        struct ceph_mds_request *req;
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_pagelist *pagelist = NULL;
+        int op = CEPH_MDS_OP_SETXATTR;
        int err;
        if (size > 0) {
@@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
                if (err)
                        goto out;
        } else if (!value) {
-                flags |= CEPH_XATTR_REMOVE;
+                if (flags & CEPH_XATTR_REPLACE)
+                        op = CEPH_MDS_OP_RMXATTR;
+                else
+                        flags |= CEPH_XATTR_REMOVE;
        }
        dout("setxattr value=%.*s\n", (int)size, value);
        /* do request */
-        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
+        req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
-                                       USE_AUTH_MDS);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
                goto out;
        }
-        req->r_args.setxattr.flags = cpu_to_le32(flags);
        req->r_path2 = kstrdup(name, GFP_NOFS);
        if (!req->r_path2) {
                ceph_mdsc_put_request(req);
@@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
                goto out;
        }
-        req->r_pagelist = pagelist;
+        if (op == CEPH_MDS_OP_SETXATTR) {
-        pagelist = NULL;
+                req->r_args.setxattr.flags = cpu_to_le32(flags);
+                req->r_pagelist = pagelist;
+                pagelist = NULL;
+        }
        req->r_inode = inode;
        ihold(inode);
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
index b827e066e55a..146507df8650 100644
--- a/include/linux/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
        return ceph_frag_make(newbits,
                         ceph_frag_value(f) | (i << (24 - newbits)));
 }
-static inline int ceph_frag_is_leftmost(__u32 f)
+static inline bool ceph_frag_is_leftmost(__u32 f)
 {
        return ceph_frag_value(f) == 0;
 }
-static inline int ceph_frag_is_rightmost(__u32 f)
+static inline bool ceph_frag_is_rightmost(__u32 f)
 {
        return ceph_frag_value(f) == ceph_frag_mask(f);
 }
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 37f28bf55ce4..dfce616002ad 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -153,8 +153,9 @@ struct ceph_dir_layout {
 /* watch-notify operations */
 enum {
-  WATCH_NOTIFY                          = 1, /* notifying watcher */
+        CEPH_WATCH_EVENT_NOTIFY           = 1, /* notifying watcher */
-  WATCH_NOTIFY_COMPLETE                 = 2, /* notifier notified when done */
+        CEPH_WATCH_EVENT_NOTIFY_COMPLETE  = 2, /* notifier notified when done */
+        CEPH_WATCH_EVENT_DISCONNECT       = 3, /* we were disconnected */
 };
@@ -207,6 +208,8 @@ struct ceph_mon_subscribe_ack {
        struct ceph_fsid fsid;
 } __attribute__ ((packed));
+#define CEPH_FS_CLUSTER_ID_NONE  -1
 /*
 * mdsmap flags
 */
@@ -344,6 +347,18 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_XATTR_REPLACE (1 << 1)
 #define CEPH_XATTR_REMOVE  (1 << 31)
+/*
+ * readdir request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS     (1<<0)
+/*
+ * readdir reply flags.
+ */
+#define CEPH_READDIR_FRAG_END           (1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE      (1<<8)
+#define CEPH_READDIR_HASH_ORDER         (1<<9)
 union ceph_mds_request_args {
        struct {
                __le32 mask;                 /* CEPH_CAP_* */
@@ -361,6 +376,7 @@ union ceph_mds_request_args {
                __le32 frag;                 /* which dir fragment */
                __le32 max_entries;          /* how many dentries to grab */
                __le32 max_bytes;
+                __le16 flags;
        } __attribute__ ((packed)) readdir;
        struct {
                __le32 mode;
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index a6ef9cc267ec..19e9932f3e77 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
 /*
 * bounds check input.
 */
-static inline int ceph_has_room(void **p, void *end, size_t n)
+static inline bool ceph_has_room(void **p, void *end, size_t n)
 {
        return end >= *p && n <= end - *p;
 }
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index db92a8d4926e..690985daad1c 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len)
                (off >> PAGE_SHIFT);
 }
+/*
+ * These are not meant to be generic - an integer key is assumed.
+ */
+#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)             \
+static void insert_##name(struct rb_root *root, type *t)                \
+{                                                                       \
+        struct rb_node **n = &root->rb_node;                            \
+        struct rb_node *parent = NULL;                                  \
+                                                                        \
+        BUG_ON(!RB_EMPTY_NODE(&t->nodefld));                            \
+                                                                        \
+        while (*n) {                                                    \
+                type *cur = rb_entry(*n, type, nodefld);                \
+                                                                        \
+                parent = *n;                                            \
+                if (t->keyfld < cur->keyfld)                            \
+                        n = &(*n)->rb_left;                             \
+                else if (t->keyfld > cur->keyfld)                       \
+                        n = &(*n)->rb_right;                            \
+                else                                                    \
+                        BUG();                                          \
+        }                                                               \
+                                                                        \
+        rb_link_node(&t->nodefld, parent, n);                           \
+        rb_insert_color(&t->nodefld, root);                             \
+}                                                                       \
+static void erase_##name(struct rb_root *root, type *t)                 \
+{                                                                       \
+        BUG_ON(RB_EMPTY_NODE(&t->nodefld));                             \
+        rb_erase(&t->nodefld, root);                                    \
+        RB_CLEAR_NODE(&t->nodefld);                                     \
+}
+#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)              \
+static type *lookup_##name(struct rb_root *root,                        \
+                           typeof(((type *)0)->keyfld) key)             \
+{                                                                       \
+        struct rb_node *n = root->rb_node;                              \
+                                                                        \
+        while (n) {                                                     \
+                type *cur = rb_entry(n, type, nodefld);                 \
+                                                                        \
+                if (key < cur->keyfld)                                  \
+                        n = n->rb_left;                                 \
+                else if (key > cur->keyfld)                             \
+                        n = n->rb_right;                                \
+                else                                                    \
+                        return cur;                                     \
+        }                                                               \
+                                                                        \
+        return NULL;                                                    \
+}
+#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld)                    \
+DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)                     \
+DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
 extern struct kmem_cache *ceph_inode_cachep;
 extern struct kmem_cache *ceph_cap_cachep;
 extern struct kmem_cache *ceph_cap_flush_cachep;
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index e230e7ed60d3..e2a92df08b47 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -39,20 +39,31 @@ struct ceph_mon_request {
        ceph_monc_request_func_t do_request;
 };
+typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *);
 /*
 * ceph_mon_generic_request is being used for the statfs and
 * mon_get_version requests which are being done a bit differently
 * because we need to get data back to the caller
 */
 struct ceph_mon_generic_request {
+        struct ceph_mon_client *monc;
        struct kref kref;
        u64 tid;
        struct rb_node node;
        int result;
-        void *buf;
        struct completion completion;
+        ceph_monc_callback_t complete_cb;
+        u64 private_data;          /* r_tid/linger_id */
        struct ceph_msg *request;  /* original request */
        struct ceph_msg *reply;    /* and reply */
+        union {
+                struct ceph_statfs *st;
+                u64 newest;
+        } u;
 };
 struct ceph_mon_client {
@@ -77,7 +88,6 @@ struct ceph_mon_client {
        /* pending generic requests */
        struct rb_root generic_request_tree;
-        int num_generic_requests;
        u64 last_tid;
        /* subs, indexed with CEPH_SUB_* */
@@ -86,6 +96,7 @@ struct ceph_mon_client {
                bool want;
                u32 have; /* epoch */
        } subs[3];
+        int fs_cluster_id; /* "mdsmap.<id>" sub */
 #ifdef CONFIG_DEBUG_FS
        struct dentry *debugfs_file;
@@ -116,16 +127,18 @@ extern const char *ceph_sub_str[];
 bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
                        bool continuous);
 void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
+void ceph_monc_renew_subs(struct ceph_mon_client *monc);
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
                                 unsigned long timeout);
 extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
                               struct ceph_statfs *buf);
-extern int ceph_monc_do_get_version(struct ceph_mon_client *monc,
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
-                                    const char *what, u64 *newest);
+                          u64 *newest);
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+                                ceph_monc_callback_t cb, u64 private_data);
 extern int ceph_monc_open_session(struct ceph_mon_client *monc);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index cbf460927c42..19b14862d3e0 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -20,10 +20,11 @@ struct ceph_osd_client;
 /*
 * completion callback for async writepages
 */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
-                                     struct ceph_msg *);
 typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
+#define CEPH_HOMELESS_OSD       -1
 /* a given osd we're communicating with */
 struct ceph_osd {
        atomic_t o_ref;
@@ -32,16 +33,15 @@ struct ceph_osd {
        int o_incarnation;
        struct rb_node o_node;
        struct ceph_connection o_con;
-        struct list_head o_requests;
+        struct rb_root o_requests;
-        struct list_head o_linger_requests;
+        struct rb_root o_linger_requests;
        struct list_head o_osd_lru;
        struct ceph_auth_handshake o_auth;
        unsigned long lru_ttl;
-        int o_marked_for_keepalive;
        struct list_head o_keepalive_item;
+        struct mutex lock;
 };
 #define CEPH_OSD_SLAB_OPS       2
 #define CEPH_OSD_MAX_OPS        16
@@ -104,76 +104,95 @@ struct ceph_osd_req_op {
                        struct ceph_osd_data response_data;
                        __u8 class_len;
                        __u8 method_len;
-                        __u8 argc;
+                        u32 indata_len;
                } cls;
                struct {
                        u64 cookie;
-                        u64 ver;
+                        __u8 op;           /* CEPH_OSD_WATCH_OP_ */
-                        u32 prot_ver;
+                        u32 gen;
-                        u32 timeout;
-                        __u8 flag;
                } watch;
                struct {
+                        struct ceph_osd_data request_data;
+                } notify_ack;
+                struct {
+                        u64 cookie;
+                        struct ceph_osd_data request_data;
+                        struct ceph_osd_data response_data;
+                } notify;
+                struct {
                        u64 expected_object_size;
                        u64 expected_write_size;
                } alloc_hint;
        };
 };
+struct ceph_osd_request_target {
+        struct ceph_object_id base_oid;
+        struct ceph_object_locator base_oloc;
+        struct ceph_object_id target_oid;
+        struct ceph_object_locator target_oloc;
+        struct ceph_pg pgid;
+        u32 pg_num;
+        u32 pg_num_mask;
+        struct ceph_osds acting;
+        struct ceph_osds up;
+        int size;
+        int min_size;
+        bool sort_bitwise;
+        unsigned int flags;                /* CEPH_OSD_FLAG_* */
+        bool paused;
+        int osd;
+};
 /* an in-flight request */
 struct ceph_osd_request {
        u64             r_tid;              /* unique for this client */
        struct rb_node  r_node;
-        struct list_head r_req_lru_item;
+        struct rb_node  r_mc_node;          /* map check */
-        struct list_head r_osd_item;
-        struct list_head r_linger_item;
-        struct list_head r_linger_osd_item;
        struct ceph_osd *r_osd;
-        struct ceph_pg   r_pgid;
-        int              r_pg_osds[CEPH_PG_MAX_SIZE];
+        struct ceph_osd_request_target r_t;
-        int              r_num_pg_osds;
+#define r_base_oid      r_t.base_oid
+#define r_base_oloc     r_t.base_oloc
+#define r_flags         r_t.flags
        struct ceph_msg  *r_request, *r_reply;
-        int               r_flags;     /* any additional flags for the osd */
        u32               r_sent;      /* >0 if r_request is sending/sent */
        /* request osd ops array  */
        unsigned int            r_num_ops;
-        /* these are updated on each send */
-        __le32           *r_request_osdmap_epoch;
-        __le32           *r_request_flags;
-        __le64           *r_request_pool;
-        void             *r_request_pgid;
-        __le32           *r_request_attempts;
-        bool              r_paused;
-        struct ceph_eversion *r_request_reassert_version;
        int               r_result;
-        int               r_got_reply;
+        bool              r_got_reply;
-        int               r_linger;
        struct ceph_osd_client *r_osdc;
        struct kref       r_kref;
        bool              r_mempool;
-        struct completion r_completion, r_safe_completion;
+        struct completion r_completion;
+        struct completion r_safe_completion;  /* fsync waiter */
        ceph_osdc_callback_t r_callback;
        ceph_osdc_unsafe_callback_t r_unsafe_callback;
-        struct ceph_eversion r_reassert_version;
        struct list_head  r_unsafe_item;
        struct inode *r_inode;                /* for use by callbacks */
        void *r_priv;                         /* ditto */
-        struct ceph_object_locator r_base_oloc;
+        /* set by submitter */
-        struct ceph_object_id r_base_oid;
+        u64 r_snapid;                         /* for reads, CEPH_NOSNAP o/w */
-        struct ceph_object_locator r_target_oloc;
+        struct ceph_snap_context *r_snapc;    /* for writes */
-        struct ceph_object_id r_target_oid;
+        struct timespec r_mtime;              /* ditto */
+        u64 r_data_offset;                    /* ditto */
-        u64               r_snapid;
+        bool r_linger;                        /* don't resend on failure */
-        unsigned long     r_stamp;            /* send OR check time */
-        struct ceph_snap_context *r_snapc;    /* snap context for writes */
+        /* internal */
+        unsigned long r_stamp;                /* jiffies, send or check time */
+        int r_attempts;
+        struct ceph_eversion r_replay_version; /* aka reassert_version */
+        u32 r_last_force_resend;
+        u32 r_map_dne_bound;
        struct ceph_osd_req_op r_ops[];
 };
@@ -182,44 +201,70 @@ struct ceph_request_redirect {
        struct ceph_object_locator oloc;
 };
-struct ceph_osd_event {
+typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
-        u64 cookie;
+                                 u64 notifier_id, void *data, size_t data_len);
-        int one_shot;
+typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
+struct ceph_osd_linger_request {
        struct ceph_osd_client *osdc;
-        void (*cb)(u64, u64, u8, void *);
+        u64 linger_id;
-        void *data;
+        bool committed;
-        struct rb_node node;
+        bool is_watch;                  /* watch or notify */
-        struct list_head osd_node;
+        struct ceph_osd *osd;
+        struct ceph_osd_request *reg_req;
+        struct ceph_osd_request *ping_req;
+        unsigned long ping_sent;
+        unsigned long watch_valid_thru;
+        struct list_head pending_lworks;
+        struct ceph_osd_request_target t;
+        u32 last_force_resend;
+        u32 map_dne_bound;
+        struct timespec mtime;
        struct kref kref;
-};
+        struct mutex lock;
+        struct rb_node node;            /* osd */
+        struct rb_node osdc_node;       /* osdc */
+        struct rb_node mc_node;         /* map check */
+        struct list_head scan_item;
+        struct completion reg_commit_wait;
+        struct completion notify_finish_wait;
+        int reg_commit_error;
+        int notify_finish_error;
+        int last_error;
+        u32 register_gen;
+        u64 notify_id;
+        rados_watchcb2_t wcb;
+        rados_watcherrcb_t errcb;
+        void *data;
-struct ceph_osd_event_work {
+        struct page ***preply_pages;
-        struct work_struct work;
+        size_t *preply_len;
-        struct ceph_osd_event *event;
-        u64 ver;
-        u64 notify_id;
-        u8 opcode;
 };
 struct ceph_osd_client {
        struct ceph_client     *client;
        struct ceph_osdmap     *osdmap;       /* current map */
-        struct rw_semaphore    map_sem;
+        struct rw_semaphore    lock;
-        struct completion      map_waiters;
-        u64                    last_requested_map;
-        struct mutex           request_mutex;
        struct rb_root         osds;          /* osds */
        struct list_head       osd_lru;       /* idle osds */
-        u64                    timeout_tid;   /* tid of timeout triggering rq */
+        spinlock_t             osd_lru_lock;
-        u64                    last_tid;      /* tid of last request */
+        struct ceph_osd        homeless_osd;
-        struct rb_root         requests;      /* pending requests */
+        atomic64_t             last_tid;      /* tid of last request */
-        struct list_head       req_lru;       /* in-flight lru */
+        u64                    last_linger_id;
-        struct list_head       req_unsent;    /* unsent/need-resend queue */
+        struct rb_root         linger_requests; /* lingering requests */
-        struct list_head       req_notarget;  /* map to no osd */
+        struct rb_root         map_checks;
-        struct list_head       req_linger;    /* lingering requests */
+        struct rb_root         linger_map_checks;
-        int                    num_requests;
+        atomic_t               num_requests;
+        atomic_t               num_homeless;
        struct delayed_work    timeout_work;
        struct delayed_work    osds_timeout_work;
 #ifdef CONFIG_DEBUG_FS
@@ -231,10 +276,6 @@ struct ceph_osd_client {
        struct ceph_msgpool     msgpool_op;
        struct ceph_msgpool     msgpool_op_reply;
-        spinlock_t              event_lock;
-        struct rb_root          event_tree;
-        u64                     event_count;
        struct workqueue_struct *notify_wq;
 };
@@ -271,9 +312,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
 extern struct ceph_osd_data *osd_req_op_extent_osd_data(
                                        struct ceph_osd_request *osd_req,
                                        unsigned int which);
-extern struct ceph_osd_data *osd_req_op_cls_response_data(
-                                        struct ceph_osd_request *osd_req,
-                                        unsigned int which);
 extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
                                        unsigned int which,
@@ -309,9 +347,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
 extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
                                 u16 opcode, const char *name, const void *value,
                                 size_t size, u8 cmp_op, u8 cmp_mode);
-extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
-                                        unsigned int which, u16 opcode,
-                                        u64 cookie, u64 version, int flag);
 extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
                                       unsigned int which,
                                       u64 expected_object_size,
@@ -322,11 +357,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
                                               unsigned int num_ops,
                                               bool use_mempool,
                                               gfp_t gfp_flags);
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
-extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-                                    struct ceph_snap_context *snapc,
-                                    u64 snap_id,
-                                    struct timespec *mtime);
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
                                      struct ceph_file_layout *layout,
@@ -338,9 +369,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
                                      u32 truncate_seq, u64 truncate_size,
                                      bool use_mempool);
-extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
-                                         struct ceph_osd_request *req);
 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
 extern void ceph_osdc_put_request(struct ceph_osd_request *req);
@@ -353,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
 extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                               struct ceph_vino vino,
@@ -371,11 +400,33 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
                                struct timespec *mtime,
                                struct page **pages, int nr_pages);
-/* watch/notify events */
+/* watch/notify */
-extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
+struct ceph_osd_linger_request *
-                                  void (*event_cb)(u64, u64, u8, void *),
+ceph_osdc_watch(struct ceph_osd_client *osdc,
-                                  void *data, struct ceph_osd_event **pevent);
+                struct ceph_object_id *oid,
-extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
+                struct ceph_object_locator *oloc,
-extern void ceph_osdc_put_event(struct ceph_osd_event *event);
+                rados_watchcb2_t wcb,
+                rados_watcherrcb_t errcb,
+                void *data);
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+                      struct ceph_osd_linger_request *lreq);
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+                         struct ceph_object_id *oid,
+                         struct ceph_object_locator *oloc,
+                         u64 notify_id,
+                         u64 cookie,
+                         void *payload,
+                         size_t payload_len);
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+                     struct ceph_object_id *oid,
+                     struct ceph_object_locator *oloc,
+                     void *payload,
+                     size_t payload_len,
+                     u32 timeout,
+                     struct page ***preply_pages,
+                     size_t *preply_len);
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+                          struct ceph_osd_linger_request *lreq);
 #endif
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index e55c08bc3a96..ddc426b22d81 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -24,21 +24,29 @@ struct ceph_pg {
        uint32_t seed;
 };
-#define CEPH_POOL_FLAG_HASHPSPOOL  1
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
+#define CEPH_POOL_FLAG_HASHPSPOOL       (1ULL << 0) /* hash pg seed and pool id
+                                                       together */
+#define CEPH_POOL_FLAG_FULL             (1ULL << 1) /* pool is full */
 struct ceph_pg_pool_info {
        struct rb_node node;
        s64 id;
-        u8 type;
+        u8 type; /* CEPH_POOL_TYPE_* */
        u8 size;
+        u8 min_size;
        u8 crush_ruleset;
        u8 object_hash;
+        u32 last_force_request_resend;
        u32 pg_num, pgp_num;
        int pg_num_mask, pgp_num_mask;
        s64 read_tier;
        s64 write_tier; /* wins for read+write ops */
-        u64 flags;
+        u64 flags; /* CEPH_POOL_FLAG_* */
        char *name;
+        bool was_full;  /* for handle_one_map() */
 };
 static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
@@ -57,6 +65,22 @@ struct ceph_object_locator {
        s64 pool;
 };
+static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
+{
+        oloc->pool = -1;
+}
+static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
+{
+        return oloc->pool == -1;
+}
+static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
+                                  const struct ceph_object_locator *src)
+{
+        dest->pool = src->pool;
+}
 /*
 * Maximum supported by kernel client object name length
 *
@@ -64,11 +88,47 @@ struct ceph_object_locator {
 */
 #define CEPH_MAX_OID_NAME_LEN 100
+/*
+ * 51-char inline_name is long enough for all cephfs and all but one
+ * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
+ * arbitrarily long (~PAGE_SIZE).  It's done once during rbd map; all
+ * other rbd requests fit into inline_name.
+ *
+ * Makes ceph_object_id 64 bytes on 64-bit.
+ */
+#define CEPH_OID_INLINE_LEN 52
+/*
+ * Both inline and external buffers have space for a NUL-terminator,
+ * which is carried around.  It's not required though - RADOS object
+ * names don't have to be NUL-terminated and may contain NULs.
+ */
 struct ceph_object_id {
-        char name[CEPH_MAX_OID_NAME_LEN];
+        char *name;
+        char inline_name[CEPH_OID_INLINE_LEN];
        int name_len;
 };
+static inline void ceph_oid_init(struct ceph_object_id *oid)
+{
+        oid->name = oid->inline_name;
+        oid->name_len = 0;
+}
+static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
+{
+        return oid->name == oid->inline_name && !oid->name_len;
+}
+void ceph_oid_copy(struct ceph_object_id *dest,
+                   const struct ceph_object_id *src);
+__printf(2, 3)
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
+__printf(3, 4)
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+                     const char *fmt, ...);
+void ceph_oid_destroy(struct ceph_object_id *oid);
 struct ceph_pg_mapping {
        struct rb_node node;
        struct ceph_pg pgid;
@@ -87,7 +147,6 @@ struct ceph_pg_mapping {
 struct ceph_osdmap {
        struct ceph_fsid fsid;
        u32 epoch;
-        u32 mkfs_epoch;
        struct ceph_timespec created, modified;
        u32 flags;         /* CEPH_OSDMAP_* */
@@ -113,43 +172,19 @@ struct ceph_osdmap {
        int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
 };
-static inline void ceph_oid_set_name(struct ceph_object_id *oid,
+static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
-                                     const char *name)
-{
-        int len;
-        len = strlen(name);
-        if (len > sizeof(oid->name)) {
-                WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
-                     name, len, sizeof(oid->name));
-                len = sizeof(oid->name);
-        }
-        memcpy(oid->name, name, len);
-        oid->name_len = len;
-}
-static inline void ceph_oid_copy(struct ceph_object_id *dest,
-                                 struct ceph_object_id *src)
-{
-        BUG_ON(src->name_len > sizeof(dest->name));
-        memcpy(dest->name, src->name, src->name_len);
-        dest->name_len = src->name_len;
-}
-static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
 {
        return osd >= 0 && osd < map->max_osd &&
               (map->osd_state[osd] & CEPH_OSD_EXISTS);
 }
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
 {
        return ceph_osd_exists(map, osd) &&
               (map->osd_state[osd] & CEPH_OSD_UP);
 }
-static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
 {
        return !ceph_osd_is_up(map, osd);
 }
@@ -192,28 +227,59 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
        return 0;
 }
+struct ceph_osdmap *ceph_osdmap_alloc(void);
 extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                            struct ceph_osdmap *map,
+                                             struct ceph_osdmap *map);
-                                            struct ceph_messenger *msgr);
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+struct ceph_osds {
+        int osds[CEPH_PG_MAX_SIZE];
+        int size;
+        int primary; /* id, NOT index */
+};
+static inline void ceph_osds_init(struct ceph_osds *set)
+{
+        set->size = 0;
+        set->primary = -1;
+}
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+                          const struct ceph_osds *new_acting,
+                          const struct ceph_osds *old_up,
+                          const struct ceph_osds *new_up,
+                          int old_size,
+                          int new_size,
+                          int old_min_size,
+                          int new_min_size,
+                          u32 old_pg_num,
+                          u32 new_pg_num,
+                          bool old_sort_bitwise,
+                          bool new_sort_bitwise,
+                          const struct ceph_pg *pgid);
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+                       const struct ceph_osds *new_acting,
+                       bool any_change);
 /* calculate mapping of a file extent to an object */
 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
                                         u64 off, u64 len,
                                         u64 *bno, u64 *oxoff, u64 *oxlen);
-/* calculate mapping of object to a placement group */
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
-extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
+                              struct ceph_object_id *oid,
-                               struct ceph_object_locator *oloc,
+                              struct ceph_object_locator *oloc,
-                               struct ceph_object_id *oid,
+                              struct ceph_pg *raw_pgid);
-                               struct ceph_pg *pg_out);
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
+                               const struct ceph_pg *raw_pgid,
-                               struct ceph_pg pgid,
+                               struct ceph_osds *up,
-                               int *osds, int *primary);
+                               struct ceph_osds *acting);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
-                                struct ceph_pg pgid);
+                              const struct ceph_pg *raw_pgid);
 extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
                                                    u64 id);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 2f822dca1046..5c0da61cb763 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -114,8 +114,8 @@ struct ceph_object_layout {
 * compound epoch+version, used by storage layer to serialize mutations
 */
 struct ceph_eversion {
-        __le32 epoch;
        __le64 version;
+        __le32 epoch;
 } __attribute__ ((packed));
 /*
@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSDMAP_NOIN     (1<<8)  /* block osd auto mark-in */
 #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
 #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB  (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
 /*
 * The error code to return when an OSD can't handle a write
@@ -389,6 +394,13 @@ enum {
        CEPH_OSD_FLAG_SKIPRWLOCKS =   0x10000,  /* skip rw locks */
        CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
        CEPH_OSD_FLAG_FLUSH =         0x40000,  /* this is part of flush */
+        CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000,  /* map snap direct to clone id */
+        CEPH_OSD_FLAG_ENFORCE_SNAPC   = 0x100000,  /* use snapc provided even if
+                                                      pool uses pool snaps */
+        CEPH_OSD_FLAG_REDIRECTED   = 0x200000,  /* op has been redirected */
+        CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000,  /* redirect bit is authoritative */
+        CEPH_OSD_FLAG_FULL_TRY =    0x800000,  /* try op despite full flag */
+        CEPH_OSD_FLAG_FULL_FORCE = 0x1000000,  /* force op despite full flag */
 };
 enum {
@@ -415,7 +427,17 @@ enum {
        CEPH_OSD_CMPXATTR_MODE_U64    = 2
 };
-#define RADOS_NOTIFY_VER        1
+enum {
+        CEPH_OSD_WATCH_OP_UNWATCH = 0,
+        CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+        /* note: use only ODD ids to prevent pre-giant code from
+           interpreting the op as UNWATCH */
+        CEPH_OSD_WATCH_OP_WATCH = 3,
+        CEPH_OSD_WATCH_OP_RECONNECT = 5,
+        CEPH_OSD_WATCH_OP_PING = 7,
+};
+const char *ceph_osd_watch_op_name(int o);
 /*
 * an individual object operation.  each may be accompanied by some data
@@ -450,10 +472,14 @@ struct ceph_osd_op {
                } __attribute__ ((packed)) snap;
                struct {
                        __le64 cookie;
-                        __le64 ver;
+                        __le64 ver;     /* no longer used */
-                        __u8 flag;      /* 0 = unwatch, 1 = watch */
+                        __u8 op;        /* CEPH_OSD_WATCH_OP_* */
+                        __le32 gen;     /* registration generation */
                } __attribute__ ((packed)) watch;
                struct {
+                        __le64 cookie;
+                } __attribute__ ((packed)) notify;
+                struct {
                        __le64 offset, length;
                        __le64 src_offset;
                } __attribute__ ((packed)) clonerange;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index dcc18c6f7cf9..55d2bfee16d7 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
 /*
 * true if we have the mon map (and have thus joined the cluster)
 */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static bool have_mon_and_osd_map(struct ceph_client *client)
 {
        return client->monc.monmap && client->monc.monmap->epoch &&
               client->osdc.osdmap && client->osdc.osdmap->epoch;
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 139a9cb19b0c..3773a4fa11e3 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
        }
 }
+const char *ceph_osd_watch_op_name(int o)
+{
+        switch (o) {
+        case CEPH_OSD_WATCH_OP_UNWATCH:
+                return "unwatch";
+        case CEPH_OSD_WATCH_OP_WATCH:
+                return "watch";
+        case CEPH_OSD_WATCH_OP_RECONNECT:
+                return "reconnect";
+        case CEPH_OSD_WATCH_OP_PING:
+                return "ping";
+        default:
+                return "???";
+        }
+}
 const char *ceph_osd_state_name(int s)
 {
        switch (s) {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index b902fbc7863e..e77b04ca7802 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p)
 {
        int i;
        struct ceph_client *client = s->private;
-        struct ceph_osdmap *map = client->osdc.osdmap;
+        struct ceph_osd_client *osdc = &client->osdc;
+        struct ceph_osdmap *map = osdc->osdmap;
        struct rb_node *n;
        if (map == NULL)
                return 0;
-        seq_printf(s, "epoch %d\n", map->epoch);
+        down_read(&osdc->lock);
-        seq_printf(s, "flags%s%s\n",
+        seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
-                   (map->flags & CEPH_OSDMAP_NEARFULL) ?  " NEARFULL" : "",
-                   (map->flags & CEPH_OSDMAP_FULL) ?  " FULL" : "");
        for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
-                struct ceph_pg_pool_info *pool =
+                struct ceph_pg_pool_info *pi =
                        rb_entry(n, struct ceph_pg_pool_info, node);
-                seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
+                seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
-                           pool->id, pool->pg_num, pool->pg_num_mask,
+                           pi->id, pi->name, pi->type, pi->size, pi->min_size,
-                           pool->read_tier, pool->write_tier);
+                           pi->pg_num, pi->pg_num_mask, pi->flags,
+                           pi->last_force_request_resend, pi->read_tier,
+                           pi->write_tier);
        }
        for (i = 0; i < map->max_osd; i++) {
                struct ceph_entity_addr *addr = &map->osd_addr[i];
@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p)
                           pg->pgid.seed, pg->primary_temp.osd);
        }
+        up_read(&osdc->lock);
        return 0;
 }
@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
                                        CEPH_SUBSCRIBE_ONETIME ?  "" : "+"));
                seq_putc(s, '\n');
        }
+        seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
        for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
                __u16 op;
@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p)
        return 0;
 }
-static int osdc_show(struct seq_file *s, void *pp)
+static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
 {
-        struct ceph_client *client = s->private;
+        int i;
-        struct ceph_osd_client *osdc = &client->osdc;
-        struct rb_node *p;
-        mutex_lock(&osdc->request_mutex);
+        seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
-        for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+        for (i = 0; i < t->up.size; i++)
-                struct ceph_osd_request *req;
+                seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
-                unsigned int i;
+        seq_printf(s, "]/%d\t[", t->up.primary);
-                int opcode;
+        for (i = 0; i < t->acting.size; i++)
+                seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
+        seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
+                   t->target_oid.name_len, t->target_oid.name, t->flags);
+        if (t->paused)
+                seq_puts(s, "\tP");
+}
-                req = rb_entry(p, struct ceph_osd_request, r_node);
+static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
+{
+        int i;
-                seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
+        seq_printf(s, "%llu\t", req->r_tid);
-                           req->r_osd ? req->r_osd->o_osd : -1,
+        dump_target(s, &req->r_t);
-                           req->r_pgid.pool, req->r_pgid.seed);
-                seq_printf(s, "%.*s", req->r_base_oid.name_len,
+        seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
-                           req->r_base_oid.name);
+                   le32_to_cpu(req->r_replay_version.epoch),
+                   le64_to_cpu(req->r_replay_version.version));
-                if (req->r_reassert_version.epoch)
+        for (i = 0; i < req->r_num_ops; i++) {
-                        seq_printf(s, "\t%u'%llu",
+                struct ceph_osd_req_op *op = &req->r_ops[i];
-                           (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),
-                           le64_to_cpu(req->r_reassert_version.version));
+                seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
-                else
+                           ceph_osd_op_name(op->op));
-                        seq_printf(s, "\t");
+                if (op->op == CEPH_OSD_OP_WATCH)
+                        seq_printf(s, "-%s",
+                                   ceph_osd_watch_op_name(op->watch.op));
+        }
+        seq_putc(s, '\n');
+}
+static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+        struct rb_node *n;
+        mutex_lock(&osd->lock);
+        for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+                struct ceph_osd_request *req =
+                    rb_entry(n, struct ceph_osd_request, r_node);
+                dump_request(s, req);
+        }
+        mutex_unlock(&osd->lock);
+}
-                for (i = 0; i < req->r_num_ops; i++) {
+static void dump_linger_request(struct seq_file *s,
-                        opcode = req->r_ops[i].op;
+                                struct ceph_osd_linger_request *lreq)
-                        seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
+{
-                                   ceph_osd_op_name(opcode));
+        seq_printf(s, "%llu\t", lreq->linger_id);
-                }
+        dump_target(s, &lreq->t);
+        seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
+                   lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
+                   lreq->last_error);
+}
+static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+        struct rb_node *n;
+        mutex_lock(&osd->lock);
+        for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+                struct ceph_osd_linger_request *lreq =
+                    rb_entry(n, struct ceph_osd_linger_request, node);
+                dump_linger_request(s, lreq);
+        }
+        mutex_unlock(&osd->lock);
+}
-                seq_printf(s, "\n");
+static int osdc_show(struct seq_file *s, void *pp)
+{
+        struct ceph_client *client = s->private;
+        struct ceph_osd_client *osdc = &client->osdc;
+        struct rb_node *n;
+        down_read(&osdc->lock);
+        seq_printf(s, "REQUESTS %d homeless %d\n",
+                   atomic_read(&osdc->num_requests),
+                   atomic_read(&osdc->num_homeless));
+        for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+                struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+                dump_requests(s, osd);
        }
-        mutex_unlock(&osdc->request_mutex);
+        dump_requests(s, &osdc->homeless_osd);
+        seq_puts(s, "LINGER REQUESTS\n");
+        for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+                struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+                dump_linger_requests(s, osd);
+        }
+        dump_linger_requests(s, &osdc->homeless_osd);
+        up_read(&osdc->lock);
        return 0;
 }
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index cf638c009cfa..37c38a7fb5c5 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
        BUG_ON(num < 1); /* monmap sub is always there */
        ceph_encode_32(&p, num);
        for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
-                const char *s = ceph_sub_str[i];
+                char buf[32];
+                int len;
                if (!monc->subs[i].want)
                        continue;
-                dout("%s %s start %llu flags 0x%x\n", __func__, s,
+                len = sprintf(buf, "%s", ceph_sub_str[i]);
+                if (i == CEPH_SUB_MDSMAP &&
+                    monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
+                        len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
+                dout("%s %s start %llu flags 0x%x\n", __func__, buf,
                     le64_to_cpu(monc->subs[i].item.start),
                     monc->subs[i].item.flags);
-                ceph_encode_string(&p, end, s, strlen(s));
+                ceph_encode_string(&p, end, buf, len);
                memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
                p += sizeof(monc->subs[i].item);
        }
-        BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
+        BUG_ON(p > end);
        msg->front.iov_len = p - msg->front.iov_base;
        msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
        ceph_msg_revoke(msg);
@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
 }
 EXPORT_SYMBOL(ceph_monc_got_map);
-/*
+void ceph_monc_renew_subs(struct ceph_mon_client *monc)
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 {
-        dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
        mutex_lock(&monc->mutex);
-        if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
+        __send_subscribe(monc);
-                                 monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
-                __send_subscribe(monc);
        mutex_unlock(&monc->mutex);
 }
-EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
+EXPORT_SYMBOL(ceph_monc_renew_subs);
 /*
 * Wait for an osdmap with a given epoch.
@@ -478,51 +478,17 @@ out:
 /*
 * generic requests (currently statfs, mon_get_version)
 */
-static struct ceph_mon_generic_request *__lookup_generic_req(
+DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
-        struct ceph_mon_client *monc, u64 tid)
-{
-        struct ceph_mon_generic_request *req;
-        struct rb_node *n = monc->generic_request_tree.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_mon_generic_request, node);
-                if (tid < req->tid)
-                        n = n->rb_left;
-                else if (tid > req->tid)
-                        n = n->rb_right;
-                else
-                        return req;
-        }
-        return NULL;
-}
-static void __insert_generic_request(struct ceph_mon_client *monc,
-                            struct ceph_mon_generic_request *new)
-{
-        struct rb_node **p = &monc->generic_request_tree.rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_mon_generic_request *req = NULL;
-        while (*p) {
-                parent = *p;
-                req = rb_entry(parent, struct ceph_mon_generic_request, node);
-                if (new->tid < req->tid)
-                        p = &(*p)->rb_left;
-                else if (new->tid > req->tid)
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
-        }
-        rb_link_node(&new->node, parent, p);
-        rb_insert_color(&new->node, &monc->generic_request_tree);
-}
 static void release_generic_request(struct kref *kref)
 {
        struct ceph_mon_generic_request *req =
                container_of(kref, struct ceph_mon_generic_request, kref);
+        dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
+             req->reply);
+        WARN_ON(!RB_EMPTY_NODE(&req->node));
        if (req->reply)
                ceph_msg_put(req->reply);
        if (req->request)
@@ -533,7 +499,8 @@ static void release_generic_request(struct kref *kref)
 static void put_generic_request(struct ceph_mon_generic_request *req)
 {
-        kref_put(&req->kref, release_generic_request);
+        if (req)
+                kref_put(&req->kref, release_generic_request);
 }
 static void get_generic_request(struct ceph_mon_generic_request *req)
@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
        kref_get(&req->kref);
 }
+static struct ceph_mon_generic_request *
+alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
+{
+        struct ceph_mon_generic_request *req;
+        req = kzalloc(sizeof(*req), gfp);
+        if (!req)
+                return NULL;
+        req->monc = monc;
+        kref_init(&req->kref);
+        RB_CLEAR_NODE(&req->node);
+        init_completion(&req->completion);
+        dout("%s greq %p\n", __func__, req);
+        return req;
+}
+static void register_generic_request(struct ceph_mon_generic_request *req)
+{
+        struct ceph_mon_client *monc = req->monc;
+        WARN_ON(req->tid);
+        get_generic_request(req);
+        req->tid = ++monc->last_tid;
+        insert_generic_request(&monc->generic_request_tree, req);
+}
+static void send_generic_request(struct ceph_mon_client *monc,
+                                 struct ceph_mon_generic_request *req)
+{
+        WARN_ON(!req->tid);
+        dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+        req->request->hdr.tid = cpu_to_le64(req->tid);
+        ceph_con_send(&monc->con, ceph_msg_get(req->request));
+}
+static void __finish_generic_request(struct ceph_mon_generic_request *req)
+{
+        struct ceph_mon_client *monc = req->monc;
+        dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+        erase_generic_request(&monc->generic_request_tree, req);
+        ceph_msg_revoke(req->request);
+        ceph_msg_revoke_incoming(req->reply);
+}
+static void finish_generic_request(struct ceph_mon_generic_request *req)
+{
+        __finish_generic_request(req);
+        put_generic_request(req);
+}
+static void complete_generic_request(struct ceph_mon_generic_request *req)
+{
+        if (req->complete_cb)
+                req->complete_cb(req);
+        else
+                complete_all(&req->completion);
+        put_generic_request(req);
+}
+void cancel_generic_request(struct ceph_mon_generic_request *req)
+{
+        struct ceph_mon_client *monc = req->monc;
+        struct ceph_mon_generic_request *lookup_req;
+        dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+        mutex_lock(&monc->mutex);
+        lookup_req = lookup_generic_request(&monc->generic_request_tree,
+                                            req->tid);
+        if (lookup_req) {
+                WARN_ON(lookup_req != req);
+                finish_generic_request(req);
+        }
+        mutex_unlock(&monc->mutex);
+}
+static int wait_generic_request(struct ceph_mon_generic_request *req)
+{
+        int ret;
+        dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+        ret = wait_for_completion_interruptible(&req->completion);
+        if (ret)
+                cancel_generic_request(req);
+        else
+                ret = req->result; /* completed */
+        return ret;
+}
 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
                                         struct ceph_msg_header *hdr,
                                         int *skip)
@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
        struct ceph_msg *m;
        mutex_lock(&monc->mutex);
-        req = __lookup_generic_req(monc, tid);
+        req = lookup_generic_request(&monc->generic_request_tree, tid);
        if (!req) {
                dout("get_generic_reply %lld dne\n", tid);
                *skip = 1;
@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
        return m;
 }
-static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
-                                struct ceph_mon_generic_request *req)
-{
-        int err;
-        /* register request */
-        req->tid = tid != 0 ? tid : ++monc->last_tid;
-        req->request->hdr.tid = cpu_to_le64(req->tid);
-        __insert_generic_request(monc, req);
-        monc->num_generic_requests++;
-        ceph_con_send(&monc->con, ceph_msg_get(req->request));
-        mutex_unlock(&monc->mutex);
-        err = wait_for_completion_interruptible(&req->completion);
-        mutex_lock(&monc->mutex);
-        rb_erase(&req->node, &monc->generic_request_tree);
-        monc->num_generic_requests--;
-        if (!err)
-                err = req->result;
-        return err;
-}
-static int do_generic_request(struct ceph_mon_client *monc,
-                              struct ceph_mon_generic_request *req)
-{
-        int err;
-        mutex_lock(&monc->mutex);
-        err = __do_generic_request(monc, 0, req);
-        mutex_unlock(&monc->mutex);
-        return err;
-}
 /*
 * statfs
 */
@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
        struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
        u64 tid = le64_to_cpu(msg->hdr.tid);
+        dout("%s msg %p tid %llu\n", __func__, msg, tid);
        if (msg->front.iov_len != sizeof(*reply))
                goto bad;
-        dout("handle_statfs_reply %p tid %llu\n", msg, tid);
        mutex_lock(&monc->mutex);
-        req = __lookup_generic_req(monc, tid);
+        req = lookup_generic_request(&monc->generic_request_tree, tid);
-        if (req) {
+        if (!req) {
-                *(struct ceph_statfs *)req->buf = reply->st;
+                mutex_unlock(&monc->mutex);
-                req->result = 0;
+                return;
-                get_generic_request(req);
        }
+        req->result = 0;
+        *req->u.st = reply->st; /* struct */
+        __finish_generic_request(req);
        mutex_unlock(&monc->mutex);
-        if (req) {
-                complete_all(&req->completion);
+        complete_generic_request(req);
-                put_generic_request(req);
-        }
        return;
 bad:
@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 {
        struct ceph_mon_generic_request *req;
        struct ceph_mon_statfs *h;
-        int err;
+        int ret = -ENOMEM;
-        req = kzalloc(sizeof(*req), GFP_NOFS);
+        req = alloc_generic_request(monc, GFP_NOFS);
        if (!req)
-                return -ENOMEM;
+                goto out;
-        kref_init(&req->kref);
-        req->buf = buf;
-        init_completion(&req->completion);
-        err = -ENOMEM;
        req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
                                    true);
        if (!req->request)
                goto out;
-        req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
-                                  true);
+        req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
        if (!req->reply)
                goto out;
+        req->u.st = buf;
+        mutex_lock(&monc->mutex);
+        register_generic_request(req);
        /* fill out request */
        h = req->request->front.iov_base;
        h->monhdr.have_version = 0;
        h->monhdr.session_mon = cpu_to_le16(-1);
        h->monhdr.session_mon_tid = 0;
        h->fsid = monc->monmap->fsid;
+        send_generic_request(monc, req);
+        mutex_unlock(&monc->mutex);
-        err = do_generic_request(monc, req);
+        ret = wait_generic_request(req);
 out:
        put_generic_request(req);
-        return err;
+        return ret;
 }
 EXPORT_SYMBOL(ceph_monc_do_statfs);
@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
        void *end = p + msg->front_alloc_len;
        u64 handle;
-        dout("%s %p tid %llu\n", __func__, msg, tid);
+        dout("%s msg %p tid %llu\n", __func__, msg, tid);
        ceph_decode_need(&p, end, 2*sizeof(u64), bad);
        handle = ceph_decode_64(&p);
@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
                goto bad;
        mutex_lock(&monc->mutex);
-        req = __lookup_generic_req(monc, handle);
+        req = lookup_generic_request(&monc->generic_request_tree, handle);
-        if (req) {
+        if (!req) {
-                *(u64 *)req->buf = ceph_decode_64(&p);
+                mutex_unlock(&monc->mutex);
-                req->result = 0;
+                return;
-                get_generic_request(req);
        }
+        req->result = 0;
+        req->u.newest = ceph_decode_64(&p);
+        __finish_generic_request(req);
        mutex_unlock(&monc->mutex);
-        if (req) {
-                complete_all(&req->completion);
-                put_generic_request(req);
-        }
+        complete_generic_request(req);
        return;
 bad:
        pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
        ceph_msg_dump(msg);
 }
-/*
+static struct ceph_mon_generic_request *
- * Send MMonGetVersion and wait for the reply.
+__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
- *
+                        ceph_monc_callback_t cb, u64 private_data)
- * @what: one of "mdsmap", "osdmap" or "monmap"
- */
-int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
-                             u64 *newest)
 {
        struct ceph_mon_generic_request *req;
-        void *p, *end;
-        u64 tid;
-        int err;
-        req = kzalloc(sizeof(*req), GFP_NOFS);
+        req = alloc_generic_request(monc, GFP_NOIO);
        if (!req)
-                return -ENOMEM;
+                goto err_put_req;
-        kref_init(&req->kref);
-        req->buf = newest;
-        init_completion(&req->completion);
        req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
                                    sizeof(u64) + sizeof(u32) + strlen(what),
-                                    GFP_NOFS, true);
+                                    GFP_NOIO, true);
-        if (!req->request) {
+        if (!req->request)
-                err = -ENOMEM;
+                goto err_put_req;
-                goto out;
-        }
-        req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
+        req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
-                                  GFP_NOFS, true);
+                                  true);
-        if (!req->reply) {
+        if (!req->reply)
-                err = -ENOMEM;
+                goto err_put_req;
-                goto out;
-        }
-        p = req->request->front.iov_base;
+        req->complete_cb = cb;
-        end = p + req->request->front_alloc_len;
+        req->private_data = private_data;
-        /* fill out request */
        mutex_lock(&monc->mutex);
-        tid = ++monc->last_tid;
+        register_generic_request(req);
-        ceph_encode_64(&p, tid); /* handle */
+        {
-        ceph_encode_string(&p, end, what, strlen(what));
+                void *p = req->request->front.iov_base;
+                void *const end = p + req->request->front_alloc_len;
+                ceph_encode_64(&p, req->tid); /* handle */
+                ceph_encode_string(&p, end, what, strlen(what));
+                WARN_ON(p != end);
+        }
+        send_generic_request(monc, req);
+        mutex_unlock(&monc->mutex);
-        err = __do_generic_request(monc, tid, req);
+        return req;
-        mutex_unlock(&monc->mutex);
+err_put_req:
-out:
        put_generic_request(req);
-        return err;
+        return ERR_PTR(-ENOMEM);
+}
+/*
+ * Send MMonGetVersion and wait for the reply.
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+                          u64 *newest)
+{
+        struct ceph_mon_generic_request *req;
+        int ret;
+        req = __ceph_monc_get_version(monc, what, NULL, 0);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        ret = wait_generic_request(req);
+        if (!ret)
+                *newest = req->u.newest;
+        put_generic_request(req);
+        return ret;
 }
-EXPORT_SYMBOL(ceph_monc_do_get_version);
+EXPORT_SYMBOL(ceph_monc_get_version);
+/*
+ * Send MMonGetVersion,
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+                                ceph_monc_callback_t cb, u64 private_data)
+{
+        struct ceph_mon_generic_request *req;
+        req = __ceph_monc_get_version(monc, what, cb, private_data);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        put_generic_request(req);
+        return 0;
+}
+EXPORT_SYMBOL(ceph_monc_get_version_async);
 /*
 * Resend pending generic requests.
@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        if (!monc->m_subscribe_ack)
                goto out_auth;
-        monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
+        monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
                                         true);
        if (!monc->m_subscribe)
                goto out_subscribe_ack;
@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
        monc->generic_request_tree = RB_ROOT;
-        monc->num_generic_requests = 0;
        monc->last_tid = 0;
+        monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
        return 0;
 out_auth_reply:
@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
        ceph_auth_destroy(monc->auth);
+        WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
        ceph_msg_put(monc->m_auth);
        ceph_msg_put(monc->m_auth_reply);
        ceph_msg_put(monc->m_subscribe);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 40a53a70efdf..0160d7d09a1e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -19,25 +19,12 @@
 #include <linux/ceph/auth.h>
 #include <linux/ceph/pagelist.h>
-#define OSD_OP_FRONT_LEN        4096
 #define OSD_OPREPLY_FRONT_LEN   512
 static struct kmem_cache        *ceph_osd_request_cache;
 static const struct ceph_connection_operations osd_con_ops;
-static void __send_queued(struct ceph_osd_client *osdc);
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-static void __register_request(struct ceph_osd_client *osdc,
-                               struct ceph_osd_request *req);
-static void __unregister_request(struct ceph_osd_client *osdc,
-                                 struct ceph_osd_request *req);
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
-                                        struct ceph_osd_request *req);
-static void __enqueue_request(struct ceph_osd_request *req);
-static void __send_request(struct ceph_osd_client *osdc,
-                           struct ceph_osd_request *req);
 /*
 * Implement client access to distributed object storage cluster.
 *
@@ -56,6 +43,52 @@ static void __send_request(struct ceph_osd_client *osdc,
 * channel with an OSD is reset.
 */
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void link_linger(struct ceph_osd *osd,
+                        struct ceph_osd_linger_request *lreq);
+static void unlink_linger(struct ceph_osd *osd,
+                          struct ceph_osd_linger_request *lreq);
+#if 1
+static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
+{
+        bool wrlocked = true;
+        if (unlikely(down_read_trylock(sem))) {
+                wrlocked = false;
+                up_read(sem);
+        }
+        return wrlocked;
+}
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
+{
+        WARN_ON(!rwsem_is_locked(&osdc->lock));
+}
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
+{
+        WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_osd_locked(struct ceph_osd *osd)
+{
+        struct ceph_osd_client *osdc = osd->o_osdc;
+        WARN_ON(!(mutex_is_locked(&osd->lock) &&
+                  rwsem_is_locked(&osdc->lock)) &&
+                !rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
+{
+        WARN_ON(!mutex_is_locked(&lreq->lock));
+}
+#else
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
+static inline void verify_osd_locked(struct ceph_osd *osd) { }
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
+#endif
 /*
 * calculate the mapping of a file extent onto an object, and fill out the
 * request accordingly.  shorten extent as necessary if it crosses an
@@ -144,14 +177,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data);
-struct ceph_osd_data *
-osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
-                        unsigned int which)
-{
-        return osd_req_op_data(osd_req, which, cls, response_data);
-}
-EXPORT_SYMBOL(osd_req_op_cls_response_data);    /* ??? */
 void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
                        unsigned int which, struct page **pages,
                        u64 length, u32 alignment,
@@ -218,6 +243,8 @@ void osd_req_op_cls_request_data_pagelist(
        osd_data = osd_req_op_data(osd_req, which, cls, request_data);
        ceph_osd_data_pagelist_init(osd_data, pagelist);
+        osd_req->r_ops[which].cls.indata_len += pagelist->length;
+        osd_req->r_ops[which].indata_len += pagelist->length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
@@ -230,6 +257,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
        osd_data = osd_req_op_data(osd_req, which, cls, request_data);
        ceph_osd_data_pages_init(osd_data, pages, length, alignment,
                                pages_from_pool, own_pages);
+        osd_req->r_ops[which].cls.indata_len += length;
+        osd_req->r_ops[which].indata_len += length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
@@ -302,14 +331,76 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
        case CEPH_OSD_OP_STAT:
                ceph_osd_data_release(&op->raw_data_in);
                break;
+        case CEPH_OSD_OP_NOTIFY_ACK:
+                ceph_osd_data_release(&op->notify_ack.request_data);
+                break;
+        case CEPH_OSD_OP_NOTIFY:
+                ceph_osd_data_release(&op->notify.request_data);
+                ceph_osd_data_release(&op->notify.response_data);
+                break;
        default:
                break;
        }
 }
 /*
+ * Assumes @t is zero-initialized.
+ */
+static void target_init(struct ceph_osd_request_target *t)
+{
+        ceph_oid_init(&t->base_oid);
+        ceph_oloc_init(&t->base_oloc);
+        ceph_oid_init(&t->target_oid);
+        ceph_oloc_init(&t->target_oloc);
+        ceph_osds_init(&t->acting);
+        ceph_osds_init(&t->up);
+        t->size = -1;
+        t->min_size = -1;
+        t->osd = CEPH_HOMELESS_OSD;
+}
+static void target_copy(struct ceph_osd_request_target *dest,
+                        const struct ceph_osd_request_target *src)
+{
+        ceph_oid_copy(&dest->base_oid, &src->base_oid);
+        ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
+        ceph_oid_copy(&dest->target_oid, &src->target_oid);
+        ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
+        dest->pgid = src->pgid; /* struct */
+        dest->pg_num = src->pg_num;
+        dest->pg_num_mask = src->pg_num_mask;
+        ceph_osds_copy(&dest->acting, &src->acting);
+        ceph_osds_copy(&dest->up, &src->up);
+        dest->size = src->size;
+        dest->min_size = src->min_size;
+        dest->sort_bitwise = src->sort_bitwise;
+        dest->flags = src->flags;
+        dest->paused = src->paused;
+        dest->osd = src->osd;
+}
+static void target_destroy(struct ceph_osd_request_target *t)
+{
+        ceph_oid_destroy(&t->base_oid);
+        ceph_oid_destroy(&t->target_oid);
+}
+/*
 * requests
 */
+static void request_release_checks(struct ceph_osd_request *req)
+{
+        WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+        WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
+        WARN_ON(!list_empty(&req->r_unsafe_item));
+        WARN_ON(req->r_osd);
+}
 static void ceph_osdc_release_request(struct kref *kref)
 {
        struct ceph_osd_request *req = container_of(kref,
@@ -318,24 +409,19 @@ static void ceph_osdc_release_request(struct kref *kref)
        dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
             req->r_request, req->r_reply);
-        WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+        request_release_checks(req);
-        WARN_ON(!list_empty(&req->r_req_lru_item));
-        WARN_ON(!list_empty(&req->r_osd_item));
-        WARN_ON(!list_empty(&req->r_linger_item));
-        WARN_ON(!list_empty(&req->r_linger_osd_item));
-        WARN_ON(req->r_osd);
        if (req->r_request)
                ceph_msg_put(req->r_request);
-        if (req->r_reply) {
+        if (req->r_reply)
-                ceph_msg_revoke_incoming(req->r_reply);
                ceph_msg_put(req->r_reply);
-        }
        for (which = 0; which < req->r_num_ops; which++)
                osd_req_op_data_release(req, which);
+        target_destroy(&req->r_t);
        ceph_put_snap_context(req->r_snapc);
        if (req->r_mempool)
                mempool_free(req, req->r_osdc->req_mempool);
        else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
@@ -354,12 +440,66 @@ EXPORT_SYMBOL(ceph_osdc_get_request);
 void ceph_osdc_put_request(struct ceph_osd_request *req)
 {
-        dout("%s %p (was %d)\n", __func__, req,
+        if (req) {
-             atomic_read(&req->r_kref.refcount));
+                dout("%s %p (was %d)\n", __func__, req,
-        kref_put(&req->r_kref, ceph_osdc_release_request);
+                     atomic_read(&req->r_kref.refcount));
+                kref_put(&req->r_kref, ceph_osdc_release_request);
+        }
 }
 EXPORT_SYMBOL(ceph_osdc_put_request);
+static void request_init(struct ceph_osd_request *req)
+{
+        /* req only, each op is zeroed in _osd_req_op_init() */
+        memset(req, 0, sizeof(*req));
+        kref_init(&req->r_kref);
+        init_completion(&req->r_completion);
+        init_completion(&req->r_safe_completion);
+        RB_CLEAR_NODE(&req->r_node);
+        RB_CLEAR_NODE(&req->r_mc_node);
+        INIT_LIST_HEAD(&req->r_unsafe_item);
+        target_init(&req->r_t);
+}
+/*
+ * This is ugly, but it allows us to reuse linger registration and ping
+ * requests, keeping the structure of the code around send_linger{_ping}()
+ * reasonable.  Setting up a min_nr=2 mempool for each linger request
+ * and dealing with copying ops (this blasts req only, watch op remains
+ * intact) isn't any better.
+ */
+static void request_reinit(struct ceph_osd_request *req)
+{
+        struct ceph_osd_client *osdc = req->r_osdc;
+        bool mempool = req->r_mempool;
+        unsigned int num_ops = req->r_num_ops;
+        u64 snapid = req->r_snapid;
+        struct ceph_snap_context *snapc = req->r_snapc;
+        bool linger = req->r_linger;
+        struct ceph_msg *request_msg = req->r_request;
+        struct ceph_msg *reply_msg = req->r_reply;
+        dout("%s req %p\n", __func__, req);
+        WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+        request_release_checks(req);
+        WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
+        WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+        target_destroy(&req->r_t);
+        request_init(req);
+        req->r_osdc = osdc;
+        req->r_mempool = mempool;
+        req->r_num_ops = num_ops;
+        req->r_snapid = snapid;
+        req->r_snapc = snapc;
+        req->r_linger = linger;
+        req->r_request = request_msg;
+        req->r_reply = reply_msg;
+}
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
                                               struct ceph_snap_context *snapc,
                                               unsigned int num_ops,
@@ -367,8 +507,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
                                               gfp_t gfp_flags)
 {
        struct ceph_osd_request *req;
-        struct ceph_msg *msg;
-        size_t msg_size;
        if (use_mempool) {
                BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
@@ -383,73 +521,65 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
        if (unlikely(!req))
                return NULL;
-        /* req only, each op is zeroed in _osd_req_op_init() */
+        request_init(req);
-        memset(req, 0, sizeof(*req));
        req->r_osdc = osdc;
        req->r_mempool = use_mempool;
        req->r_num_ops = num_ops;
+        req->r_snapid = CEPH_NOSNAP;
+        req->r_snapc = ceph_get_snap_context(snapc);
-        kref_init(&req->r_kref);
+        dout("%s req %p\n", __func__, req);
-        init_completion(&req->r_completion);
+        return req;
-        init_completion(&req->r_safe_completion);
+}
-        RB_CLEAR_NODE(&req->r_node);
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
-        INIT_LIST_HEAD(&req->r_unsafe_item);
-        INIT_LIST_HEAD(&req->r_linger_item);
-        INIT_LIST_HEAD(&req->r_linger_osd_item);
-        INIT_LIST_HEAD(&req->r_req_lru_item);
-        INIT_LIST_HEAD(&req->r_osd_item);
-        req->r_base_oloc.pool = -1;
-        req->r_target_oloc.pool = -1;
-        msg_size = OSD_OPREPLY_FRONT_LEN;
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
-        if (num_ops > CEPH_OSD_SLAB_OPS) {
+{
-                /* ceph_osd_op and rval */
+        struct ceph_osd_client *osdc = req->r_osdc;
-                msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
+        struct ceph_msg *msg;
-                            (sizeof(struct ceph_osd_op) + 4);
+        int msg_size;
-        }
-        /* create reply message */
+        WARN_ON(ceph_oid_empty(&req->r_base_oid));
-        if (use_mempool)
-                msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
-        else
-                msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
-                                   gfp_flags, true);
-        if (!msg) {
-                ceph_osdc_put_request(req);
-                return NULL;
-        }
-        req->r_reply = msg;
+        /* create request message */
        msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
        msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
        msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
        msg_size += 1 + 8 + 4 + 4; /* pgid */
-        msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
+        msg_size += 4 + req->r_base_oid.name_len; /* oid */
-        msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
+        msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
        msg_size += 8; /* snapid */
        msg_size += 8; /* snap_seq */
-        msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
+        msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
        msg_size += 4; /* retry_attempt */
-        /* create request message; allow space for oid */
+        if (req->r_mempool)
-        if (use_mempool)
                msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
        else
-                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
+                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
-        if (!msg) {
+        if (!msg)
-                ceph_osdc_put_request(req);
+                return -ENOMEM;
-                return NULL;
-        }
        memset(msg->front.iov_base, 0, msg->front.iov_len);
        req->r_request = msg;
-        return req;
+        /* create reply message */
+        msg_size = OSD_OPREPLY_FRONT_LEN;
+        msg_size += req->r_base_oid.name_len;
+        msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
+        if (req->r_mempool)
+                msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+        else
+                msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
+        if (!msg)
+                return -ENOMEM;
+        req->r_reply = msg;
+        return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_alloc_request);
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
 static bool osd_req_opcode_valid(u16 opcode)
 {
@@ -587,8 +717,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
        osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
-        op->cls.argc = 0;       /* currently unused */
        op->indata_len = payload_len;
 }
 EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -627,21 +755,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
 }
 EXPORT_SYMBOL(osd_req_op_xattr_init);
-void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
+/*
-                                unsigned int which, u16 opcode,
+ * @watch_opcode: CEPH_OSD_WATCH_OP_*
-                                u64 cookie, u64 version, int flag)
+ */
+static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
+                                  u64 cookie, u8 watch_opcode)
 {
-        struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+        struct ceph_osd_req_op *op;
-                                                      opcode, 0);
-        BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+        op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
        op->watch.cookie = cookie;
-        op->watch.ver = version;
+        op->watch.op = watch_opcode;
-        if (opcode == CEPH_OSD_OP_WATCH && flag)
+        op->watch.gen = 0;
-                op->watch.flag = (u8)1;
 }
-EXPORT_SYMBOL(osd_req_op_watch_init);
 void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
                                unsigned int which,
@@ -686,16 +812,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
        }
 }
-static u64 osd_req_encode_op(struct ceph_osd_request *req,
+static u32 osd_req_encode_op(struct ceph_osd_op *dst,
-                              struct ceph_osd_op *dst, unsigned int which)
+                             const struct ceph_osd_req_op *src)
 {
-        struct ceph_osd_req_op *src;
-        struct ceph_osd_data *osd_data;
-        u64 request_data_len = 0;
-        u64 data_length;
-        BUG_ON(which >= req->r_num_ops);
-        src = &req->r_ops[which];
        if (WARN_ON(!osd_req_opcode_valid(src->op))) {
                pr_err("unrecognized osd opcode %d\n", src->op);
@@ -704,57 +823,36 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
        switch (src->op) {
        case CEPH_OSD_OP_STAT:
-                osd_data = &src->raw_data_in;
-                ceph_osdc_msg_data_add(req->r_reply, osd_data);
                break;
        case CEPH_OSD_OP_READ:
        case CEPH_OSD_OP_WRITE:
        case CEPH_OSD_OP_WRITEFULL:
        case CEPH_OSD_OP_ZERO:
        case CEPH_OSD_OP_TRUNCATE:
-                if (src->op == CEPH_OSD_OP_WRITE ||
-                    src->op == CEPH_OSD_OP_WRITEFULL)
-                        request_data_len = src->extent.length;
                dst->extent.offset = cpu_to_le64(src->extent.offset);
                dst->extent.length = cpu_to_le64(src->extent.length);
                dst->extent.truncate_size =
                        cpu_to_le64(src->extent.truncate_size);
                dst->extent.truncate_seq =
                        cpu_to_le32(src->extent.truncate_seq);
-                osd_data = &src->extent.osd_data;
-                if (src->op == CEPH_OSD_OP_WRITE ||
-                    src->op == CEPH_OSD_OP_WRITEFULL)
-                        ceph_osdc_msg_data_add(req->r_request, osd_data);
-                else
-                        ceph_osdc_msg_data_add(req->r_reply, osd_data);
                break;
        case CEPH_OSD_OP_CALL:
                dst->cls.class_len = src->cls.class_len;
                dst->cls.method_len = src->cls.method_len;
-                osd_data = &src->cls.request_info;
+                dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
-                ceph_osdc_msg_data_add(req->r_request, osd_data);
-                BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
-                request_data_len = osd_data->pagelist->length;
-                osd_data = &src->cls.request_data;
-                data_length = ceph_osd_data_length(osd_data);
-                if (data_length) {
-                        BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
-                        dst->cls.indata_len = cpu_to_le32(data_length);
-                        ceph_osdc_msg_data_add(req->r_request, osd_data);
-                        src->indata_len += data_length;
-                        request_data_len += data_length;
-                }
-                osd_data = &src->cls.response_data;
-                ceph_osdc_msg_data_add(req->r_reply, osd_data);
                break;
        case CEPH_OSD_OP_STARTSYNC:
                break;
-        case CEPH_OSD_OP_NOTIFY_ACK:
        case CEPH_OSD_OP_WATCH:
                dst->watch.cookie = cpu_to_le64(src->watch.cookie);
-                dst->watch.ver = cpu_to_le64(src->watch.ver);
+                dst->watch.ver = cpu_to_le64(0);
-                dst->watch.flag = src->watch.flag;
+                dst->watch.op = src->watch.op;
+                dst->watch.gen = cpu_to_le32(src->watch.gen);
+                break;
+        case CEPH_OSD_OP_NOTIFY_ACK:
+                break;
+        case CEPH_OSD_OP_NOTIFY:
+                dst->notify.cookie = cpu_to_le64(src->notify.cookie);
                break;
        case CEPH_OSD_OP_SETALLOCHINT:
                dst->alloc_hint.expected_object_size =
@@ -768,9 +866,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
                dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
                dst->xattr.cmp_op = src->xattr.cmp_op;
                dst->xattr.cmp_mode = src->xattr.cmp_mode;
-                osd_data = &src->xattr.osd_data;
-                ceph_osdc_msg_data_add(req->r_request, osd_data);
-                request_data_len = osd_data->pagelist->length;
                break;
        case CEPH_OSD_OP_CREATE:
        case CEPH_OSD_OP_DELETE:
@@ -787,7 +882,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
        dst->flags = cpu_to_le32(src->flags);
        dst->payload_len = cpu_to_le32(src->indata_len);
-        return request_data_len;
+        return src->indata_len;
 }
 /*
@@ -824,17 +919,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
        req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
                                        GFP_NOFS);
-        if (!req)
+        if (!req) {
-                return ERR_PTR(-ENOMEM);
+                r = -ENOMEM;
+                goto fail;
-        req->r_flags = flags;
+        }
        /* calculate max write size */
        r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
-        if (r < 0) {
+        if (r)
-                ceph_osdc_put_request(req);
+                goto fail;
-                return ERR_PTR(r);
-        }
        if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
                osd_req_op_init(req, which, opcode, 0);
@@ -854,194 +947,71 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                                       truncate_size, truncate_seq);
        }
+        req->r_flags = flags;
        req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+        ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
-        snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
+        req->r_snapid = vino.snap;
-                 "%llx.%08llx", vino.ino, objnum);
+        if (flags & CEPH_OSD_FLAG_WRITE)
-        req->r_base_oid.name_len = strlen(req->r_base_oid.name);
+                req->r_data_offset = off;
+        r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+        if (r)
+                goto fail;
        return req;
+fail:
+        ceph_osdc_put_request(req);
+        return ERR_PTR(r);
 }
 EXPORT_SYMBOL(ceph_osdc_new_request);
 /*
 * We keep osd requests in an rbtree, sorted by ->r_tid.
 */
-static void __insert_request(struct ceph_osd_client *osdc,
+DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
-                             struct ceph_osd_request *new)
+DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
-{
-        struct rb_node **p = &osdc->requests.rb_node;
-        struct rb_node *parent = NULL;
-        struct ceph_osd_request *req = NULL;
-        while (*p) {
-                parent = *p;
-                req = rb_entry(parent, struct ceph_osd_request, r_node);
-                if (new->r_tid < req->r_tid)
-                        p = &(*p)->rb_left;
-                else if (new->r_tid > req->r_tid)
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
-        }
-        rb_link_node(&new->r_node, parent, p);
-        rb_insert_color(&new->r_node, &osdc->requests);
-}
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
-                                                 u64 tid)
-{
-        struct ceph_osd_request *req;
-        struct rb_node *n = osdc->requests.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_osd_request, r_node);
-                if (tid < req->r_tid)
-                        n = n->rb_left;
-                else if (tid > req->r_tid)
-                        n = n->rb_right;
-                else
-                        return req;
-        }
-        return NULL;
-}
-static struct ceph_osd_request *
+static bool osd_homeless(struct ceph_osd *osd)
-__lookup_request_ge(struct ceph_osd_client *osdc,
-                    u64 tid)
 {
-        struct ceph_osd_request *req;
+        return osd->o_osd == CEPH_HOMELESS_OSD;
-        struct rb_node *n = osdc->requests.rb_node;
-        while (n) {
-                req = rb_entry(n, struct ceph_osd_request, r_node);
-                if (tid < req->r_tid) {
-                        if (!n->rb_left)
-                                return req;
-                        n = n->rb_left;
-                } else if (tid > req->r_tid) {
-                        n = n->rb_right;
-                } else {
-                        return req;
-                }
-        }
-        return NULL;
 }
-static void __kick_linger_request(struct ceph_osd_request *req)
+static bool osd_registered(struct ceph_osd *osd)
 {
-        struct ceph_osd_client *osdc = req->r_osdc;
+        verify_osdc_locked(osd->o_osdc);
-        struct ceph_osd *osd = req->r_osd;
-        /*
-         * Linger requests need to be resent with a new tid to avoid
-         * the dup op detection logic on the OSDs.  Achieve this with
-         * a re-register dance instead of open-coding.
-         */
-        ceph_osdc_get_request(req);
-        if (!list_empty(&req->r_linger_item))
-                __unregister_linger_request(osdc, req);
-        else
-                __unregister_request(osdc, req);
-        __register_request(osdc, req);
-        ceph_osdc_put_request(req);
-        /*
-         * Unless request has been registered as both normal and
-         * lingering, __unregister{,_linger}_request clears r_osd.
-         * However, here we need to preserve r_osd to make sure we
-         * requeue on the same OSD.
-         */
-        WARN_ON(req->r_osd || !osd);
-        req->r_osd = osd;
-        dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
+        return !RB_EMPTY_NODE(&osd->o_node);
-        __enqueue_request(req);
 }
 /*
- * Resubmit requests pending on the given osd.
+ * Assumes @osd is zero-initialized.
 */
-static void __kick_osd_requests(struct ceph_osd_client *osdc,
+static void osd_init(struct ceph_osd *osd)
-                                struct ceph_osd *osd)
 {
-        struct ceph_osd_request *req, *nreq;
+        atomic_set(&osd->o_ref, 1);
-        LIST_HEAD(resend);
+        RB_CLEAR_NODE(&osd->o_node);
-        LIST_HEAD(resend_linger);
+        osd->o_requests = RB_ROOT;
-        int err;
+        osd->o_linger_requests = RB_ROOT;
+        INIT_LIST_HEAD(&osd->o_osd_lru);
-        dout("%s osd%d\n", __func__, osd->o_osd);
+        INIT_LIST_HEAD(&osd->o_keepalive_item);
-        err = __reset_osd(osdc, osd);
+        osd->o_incarnation = 1;
-        if (err)
+        mutex_init(&osd->lock);
-                return;
-        /*
-         * Build up a list of requests to resend by traversing the
-         * osd's list of requests.  Requests for a given object are
-         * sent in tid order, and that is also the order they're
-         * kept on this list.  Therefore all requests that are in
-         * flight will be found first, followed by all requests that
-         * have not yet been sent.  And to resend requests while
-         * preserving this order we will want to put any sent
-         * requests back on the front of the osd client's unsent
-         * list.
-         *
-         * So we build a separate ordered list of already-sent
-         * requests for the affected osd and splice it onto the
-         * front of the osd client's unsent list.  Once we've seen a
-         * request that has not yet been sent we're done.  Those
-         * requests are already sitting right where they belong.
-         */
-        list_for_each_entry(req, &osd->o_requests, r_osd_item) {
-                if (!req->r_sent)
-                        break;
-                if (!req->r_linger) {
-                        dout("%s requeueing %p tid %llu\n", __func__, req,
-                             req->r_tid);
-                        list_move_tail(&req->r_req_lru_item, &resend);
-                        req->r_flags |= CEPH_OSD_FLAG_RETRY;
-                } else {
-                        list_move_tail(&req->r_req_lru_item, &resend_linger);
-                }
-        }
-        list_splice(&resend, &osdc->req_unsent);
-        /*
-         * Both registered and not yet registered linger requests are
-         * enqueued with a new tid on the same OSD.  We add/move them
-         * to req_unsent/o_requests at the end to keep things in tid
-         * order.
-         */
-        list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
-                                 r_linger_osd_item) {
-                WARN_ON(!list_empty(&req->r_req_lru_item));
-                __kick_linger_request(req);
-        }
-        list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
-                __kick_linger_request(req);
 }
-/*
+static void osd_cleanup(struct ceph_osd *osd)
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
 {
-        struct ceph_osd *osd = con->private;
+        WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
-        struct ceph_osd_client *osdc;
+        WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+        WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
-        if (!osd)
+        WARN_ON(!list_empty(&osd->o_osd_lru));
-                return;
+        WARN_ON(!list_empty(&osd->o_keepalive_item));
-        dout("osd_reset osd%d\n", osd->o_osd);
-        osdc = osd->o_osdc;
+        if (osd->o_auth.authorizer) {
-        down_read(&osdc->map_sem);
+                WARN_ON(osd_homeless(osd));
-        mutex_lock(&osdc->request_mutex);
+                ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
-        __kick_osd_requests(osdc, osd);
+        }
-        __send_queued(osdc);
-        mutex_unlock(&osdc->request_mutex);
-        up_read(&osdc->map_sem);
 }
 /*
@@ -1051,22 +1021,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 {
        struct ceph_osd *osd;
-        osd = kzalloc(sizeof(*osd), GFP_NOFS);
+        WARN_ON(onum == CEPH_HOMELESS_OSD);
-        if (!osd)
-                return NULL;
-        atomic_set(&osd->o_ref, 1);
+        osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
+        osd_init(osd);
        osd->o_osdc = osdc;
        osd->o_osd = onum;
-        RB_CLEAR_NODE(&osd->o_node);
-        INIT_LIST_HEAD(&osd->o_requests);
-        INIT_LIST_HEAD(&osd->o_linger_requests);
-        INIT_LIST_HEAD(&osd->o_osd_lru);
-        osd->o_incarnation = 1;
        ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
-        INIT_LIST_HEAD(&osd->o_keepalive_item);
        return osd;
 }
@@ -1087,114 +1050,115 @@ static void put_osd(struct ceph_osd *osd)
        dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
             atomic_read(&osd->o_ref) - 1);
        if (atomic_dec_and_test(&osd->o_ref)) {
-                if (osd->o_auth.authorizer)
+                osd_cleanup(osd);
-                        ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
                kfree(osd);
        }
 }
-/*
+DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-        dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-        WARN_ON(!list_empty(&osd->o_requests));
-        WARN_ON(!list_empty(&osd->o_linger_requests));
-        list_del_init(&osd->o_osd_lru);
+static void __move_osd_to_lru(struct ceph_osd *osd)
-        rb_erase(&osd->o_node, &osdc->osds);
-        RB_CLEAR_NODE(&osd->o_node);
-}
-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 {
-        dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
+        struct ceph_osd_client *osdc = osd->o_osdc;
-        if (!RB_EMPTY_NODE(&osd->o_node)) {
-                ceph_con_close(&osd->o_con);
-                __remove_osd(osdc, osd);
-                put_osd(osd);
-        }
-}
-static void remove_all_osds(struct ceph_osd_client *osdc)
-{
-        dout("%s %p\n", __func__, osdc);
-        mutex_lock(&osdc->request_mutex);
-        while (!RB_EMPTY_ROOT(&osdc->osds)) {
-                struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
-                                                struct ceph_osd, o_node);
-                remove_osd(osdc, osd);
-        }
-        mutex_unlock(&osdc->request_mutex);
-}
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+        dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
-                              struct ceph_osd *osd)
-{
-        dout("%s %p\n", __func__, osd);
        BUG_ON(!list_empty(&osd->o_osd_lru));
+        spin_lock(&osdc->osd_lru_lock);
        list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+        spin_unlock(&osdc->osd_lru_lock);
        osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
 }
-static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
+static void maybe_move_osd_to_lru(struct ceph_osd *osd)
-                                  struct ceph_osd *osd)
 {
-        dout("%s %p\n", __func__, osd);
+        if (RB_EMPTY_ROOT(&osd->o_requests) &&
+            RB_EMPTY_ROOT(&osd->o_linger_requests))
-        if (list_empty(&osd->o_requests) &&
+                __move_osd_to_lru(osd);
-            list_empty(&osd->o_linger_requests))
-                __move_osd_to_lru(osdc, osd);
 }
 static void __remove_osd_from_lru(struct ceph_osd *osd)
 {
-        dout("__remove_osd_from_lru %p\n", osd);
+        struct ceph_osd_client *osdc = osd->o_osdc;
+        dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+        spin_lock(&osdc->osd_lru_lock);
        if (!list_empty(&osd->o_osd_lru))
                list_del_init(&osd->o_osd_lru);
+        spin_unlock(&osdc->osd_lru_lock);
 }
-static void remove_old_osds(struct ceph_osd_client *osdc)
+/*
+ * Close the connection and assign any leftover requests to the
+ * homeless session.
+ */
+static void close_osd(struct ceph_osd *osd)
 {
-        struct ceph_osd *osd, *nosd;
+        struct ceph_osd_client *osdc = osd->o_osdc;
+        struct rb_node *n;
-        dout("__remove_old_osds %p\n", osdc);
+        verify_osdc_wrlocked(osdc);
-        mutex_lock(&osdc->request_mutex);
+        dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
-        list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
-                if (time_before(jiffies, osd->lru_ttl))
+        ceph_con_close(&osd->o_con);
-                        break;
-                remove_osd(osdc, osd);
+        for (n = rb_first(&osd->o_requests); n; ) {
+                struct ceph_osd_request *req =
+                    rb_entry(n, struct ceph_osd_request, r_node);
+                n = rb_next(n); /* unlink_request() */
+                dout(" reassigning req %p tid %llu\n", req, req->r_tid);
+                unlink_request(osd, req);
+                link_request(&osdc->homeless_osd, req);
+        }
+        for (n = rb_first(&osd->o_linger_requests); n; ) {
+                struct ceph_osd_linger_request *lreq =
+                    rb_entry(n, struct ceph_osd_linger_request, node);
+                n = rb_next(n); /* unlink_linger() */
+                dout(" reassigning lreq %p linger_id %llu\n", lreq,
+                     lreq->linger_id);
+                unlink_linger(osd, lreq);
+                link_linger(&osdc->homeless_osd, lreq);
        }
-        mutex_unlock(&osdc->request_mutex);
+        __remove_osd_from_lru(osd);
+        erase_osd(&osdc->osds, osd);
+        put_osd(osd);
 }
 /*
 * reset osd connect
 */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+static int reopen_osd(struct ceph_osd *osd)
 {
        struct ceph_entity_addr *peer_addr;
-        dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
+        dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
-        if (list_empty(&osd->o_requests) &&
-            list_empty(&osd->o_linger_requests)) {
+        if (RB_EMPTY_ROOT(&osd->o_requests) &&
-                remove_osd(osdc, osd);
+            RB_EMPTY_ROOT(&osd->o_linger_requests)) {
+                close_osd(osd);
                return -ENODEV;
        }
-        peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+        peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
        if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
                        !ceph_con_opened(&osd->o_con)) {
-                struct ceph_osd_request *req;
+                struct rb_node *n;
                dout("osd addr hasn't changed and connection never opened, "
                     "letting msgr retry\n");
                /* touch each r_stamp for handle_timeout()'s benfit */
-                list_for_each_entry(req, &osd->o_requests, r_osd_item)
+                for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+                        struct ceph_osd_request *req =
+                            rb_entry(n, struct ceph_osd_request, r_node);
                        req->r_stamp = jiffies;
+                }
                return -EAGAIN;
        }
@@ -1206,455 +1170,1370 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
        return 0;
 }
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
+                                          bool wrlocked)
 {
-        struct rb_node **p = &osdc->osds.rb_node;
+        struct ceph_osd *osd;
-        struct rb_node *parent = NULL;
-        struct ceph_osd *osd = NULL;
-        dout("__insert_osd %p osd%d\n", new, new->o_osd);
+        if (wrlocked)
-        while (*p) {
+                verify_osdc_wrlocked(osdc);
-                parent = *p;
+        else
-                osd = rb_entry(parent, struct ceph_osd, o_node);
+                verify_osdc_locked(osdc);
-                if (new->o_osd < osd->o_osd)
-                        p = &(*p)->rb_left;
+        if (o != CEPH_HOMELESS_OSD)
-                else if (new->o_osd > osd->o_osd)
+                osd = lookup_osd(&osdc->osds, o);
-                        p = &(*p)->rb_right;
+        else
-                else
+                osd = &osdc->homeless_osd;
-                        BUG();
+        if (!osd) {
+                if (!wrlocked)
+                        return ERR_PTR(-EAGAIN);
+                osd = create_osd(osdc, o);
+                insert_osd(&osdc->osds, osd);
+                ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+                              &osdc->osdmap->osd_addr[osd->o_osd]);
        }
-        rb_link_node(&new->o_node, parent, p);
+        dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
-        rb_insert_color(&new->o_node, &osdc->osds);
+        return osd;
 }
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+/*
+ * Create request <-> OSD session relation.
+ *
+ * @req has to be assigned a tid, @osd may be homeless.
+ */
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 {
-        struct ceph_osd *osd;
+        verify_osd_locked(osd);
-        struct rb_node *n = osdc->osds.rb_node;
+        WARN_ON(!req->r_tid || req->r_osd);
+        dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
-        while (n) {
+             req, req->r_tid);
-                osd = rb_entry(n, struct ceph_osd, o_node);
-                if (o < osd->o_osd)
+        if (!osd_homeless(osd))
-                        n = n->rb_left;
+                __remove_osd_from_lru(osd);
-                else if (o > osd->o_osd)
+        else
-                        n = n->rb_right;
+                atomic_inc(&osd->o_osdc->num_homeless);
-                else
-                        return osd;
+        get_osd(osd);
-        }
+        insert_request(&osd->o_requests, req);
-        return NULL;
+        req->r_osd = osd;
 }
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 {
-        schedule_delayed_work(&osdc->timeout_work,
+        verify_osd_locked(osd);
-                              osdc->client->options->osd_keepalive_timeout);
+        WARN_ON(req->r_osd != osd);
+        dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+             req, req->r_tid);
+        req->r_osd = NULL;
+        erase_request(&osd->o_requests, req);
+        put_osd(osd);
+        if (!osd_homeless(osd))
+                maybe_move_osd_to_lru(osd);
+        else
+                atomic_dec(&osd->o_osdc->num_homeless);
 }
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+static bool __pool_full(struct ceph_pg_pool_info *pi)
 {
-        cancel_delayed_work(&osdc->timeout_work);
+        return pi->flags & CEPH_POOL_FLAG_FULL;
 }
-/*
+static bool have_pool_full(struct ceph_osd_client *osdc)
- * Register request, assign tid.  If this is the first request, set up
- * the timeout event.
- */
-static void __register_request(struct ceph_osd_client *osdc,
-                               struct ceph_osd_request *req)
 {
-        req->r_tid = ++osdc->last_tid;
+        struct rb_node *n;
-        req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
-        dout("__register_request %p tid %lld\n", req, req->r_tid);
+        for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
-        __insert_request(osdc, req);
+                struct ceph_pg_pool_info *pi =
-        ceph_osdc_get_request(req);
+                    rb_entry(n, struct ceph_pg_pool_info, node);
-        osdc->num_requests++;
-        if (osdc->num_requests == 1) {
+                if (__pool_full(pi))
-                dout(" first request, scheduling timeout\n");
+                        return true;
-                __schedule_osd_timeout(osdc);
        }
+        return false;
+}
+static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+        struct ceph_pg_pool_info *pi;
+        pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+        if (!pi)
+                return false;
+        return __pool_full(pi);
 }
 /*
- * called under osdc->request_mutex
+ * Returns whether a request should be blocked from being sent
+ * based on the current osdmap and osd_client settings.
 */
-static void __unregister_request(struct ceph_osd_client *osdc,
+static bool target_should_be_paused(struct ceph_osd_client *osdc,
-                                 struct ceph_osd_request *req)
+                                    const struct ceph_osd_request_target *t,
+                                    struct ceph_pg_pool_info *pi)
 {
-        if (RB_EMPTY_NODE(&req->r_node)) {
+        bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
-                dout("__unregister_request %p tid %lld not registered\n",
+        bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-                        req, req->r_tid);
+                       ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
-                return;
+                       __pool_full(pi);
+        WARN_ON(pi->id != t->base_oloc.pool);
+        return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+               (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+enum calc_target_result {
+        CALC_TARGET_NO_ACTION = 0,
+        CALC_TARGET_NEED_RESEND,
+        CALC_TARGET_POOL_DNE,
+};
+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
+                                           struct ceph_osd_request_target *t,
+                                           u32 *last_force_resend,
+                                           bool any_change)
+{
+        struct ceph_pg_pool_info *pi;
+        struct ceph_pg pgid, last_pgid;
+        struct ceph_osds up, acting;
+        bool force_resend = false;
+        bool need_check_tiering = false;
+        bool need_resend = false;
+        bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
+                                             CEPH_OSDMAP_SORTBITWISE);
+        enum calc_target_result ct_res;
+        int ret;
+        pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
+        if (!pi) {
+                t->osd = CEPH_HOMELESS_OSD;
+                ct_res = CALC_TARGET_POOL_DNE;
+                goto out;
        }
-        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+        if (osdc->osdmap->epoch == pi->last_force_request_resend) {
-        rb_erase(&req->r_node, &osdc->requests);
+                if (last_force_resend &&
-        RB_CLEAR_NODE(&req->r_node);
+                    *last_force_resend < pi->last_force_request_resend) {
-        osdc->num_requests--;
+                        *last_force_resend = pi->last_force_request_resend;
+                        force_resend = true;
+                } else if (!last_force_resend) {
+                        force_resend = true;
+                }
+        }
+        if (ceph_oid_empty(&t->target_oid) || force_resend) {
+                ceph_oid_copy(&t->target_oid, &t->base_oid);
+                need_check_tiering = true;
+        }
+        if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
+                ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+                need_check_tiering = true;
+        }
-        if (req->r_osd) {
+        if (need_check_tiering &&
-                /* make sure the original request isn't in flight. */
+            (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
-                ceph_msg_revoke(req->r_request);
+                if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+                        t->target_oloc.pool = pi->read_tier;
+                if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+                        t->target_oloc.pool = pi->write_tier;
+        }
-                list_del_init(&req->r_osd_item);
+        ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
-                maybe_move_osd_to_lru(osdc, req->r_osd);
+                                        &t->target_oloc, &pgid);
-                if (list_empty(&req->r_linger_osd_item))
+        if (ret) {
-                        req->r_osd = NULL;
+                WARN_ON(ret != -ENOENT);
+                t->osd = CEPH_HOMELESS_OSD;
+                ct_res = CALC_TARGET_POOL_DNE;
+                goto out;
+        }
+        last_pgid.pool = pgid.pool;
+        last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
+        ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+        if (any_change &&
+            ceph_is_new_interval(&t->acting,
+                                 &acting,
+                                 &t->up,
+                                 &up,
+                                 t->size,
+                                 pi->size,
+                                 t->min_size,
+                                 pi->min_size,
+                                 t->pg_num,
+                                 pi->pg_num,
+                                 t->sort_bitwise,
+                                 sort_bitwise,
+                                 &last_pgid))
+                force_resend = true;
+        if (t->paused && !target_should_be_paused(osdc, t, pi)) {
+                t->paused = false;
+                need_resend = true;
        }
-        list_del_init(&req->r_req_lru_item);
+        if (ceph_pg_compare(&t->pgid, &pgid) ||
-        ceph_osdc_put_request(req);
+            ceph_osds_changed(&t->acting, &acting, any_change) ||
+            force_resend) {
+                t->pgid = pgid; /* struct */
+                ceph_osds_copy(&t->acting, &acting);
+                ceph_osds_copy(&t->up, &up);
+                t->size = pi->size;
+                t->min_size = pi->min_size;
+                t->pg_num = pi->pg_num;
+                t->pg_num_mask = pi->pg_num_mask;
+                t->sort_bitwise = sort_bitwise;
+                t->osd = acting.primary;
+                need_resend = true;
+        }
+        ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+out:
+        dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+        return ct_res;
+}
+static void setup_request_data(struct ceph_osd_request *req,
+                               struct ceph_msg *msg)
+{
+        u32 data_len = 0;
+        int i;
+        if (!list_empty(&msg->data))
+                return;
+        WARN_ON(msg->data_length);
+        for (i = 0; i < req->r_num_ops; i++) {
+                struct ceph_osd_req_op *op = &req->r_ops[i];
+                switch (op->op) {
+                /* request */
+                case CEPH_OSD_OP_WRITE:
+                case CEPH_OSD_OP_WRITEFULL:
+                        WARN_ON(op->indata_len != op->extent.length);
+                        ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
+                        break;
+                case CEPH_OSD_OP_SETXATTR:
+                case CEPH_OSD_OP_CMPXATTR:
+                        WARN_ON(op->indata_len != op->xattr.name_len +
+                                                  op->xattr.value_len);
+                        ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
+                        break;
+                case CEPH_OSD_OP_NOTIFY_ACK:
+                        ceph_osdc_msg_data_add(msg,
+                                               &op->notify_ack.request_data);
+                        break;
+                /* reply */
+                case CEPH_OSD_OP_STAT:
+                        ceph_osdc_msg_data_add(req->r_reply,
+                                               &op->raw_data_in);
+                        break;
+                case CEPH_OSD_OP_READ:
+                        ceph_osdc_msg_data_add(req->r_reply,
+                                               &op->extent.osd_data);
+                        break;
+                /* both */
+                case CEPH_OSD_OP_CALL:
+                        WARN_ON(op->indata_len != op->cls.class_len +
+                                                  op->cls.method_len +
+                                                  op->cls.indata_len);
+                        ceph_osdc_msg_data_add(msg, &op->cls.request_info);
+                        /* optional, can be NONE */
+                        ceph_osdc_msg_data_add(msg, &op->cls.request_data);
+                        /* optional, can be NONE */
+                        ceph_osdc_msg_data_add(req->r_reply,
+                                               &op->cls.response_data);
+                        break;
+                case CEPH_OSD_OP_NOTIFY:
+                        ceph_osdc_msg_data_add(msg,
+                                               &op->notify.request_data);
+                        ceph_osdc_msg_data_add(req->r_reply,
+                                               &op->notify.response_data);
+                        break;
+                }
+                data_len += op->indata_len;
+        }
+        WARN_ON(data_len != msg->data_length);
+}
+static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+        void *p = msg->front.iov_base;
+        void *const end = p + msg->front_alloc_len;
+        u32 data_len = 0;
+        int i;
+        if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
+                /* snapshots aren't writeable */
+                WARN_ON(req->r_snapid != CEPH_NOSNAP);
+        } else {
+                WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
+                        req->r_data_offset || req->r_snapc);
+        }
+        setup_request_data(req, msg);
+        ceph_encode_32(&p, 1); /* client_inc, always 1 */
+        ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
+        ceph_encode_32(&p, req->r_flags);
+        ceph_encode_timespec(p, &req->r_mtime);
+        p += sizeof(struct ceph_timespec);
+        /* aka reassert_version */
+        memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
+        p += sizeof(req->r_replay_version);
+        /* oloc */
+        ceph_encode_8(&p, 4);
+        ceph_encode_8(&p, 4);
+        ceph_encode_32(&p, 8 + 4 + 4);
+        ceph_encode_64(&p, req->r_t.target_oloc.pool);
+        ceph_encode_32(&p, -1); /* preferred */
+        ceph_encode_32(&p, 0); /* key len */
+        /* pgid */
+        ceph_encode_8(&p, 1);
+        ceph_encode_64(&p, req->r_t.pgid.pool);
+        ceph_encode_32(&p, req->r_t.pgid.seed);
+        ceph_encode_32(&p, -1); /* preferred */
+        /* oid */
+        ceph_encode_32(&p, req->r_t.target_oid.name_len);
+        memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
+        p += req->r_t.target_oid.name_len;
-        if (osdc->num_requests == 0) {
+        /* ops, can imply data */
-                dout(" no requests, canceling timeout\n");
+        ceph_encode_16(&p, req->r_num_ops);
-                __cancel_osd_timeout(osdc);
+        for (i = 0; i < req->r_num_ops; i++) {
+                data_len += osd_req_encode_op(p, &req->r_ops[i]);
+                p += sizeof(struct ceph_osd_op);
        }
+        ceph_encode_64(&p, req->r_snapid); /* snapid */
+        if (req->r_snapc) {
+                ceph_encode_64(&p, req->r_snapc->seq);
+                ceph_encode_32(&p, req->r_snapc->num_snaps);
+                for (i = 0; i < req->r_snapc->num_snaps; i++)
+                        ceph_encode_64(&p, req->r_snapc->snaps[i]);
+        } else {
+                ceph_encode_64(&p, 0); /* snap_seq */
+                ceph_encode_32(&p, 0); /* snaps len */
+        }
+        ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
+        BUG_ON(p > end);
+        msg->front.iov_len = p - msg->front.iov_base;
+        msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
+        msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+        msg->hdr.data_len = cpu_to_le32(data_len);
+        /*
+         * The header "data_off" is a hint to the receiver allowing it
+         * to align received data into its buffers such that there's no
+         * need to re-copy it before writing it to disk (direct I/O).
+         */
+        msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
+        dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__,
+             req, req->r_t.target_oid.name_len, req->r_t.target_oid.name,
+             req->r_t.target_oid.name_len, msg->front.iov_len, data_len);
 }
 /*
- * Cancel a previously queued request message
+ * @req has to be assigned a tid and registered.
 */
-static void __cancel_request(struct ceph_osd_request *req)
+static void send_request(struct ceph_osd_request *req)
 {
-        if (req->r_sent && req->r_osd) {
+        struct ceph_osd *osd = req->r_osd;
+        verify_osd_locked(osd);
+        WARN_ON(osd->o_osd != req->r_t.osd);
+        /*
+         * We may have a previously queued request message hanging
+         * around.  Cancel it to avoid corrupting the msgr.
+         */
+        if (req->r_sent)
                ceph_msg_revoke(req->r_request);
-                req->r_sent = 0;
+        req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+        if (req->r_attempts)
+                req->r_flags |= CEPH_OSD_FLAG_RETRY;
+        else
+                WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
+        encode_request(req, req->r_request);
+        dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
+             __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
+             req->r_t.osd, req->r_flags, req->r_attempts);
+        req->r_t.paused = false;
+        req->r_stamp = jiffies;
+        req->r_attempts++;
+        req->r_sent = osd->o_incarnation;
+        req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+        ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
+}
+static void maybe_request_map(struct ceph_osd_client *osdc)
+{
+        bool continuous = false;
+        verify_osdc_locked(osdc);
+        WARN_ON(!osdc->osdmap->epoch);
+        if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+            ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
+            ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
+                dout("%s osdc %p continuous\n", __func__, osdc);
+                continuous = true;
+        } else {
+                dout("%s osdc %p onetime\n", __func__, osdc);
        }
+        if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+                               osdc->osdmap->epoch + 1, continuous))
+                ceph_monc_renew_subs(&osdc->client->monc);
 }
-static void __register_linger_request(struct ceph_osd_client *osdc,
+static void send_map_check(struct ceph_osd_request *req);
-                                    struct ceph_osd_request *req)
+static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 {
-        dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+        struct ceph_osd_client *osdc = req->r_osdc;
-        WARN_ON(!req->r_linger);
+        struct ceph_osd *osd;
+        enum calc_target_result ct_res;
+        bool need_send = false;
+        bool promoted = false;
+        WARN_ON(req->r_tid || req->r_got_reply);
+        dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
+again:
+        ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+        if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
+                goto promote;
+        osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
+        if (IS_ERR(osd)) {
+                WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
+                goto promote;
+        }
+        if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+            ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
+                dout("req %p pausewr\n", req);
+                req->r_t.paused = true;
+                maybe_request_map(osdc);
+        } else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+                   ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
+                dout("req %p pauserd\n", req);
+                req->r_t.paused = true;
+                maybe_request_map(osdc);
+        } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+                   !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
+                                     CEPH_OSD_FLAG_FULL_FORCE)) &&
+                   (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+                    pool_full(osdc, req->r_t.base_oloc.pool))) {
+                dout("req %p full/pool_full\n", req);
+                pr_warn_ratelimited("FULL or reached pool quota\n");
+                req->r_t.paused = true;
+                maybe_request_map(osdc);
+        } else if (!osd_homeless(osd)) {
+                need_send = true;
+        } else {
+                maybe_request_map(osdc);
+        }
+        mutex_lock(&osd->lock);
+        /*
+         * Assign the tid atomically with send_request() to protect
+         * multiple writes to the same object from racing with each
+         * other, resulting in out of order ops on the OSDs.
+         */
+        req->r_tid = atomic64_inc_return(&osdc->last_tid);
+        link_request(osd, req);
+        if (need_send)
+                send_request(req);
+        mutex_unlock(&osd->lock);
+        if (ct_res == CALC_TARGET_POOL_DNE)
+                send_map_check(req);
+        if (promoted)
+                downgrade_write(&osdc->lock);
+        return;
+promote:
+        up_read(&osdc->lock);
+        down_write(&osdc->lock);
+        wrlocked = true;
+        promoted = true;
+        goto again;
+}
+static void account_request(struct ceph_osd_request *req)
+{
+        unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+        if (req->r_flags & CEPH_OSD_FLAG_READ) {
+                WARN_ON(req->r_flags & mask);
+                req->r_flags |= CEPH_OSD_FLAG_ACK;
+        } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+                WARN_ON(!(req->r_flags & mask));
+        else
+                WARN_ON(1);
+        WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+        atomic_inc(&req->r_osdc->num_requests);
+}
+static void submit_request(struct ceph_osd_request *req, bool wrlocked)
+{
        ceph_osdc_get_request(req);
-        list_add_tail(&req->r_linger_item, &osdc->req_linger);
+        account_request(req);
-        if (req->r_osd)
+        __submit_request(req, wrlocked);
-                list_add_tail(&req->r_linger_osd_item,
-                              &req->r_osd->o_linger_requests);
 }
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
+static void __finish_request(struct ceph_osd_request *req)
-                                        struct ceph_osd_request *req)
 {
-        WARN_ON(!req->r_linger);
+        struct ceph_osd_client *osdc = req->r_osdc;
+        struct ceph_osd *osd = req->r_osd;
-        if (list_empty(&req->r_linger_item)) {
+        verify_osd_locked(osd);
-                dout("%s %p tid %llu not registered\n", __func__, req,
+        dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
-                     req->r_tid);
+        WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
+        unlink_request(osd, req);
+        atomic_dec(&osdc->num_requests);
+        /*
+         * If an OSD has failed or returned and a request has been sent
+         * twice, it's possible to get a reply and end up here while the
+         * request message is queued for delivery.  We will ignore the
+         * reply, so not a big deal, but better to try and catch it.
+         */
+        ceph_msg_revoke(req->r_request);
+        ceph_msg_revoke_incoming(req->r_reply);
+}
+static void finish_request(struct ceph_osd_request *req)
+{
+        __finish_request(req);
+        ceph_osdc_put_request(req);
+}
+static void __complete_request(struct ceph_osd_request *req)
+{
+        if (req->r_callback)
+                req->r_callback(req);
+        else
+                complete_all(&req->r_completion);
+}
+/*
+ * Note that this is open-coded in handle_reply(), which has to deal
+ * with ack vs commit, dup acks, etc.
+ */
+static void complete_request(struct ceph_osd_request *req, int err)
+{
+        dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+        req->r_result = err;
+        __finish_request(req);
+        __complete_request(req);
+        complete_all(&req->r_safe_completion);
+        ceph_osdc_put_request(req);
+}
+static void cancel_map_check(struct ceph_osd_request *req)
+{
+        struct ceph_osd_client *osdc = req->r_osdc;
+        struct ceph_osd_request *lookup_req;
+        verify_osdc_wrlocked(osdc);
+        lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+        if (!lookup_req)
                return;
+        WARN_ON(lookup_req != req);
+        erase_request_mc(&osdc->map_checks, req);
+        ceph_osdc_put_request(req);
+}
+static void cancel_request(struct ceph_osd_request *req)
+{
+        dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+        cancel_map_check(req);
+        finish_request(req);
+}
+static void check_pool_dne(struct ceph_osd_request *req)
+{
+        struct ceph_osd_client *osdc = req->r_osdc;
+        struct ceph_osdmap *map = osdc->osdmap;
+        verify_osdc_wrlocked(osdc);
+        WARN_ON(!map->epoch);
+        if (req->r_attempts) {
+                /*
+                 * We sent a request earlier, which means that
+                 * previously the pool existed, and now it does not
+                 * (i.e., it was deleted).
+                 */
+                req->r_map_dne_bound = map->epoch;
+                dout("%s req %p tid %llu pool disappeared\n", __func__, req,
+                     req->r_tid);
+        } else {
+                dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
+                     req, req->r_tid, req->r_map_dne_bound, map->epoch);
        }
-        dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+        if (req->r_map_dne_bound) {
-        list_del_init(&req->r_linger_item);
+                if (map->epoch >= req->r_map_dne_bound) {
+                        /* we had a new enough map */
+                        pr_info_ratelimited("tid %llu pool does not exist\n",
+                                            req->r_tid);
+                        complete_request(req, -ENOENT);
+                }
+        } else {
+                send_map_check(req);
+        }
+}
-        if (req->r_osd) {
+static void map_check_cb(struct ceph_mon_generic_request *greq)
-                list_del_init(&req->r_linger_osd_item);
+{
-                maybe_move_osd_to_lru(osdc, req->r_osd);
+        struct ceph_osd_client *osdc = &greq->monc->client->osdc;
-                if (list_empty(&req->r_osd_item))
+        struct ceph_osd_request *req;
-                        req->r_osd = NULL;
+        u64 tid = greq->private_data;
+        WARN_ON(greq->result || !greq->u.newest);
+        down_write(&osdc->lock);
+        req = lookup_request_mc(&osdc->map_checks, tid);
+        if (!req) {
+                dout("%s tid %llu dne\n", __func__, tid);
+                goto out_unlock;
        }
+        dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
+             req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
+        if (!req->r_map_dne_bound)
+                req->r_map_dne_bound = greq->u.newest;
+        erase_request_mc(&osdc->map_checks, req);
+        check_pool_dne(req);
        ceph_osdc_put_request(req);
+out_unlock:
+        up_write(&osdc->lock);
 }
-void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
+static void send_map_check(struct ceph_osd_request *req)
-                                  struct ceph_osd_request *req)
 {
-        if (!req->r_linger) {
+        struct ceph_osd_client *osdc = req->r_osdc;
-                dout("set_request_linger %p\n", req);
+        struct ceph_osd_request *lookup_req;
-                req->r_linger = 1;
+        int ret;
+        verify_osdc_wrlocked(osdc);
+        lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+        if (lookup_req) {
+                WARN_ON(lookup_req != req);
+                return;
        }
+        ceph_osdc_get_request(req);
+        insert_request_mc(&osdc->map_checks, req);
+        ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+                                          map_check_cb, req->r_tid);
+        WARN_ON(ret);
 }
-EXPORT_SYMBOL(ceph_osdc_set_request_linger);
 /*
- * Returns whether a request should be blocked from being sent
+ * lingering requests, watch/notify v2 infrastructure
- * based on the current osdmap and osd_client settings.
- *
- * Caller should hold map_sem for read.
 */
-static bool __req_should_be_paused(struct ceph_osd_client *osdc,
+static void linger_release(struct kref *kref)
-                                   struct ceph_osd_request *req)
 {
-        bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+        struct ceph_osd_linger_request *lreq =
-        bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+            container_of(kref, struct ceph_osd_linger_request, kref);
-                ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
-        return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
+        dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
-                (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
+             lreq->reg_req, lreq->ping_req);
+        WARN_ON(!RB_EMPTY_NODE(&lreq->node));
+        WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
+        WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
+        WARN_ON(!list_empty(&lreq->scan_item));
+        WARN_ON(!list_empty(&lreq->pending_lworks));
+        WARN_ON(lreq->osd);
+        if (lreq->reg_req)
+                ceph_osdc_put_request(lreq->reg_req);
+        if (lreq->ping_req)
+                ceph_osdc_put_request(lreq->ping_req);
+        target_destroy(&lreq->t);
+        kfree(lreq);
 }
+static void linger_put(struct ceph_osd_linger_request *lreq)
+{
+        if (lreq)
+                kref_put(&lreq->kref, linger_release);
+}
+static struct ceph_osd_linger_request *
+linger_get(struct ceph_osd_linger_request *lreq)
+{
+        kref_get(&lreq->kref);
+        return lreq;
+}
+static struct ceph_osd_linger_request *
+linger_alloc(struct ceph_osd_client *osdc)
+{
+        struct ceph_osd_linger_request *lreq;
+        lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
+        if (!lreq)
+                return NULL;
+        kref_init(&lreq->kref);
+        mutex_init(&lreq->lock);
+        RB_CLEAR_NODE(&lreq->node);
+        RB_CLEAR_NODE(&lreq->osdc_node);
+        RB_CLEAR_NODE(&lreq->mc_node);
+        INIT_LIST_HEAD(&lreq->scan_item);
+        INIT_LIST_HEAD(&lreq->pending_lworks);
+        init_completion(&lreq->reg_commit_wait);
+        init_completion(&lreq->notify_finish_wait);
+        lreq->osdc = osdc;
+        target_init(&lreq->t);
+        dout("%s lreq %p\n", __func__, lreq);
+        return lreq;
+}
+DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
+DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
+DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
 /*
- * Calculate mapping of a request to a PG.  Takes tiering into account.
+ * Create linger request <-> OSD session relation.
+ *
+ * @lreq has to be registered, @osd may be homeless.
 */
-static int __calc_request_pg(struct ceph_osdmap *osdmap,
+static void link_linger(struct ceph_osd *osd,
-                             struct ceph_osd_request *req,
+                        struct ceph_osd_linger_request *lreq)
-                             struct ceph_pg *pg_out)
 {
-        bool need_check_tiering;
+        verify_osd_locked(osd);
+        WARN_ON(!lreq->linger_id || lreq->osd);
+        dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+             osd->o_osd, lreq, lreq->linger_id);
-        need_check_tiering = false;
+        if (!osd_homeless(osd))
-        if (req->r_target_oloc.pool == -1) {
+                __remove_osd_from_lru(osd);
-                req->r_target_oloc = req->r_base_oloc; /* struct */
+        else
-                need_check_tiering = true;
+                atomic_inc(&osd->o_osdc->num_homeless);
+        get_osd(osd);
+        insert_linger(&osd->o_linger_requests, lreq);
+        lreq->osd = osd;
+}
+static void unlink_linger(struct ceph_osd *osd,
+                          struct ceph_osd_linger_request *lreq)
+{
+        verify_osd_locked(osd);
+        WARN_ON(lreq->osd != osd);
+        dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+             osd->o_osd, lreq, lreq->linger_id);
+        lreq->osd = NULL;
+        erase_linger(&osd->o_linger_requests, lreq);
+        put_osd(osd);
+        if (!osd_homeless(osd))
+                maybe_move_osd_to_lru(osd);
+        else
+                atomic_dec(&osd->o_osdc->num_homeless);
+}
+static bool __linger_registered(struct ceph_osd_linger_request *lreq)
+{
+        verify_osdc_locked(lreq->osdc);
+        return !RB_EMPTY_NODE(&lreq->osdc_node);
+}
+static bool linger_registered(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        bool registered;
+        down_read(&osdc->lock);
+        registered = __linger_registered(lreq);
+        up_read(&osdc->lock);
+        return registered;
+}
+static void linger_register(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        verify_osdc_wrlocked(osdc);
+        WARN_ON(lreq->linger_id);
+        linger_get(lreq);
+        lreq->linger_id = ++osdc->last_linger_id;
+        insert_linger_osdc(&osdc->linger_requests, lreq);
+}
+static void linger_unregister(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        verify_osdc_wrlocked(osdc);
+        erase_linger_osdc(&osdc->linger_requests, lreq);
+        linger_put(lreq);
+}
+static void cancel_linger_request(struct ceph_osd_request *req)
+{
+        struct ceph_osd_linger_request *lreq = req->r_priv;
+        WARN_ON(!req->r_linger);
+        cancel_request(req);
+        linger_put(lreq);
+}
+struct linger_work {
+        struct work_struct work;
+        struct ceph_osd_linger_request *lreq;
+        struct list_head pending_item;
+        unsigned long queued_stamp;
+        union {
+                struct {
+                        u64 notify_id;
+                        u64 notifier_id;
+                        void *payload; /* points into @msg front */
+                        size_t payload_len;
+                        struct ceph_msg *msg; /* for ceph_msg_put() */
+                } notify;
+                struct {
+                        int err;
+                } error;
+        };
+};
+static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
+                                       work_func_t workfn)
+{
+        struct linger_work *lwork;
+        lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
+        if (!lwork)
+                return NULL;
+        INIT_WORK(&lwork->work, workfn);
+        INIT_LIST_HEAD(&lwork->pending_item);
+        lwork->lreq = linger_get(lreq);
+        return lwork;
+}
+static void lwork_free(struct linger_work *lwork)
+{
+        struct ceph_osd_linger_request *lreq = lwork->lreq;
+        mutex_lock(&lreq->lock);
+        list_del(&lwork->pending_item);
+        mutex_unlock(&lreq->lock);
+        linger_put(lreq);
+        kfree(lwork);
+}
+static void lwork_queue(struct linger_work *lwork)
+{
+        struct ceph_osd_linger_request *lreq = lwork->lreq;
+        struct ceph_osd_client *osdc = lreq->osdc;
+        verify_lreq_locked(lreq);
+        WARN_ON(!list_empty(&lwork->pending_item));
+        lwork->queued_stamp = jiffies;
+        list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
+        queue_work(osdc->notify_wq, &lwork->work);
+}
+static void do_watch_notify(struct work_struct *w)
+{
+        struct linger_work *lwork = container_of(w, struct linger_work, work);
+        struct ceph_osd_linger_request *lreq = lwork->lreq;
+        if (!linger_registered(lreq)) {
+                dout("%s lreq %p not registered\n", __func__, lreq);
+                goto out;
        }
-        if (req->r_target_oid.name_len == 0) {
-                ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
+        WARN_ON(!lreq->is_watch);
-                need_check_tiering = true;
+        dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
+             __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
+             lwork->notify.payload_len);
+        lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
+                  lwork->notify.notifier_id, lwork->notify.payload,
+                  lwork->notify.payload_len);
+out:
+        ceph_msg_put(lwork->notify.msg);
+        lwork_free(lwork);
+}
+static void do_watch_error(struct work_struct *w)
+{
+        struct linger_work *lwork = container_of(w, struct linger_work, work);
+        struct ceph_osd_linger_request *lreq = lwork->lreq;
+        if (!linger_registered(lreq)) {
+                dout("%s lreq %p not registered\n", __func__, lreq);
+                goto out;
        }
-        if (need_check_tiering &&
+        dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
-            (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+        lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
-                struct ceph_pg_pool_info *pi;
+out:
-                pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
+        lwork_free(lwork);
-                if (pi) {
+}
-                        if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
-                            pi->read_tier >= 0)
+static void queue_watch_error(struct ceph_osd_linger_request *lreq)
-                                req->r_target_oloc.pool = pi->read_tier;
+{
-                        if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+        struct linger_work *lwork;
-                            pi->write_tier >= 0)
-                                req->r_target_oloc.pool = pi->write_tier;
+        lwork = lwork_alloc(lreq, do_watch_error);
+        if (!lwork) {
+                pr_err("failed to allocate error-lwork\n");
+                return;
+        }
+        lwork->error.err = lreq->last_error;
+        lwork_queue(lwork);
+}
+static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
+                                       int result)
+{
+        if (!completion_done(&lreq->reg_commit_wait)) {
+                lreq->reg_commit_error = (result <= 0 ? result : 0);
+                complete_all(&lreq->reg_commit_wait);
+        }
+}
+static void linger_commit_cb(struct ceph_osd_request *req)
+{
+        struct ceph_osd_linger_request *lreq = req->r_priv;
+        mutex_lock(&lreq->lock);
+        dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
+             lreq->linger_id, req->r_result);
+        WARN_ON(!__linger_registered(lreq));
+        linger_reg_commit_complete(lreq, req->r_result);
+        lreq->committed = true;
+        if (!lreq->is_watch) {
+                struct ceph_osd_data *osd_data =
+                    osd_req_op_data(req, 0, notify, response_data);
+                void *p = page_address(osd_data->pages[0]);
+                WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
+                        osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+                /* make note of the notify_id */
+                if (req->r_ops[0].outdata_len >= sizeof(u64)) {
+                        lreq->notify_id = ceph_decode_64(&p);
+                        dout("lreq %p notify_id %llu\n", lreq,
+                             lreq->notify_id);
+                } else {
+                        dout("lreq %p no notify_id\n", lreq);
                }
-                /* !pi is caught in ceph_oloc_oid_to_pg() */
        }
-        return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
+        mutex_unlock(&lreq->lock);
-                                   &req->r_target_oid, pg_out);
+        linger_put(lreq);
 }
-static void __enqueue_request(struct ceph_osd_request *req)
+static int normalize_watch_error(int err)
 {
-        struct ceph_osd_client *osdc = req->r_osdc;
+        /*
+         * Translate ENOENT -> ENOTCONN so that a delete->disconnection
+         * notification and a failure to reconnect because we raced with
+         * the delete appear the same to the user.
+         */
+        if (err == -ENOENT)
+                err = -ENOTCONN;
+        return err;
+}
+static void linger_reconnect_cb(struct ceph_osd_request *req)
+{
+        struct ceph_osd_linger_request *lreq = req->r_priv;
+        mutex_lock(&lreq->lock);
+        dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
+             lreq, lreq->linger_id, req->r_result, lreq->last_error);
+        if (req->r_result < 0) {
+                if (!lreq->last_error) {
+                        lreq->last_error = normalize_watch_error(req->r_result);
+                        queue_watch_error(lreq);
+                }
+        }
-        dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
+        mutex_unlock(&lreq->lock);
-             req->r_osd ? req->r_osd->o_osd : -1);
+        linger_put(lreq);
+}
+static void send_linger(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_request *req = lreq->reg_req;
+        struct ceph_osd_req_op *op = &req->r_ops[0];
-        if (req->r_osd) {
+        verify_osdc_wrlocked(req->r_osdc);
-                __remove_osd_from_lru(req->r_osd);
+        dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
-                list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
-                list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
+        if (req->r_osd)
+                cancel_linger_request(req);
+        request_reinit(req);
+        ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+        ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+        req->r_flags = lreq->t.flags;
+        req->r_mtime = lreq->mtime;
+        mutex_lock(&lreq->lock);
+        if (lreq->is_watch && lreq->committed) {
+                WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+                        op->watch.cookie != lreq->linger_id);
+                op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
+                op->watch.gen = ++lreq->register_gen;
+                dout("lreq %p reconnect register_gen %u\n", lreq,
+                     op->watch.gen);
+                req->r_callback = linger_reconnect_cb;
        } else {
-                list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
+                if (!lreq->is_watch)
+                        lreq->notify_id = 0;
+                else
+                        WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+                dout("lreq %p register\n", lreq);
+                req->r_callback = linger_commit_cb;
        }
+        mutex_unlock(&lreq->lock);
+        req->r_priv = linger_get(lreq);
+        req->r_linger = true;
+        submit_request(req, true);
 }
-/*
+static void linger_ping_cb(struct ceph_osd_request *req)
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately.  If there is
- * no up osd, set r_osd to NULL.  Move the request to the appropriate list
- * (unsent, homeless) or leave on in-flight lru.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_request(struct ceph_osd_client *osdc,
-                         struct ceph_osd_request *req, int force_resend)
 {
-        struct ceph_pg pgid;
+        struct ceph_osd_linger_request *lreq = req->r_priv;
-        int acting[CEPH_PG_MAX_SIZE];
-        int num, o;
+        mutex_lock(&lreq->lock);
-        int err;
+        dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
-        bool was_paused;
+             __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
+             lreq->last_error);
-        dout("map_request %p tid %lld\n", req, req->r_tid);
+        if (lreq->register_gen == req->r_ops[0].watch.gen) {
+                if (!req->r_result) {
-        err = __calc_request_pg(osdc->osdmap, req, &pgid);
+                        lreq->watch_valid_thru = lreq->ping_sent;
-        if (err) {
+                } else if (!lreq->last_error) {
-                list_move(&req->r_req_lru_item, &osdc->req_notarget);
+                        lreq->last_error = normalize_watch_error(req->r_result);
-                return err;
+                        queue_watch_error(lreq);
-        }
-        req->r_pgid = pgid;
-        num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
-        if (num < 0)
-                num = 0;
-        was_paused = req->r_paused;
-        req->r_paused = __req_should_be_paused(osdc, req);
-        if (was_paused && !req->r_paused)
-                force_resend = 1;
-        if ((!force_resend &&
-             req->r_osd && req->r_osd->o_osd == o &&
-             req->r_sent >= req->r_osd->o_incarnation &&
-             req->r_num_pg_osds == num &&
-             memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
-            (req->r_osd == NULL && o == -1) ||
-            req->r_paused)
-                return 0;  /* no change */
-        dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
-             req->r_tid, pgid.pool, pgid.seed, o,
-             req->r_osd ? req->r_osd->o_osd : -1);
-        /* record full pg acting set */
-        memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
-        req->r_num_pg_osds = num;
-        if (req->r_osd) {
-                __cancel_request(req);
-                list_del_init(&req->r_osd_item);
-                list_del_init(&req->r_linger_osd_item);
-                req->r_osd = NULL;
-        }
-        req->r_osd = __lookup_osd(osdc, o);
-        if (!req->r_osd && o >= 0) {
-                err = -ENOMEM;
-                req->r_osd = create_osd(osdc, o);
-                if (!req->r_osd) {
-                        list_move(&req->r_req_lru_item, &osdc->req_notarget);
-                        goto out;
                }
+        } else {
+                dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
+                     lreq->register_gen, req->r_ops[0].watch.gen);
+        }
-                dout("map_request osd %p is osd%d\n", req->r_osd, o);
+        mutex_unlock(&lreq->lock);
-                __insert_osd(osdc, req->r_osd);
+        linger_put(lreq);
+}
+static void send_linger_ping(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        struct ceph_osd_request *req = lreq->ping_req;
+        struct ceph_osd_req_op *op = &req->r_ops[0];
-                ceph_con_open(&req->r_osd->o_con,
+        if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
-                              CEPH_ENTITY_TYPE_OSD, o,
+                dout("%s PAUSERD\n", __func__);
-                              &osdc->osdmap->osd_addr[o]);
+                return;
        }
-        __enqueue_request(req);
+        lreq->ping_sent = jiffies;
-        err = 1;   /* osd or pg changed */
+        dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
+             __func__, lreq, lreq->linger_id, lreq->ping_sent,
+             lreq->register_gen);
-out:
+        if (req->r_osd)
-        return err;
+                cancel_linger_request(req);
+        request_reinit(req);
+        target_copy(&req->r_t, &lreq->t);
+        WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+                op->watch.cookie != lreq->linger_id ||
+                op->watch.op != CEPH_OSD_WATCH_OP_PING);
+        op->watch.gen = lreq->register_gen;
+        req->r_callback = linger_ping_cb;
+        req->r_priv = linger_get(lreq);
+        req->r_linger = true;
+        ceph_osdc_get_request(req);
+        account_request(req);
+        req->r_tid = atomic64_inc_return(&osdc->last_tid);
+        link_request(lreq->osd, req);
+        send_request(req);
 }
-/*
+static void linger_submit(struct ceph_osd_linger_request *lreq)
- * caller should hold map_sem (for read) and request_mutex
- */
-static void __send_request(struct ceph_osd_client *osdc,
-                           struct ceph_osd_request *req)
 {
-        void *p;
+        struct ceph_osd_client *osdc = lreq->osdc;
+        struct ceph_osd *osd;
-        dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
+        calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
-             req, req->r_tid, req->r_osd->o_osd, req->r_flags,
+        osd = lookup_create_osd(osdc, lreq->t.osd, true);
-             (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
+        link_linger(osd, lreq);
-        /* fill in message content that changes each time we send it */
+        send_linger(lreq);
-        put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
+}
-        put_unaligned_le32(req->r_flags, req->r_request_flags);
-        put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
-        p = req->r_request_pgid;
-        ceph_encode_64(&p, req->r_pgid.pool);
-        ceph_encode_32(&p, req->r_pgid.seed);
-        put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */
-        memcpy(req->r_request_reassert_version, &req->r_reassert_version,
-               sizeof(req->r_reassert_version));
-        req->r_stamp = jiffies;
+static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
-        list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        struct ceph_osd_linger_request *lookup_lreq;
-        ceph_msg_get(req->r_request); /* send consumes a ref */
+        verify_osdc_wrlocked(osdc);
-        req->r_sent = req->r_osd->o_incarnation;
+        lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+                                       lreq->linger_id);
+        if (!lookup_lreq)
+                return;
-        ceph_con_send(&req->r_osd->o_con, req->r_request);
+        WARN_ON(lookup_lreq != lreq);
+        erase_linger_mc(&osdc->linger_map_checks, lreq);
+        linger_put(lreq);
 }
 /*
- * Send any requests in the queue (req_unsent).
+ * @lreq has to be both registered and linked.
 */
-static void __send_queued(struct ceph_osd_client *osdc)
+static void __linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+        if (lreq->is_watch && lreq->ping_req->r_osd)
+                cancel_linger_request(lreq->ping_req);
+        if (lreq->reg_req->r_osd)
+                cancel_linger_request(lreq->reg_req);
+        cancel_linger_map_check(lreq);
+        unlink_linger(lreq->osd, lreq);
+        linger_unregister(lreq);
+}
+static void linger_cancel(struct ceph_osd_linger_request *lreq)
 {
-        struct ceph_osd_request *req, *tmp;
+        struct ceph_osd_client *osdc = lreq->osdc;
-        dout("__send_queued\n");
+        down_write(&osdc->lock);
-        list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
+        if (__linger_registered(lreq))
-                __send_request(osdc, req);
+                __linger_cancel(lreq);
+        up_write(&osdc->lock);
 }
-/*
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
- * Caller should hold map_sem for read and request_mutex.
- */
+static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
-static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
+{
-                                     struct ceph_osd_request *req,
+        struct ceph_osd_client *osdc = lreq->osdc;
-                                     bool nofail)
+        struct ceph_osdmap *map = osdc->osdmap;
-{
-        int rc;
+        verify_osdc_wrlocked(osdc);
+        WARN_ON(!map->epoch);
-        __register_request(osdc, req);
-        req->r_sent = 0;
+        if (lreq->register_gen) {
-        req->r_got_reply = 0;
+                lreq->map_dne_bound = map->epoch;
-        rc = __map_request(osdc, req, 0);
+                dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
-        if (rc < 0) {
+                     lreq, lreq->linger_id);
-                if (nofail) {
+        } else {
-                        dout("osdc_start_request failed map, "
+                dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
-                                " will retry %lld\n", req->r_tid);
+                     __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
-                        rc = 0;
+                     map->epoch);
-                } else {
-                        __unregister_request(osdc, req);
-                }
-                return rc;
        }
-        if (req->r_osd == NULL) {
+        if (lreq->map_dne_bound) {
-                dout("send_request %p no up osds in pg\n", req);
+                if (map->epoch >= lreq->map_dne_bound) {
-                ceph_monc_request_next_osdmap(&osdc->client->monc);
+                        /* we had a new enough map */
+                        pr_info("linger_id %llu pool does not exist\n",
+                                lreq->linger_id);
+                        linger_reg_commit_complete(lreq, -ENOENT);
+                        __linger_cancel(lreq);
+                }
        } else {
-                __send_queued(osdc);
+                send_linger_map_check(lreq);
        }
+}
-        return 0;
+static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
+{
+        struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+        struct ceph_osd_linger_request *lreq;
+        u64 linger_id = greq->private_data;
+        WARN_ON(greq->result || !greq->u.newest);
+        down_write(&osdc->lock);
+        lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
+        if (!lreq) {
+                dout("%s linger_id %llu dne\n", __func__, linger_id);
+                goto out_unlock;
+        }
+        dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
+             __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+             greq->u.newest);
+        if (!lreq->map_dne_bound)
+                lreq->map_dne_bound = greq->u.newest;
+        erase_linger_mc(&osdc->linger_map_checks, lreq);
+        check_linger_pool_dne(lreq);
+        linger_put(lreq);
+out_unlock:
+        up_write(&osdc->lock);
+}
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        struct ceph_osd_linger_request *lookup_lreq;
+        int ret;
+        verify_osdc_wrlocked(osdc);
+        lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+                                       lreq->linger_id);
+        if (lookup_lreq) {
+                WARN_ON(lookup_lreq != lreq);
+                return;
+        }
+        linger_get(lreq);
+        insert_linger_mc(&osdc->linger_map_checks, lreq);
+        ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+                                          linger_map_check_cb, lreq->linger_id);
+        WARN_ON(ret);
+}
+static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
+{
+        int ret;
+        dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+        ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
+        return ret ?: lreq->reg_commit_error;
+}
+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
+{
+        int ret;
+        dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+        ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
+        return ret ?: lreq->notify_finish_error;
 }
 /*
- * Timeout callback, called every N seconds when 1 or more osd
+ * Timeout callback, called every N seconds.  When 1 or more OSD
- * requests has been active for more than N seconds.  When this
+ * requests has been active for more than N seconds, we send a keepalive
- * happens, we ping all OSDs with requests who have timed out to
+ * (tag + timestamp) to its OSD to ensure any communications channel
- * ensure any communications channel reset is detected.  Reset the
+ * reset is detected.
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
 */
 static void handle_timeout(struct work_struct *work)
 {
        struct ceph_osd_client *osdc =
                container_of(work, struct ceph_osd_client, timeout_work.work);
        struct ceph_options *opts = osdc->client->options;
-        struct ceph_osd_request *req;
+        unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
-        struct ceph_osd *osd;
+        LIST_HEAD(slow_osds);
-        struct list_head slow_osds;
+        struct rb_node *n, *p;
-        dout("timeout\n");
-        down_read(&osdc->map_sem);
-        ceph_monc_request_next_osdmap(&osdc->client->monc);
-        mutex_lock(&osdc->request_mutex);
+        dout("%s osdc %p\n", __func__, osdc);
+        down_write(&osdc->lock);
        /*
         * ping osds that are a bit slow.  this ensures that if there
         * is a break in the TCP connection we will notice, and reopen
         * a connection with that osd (from the fault callback).
         */
-        INIT_LIST_HEAD(&slow_osds);
+        for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
-        list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+                struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
-                if (time_before(jiffies,
+                bool found = false;
-                                req->r_stamp + opts->osd_keepalive_timeout))
-                        break;
+                for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+                        struct ceph_osd_request *req =
+                            rb_entry(p, struct ceph_osd_request, r_node);
+                        if (time_before(req->r_stamp, cutoff)) {
+                                dout(" req %p tid %llu on osd%d is laggy\n",
+                                     req, req->r_tid, osd->o_osd);
+                                found = true;
+                        }
+                }
+                for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
+                        struct ceph_osd_linger_request *lreq =
+                            rb_entry(p, struct ceph_osd_linger_request, node);
+                        dout(" lreq %p linger_id %llu is served by osd%d\n",
+                             lreq, lreq->linger_id, osd->o_osd);
+                        found = true;
+                        mutex_lock(&lreq->lock);
+                        if (lreq->is_watch && lreq->committed && !lreq->last_error)
+                                send_linger_ping(lreq);
+                        mutex_unlock(&lreq->lock);
+                }
-                osd = req->r_osd;
+                if (found)
-                BUG_ON(!osd);
+                        list_move_tail(&osd->o_keepalive_item, &slow_osds);
-                dout(" tid %llu is slow, will send keepalive on osd%d\n",
-                     req->r_tid, osd->o_osd);
-                list_move_tail(&osd->o_keepalive_item, &slow_osds);
        }
+        if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
+                maybe_request_map(osdc);
        while (!list_empty(&slow_osds)) {
-                osd = list_entry(slow_osds.next, struct ceph_osd,
+                struct ceph_osd *osd = list_first_entry(&slow_osds,
-                                 o_keepalive_item);
+                                                        struct ceph_osd,
+                                                        o_keepalive_item);
                list_del_init(&osd->o_keepalive_item);
                ceph_con_keepalive(&osd->o_con);
        }
-        __schedule_osd_timeout(osdc);
+        up_write(&osdc->lock);
-        __send_queued(osdc);
+        schedule_delayed_work(&osdc->timeout_work,
-        mutex_unlock(&osdc->request_mutex);
+                              osdc->client->options->osd_keepalive_timeout);
-        up_read(&osdc->map_sem);
 }
 static void handle_osds_timeout(struct work_struct *work)
@@ -1663,12 +2542,20 @@ static void handle_osds_timeout(struct work_struct *work)
                container_of(work, struct ceph_osd_client,
                             osds_timeout_work.work);
        unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
+        struct ceph_osd *osd, *nosd;
-        dout("osds timeout\n");
+        dout("%s osdc %p\n", __func__, osdc);
-        down_read(&osdc->map_sem);
+        down_write(&osdc->lock);
-        remove_old_osds(osdc);
+        list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
-        up_read(&osdc->map_sem);
+                if (time_before(jiffies, osd->lru_ttl))
+                        break;
+                WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+                WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+                close_osd(osd);
+        }
+        up_write(&osdc->lock);
        schedule_delayed_work(&osdc->osds_timeout_work,
                              round_jiffies_relative(delay));
 }
@@ -1776,107 +2663,76 @@ e_inval:
        goto out;
 }
-static void complete_request(struct ceph_osd_request *req)
+struct MOSDOpReply {
-{
+        struct ceph_pg pgid;
-        complete_all(&req->r_safe_completion);  /* fsync waiter */
+        u64 flags;
-}
+        int result;
+        u32 epoch;
+        int num_ops;
+        u32 outdata_len[CEPH_OSD_MAX_OPS];
+        s32 rval[CEPH_OSD_MAX_OPS];
+        int retry_attempt;
+        struct ceph_eversion replay_version;
+        u64 user_version;
+        struct ceph_request_redirect redirect;
+};
-/*
+static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 {
-        void *p, *end;
+        void *p = msg->front.iov_base;
-        struct ceph_osd_request *req;
+        void *const end = p + msg->front.iov_len;
-        struct ceph_request_redirect redir;
+        u16 version = le16_to_cpu(msg->hdr.version);
-        u64 tid;
+        struct ceph_eversion bad_replay_version;
-        int object_len;
-        unsigned int numops;
-        int payload_len, flags;
-        s32 result;
-        s32 retry_attempt;
-        struct ceph_pg pg;
-        int err;
-        u32 reassert_epoch;
-        u64 reassert_version;
-        u32 osdmap_epoch;
-        int already_completed;
-        u32 bytes;
        u8 decode_redir;
-        unsigned int i;
+        u32 len;
+        int ret;
-        tid = le64_to_cpu(msg->hdr.tid);
+        int i;
-        dout("handle_reply %p tid %llu\n", msg, tid);
-        p = msg->front.iov_base;
+        ceph_decode_32_safe(&p, end, len, e_inval);
-        end = p + msg->front.iov_len;
+        ceph_decode_need(&p, end, len, e_inval);
+        p += len; /* skip oid */
-        ceph_decode_need(&p, end, 4, bad);
+        ret = ceph_decode_pgid(&p, end, &m->pgid);
-        object_len = ceph_decode_32(&p);
+        if (ret)
-        ceph_decode_need(&p, end, object_len, bad);
+                return ret;
-        p += object_len;
-        err = ceph_decode_pgid(&p, end, &pg);
+        ceph_decode_64_safe(&p, end, m->flags, e_inval);
-        if (err)
+        ceph_decode_32_safe(&p, end, m->result, e_inval);
-                goto bad;
+        ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
+        memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
+        p += sizeof(bad_replay_version);
+        ceph_decode_32_safe(&p, end, m->epoch, e_inval);
-        ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
+        ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
-        flags = ceph_decode_64(&p);
+        if (m->num_ops > ARRAY_SIZE(m->outdata_len))
-        result = ceph_decode_32(&p);
+                goto e_inval;
-        reassert_epoch = ceph_decode_32(&p);
-        reassert_version = ceph_decode_64(&p);
-        osdmap_epoch = ceph_decode_32(&p);
-        /* lookup */
-        down_read(&osdc->map_sem);
-        mutex_lock(&osdc->request_mutex);
-        req = __lookup_request(osdc, tid);
-        if (req == NULL) {
-                dout("handle_reply tid %llu dne\n", tid);
-                goto bad_mutex;
-        }
-        ceph_osdc_get_request(req);
-        dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
+        ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
-             req, result);
+                         e_inval);
+        for (i = 0; i < m->num_ops; i++) {
-        ceph_decode_need(&p, end, 4, bad_put);
-        numops = ceph_decode_32(&p);
-        if (numops > CEPH_OSD_MAX_OPS)
-                goto bad_put;
-        if (numops != req->r_num_ops)
-                goto bad_put;
-        payload_len = 0;
-        ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
-        for (i = 0; i < numops; i++) {
                struct ceph_osd_op *op = p;
-                int len;
-                len = le32_to_cpu(op->payload_len);
+                m->outdata_len[i] = le32_to_cpu(op->payload_len);
-                req->r_ops[i].outdata_len = len;
-                dout(" op %d has %d bytes\n", i, len);
-                payload_len += len;
                p += sizeof(*op);
        }
-        bytes = le32_to_cpu(msg->hdr.data_len);
-        if (payload_len != bytes) {
-                pr_warn("sum of op payload lens %d != data_len %d\n",
-                        payload_len, bytes);
-                goto bad_put;
-        }
-        ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
+        ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
-        retry_attempt = ceph_decode_32(&p);
+        for (i = 0; i < m->num_ops; i++)
-        for (i = 0; i < numops; i++)
+                ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
-                req->r_ops[i].rval = ceph_decode_32(&p);
-        if (le16_to_cpu(msg->hdr.version) >= 6) {
+        if (version >= 5) {
-                p += 8 + 4; /* skip replay_version */
+                ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
-                p += 8; /* skip user_version */
+                memcpy(&m->replay_version, p, sizeof(m->replay_version));
+                p += sizeof(m->replay_version);
+                ceph_decode_64_safe(&p, end, m->user_version, e_inval);
+        } else {
+                m->replay_version = bad_replay_version; /* struct */
+                m->user_version = le64_to_cpu(m->replay_version.version);
+        }
-                if (le16_to_cpu(msg->hdr.version) >= 7)
+        if (version >= 6) {
-                        ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+                if (version >= 7)
+                        ceph_decode_8_safe(&p, end, decode_redir, e_inval);
                else
                        decode_redir = 1;
        } else {
@@ -1884,228 +2740,410 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
        }
        if (decode_redir) {
-                err = ceph_redirect_decode(&p, end, &redir);
+                ret = ceph_redirect_decode(&p, end, &m->redirect);
-                if (err)
+                if (ret)
-                        goto bad_put;
+                        return ret;
        } else {
-                redir.oloc.pool = -1;
+                ceph_oloc_init(&m->redirect.oloc);
        }
-        if (redir.oloc.pool != -1) {
+        return 0;
-                dout("redirect pool %lld\n", redir.oloc.pool);
-                __unregister_request(osdc, req);
-                req->r_target_oloc = redir.oloc; /* struct */
-                /*
+e_inval:
-                 * Start redirect requests with nofail=true.  If
+        return -EINVAL;
-                 * mapping fails, request will end up on the notarget
+}
-                 * list, waiting for the new osdmap (which can take
-                 * a while), even though the original request mapped
-                 * successfully.  In the future we might want to follow
-                 * original request's nofail setting here.
-                 */
-                err = __ceph_osdc_start_request(osdc, req, true);
-                BUG_ON(err);
-                goto out_unlock;
+/*
-        }
+ * We are done with @req if
+ *   - @m is a safe reply, or
+ *   - @m is an unsafe reply and we didn't want a safe one
+ */
+static bool done_request(const struct ceph_osd_request *req,
+                         const struct MOSDOpReply *m)
+{
+        return (m->result < 0 ||
+                (m->flags & CEPH_OSD_FLAG_ONDISK) ||
+                !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
+}
-        already_completed = req->r_got_reply;
+/*
-        if (!req->r_got_reply) {
+ * handle osd op reply.  either call the callback if it is specified,
-                req->r_result = result;
+ * or do the completion to wake up the waiting thread.
-                dout("handle_reply result %d bytes %d\n", req->r_result,
+ *
-                     bytes);
+ * ->r_unsafe_callback is set?  yes                     no
-                if (req->r_result == 0)
+ *
-                        req->r_result = bytes;
+ * first reply is OK (needed    r_cb/r_completion,      r_cb/r_completion,
+ * any or needed/got safe)      r_safe_completion       r_safe_completion
+ *
+ * first reply is unsafe        r_unsafe_cb(true)       (nothing)
+ *
+ * when we get the safe reply   r_unsafe_cb(false),     r_cb/r_completion,
+ *                              r_safe_completion       r_safe_completion
+ */
+static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
+{
+        struct ceph_osd_client *osdc = osd->o_osdc;
+        struct ceph_osd_request *req;
+        struct MOSDOpReply m;
+        u64 tid = le64_to_cpu(msg->hdr.tid);
+        u32 data_len = 0;
+        bool already_acked;
+        int ret;
+        int i;
-                /* in case this is a write and we need to replay, */
+        dout("%s msg %p tid %llu\n", __func__, msg, tid);
-                req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
-                req->r_reassert_version.version = cpu_to_le64(reassert_version);
-                req->r_got_reply = 1;
+        down_read(&osdc->lock);
-        } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+        if (!osd_registered(osd)) {
-                dout("handle_reply tid %llu dup ack\n", tid);
+                dout("%s osd%d unknown\n", __func__, osd->o_osd);
-                goto out_unlock;
+                goto out_unlock_osdc;
        }
+        WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
-        dout("handle_reply tid %llu flags %d\n", tid, flags);
+        mutex_lock(&osd->lock);
+        req = lookup_request(&osd->o_requests, tid);
+        if (!req) {
+                dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
+                goto out_unlock_session;
+        }
-        if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
+        ret = decode_MOSDOpReply(msg, &m);
-                __register_linger_request(osdc, req);
+        if (ret) {
+                pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
+                       req->r_tid, ret);
+                ceph_msg_dump(msg);
+                goto fail_request;
+        }
+        dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
+             __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
+             m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
+             le64_to_cpu(m.replay_version.version), m.user_version);
+        if (m.retry_attempt >= 0) {
+                if (m.retry_attempt != req->r_attempts - 1) {
+                        dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
+                             req, req->r_tid, m.retry_attempt,
+                             req->r_attempts - 1);
+                        goto out_unlock_session;
+                }
+        } else {
+                WARN_ON(1); /* MOSDOpReply v4 is assumed */
+        }
-        /* either this is a read, or we got the safe response */
+        if (!ceph_oloc_empty(&m.redirect.oloc)) {
-        if (result < 0 ||
+                dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
-            (flags & CEPH_OSD_FLAG_ONDISK) ||
+                     m.redirect.oloc.pool);
-            ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+                unlink_request(osd, req);
-                __unregister_request(osdc, req);
+                mutex_unlock(&osd->lock);
+                ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
+                req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
+                req->r_tid = 0;
+                __submit_request(req, false);
+                goto out_unlock_osdc;
+        }
-        mutex_unlock(&osdc->request_mutex);
+        if (m.num_ops != req->r_num_ops) {
-        up_read(&osdc->map_sem);
+                pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
+                       req->r_num_ops, req->r_tid);
+                goto fail_request;
+        }
+        for (i = 0; i < req->r_num_ops; i++) {
+                dout(" req %p tid %llu op %d rval %d len %u\n", req,
+                     req->r_tid, i, m.rval[i], m.outdata_len[i]);
+                req->r_ops[i].rval = m.rval[i];
+                req->r_ops[i].outdata_len = m.outdata_len[i];
+                data_len += m.outdata_len[i];
+        }
+        if (data_len != le32_to_cpu(msg->hdr.data_len)) {
+                pr_err("sum of lens %u != %u for tid %llu\n", data_len,
+                       le32_to_cpu(msg->hdr.data_len), req->r_tid);
+                goto fail_request;
+        }
+        dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
+             req, req->r_tid, req->r_got_reply, m.result, data_len);
+        already_acked = req->r_got_reply;
+        if (!already_acked) {
+                req->r_result = m.result ?: data_len;
+                req->r_replay_version = m.replay_version; /* struct */
+                req->r_got_reply = true;
+        } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
+                dout("req %p tid %llu dup ack\n", req, req->r_tid);
+                goto out_unlock_session;
+        }
-        if (!already_completed) {
+        if (done_request(req, &m)) {
-                if (req->r_unsafe_callback &&
+                __finish_request(req);
-                    result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
+                if (req->r_linger) {
-                        req->r_unsafe_callback(req, true);
+                        WARN_ON(req->r_unsafe_callback);
-                if (req->r_callback)
+                        dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
-                        req->r_callback(req, msg);
+                        __complete_request(req);
-                else
+                }
-                        complete_all(&req->r_completion);
        }
-        if (flags & CEPH_OSD_FLAG_ONDISK) {
+        mutex_unlock(&osd->lock);
-                if (req->r_unsafe_callback && already_completed)
+        up_read(&osdc->lock);
+        if (done_request(req, &m)) {
+                if (already_acked && req->r_unsafe_callback) {
+                        dout("req %p tid %llu safe-cb\n", req, req->r_tid);
                        req->r_unsafe_callback(req, false);
-                complete_request(req);
+                } else if (!req->r_linger) {
+                        dout("req %p tid %llu cb\n", req, req->r_tid);
+                        __complete_request(req);
+                }
+        } else {
+                if (req->r_unsafe_callback) {
+                        dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
+                        req->r_unsafe_callback(req, true);
+                } else {
+                        WARN_ON(1);
+                }
        }
+        if (m.flags & CEPH_OSD_FLAG_ONDISK)
+                complete_all(&req->r_safe_completion);
-out:
-        dout("req=%p req->r_linger=%d\n", req, req->r_linger);
        ceph_osdc_put_request(req);
        return;
-out_unlock:
-        mutex_unlock(&osdc->request_mutex);
-        up_read(&osdc->map_sem);
-        goto out;
-bad_put:
+fail_request:
-        req->r_result = -EIO;
+        complete_request(req, -EIO);
-        __unregister_request(osdc, req);
+out_unlock_session:
-        if (req->r_callback)
+        mutex_unlock(&osd->lock);
-                req->r_callback(req, msg);
+out_unlock_osdc:
-        else
+        up_read(&osdc->lock);
-                complete_all(&req->r_completion);
-        complete_request(req);
-        ceph_osdc_put_request(req);
-bad_mutex:
-        mutex_unlock(&osdc->request_mutex);
-        up_read(&osdc->map_sem);
-bad:
-        pr_err("corrupt osd_op_reply got %d %d\n",
-               (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
-        ceph_msg_dump(msg);
 }
-static void reset_changed_osds(struct ceph_osd_client *osdc)
+static void set_pool_was_full(struct ceph_osd_client *osdc)
 {
-        struct rb_node *p, *n;
+        struct rb_node *n;
-        dout("%s %p\n", __func__, osdc);
+        for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
-        for (p = rb_first(&osdc->osds); p; p = n) {
+                struct ceph_pg_pool_info *pi =
-                struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
+                    rb_entry(n, struct ceph_pg_pool_info, node);
-                n = rb_next(p);
+                pi->was_full = __pool_full(pi);
-                if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-                    memcmp(&osd->o_con.peer_addr,
-                           ceph_osd_addr(osdc->osdmap,
-                                         osd->o_osd),
-                           sizeof(struct ceph_entity_addr)) != 0)
-                        __reset_osd(osdc, osd);
        }
 }
+static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+        struct ceph_pg_pool_info *pi;
+        pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+        if (!pi)
+                return false;
+        return pi->was_full && !__pool_full(pi);
+}
+static enum calc_target_result
+recalc_linger_target(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_client *osdc = lreq->osdc;
+        enum calc_target_result ct_res;
+        ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
+        if (ct_res == CALC_TARGET_NEED_RESEND) {
+                struct ceph_osd *osd;
+                osd = lookup_create_osd(osdc, lreq->t.osd, true);
+                if (osd != lreq->osd) {
+                        unlink_linger(lreq->osd, lreq);
+                        link_linger(osd, lreq);
+                }
+        }
+        return ct_res;
+}
 /*
- * Requeue requests whose mapping to an OSD has changed.  If requests map to
+ * Requeue requests whose mapping to an OSD has changed.
- * no osd, request a new map.
- *
- * Caller should hold map_sem for read.
 */
-static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
+static void scan_requests(struct ceph_osd *osd,
-                          bool force_resend_writes)
+                          bool force_resend,
+                          bool cleared_full,
+                          bool check_pool_cleared_full,
+                          struct rb_root *need_resend,
+                          struct list_head *need_resend_linger)
 {
-        struct ceph_osd_request *req, *nreq;
+        struct ceph_osd_client *osdc = osd->o_osdc;
-        struct rb_node *p;
+        struct rb_node *n;
-        int needmap = 0;
+        bool force_resend_writes;
-        int err;
-        bool force_resend_req;
+        for (n = rb_first(&osd->o_linger_requests); n; ) {
+                struct ceph_osd_linger_request *lreq =
+                    rb_entry(n, struct ceph_osd_linger_request, node);
+                enum calc_target_result ct_res;
+                n = rb_next(n); /* recalc_linger_target() */
+                dout("%s lreq %p linger_id %llu\n", __func__, lreq,
+                     lreq->linger_id);
+                ct_res = recalc_linger_target(lreq);
+                switch (ct_res) {
+                case CALC_TARGET_NO_ACTION:
+                        force_resend_writes = cleared_full ||
+                            (check_pool_cleared_full &&
+                             pool_cleared_full(osdc, lreq->t.base_oloc.pool));
+                        if (!force_resend && !force_resend_writes)
+                                break;
+                        /* fall through */
+                case CALC_TARGET_NEED_RESEND:
+                        cancel_linger_map_check(lreq);
+                        /*
+                         * scan_requests() for the previous epoch(s)
+                         * may have already added it to the list, since
+                         * it's not unlinked here.
+                         */
+                        if (list_empty(&lreq->scan_item))
+                                list_add_tail(&lreq->scan_item, need_resend_linger);
+                        break;
+                case CALC_TARGET_POOL_DNE:
+                        check_linger_pool_dne(lreq);
+                        break;
+                }
+        }
-        dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
+        for (n = rb_first(&osd->o_requests); n; ) {
-                force_resend_writes ? " (force resend writes)" : "");
+                struct ceph_osd_request *req =
-        mutex_lock(&osdc->request_mutex);
+                    rb_entry(n, struct ceph_osd_request, r_node);
-        for (p = rb_first(&osdc->requests); p; ) {
+                enum calc_target_result ct_res;
-                req = rb_entry(p, struct ceph_osd_request, r_node);
-                p = rb_next(p);
+                n = rb_next(n); /* unlink_request(), check_pool_dne() */
+                dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+                ct_res = calc_target(osdc, &req->r_t,
+                                     &req->r_last_force_resend, false);
+                switch (ct_res) {
+                case CALC_TARGET_NO_ACTION:
+                        force_resend_writes = cleared_full ||
+                            (check_pool_cleared_full &&
+                             pool_cleared_full(osdc, req->r_t.base_oloc.pool));
+                        if (!force_resend &&
+                            (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
+                             !force_resend_writes))
+                                break;
+                        /* fall through */
+                case CALC_TARGET_NEED_RESEND:
+                        cancel_map_check(req);
+                        unlink_request(osd, req);
+                        insert_request(need_resend, req);
+                        break;
+                case CALC_TARGET_POOL_DNE:
+                        check_pool_dne(req);
+                        break;
+                }
+        }
+}
+static int handle_one_map(struct ceph_osd_client *osdc,
+                          void *p, void *end, bool incremental,
+                          struct rb_root *need_resend,
+                          struct list_head *need_resend_linger)
+{
+        struct ceph_osdmap *newmap;
+        struct rb_node *n;
+        bool skipped_map = false;
+        bool was_full;
+        was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+        set_pool_was_full(osdc);
+        if (incremental)
+                newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+        else
+                newmap = ceph_osdmap_decode(&p, end);
+        if (IS_ERR(newmap))
+                return PTR_ERR(newmap);
+        if (newmap != osdc->osdmap) {
                /*
-                 * For linger requests that have not yet been
+                 * Preserve ->was_full before destroying the old map.
-                 * registered, move them to the linger list; they'll
+                 * For pools that weren't in the old map, ->was_full
-                 * be sent to the osd in the loop below.  Unregister
+                 * should be false.
-                 * the request before re-registering it as a linger
-                 * request to ensure the __map_request() below
-                 * will decide it needs to be sent.
                 */
-                if (req->r_linger && list_empty(&req->r_linger_item)) {
+                for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
-                        dout("%p tid %llu restart on osd%d\n",
+                        struct ceph_pg_pool_info *pi =
-                             req, req->r_tid,
+                            rb_entry(n, struct ceph_pg_pool_info, node);
-                             req->r_osd ? req->r_osd->o_osd : -1);
+                        struct ceph_pg_pool_info *old_pi;
-                        ceph_osdc_get_request(req);
-                        __unregister_request(osdc, req);
+                        old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
-                        __register_linger_request(osdc, req);
+                        if (old_pi)
-                        ceph_osdc_put_request(req);
+                                pi->was_full = old_pi->was_full;
-                        continue;
+                        else
+                                WARN_ON(pi->was_full);
                }
-                force_resend_req = force_resend ||
+                if (osdc->osdmap->epoch &&
-                        (force_resend_writes &&
+                    osdc->osdmap->epoch + 1 < newmap->epoch) {
-                                req->r_flags & CEPH_OSD_FLAG_WRITE);
+                        WARN_ON(incremental);
-                err = __map_request(osdc, req, force_resend_req);
+                        skipped_map = true;
-                if (err < 0)
-                        continue;  /* error */
-                if (req->r_osd == NULL) {
-                        dout("%p tid %llu maps to no osd\n", req, req->r_tid);
-                        needmap++;  /* request a newer map */
-                } else if (err > 0) {
-                        if (!req->r_linger) {
-                                dout("%p tid %llu requeued on osd%d\n", req,
-                                     req->r_tid,
-                                     req->r_osd ? req->r_osd->o_osd : -1);
-                                req->r_flags |= CEPH_OSD_FLAG_RETRY;
-                        }
                }
+                ceph_osdmap_destroy(osdc->osdmap);
+                osdc->osdmap = newmap;
        }
-        list_for_each_entry_safe(req, nreq, &osdc->req_linger,
+        was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
-                                 r_linger_item) {
+        scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
-                dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
+                      need_resend, need_resend_linger);
-                err = __map_request(osdc, req,
+        for (n = rb_first(&osdc->osds); n; ) {
-                                    force_resend || force_resend_writes);
+                struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
-                dout("__map_request returned %d\n", err);
-                if (err < 0)
+                n = rb_next(n); /* close_osd() */
-                        continue;  /* hrm! */
-                if (req->r_osd == NULL || err > 0) {
+                scan_requests(osd, skipped_map, was_full, true, need_resend,
-                        if (req->r_osd == NULL) {
+                              need_resend_linger);
-                                dout("lingering %p tid %llu maps to no osd\n",
+                if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-                                     req, req->r_tid);
+                    memcmp(&osd->o_con.peer_addr,
-                                /*
+                           ceph_osd_addr(osdc->osdmap, osd->o_osd),
-                                 * A homeless lingering request makes
+                           sizeof(struct ceph_entity_addr)))
-                                 * no sense, as it's job is to keep
+                        close_osd(osd);
-                                 * a particular OSD connection open.
+        }
-                                 * Request a newer map and kick the
-                                 * request, knowing that it won't be
-                                 * resent until we actually get a map
-                                 * that can tell us where to send it.
-                                 */
-                                needmap++;
-                        }
-                        dout("kicking lingering %p tid %llu osd%d\n", req,
+        return 0;
-                             req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
+}
-                        __register_request(osdc, req);
-                        __unregister_linger_request(osdc, req);
+static void kick_requests(struct ceph_osd_client *osdc,
+                          struct rb_root *need_resend,
+                          struct list_head *need_resend_linger)
+{
+        struct ceph_osd_linger_request *lreq, *nlreq;
+        struct rb_node *n;
+        for (n = rb_first(need_resend); n; ) {
+                struct ceph_osd_request *req =
+                    rb_entry(n, struct ceph_osd_request, r_node);
+                struct ceph_osd *osd;
+                n = rb_next(n);
+                erase_request(need_resend, req); /* before link_request() */
+                WARN_ON(req->r_osd);
+                calc_target(osdc, &req->r_t, NULL, false);
+                osd = lookup_create_osd(osdc, req->r_t.osd, true);
+                link_request(osd, req);
+                if (!req->r_linger) {
+                        if (!osd_homeless(osd) && !req->r_t.paused)
+                                send_request(req);
+                } else {
+                        cancel_linger_request(req);
                }
        }
-        reset_changed_osds(osdc);
-        mutex_unlock(&osdc->request_mutex);
-        if (needmap) {
+        list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
-                dout("%d requests for down osds, need new map\n", needmap);
+                if (!osd_homeless(lreq->osd))
-                ceph_monc_request_next_osdmap(&osdc->client->monc);
+                        send_linger(lreq);
+                list_del_init(&lreq->scan_item);
        }
 }
 /*
 * Process updated osd map.
 *
@@ -2115,27 +3153,31 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
 */
 void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 {
-        void *p, *end, *next;
+        void *p = msg->front.iov_base;
+        void *const end = p + msg->front.iov_len;
        u32 nr_maps, maplen;
        u32 epoch;
-        struct ceph_osdmap *newmap = NULL, *oldmap;
-        int err;
        struct ceph_fsid fsid;
-        bool was_full;
+        struct rb_root need_resend = RB_ROOT;
+        LIST_HEAD(need_resend_linger);
+        bool handled_incremental = false;
+        bool was_pauserd, was_pausewr;
+        bool pauserd, pausewr;
+        int err;
-        dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+        dout("%s have %u\n", __func__, osdc->osdmap->epoch);
-        p = msg->front.iov_base;
+        down_write(&osdc->lock);
-        end = p + msg->front.iov_len;
        /* verify fsid */
        ceph_decode_need(&p, end, sizeof(fsid), bad);
        ceph_decode_copy(&p, &fsid, sizeof(fsid));
        if (ceph_check_fsid(osdc->client, &fsid) < 0)
-                return;
+                goto bad;
-        down_write(&osdc->map_sem);
+        was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+        was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-        was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+                      ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+                      have_pool_full(osdc);
        /* incremental maps */
        ceph_decode_32_safe(&p, end, nr_maps, bad);
@@ -2145,34 +3187,23 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
                epoch = ceph_decode_32(&p);
                maplen = ceph_decode_32(&p);
                ceph_decode_need(&p, end, maplen, bad);
-                next = p + maplen;
+                if (osdc->osdmap->epoch &&
-                if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+                    osdc->osdmap->epoch + 1 == epoch) {
                        dout("applying incremental map %u len %d\n",
                             epoch, maplen);
-                        newmap = osdmap_apply_incremental(&p, next,
+                        err = handle_one_map(osdc, p, p + maplen, true,
-                                                          osdc->osdmap,
+                                             &need_resend, &need_resend_linger);
-                                                          &osdc->client->msgr);
+                        if (err)
-                        if (IS_ERR(newmap)) {
-                                err = PTR_ERR(newmap);
                                goto bad;
-                        }
+                        handled_incremental = true;
-                        BUG_ON(!newmap);
-                        if (newmap != osdc->osdmap) {
-                                ceph_osdmap_destroy(osdc->osdmap);
-                                osdc->osdmap = newmap;
-                        }
-                        was_full = was_full ||
-                                ceph_osdmap_flag(osdc->osdmap,
-                                                 CEPH_OSDMAP_FULL);
-                        kick_requests(osdc, 0, was_full);
                } else {
                        dout("ignoring incremental map %u len %d\n",
                             epoch, maplen);
                }
-                p = next;
+                p += maplen;
                nr_maps--;
        }
-        if (newmap)
+        if (handled_incremental)
                goto done;
        /* full maps */
@@ -2186,455 +3217,647 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
                if (nr_maps > 1) {
                        dout("skipping non-latest full map %u len %d\n",
                             epoch, maplen);
-                } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+                } else if (osdc->osdmap->epoch >= epoch) {
                        dout("skipping full map %u len %d, "
                             "older than our %u\n", epoch, maplen,
                             osdc->osdmap->epoch);
                } else {
-                        int skipped_map = 0;
                        dout("taking full map %u len %d\n", epoch, maplen);
-                        newmap = ceph_osdmap_decode(&p, p+maplen);
+                        err = handle_one_map(osdc, p, p + maplen, false,
-                        if (IS_ERR(newmap)) {
+                                             &need_resend, &need_resend_linger);
-                                err = PTR_ERR(newmap);
+                        if (err)
                                goto bad;
-                        }
-                        BUG_ON(!newmap);
-                        oldmap = osdc->osdmap;
-                        osdc->osdmap = newmap;
-                        if (oldmap) {
-                                if (oldmap->epoch + 1 < newmap->epoch)
-                                        skipped_map = 1;
-                                ceph_osdmap_destroy(oldmap);
-                        }
-                        was_full = was_full ||
-                                ceph_osdmap_flag(osdc->osdmap,
-                                                 CEPH_OSDMAP_FULL);
-                        kick_requests(osdc, skipped_map, was_full);
                }
                p += maplen;
                nr_maps--;
        }
-        if (!osdc->osdmap)
-                goto bad;
 done:
-        downgrade_write(&osdc->map_sem);
-        ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
-                          osdc->osdmap->epoch);
        /*
         * subscribe to subsequent osdmap updates if full to ensure
         * we find out when we are no longer full and stop returning
         * ENOSPC.
         */
-        if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+        pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
-                ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
+        pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-                ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
+                  ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
-                ceph_monc_request_next_osdmap(&osdc->client->monc);
+                  have_pool_full(osdc);
+        if (was_pauserd || was_pausewr || pauserd || pausewr)
-        mutex_lock(&osdc->request_mutex);
+                maybe_request_map(osdc);
-        __send_queued(osdc);
-        mutex_unlock(&osdc->request_mutex);
+        kick_requests(osdc, &need_resend, &need_resend_linger);
-        up_read(&osdc->map_sem);
+        ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+                          osdc->osdmap->epoch);
+        up_write(&osdc->lock);
        wake_up_all(&osdc->client->auth_wq);
        return;
 bad:
        pr_err("osdc handle_map corrupt msg\n");
        ceph_msg_dump(msg);
-        up_write(&osdc->map_sem);
+        up_write(&osdc->lock);
 }
 /*
- * watch/notify callback event infrastructure
+ * Resubmit requests pending on the given osd.
- *
- * These callbacks are used both for watch and notify operations.
 */
-static void __release_event(struct kref *kref)
+static void kick_osd_requests(struct ceph_osd *osd)
 {
-        struct ceph_osd_event *event =
+        struct rb_node *n;
-                container_of(kref, struct ceph_osd_event, kref);
-        dout("__release_event %p\n", event);
+        for (n = rb_first(&osd->o_requests); n; ) {
-        kfree(event);
+                struct ceph_osd_request *req =
-}
+                    rb_entry(n, struct ceph_osd_request, r_node);
-static void get_event(struct ceph_osd_event *event)
+                n = rb_next(n); /* cancel_linger_request() */
-{
-        kref_get(&event->kref);
-}
-void ceph_osdc_put_event(struct ceph_osd_event *event)
+                if (!req->r_linger) {
-{
+                        if (!req->r_t.paused)
-        kref_put(&event->kref, __release_event);
+                                send_request(req);
+                } else {
+                        cancel_linger_request(req);
+                }
+        }
+        for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+                struct ceph_osd_linger_request *lreq =
+                    rb_entry(n, struct ceph_osd_linger_request, node);
+                send_linger(lreq);
+        }
 }
-EXPORT_SYMBOL(ceph_osdc_put_event);
-static void __insert_event(struct ceph_osd_client *osdc,
+/*
-                             struct ceph_osd_event *new)
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_fault(struct ceph_connection *con)
 {
-        struct rb_node **p = &osdc->event_tree.rb_node;
+        struct ceph_osd *osd = con->private;
-        struct rb_node *parent = NULL;
+        struct ceph_osd_client *osdc = osd->o_osdc;
-        struct ceph_osd_event *event = NULL;
-        while (*p) {
+        dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
-                parent = *p;
-                event = rb_entry(parent, struct ceph_osd_event, node);
+        down_write(&osdc->lock);
-                if (new->cookie < event->cookie)
+        if (!osd_registered(osd)) {
-                        p = &(*p)->rb_left;
+                dout("%s osd%d unknown\n", __func__, osd->o_osd);
-                else if (new->cookie > event->cookie)
+                goto out_unlock;
-                        p = &(*p)->rb_right;
-                else
-                        BUG();
        }
-        rb_link_node(&new->node, parent, p);
+        if (!reopen_osd(osd))
-        rb_insert_color(&new->node, &osdc->event_tree);
+                kick_osd_requests(osd);
+        maybe_request_map(osdc);
+out_unlock:
+        up_write(&osdc->lock);
 }
-static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
+/*
-                                                u64 cookie)
+ * Process osd watch notifications
+ */
+static void handle_watch_notify(struct ceph_osd_client *osdc,
+                                struct ceph_msg *msg)
 {
-        struct rb_node **p = &osdc->event_tree.rb_node;
+        void *p = msg->front.iov_base;
-        struct rb_node *parent = NULL;
+        void *const end = p + msg->front.iov_len;
-        struct ceph_osd_event *event = NULL;
+        struct ceph_osd_linger_request *lreq;
+        struct linger_work *lwork;
+        u8 proto_ver, opcode;
+        u64 cookie, notify_id;
+        u64 notifier_id = 0;
+        s32 return_code = 0;
+        void *payload = NULL;
+        u32 payload_len = 0;
-        while (*p) {
+        ceph_decode_8_safe(&p, end, proto_ver, bad);
-                parent = *p;
+        ceph_decode_8_safe(&p, end, opcode, bad);
-                event = rb_entry(parent, struct ceph_osd_event, node);
+        ceph_decode_64_safe(&p, end, cookie, bad);
-                if (cookie < event->cookie)
+        p += 8; /* skip ver */
-                        p = &(*p)->rb_left;
+        ceph_decode_64_safe(&p, end, notify_id, bad);
-                else if (cookie > event->cookie)
-                        p = &(*p)->rb_right;
+        if (proto_ver >= 1) {
-                else
+                ceph_decode_32_safe(&p, end, payload_len, bad);
-                        return event;
+                ceph_decode_need(&p, end, payload_len, bad);
+                payload = p;
+                p += payload_len;
        }
-        return NULL;
-}
-static void __remove_event(struct ceph_osd_event *event)
+        if (le16_to_cpu(msg->hdr.version) >= 2)
-{
+                ceph_decode_32_safe(&p, end, return_code, bad);
-        struct ceph_osd_client *osdc = event->osdc;
-        if (!RB_EMPTY_NODE(&event->node)) {
+        if (le16_to_cpu(msg->hdr.version) >= 3)
-                dout("__remove_event removed %p\n", event);
+                ceph_decode_64_safe(&p, end, notifier_id, bad);
-                rb_erase(&event->node, &osdc->event_tree);
-                ceph_osdc_put_event(event);
+        down_read(&osdc->lock);
+        lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
+        if (!lreq) {
+                dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
+                     cookie);
+                goto out_unlock_osdc;
+        }
+        mutex_lock(&lreq->lock);
+        dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
+             opcode, cookie, lreq, lreq->is_watch);
+        if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
+                if (!lreq->last_error) {
+                        lreq->last_error = -ENOTCONN;
+                        queue_watch_error(lreq);
+                }
+        } else if (!lreq->is_watch) {
+                /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
+                if (lreq->notify_id && lreq->notify_id != notify_id) {
+                        dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
+                             lreq->notify_id, notify_id);
+                } else if (!completion_done(&lreq->notify_finish_wait)) {
+                        struct ceph_msg_data *data =
+                            list_first_entry_or_null(&msg->data,
+                                                     struct ceph_msg_data,
+                                                     links);
+                        if (data) {
+                                if (lreq->preply_pages) {
+                                        WARN_ON(data->type !=
+                                                        CEPH_MSG_DATA_PAGES);
+                                        *lreq->preply_pages = data->pages;
+                                        *lreq->preply_len = data->length;
+                                } else {
+                                        ceph_release_page_vector(data->pages,
+                                               calc_pages_for(0, data->length));
+                                }
+                        }
+                        lreq->notify_finish_error = return_code;
+                        complete_all(&lreq->notify_finish_wait);
+                }
        } else {
-                dout("__remove_event didn't remove %p\n", event);
+                /* CEPH_WATCH_EVENT_NOTIFY */
+                lwork = lwork_alloc(lreq, do_watch_notify);
+                if (!lwork) {
+                        pr_err("failed to allocate notify-lwork\n");
+                        goto out_unlock_lreq;
+                }
+                lwork->notify.notify_id = notify_id;
+                lwork->notify.notifier_id = notifier_id;
+                lwork->notify.payload = payload;
+                lwork->notify.payload_len = payload_len;
+                lwork->notify.msg = ceph_msg_get(msg);
+                lwork_queue(lwork);
        }
+out_unlock_lreq:
+        mutex_unlock(&lreq->lock);
+out_unlock_osdc:
+        up_read(&osdc->lock);
+        return;
+bad:
+        pr_err("osdc handle_watch_notify corrupt msg\n");
 }
-int ceph_osdc_create_event(struct ceph_osd_client *osdc,
+/*
-                           void (*event_cb)(u64, u64, u8, void *),
+ * Register request, send initial attempt.
-                           void *data, struct ceph_osd_event **pevent)
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+                            struct ceph_osd_request *req,
+                            bool nofail)
 {
-        struct ceph_osd_event *event;
+        down_read(&osdc->lock);
+        submit_request(req, false);
-        event = kmalloc(sizeof(*event), GFP_NOIO);
+        up_read(&osdc->lock);
-        if (!event)
-                return -ENOMEM;
-        dout("create_event %p\n", event);
-        event->cb = event_cb;
-        event->one_shot = 0;
-        event->data = data;
-        event->osdc = osdc;
-        INIT_LIST_HEAD(&event->osd_node);
-        RB_CLEAR_NODE(&event->node);
-        kref_init(&event->kref);   /* one ref for us */
-        kref_get(&event->kref);    /* one ref for the caller */
-        spin_lock(&osdc->event_lock);
-        event->cookie = ++osdc->event_count;
-        __insert_event(osdc, event);
-        spin_unlock(&osdc->event_lock);
-        *pevent = event;
        return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_create_event);
+EXPORT_SYMBOL(ceph_osdc_start_request);
-void ceph_osdc_cancel_event(struct ceph_osd_event *event)
+/*
+ * Unregister a registered request.  The request is not completed (i.e.
+ * no callbacks or wakeups) - higher layers are supposed to know what
+ * they are canceling.
+ */
+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
 {
-        struct ceph_osd_client *osdc = event->osdc;
+        struct ceph_osd_client *osdc = req->r_osdc;
-        dout("cancel_event %p\n", event);
+        down_write(&osdc->lock);
-        spin_lock(&osdc->event_lock);
+        if (req->r_osd)
-        __remove_event(event);
+                cancel_request(req);
-        spin_unlock(&osdc->event_lock);
+        up_write(&osdc->lock);
-        ceph_osdc_put_event(event); /* caller's */
 }
-EXPORT_SYMBOL(ceph_osdc_cancel_event);
+EXPORT_SYMBOL(ceph_osdc_cancel_request);
-static void do_event_work(struct work_struct *work)
+/*
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
+static int wait_request_timeout(struct ceph_osd_request *req,
+                                unsigned long timeout)
 {
-        struct ceph_osd_event_work *event_work =
+        long left;
-                container_of(work, struct ceph_osd_event_work, work);
-        struct ceph_osd_event *event = event_work->event;
-        u64 ver = event_work->ver;
-        u64 notify_id = event_work->notify_id;
-        u8 opcode = event_work->opcode;
-        dout("do_event_work completing %p\n", event);
+        dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
-        event->cb(ver, notify_id, opcode, event->data);
+        left = wait_for_completion_killable_timeout(&req->r_completion,
-        dout("do_event_work completed %p\n", event);
+                                                ceph_timeout_jiffies(timeout));
-        ceph_osdc_put_event(event);
+        if (left <= 0) {
-        kfree(event_work);
+                left = left ?: -ETIMEDOUT;
+                ceph_osdc_cancel_request(req);
+                /* kludge - need to to wake ceph_osdc_sync() */
+                complete_all(&req->r_safe_completion);
+        } else {
+                left = req->r_result; /* completed */
+        }
+        return left;
 }
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+                           struct ceph_osd_request *req)
+{
+        return wait_request_timeout(req, 0);
+}
+EXPORT_SYMBOL(ceph_osdc_wait_request);
 /*
- * Process osd watch notifications
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
 */
-static void handle_watch_notify(struct ceph_osd_client *osdc,
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
-                                struct ceph_msg *msg)
 {
-        void *p, *end;
+        struct rb_node *n, *p;
-        u8 proto_ver;
+        u64 last_tid = atomic64_read(&osdc->last_tid);
-        u64 cookie, ver, notify_id;
-        u8 opcode;
-        struct ceph_osd_event *event;
-        struct ceph_osd_event_work *event_work;
-        p = msg->front.iov_base;
+again:
-        end = p + msg->front.iov_len;
+        down_read(&osdc->lock);
+        for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+                struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
-        ceph_decode_8_safe(&p, end, proto_ver, bad);
+                mutex_lock(&osd->lock);
-        ceph_decode_8_safe(&p, end, opcode, bad);
+                for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
-        ceph_decode_64_safe(&p, end, cookie, bad);
+                        struct ceph_osd_request *req =
-        ceph_decode_64_safe(&p, end, ver, bad);
+                            rb_entry(p, struct ceph_osd_request, r_node);
-        ceph_decode_64_safe(&p, end, notify_id, bad);
+                        if (req->r_tid > last_tid)
+                                break;
+                        if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
+                                continue;
-        spin_lock(&osdc->event_lock);
+                        ceph_osdc_get_request(req);
-        event = __find_event(osdc, cookie);
+                        mutex_unlock(&osd->lock);
-        if (event) {
+                        up_read(&osdc->lock);
-                BUG_ON(event->one_shot);
+                        dout("%s waiting on req %p tid %llu last_tid %llu\n",
-                get_event(event);
+                             __func__, req, req->r_tid, last_tid);
-        }
+                        wait_for_completion(&req->r_safe_completion);
-        spin_unlock(&osdc->event_lock);
+                        ceph_osdc_put_request(req);
-        dout("handle_watch_notify cookie %lld ver %lld event %p\n",
+                        goto again;
-             cookie, ver, event);
-        if (event) {
-                event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
-                if (!event_work) {
-                        pr_err("couldn't allocate event_work\n");
-                        ceph_osdc_put_event(event);
-                        return;
                }
-                INIT_WORK(&event_work->work, do_event_work);
-                event_work->event = event;
-                event_work->ver = ver;
-                event_work->notify_id = notify_id;
-                event_work->opcode = opcode;
-                queue_work(osdc->notify_wq, &event_work->work);
+                mutex_unlock(&osd->lock);
        }
-        return;
+        up_read(&osdc->lock);
+        dout("%s done last_tid %llu\n", __func__, last_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_sync);
-bad:
+static struct ceph_osd_request *
-        pr_err("osdc handle_watch_notify corrupt msg\n");
+alloc_linger_request(struct ceph_osd_linger_request *lreq)
+{
+        struct ceph_osd_request *req;
+        req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
+        if (!req)
+                return NULL;
+        ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+        ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+        if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
+                ceph_osdc_put_request(req);
+                return NULL;
+        }
+        return req;
 }
 /*
- * build new request AND message
+ * Returns a handle, caller owns a ref.
- *
 */
-void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
+struct ceph_osd_linger_request *
-                                struct ceph_snap_context *snapc, u64 snap_id,
+ceph_osdc_watch(struct ceph_osd_client *osdc,
-                                struct timespec *mtime)
+                struct ceph_object_id *oid,
-{
+                struct ceph_object_locator *oloc,
-        struct ceph_msg *msg = req->r_request;
+                rados_watchcb2_t wcb,
-        void *p;
+                rados_watcherrcb_t errcb,
-        size_t msg_size;
+                void *data)
-        int flags = req->r_flags;
+{
-        u64 data_len;
+        struct ceph_osd_linger_request *lreq;
-        unsigned int i;
+        int ret;
-        req->r_snapid = snap_id;
-        req->r_snapc = ceph_get_snap_context(snapc);
-        /* encode request */
-        msg->hdr.version = cpu_to_le16(4);
-        p = msg->front.iov_base;
-        ceph_encode_32(&p, 1);   /* client_inc  is always 1 */
-        req->r_request_osdmap_epoch = p;
-        p += 4;
-        req->r_request_flags = p;
-        p += 4;
-        if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-                ceph_encode_timespec(p, mtime);
-        p += sizeof(struct ceph_timespec);
-        req->r_request_reassert_version = p;
-        p += sizeof(struct ceph_eversion); /* will get filled in */
-        /* oloc */
-        ceph_encode_8(&p, 4);
-        ceph_encode_8(&p, 4);
-        ceph_encode_32(&p, 8 + 4 + 4);
-        req->r_request_pool = p;
-        p += 8;
-        ceph_encode_32(&p, -1);  /* preferred */
-        ceph_encode_32(&p, 0);   /* key len */
-        ceph_encode_8(&p, 1);
+        lreq = linger_alloc(osdc);
-        req->r_request_pgid = p;
+        if (!lreq)
-        p += 8 + 4;
+                return ERR_PTR(-ENOMEM);
-        ceph_encode_32(&p, -1);  /* preferred */
-        /* oid */
+        lreq->is_watch = true;
-        ceph_encode_32(&p, req->r_base_oid.name_len);
+        lreq->wcb = wcb;
-        memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
+        lreq->errcb = errcb;
-        dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
+        lreq->data = data;
-             req->r_base_oid.name, req->r_base_oid.name_len);
+        lreq->watch_valid_thru = jiffies;
-        p += req->r_base_oid.name_len;
+        ceph_oid_copy(&lreq->t.base_oid, oid);
-        /* ops--can imply data */
+        ceph_oloc_copy(&lreq->t.base_oloc, oloc);
-        ceph_encode_16(&p, (u16)req->r_num_ops);
+        lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
-        data_len = 0;
+        lreq->mtime = CURRENT_TIME;
-        for (i = 0; i < req->r_num_ops; i++) {
-                data_len += osd_req_encode_op(req, p, i);
+        lreq->reg_req = alloc_linger_request(lreq);
-                p += sizeof(struct ceph_osd_op);
+        if (!lreq->reg_req) {
+                ret = -ENOMEM;
+                goto err_put_lreq;
        }
-        /* snaps */
+        lreq->ping_req = alloc_linger_request(lreq);
-        ceph_encode_64(&p, req->r_snapid);
+        if (!lreq->ping_req) {
-        ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
+                ret = -ENOMEM;
-        ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
+                goto err_put_lreq;
-        if (req->r_snapc) {
-                for (i = 0; i < snapc->num_snaps; i++) {
-                        ceph_encode_64(&p, req->r_snapc->snaps[i]);
-                }
        }
-        req->r_request_attempts = p;
+        down_write(&osdc->lock);
-        p += 4;
+        linger_register(lreq); /* before osd_req_op_* */
+        osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
-        /* data */
+                              CEPH_OSD_WATCH_OP_WATCH);
-        if (flags & CEPH_OSD_FLAG_WRITE) {
+        osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
-                u16 data_off;
+                              CEPH_OSD_WATCH_OP_PING);
+        linger_submit(lreq);
-                /*
+        up_write(&osdc->lock);
-                 * The header "data_off" is a hint to the receiver
-                 * allowing it to align received data into its
+        ret = linger_reg_commit_wait(lreq);
-                 * buffers such that there's no need to re-copy
+        if (ret) {
-                 * it before writing it to disk (direct I/O).
+                linger_cancel(lreq);
-                 */
+                goto err_put_lreq;
-                data_off = (u16) (off & 0xffff);
-                req->r_request->hdr.data_off = cpu_to_le16(data_off);
        }
-        req->r_request->hdr.data_len = cpu_to_le32(data_len);
-        BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+        return lreq;
-        msg_size = p - msg->front.iov_base;
-        msg->front.iov_len = msg_size;
-        msg->hdr.front_len = cpu_to_le32(msg_size);
-        dout("build_request msg_size was %d\n", (int)msg_size);
+err_put_lreq:
+        linger_put(lreq);
+        return ERR_PTR(ret);
 }
-EXPORT_SYMBOL(ceph_osdc_build_request);
+EXPORT_SYMBOL(ceph_osdc_watch);
 /*
- * Register request, send initial attempt.
+ * Releases a ref.
+ *
+ * Times out after mount_timeout to preserve rbd unmap behaviour
+ * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
+ * with mount_timeout").
 */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
-                            struct ceph_osd_request *req,
+                      struct ceph_osd_linger_request *lreq)
-                            bool nofail)
 {
-        int rc;
+        struct ceph_options *opts = osdc->client->options;
+        struct ceph_osd_request *req;
+        int ret;
-        down_read(&osdc->map_sem);
+        req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
-        mutex_lock(&osdc->request_mutex);
+        if (!req)
+                return -ENOMEM;
+        ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+        ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+        req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+        req->r_mtime = CURRENT_TIME;
+        osd_req_op_watch_init(req, 0, lreq->linger_id,
+                              CEPH_OSD_WATCH_OP_UNWATCH);
-        rc = __ceph_osdc_start_request(osdc, req, nofail);
+        ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+        if (ret)
+                goto out_put_req;
-        mutex_unlock(&osdc->request_mutex);
+        ceph_osdc_start_request(osdc, req, false);
-        up_read(&osdc->map_sem);
+        linger_cancel(lreq);
+        linger_put(lreq);
+        ret = wait_request_timeout(req, opts->mount_timeout);
-        return rc;
+out_put_req:
+        ceph_osdc_put_request(req);
+        return ret;
 }
-EXPORT_SYMBOL(ceph_osdc_start_request);
+EXPORT_SYMBOL(ceph_osdc_unwatch);
-/*
+static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
- * Unregister a registered request.  The request is not completed (i.e.
+                                      u64 notify_id, u64 cookie, void *payload,
- * no callbacks or wakeups) - higher layers are supposed to know what
+                                      size_t payload_len)
- * they are canceling.
- */
-void ceph_osdc_cancel_request(struct ceph_osd_request *req)
 {
-        struct ceph_osd_client *osdc = req->r_osdc;
+        struct ceph_osd_req_op *op;
+        struct ceph_pagelist *pl;
+        int ret;
-        mutex_lock(&osdc->request_mutex);
+        op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
-        if (req->r_linger)
-                __unregister_linger_request(osdc, req);
+        pl = kmalloc(sizeof(*pl), GFP_NOIO);
-        __unregister_request(osdc, req);
+        if (!pl)
-        mutex_unlock(&osdc->request_mutex);
+                return -ENOMEM;
+        ceph_pagelist_init(pl);
+        ret = ceph_pagelist_encode_64(pl, notify_id);
+        ret |= ceph_pagelist_encode_64(pl, cookie);
+        if (payload) {
+                ret |= ceph_pagelist_encode_32(pl, payload_len);
+                ret |= ceph_pagelist_append(pl, payload, payload_len);
+        } else {
+                ret |= ceph_pagelist_encode_32(pl, 0);
+        }
+        if (ret) {
+                ceph_pagelist_release(pl);
+                return -ENOMEM;
+        }
-        dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+        ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
+        op->indata_len = pl->length;
+        return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_cancel_request);
-/*
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
- * wait for a request to complete
+                         struct ceph_object_id *oid,
- */
+                         struct ceph_object_locator *oloc,
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+                         u64 notify_id,
-                           struct ceph_osd_request *req)
+                         u64 cookie,
+                         void *payload,
+                         size_t payload_len)
 {
-        int rc;
+        struct ceph_osd_request *req;
+        int ret;
-        dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+        req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+        if (!req)
+                return -ENOMEM;
-        rc = wait_for_completion_interruptible(&req->r_completion);
+        ceph_oid_copy(&req->r_base_oid, oid);
-        if (rc < 0) {
+        ceph_oloc_copy(&req->r_base_oloc, oloc);
-                dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
+        req->r_flags = CEPH_OSD_FLAG_READ;
-                ceph_osdc_cancel_request(req);
-                complete_request(req);
+        ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
-                return rc;
+        if (ret)
+                goto out_put_req;
+        ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
+                                         payload_len);
+        if (ret)
+                goto out_put_req;
+        ceph_osdc_start_request(osdc, req, false);
+        ret = ceph_osdc_wait_request(osdc, req);
+out_put_req:
+        ceph_osdc_put_request(req);
+        return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify_ack);
+static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
+                                  u64 cookie, u32 prot_ver, u32 timeout,
+                                  void *payload, size_t payload_len)
+{
+        struct ceph_osd_req_op *op;
+        struct ceph_pagelist *pl;
+        int ret;
+        op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
+        op->notify.cookie = cookie;
+        pl = kmalloc(sizeof(*pl), GFP_NOIO);
+        if (!pl)
+                return -ENOMEM;
+        ceph_pagelist_init(pl);
+        ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
+        ret |= ceph_pagelist_encode_32(pl, timeout);
+        ret |= ceph_pagelist_encode_32(pl, payload_len);
+        ret |= ceph_pagelist_append(pl, payload, payload_len);
+        if (ret) {
+                ceph_pagelist_release(pl);
+                return -ENOMEM;
        }
-        dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
+        ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
-             req->r_result);
+        op->indata_len = pl->length;
-        return req->r_result;
+        return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_wait_request);
 /*
- * sync - wait for all in-flight requests to flush.  avoid starvation.
+ * @timeout: in seconds
+ *
+ * @preply_{pages,len} are initialized both on success and error.
+ * The caller is responsible for:
+ *
+ *     ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
 */
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+                     struct ceph_object_id *oid,
+                     struct ceph_object_locator *oloc,
+                     void *payload,
+                     size_t payload_len,
+                     u32 timeout,
+                     struct page ***preply_pages,
+                     size_t *preply_len)
 {
-        struct ceph_osd_request *req;
+        struct ceph_osd_linger_request *lreq;
-        u64 last_tid, next_tid = 0;
+        struct page **pages;
+        int ret;
-        mutex_lock(&osdc->request_mutex);
+        WARN_ON(!timeout);
-        last_tid = osdc->last_tid;
+        if (preply_pages) {
-        while (1) {
+                *preply_pages = NULL;
-                req = __lookup_request_ge(osdc, next_tid);
+                *preply_len = 0;
-                if (!req)
+        }
-                        break;
-                if (req->r_tid > last_tid)
-                        break;
-                next_tid = req->r_tid + 1;
+        lreq = linger_alloc(osdc);
-                if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+        if (!lreq)
-                        continue;
+                return -ENOMEM;
-                ceph_osdc_get_request(req);
+        lreq->preply_pages = preply_pages;
-                mutex_unlock(&osdc->request_mutex);
+        lreq->preply_len = preply_len;
-                dout("sync waiting on tid %llu (last is %llu)\n",
-                     req->r_tid, last_tid);
+        ceph_oid_copy(&lreq->t.base_oid, oid);
-                wait_for_completion(&req->r_safe_completion);
+        ceph_oloc_copy(&lreq->t.base_oloc, oloc);
-                mutex_lock(&osdc->request_mutex);
+        lreq->t.flags = CEPH_OSD_FLAG_READ;
-                ceph_osdc_put_request(req);
+        lreq->reg_req = alloc_linger_request(lreq);
+        if (!lreq->reg_req) {
+                ret = -ENOMEM;
+                goto out_put_lreq;
        }
-        mutex_unlock(&osdc->request_mutex);
-        dout("sync done (thru tid %llu)\n", last_tid);
+        /* for notify_id */
+        pages = ceph_alloc_page_vector(1, GFP_NOIO);
+        if (IS_ERR(pages)) {
+                ret = PTR_ERR(pages);
+                goto out_put_lreq;
+        }
+        down_write(&osdc->lock);
+        linger_register(lreq); /* before osd_req_op_* */
+        ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
+                                     timeout, payload, payload_len);
+        if (ret) {
+                linger_unregister(lreq);
+                up_write(&osdc->lock);
+                ceph_release_page_vector(pages, 1);
+                goto out_put_lreq;
+        }
+        ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
+                                                 response_data),
+                                 pages, PAGE_SIZE, 0, false, true);
+        linger_submit(lreq);
+        up_write(&osdc->lock);
+        ret = linger_reg_commit_wait(lreq);
+        if (!ret)
+                ret = linger_notify_finish_wait(lreq);
+        else
+                dout("lreq %p failed to initiate notify %d\n", lreq, ret);
+        linger_cancel(lreq);
+out_put_lreq:
+        linger_put(lreq);
+        return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_notify);
+/*
+ * Return the number of milliseconds since the watch was last
+ * confirmed, or an error.  If there is an error, the watch is no
+ * longer valid, and should be destroyed with ceph_osdc_unwatch().
+ */
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+                          struct ceph_osd_linger_request *lreq)
+{
+        unsigned long stamp, age;
+        int ret;
+        down_read(&osdc->lock);
+        mutex_lock(&lreq->lock);
+        stamp = lreq->watch_valid_thru;
+        if (!list_empty(&lreq->pending_lworks)) {
+                struct linger_work *lwork =
+                    list_first_entry(&lreq->pending_lworks,
+                                     struct linger_work,
+                                     pending_item);
+                if (time_before(lwork->queued_stamp, stamp))
+                        stamp = lwork->queued_stamp;
+        }
+        age = jiffies - stamp;
+        dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
+             lreq, lreq->linger_id, age, lreq->last_error);
+        /* we are truncating to msecs, so return a safe upper bound */
+        ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
+        mutex_unlock(&lreq->lock);
+        up_read(&osdc->lock);
+        return ret;
 }
-EXPORT_SYMBOL(ceph_osdc_sync);
 /*
 * Call all pending notify callbacks - for use after a watch is
@@ -2646,6 +3869,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
 }
 EXPORT_SYMBOL(ceph_osdc_flush_notifies);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
+{
+        down_read(&osdc->lock);
+        maybe_request_map(osdc);
+        up_read(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
 /*
 * init, shutdown
@@ -2656,43 +3886,35 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
        dout("init\n");
        osdc->client = client;
-        osdc->osdmap = NULL;
+        init_rwsem(&osdc->lock);
-        init_rwsem(&osdc->map_sem);
-        init_completion(&osdc->map_waiters);
-        osdc->last_requested_map = 0;
-        mutex_init(&osdc->request_mutex);
-        osdc->last_tid = 0;
        osdc->osds = RB_ROOT;
        INIT_LIST_HEAD(&osdc->osd_lru);
-        osdc->requests = RB_ROOT;
+        spin_lock_init(&osdc->osd_lru_lock);
-        INIT_LIST_HEAD(&osdc->req_lru);
+        osd_init(&osdc->homeless_osd);
-        INIT_LIST_HEAD(&osdc->req_unsent);
+        osdc->homeless_osd.o_osdc = osdc;
-        INIT_LIST_HEAD(&osdc->req_notarget);
+        osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
-        INIT_LIST_HEAD(&osdc->req_linger);
+        osdc->linger_requests = RB_ROOT;
-        osdc->num_requests = 0;
+        osdc->map_checks = RB_ROOT;
+        osdc->linger_map_checks = RB_ROOT;
        INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
        INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
-        spin_lock_init(&osdc->event_lock);
-        osdc->event_tree = RB_ROOT;
-        osdc->event_count = 0;
-        schedule_delayed_work(&osdc->osds_timeout_work,
-            round_jiffies_relative(osdc->client->options->osd_idle_ttl));
        err = -ENOMEM;
+        osdc->osdmap = ceph_osdmap_alloc();
+        if (!osdc->osdmap)
+                goto out;
        osdc->req_mempool = mempool_create_slab_pool(10,
                                                     ceph_osd_request_cache);
        if (!osdc->req_mempool)
-                goto out;
+                goto out_map;
        err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
-                                OSD_OP_FRONT_LEN, 10, true,
+                                PAGE_SIZE, 10, true, "osd_op");
-                                "osd_op");
        if (err < 0)
                goto out_mempool;
        err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
-                                OSD_OPREPLY_FRONT_LEN, 10, true,
+                                PAGE_SIZE, 10, true, "osd_op_reply");
-                                "osd_op_reply");
        if (err < 0)
                goto out_msgpool;
@@ -2701,6 +3923,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
        if (!osdc->notify_wq)
                goto out_msgpool_reply;
+        schedule_delayed_work(&osdc->timeout_work,
+                              osdc->client->options->osd_keepalive_timeout);
+        schedule_delayed_work(&osdc->osds_timeout_work,
+            round_jiffies_relative(osdc->client->options->osd_idle_ttl));
        return 0;
 out_msgpool_reply:
@@ -2709,6 +3936,8 @@ out_msgpool:
        ceph_msgpool_destroy(&osdc->msgpool_op);
 out_mempool:
        mempool_destroy(osdc->req_mempool);
+out_map:
+        ceph_osdmap_destroy(osdc->osdmap);
 out:
        return err;
 }
@@ -2719,11 +3948,25 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
        destroy_workqueue(osdc->notify_wq);
        cancel_delayed_work_sync(&osdc->timeout_work);
        cancel_delayed_work_sync(&osdc->osds_timeout_work);
-        if (osdc->osdmap) {
-                ceph_osdmap_destroy(osdc->osdmap);
+        down_write(&osdc->lock);
-                osdc->osdmap = NULL;
+        while (!RB_EMPTY_ROOT(&osdc->osds)) {
+                struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
+                                                struct ceph_osd, o_node);
+                close_osd(osd);
        }
-        remove_all_osds(osdc);
+        up_write(&osdc->lock);
+        WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+        osd_cleanup(&osdc->homeless_osd);
+        WARN_ON(!list_empty(&osdc->osd_lru));
+        WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
+        WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
+        WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
+        WARN_ON(atomic_read(&osdc->num_requests));
+        WARN_ON(atomic_read(&osdc->num_homeless));
+        ceph_osdmap_destroy(osdc->osdmap);
        mempool_destroy(osdc->req_mempool);
        ceph_msgpool_destroy(&osdc->msgpool_op);
        ceph_msgpool_destroy(&osdc->msgpool_op_reply);
@@ -2752,15 +3995,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                return PTR_ERR(req);
        /* it may be a short read due to an object boundary */
        osd_req_op_extent_osd_data_pages(req, 0,
                                pages, *plen, page_align, false, false);
        dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
             off, *plen, *plen, page_align);
-        ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
        rc = ceph_osdc_start_request(osdc, req, false);
        if (!rc)
                rc = ceph_osdc_wait_request(osdc, req);
@@ -2786,7 +4026,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
        int rc = 0;
        int page_align = off & ~PAGE_MASK;
-        BUG_ON(vino.snap != CEPH_NOSNAP);       /* snapshots aren't writeable */
        req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
                                    CEPH_OSD_OP_WRITE,
                                    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
@@ -2800,8 +4039,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
                                false, false);
        dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
-        ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
+        req->r_mtime = *mtime;
        rc = ceph_osdc_start_request(osdc, req, true);
        if (!rc)
                rc = ceph_osdc_wait_request(osdc, req);
@@ -2841,19 +4079,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
        struct ceph_osd *osd = con->private;
-        struct ceph_osd_client *osdc;
+        struct ceph_osd_client *osdc = osd->o_osdc;
        int type = le16_to_cpu(msg->hdr.type);
-        if (!osd)
-                goto out;
-        osdc = osd->o_osdc;
        switch (type) {
        case CEPH_MSG_OSD_MAP:
                ceph_osdc_handle_map(osdc, msg);
                break;
        case CEPH_MSG_OSD_OPREPLY:
-                handle_reply(osdc, msg);
+                handle_reply(osd, msg);
                break;
        case CEPH_MSG_WATCH_NOTIFY:
                handle_watch_notify(osdc, msg);
@@ -2863,7 +4097,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
                pr_err("received unknown message type %d %s\n", type,
                       ceph_msg_type_name(type));
        }
-out:
        ceph_msg_put(msg);
 }
@@ -2878,21 +4112,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 {
        struct ceph_osd *osd = con->private;
        struct ceph_osd_client *osdc = osd->o_osdc;
-        struct ceph_msg *m;
+        struct ceph_msg *m = NULL;
        struct ceph_osd_request *req;
        int front_len = le32_to_cpu(hdr->front_len);
        int data_len = le32_to_cpu(hdr->data_len);
-        u64 tid;
+        u64 tid = le64_to_cpu(hdr->tid);
-        tid = le64_to_cpu(hdr->tid);
+        down_read(&osdc->lock);
-        mutex_lock(&osdc->request_mutex);
+        if (!osd_registered(osd)) {
-        req = __lookup_request(osdc, tid);
+                dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
+                *skip = 1;
+                goto out_unlock_osdc;
+        }
+        WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
+        mutex_lock(&osd->lock);
+        req = lookup_request(&osd->o_requests, tid);
        if (!req) {
                dout("%s osd%d tid %llu unknown, skipping\n", __func__,
                     osd->o_osd, tid);
-                m = NULL;
                *skip = 1;
-                goto out;
+                goto out_unlock_session;
        }
        ceph_msg_revoke_incoming(req->r_reply);
@@ -2904,7 +4144,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
                                 false);
                if (!m)
-                        goto out;
+                        goto out_unlock_session;
                ceph_msg_put(req->r_reply);
                req->r_reply = m;
        }
@@ -2915,14 +4155,49 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                        req->r_reply->data_length);
                m = NULL;
                *skip = 1;
-                goto out;
+                goto out_unlock_session;
        }
        m = ceph_msg_get(req->r_reply);
        dout("get_reply tid %lld %p\n", tid, m);
-out:
+out_unlock_session:
-        mutex_unlock(&osdc->request_mutex);
+        mutex_unlock(&osd->lock);
+out_unlock_osdc:
+        up_read(&osdc->lock);
+        return m;
+}
+/*
+ * TODO: switch to a msg-owned pagelist
+ */
+static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
+{
+        struct ceph_msg *m;
+        int type = le16_to_cpu(hdr->type);
+        u32 front_len = le32_to_cpu(hdr->front_len);
+        u32 data_len = le32_to_cpu(hdr->data_len);
+        m = ceph_msg_new(type, front_len, GFP_NOIO, false);
+        if (!m)
+                return NULL;
+        if (data_len) {
+                struct page **pages;
+                struct ceph_osd_data osd_data;
+                pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
+                                               GFP_NOIO);
+                if (!pages) {
+                        ceph_msg_put(m);
+                        return NULL;
+                }
+                ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
+                                         false);
+                ceph_osdc_msg_data_add(m, &osd_data);
+        }
        return m;
 }
@@ -2932,18 +4207,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 {
        struct ceph_osd *osd = con->private;
        int type = le16_to_cpu(hdr->type);
-        int front = le32_to_cpu(hdr->front_len);
        *skip = 0;
        switch (type) {
        case CEPH_MSG_OSD_MAP:
        case CEPH_MSG_WATCH_NOTIFY:
-                return ceph_msg_new(type, front, GFP_NOFS, false);
+                return alloc_msg_with_page_vector(hdr);
        case CEPH_MSG_OSD_OPREPLY:
                return get_reply(con, hdr, skip);
        default:
-                pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+                pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
-                        osd->o_osd);
+                        osd->o_osd, type);
                *skip = 1;
                return NULL;
        }
@@ -3047,5 +4321,5 @@ static const struct ceph_connection_operations osd_con_ops = {
        .alloc_msg = alloc_msg,
        .sign_message = osd_sign_message,
        .check_message_signature = osd_check_message_signature,
-        .fault = osd_reset,
+        .fault = osd_fault,
 };
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 243574c8cf33..cde52e94732f 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -380,23 +380,24 @@ bad:
        return ERR_PTR(err);
 }
-/*
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds) and primary_temp (explicit primary setting)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
 {
-        if (l.pool < r.pool)
+        if (lhs->pool < rhs->pool)
                return -1;
-        if (l.pool > r.pool)
+        if (lhs->pool > rhs->pool)
                return 1;
-        if (l.seed < r.seed)
+        if (lhs->seed < rhs->seed)
                return -1;
-        if (l.seed > r.seed)
+        if (lhs->seed > rhs->seed)
                return 1;
        return 0;
 }
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds) and primary_temp (explicit primary setting)
+ */
 static int __insert_pg_mapping(struct ceph_pg_mapping *new,
                               struct rb_root *root)
 {
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
        while (*p) {
                parent = *p;
                pg = rb_entry(parent, struct ceph_pg_mapping, node);
-                c = pgid_cmp(new->pgid, pg->pgid);
+                c = ceph_pg_compare(&new->pgid, &pg->pgid);
                if (c < 0)
                        p = &(*p)->rb_left;
                else if (c > 0)
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
        while (n) {
                pg = rb_entry(n, struct ceph_pg_mapping, node);
-                c = pgid_cmp(pgid, pg->pgid);
+                c = ceph_pg_compare(&pgid, &pg->pgid);
                if (c < 0) {
                        n = n->rb_left;
                } else if (c > 0) {
@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
        *p += 4;  /* skip crash_replay_interval */
        if (ev >= 7)
-                *p += 1;  /* skip min_size */
+                pi->min_size = ceph_decode_8(p);
+        else
+                pi->min_size = pi->size - pi->size / 2;
        if (ev >= 8)
                *p += 8 + 8;  /* skip quota_max_* */
@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
                pi->write_tier = -1;
        }
+        if (ev >= 10) {
+                /* skip properties */
+                num = ceph_decode_32(p);
+                while (num--) {
+                        len = ceph_decode_32(p);
+                        *p += len; /* key */
+                        len = ceph_decode_32(p);
+                        *p += len; /* val */
+                }
+        }
+        if (ev >= 11) {
+                /* skip hit_set_params */
+                *p += 1 + 1; /* versions */
+                len = ceph_decode_32(p);
+                *p += len;
+                *p += 4; /* skip hit_set_period */
+                *p += 4; /* skip hit_set_count */
+        }
+        if (ev >= 12)
+                *p += 4; /* skip stripe_width */
+        if (ev >= 13) {
+                *p += 8; /* skip target_max_bytes */
+                *p += 8; /* skip target_max_objects */
+                *p += 4; /* skip cache_target_dirty_ratio_micro */
+                *p += 4; /* skip cache_target_full_ratio_micro */
+                *p += 4; /* skip cache_min_flush_age */
+                *p += 4; /* skip cache_min_evict_age */
+        }
+        if (ev >=  14) {
+                /* skip erasure_code_profile */
+                len = ceph_decode_32(p);
+                *p += len;
+        }
+        if (ev >= 15)
+                pi->last_force_request_resend = ceph_decode_32(p);
+        else
+                pi->last_force_request_resend = 0;
        /* ignore the rest */
        *p = pool_end;
@@ -660,6 +707,23 @@ bad:
 /*
 * osd map
 */
+struct ceph_osdmap *ceph_osdmap_alloc(void)
+{
+        struct ceph_osdmap *map;
+        map = kzalloc(sizeof(*map), GFP_NOIO);
+        if (!map)
+                return NULL;
+        map->pg_pools = RB_ROOT;
+        map->pool_max = -1;
+        map->pg_temp = RB_ROOT;
+        map->primary_temp = RB_ROOT;
+        mutex_init(&map->crush_scratch_mutex);
+        return map;
+}
 void ceph_osdmap_destroy(struct ceph_osdmap *map)
 {
        dout("osdmap_destroy %p\n", map);
@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
        struct ceph_osdmap *map;
        int ret;
-        map = kzalloc(sizeof(*map), GFP_NOFS);
+        map = ceph_osdmap_alloc();
        if (!map)
                return ERR_PTR(-ENOMEM);
-        map->pg_temp = RB_ROOT;
-        map->primary_temp = RB_ROOT;
-        mutex_init(&map->crush_scratch_mutex);
        ret = osdmap_decode(p, end, map);
        if (ret) {
                ceph_osdmap_destroy(map);
@@ -1204,8 +1264,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
 * decode and apply an incremental map update.
 */
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                             struct ceph_osdmap *map,
+                                             struct ceph_osdmap *map)
-                                             struct ceph_messenger *msgr)
 {
        struct crush_map *newcrush = NULL;
        struct ceph_fsid fsid;
@@ -1381,8 +1440,252 @@ bad:
        return ERR_PTR(err);
 }
+void ceph_oid_copy(struct ceph_object_id *dest,
+                   const struct ceph_object_id *src)
+{
+        WARN_ON(!ceph_oid_empty(dest));
+        if (src->name != src->inline_name) {
+                /* very rare, see ceph_object_id definition */
+                dest->name = kmalloc(src->name_len + 1,
+                                     GFP_NOIO | __GFP_NOFAIL);
+        }
+        memcpy(dest->name, src->name, src->name_len + 1);
+        dest->name_len = src->name_len;
+}
+EXPORT_SYMBOL(ceph_oid_copy);
+static __printf(2, 0)
+int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
+{
+        int len;
+        WARN_ON(!ceph_oid_empty(oid));
+        len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
+        if (len >= sizeof(oid->inline_name))
+                return len;
+        oid->name_len = len;
+        return 0;
+}
+/*
+ * If oid doesn't fit into inline buffer, BUG.
+ */
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
+{
+        va_list ap;
+        va_start(ap, fmt);
+        BUG_ON(oid_printf_vargs(oid, fmt, ap));
+        va_end(ap);
+}
+EXPORT_SYMBOL(ceph_oid_printf);
+static __printf(3, 0)
+int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
+                      const char *fmt, va_list ap)
+{
+        va_list aq;
+        int len;
+        va_copy(aq, ap);
+        len = oid_printf_vargs(oid, fmt, aq);
+        va_end(aq);
+        if (len) {
+                char *external_name;
+                external_name = kmalloc(len + 1, gfp);
+                if (!external_name)
+                        return -ENOMEM;
+                oid->name = external_name;
+                WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
+                oid->name_len = len;
+        }
+        return 0;
+}
+/*
+ * If oid doesn't fit into inline buffer, allocate.
+ */
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+                     const char *fmt, ...)
+{
+        va_list ap;
+        int ret;
+        va_start(ap, fmt);
+        ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
+        va_end(ap);
+        return ret;
+}
+EXPORT_SYMBOL(ceph_oid_aprintf);
+void ceph_oid_destroy(struct ceph_object_id *oid)
+{
+        if (oid->name != oid->inline_name)
+                kfree(oid->name);
+}
+EXPORT_SYMBOL(ceph_oid_destroy);
+/*
+ * osds only
+ */
+static bool __osds_equal(const struct ceph_osds *lhs,
+                         const struct ceph_osds *rhs)
+{
+        if (lhs->size == rhs->size &&
+            !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
+                return true;
+        return false;
+}
+/*
+ * osds + primary
+ */
+static bool osds_equal(const struct ceph_osds *lhs,
+                       const struct ceph_osds *rhs)
+{
+        if (__osds_equal(lhs, rhs) &&
+            lhs->primary == rhs->primary)
+                return true;
+        return false;
+}
+static bool osds_valid(const struct ceph_osds *set)
+{
+        /* non-empty set */
+        if (set->size > 0 && set->primary >= 0)
+                return true;
+        /* empty can_shift_osds set */
+        if (!set->size && set->primary == -1)
+                return true;
+        /* empty !can_shift_osds set - all NONE */
+        if (set->size > 0 && set->primary == -1) {
+                int i;
+                for (i = 0; i < set->size; i++) {
+                        if (set->osds[i] != CRUSH_ITEM_NONE)
+                                break;
+                }
+                if (i == set->size)
+                        return true;
+        }
+        return false;
+}
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
+{
+        memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
+        dest->size = src->size;
+        dest->primary = src->primary;
+}
+static bool is_split(const struct ceph_pg *pgid,
+                     u32 old_pg_num,
+                     u32 new_pg_num)
+{
+        int old_bits = calc_bits_of(old_pg_num);
+        int old_mask = (1 << old_bits) - 1;
+        int n;
+        WARN_ON(pgid->seed >= old_pg_num);
+        if (new_pg_num <= old_pg_num)
+                return false;
+        for (n = 1; ; n++) {
+                int next_bit = n << (old_bits - 1);
+                u32 s = next_bit | pgid->seed;
+                if (s < old_pg_num || s == pgid->seed)
+                        continue;
+                if (s >= new_pg_num)
+                        break;
+                s = ceph_stable_mod(s, old_pg_num, old_mask);
+                if (s == pgid->seed)
+                        return true;
+        }
+        return false;
+}
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+                          const struct ceph_osds *new_acting,
+                          const struct ceph_osds *old_up,
+                          const struct ceph_osds *new_up,
+                          int old_size,
+                          int new_size,
+                          int old_min_size,
+                          int new_min_size,
+                          u32 old_pg_num,
+                          u32 new_pg_num,
+                          bool old_sort_bitwise,
+                          bool new_sort_bitwise,
+                          const struct ceph_pg *pgid)
+{
+        return !osds_equal(old_acting, new_acting) ||
+               !osds_equal(old_up, new_up) ||
+               old_size != new_size ||
+               old_min_size != new_min_size ||
+               is_split(pgid, old_pg_num, new_pg_num) ||
+               old_sort_bitwise != new_sort_bitwise;
+}
+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
+{
+        int i;
+        for (i = 0; i < acting->size; i++) {
+                if (acting->osds[i] == osd)
+                        return i;
+        }
+        return -1;
+}
+static bool primary_changed(const struct ceph_osds *old_acting,
+                            const struct ceph_osds *new_acting)
+{
+        if (!old_acting->size && !new_acting->size)
+                return false; /* both still empty */
+        if (!old_acting->size ^ !new_acting->size)
+                return true; /* was empty, now not, or vice versa */
+        if (old_acting->primary != new_acting->primary)
+                return true; /* primary changed */
+        if (calc_pg_rank(old_acting->primary, old_acting) !=
+            calc_pg_rank(new_acting->primary, new_acting))
+                return true;
+        return false; /* same primary (tho replicas may have changed) */
+}
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+                       const struct ceph_osds *new_acting,
+                       bool any_change)
+{
+        if (primary_changed(old_acting, new_acting))
+                return true;
+        if (any_change && !__osds_equal(old_acting, new_acting))
+                return true;
+        return false;
+}
 /*
 * calculate file layout from given offset, length.
@@ -1455,30 +1758,71 @@ invalid:
 EXPORT_SYMBOL(ceph_calc_file_object_mapping);
 /*
- * Calculate mapping of a (oloc, oid) pair to a PG.  Should only be
+ * Map an object into a PG.
- * called with target's (oloc, oid), since tiering isn't taken into
+ *
- * account.
+ * Should only be called with target_oid and target_oloc (as opposed to
+ * base_oid and base_oloc), since tiering isn't taken into account.
 */
-int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
-                        struct ceph_object_locator *oloc,
+                              struct ceph_object_id *oid,
-                        struct ceph_object_id *oid,
+                              struct ceph_object_locator *oloc,
-                        struct ceph_pg *pg_out)
+                              struct ceph_pg *raw_pgid)
 {
        struct ceph_pg_pool_info *pi;
-        pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+        pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
        if (!pi)
-                return -EIO;
+                return -ENOENT;
-        pg_out->pool = oloc->pool;
+        raw_pgid->pool = oloc->pool;
-        pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
+        raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
-                                     oid->name_len);
+                                       oid->name_len);
-        dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
+        dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len,
-             pg_out->pool, pg_out->seed);
+             oid->name, raw_pgid->pool, raw_pgid->seed);
        return 0;
 }
-EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+EXPORT_SYMBOL(ceph_object_locator_to_pg);
+/*
+ * Map a raw PG (full precision ps) into an actual PG.
+ */
+static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
+                         const struct ceph_pg *raw_pgid,
+                         struct ceph_pg *pgid)
+{
+        pgid->pool = raw_pgid->pool;
+        pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
+                                     pi->pg_num_mask);
+}
+/*
+ * Map a raw PG (full precision ps) into a placement ps (placement
+ * seed).  Include pool id in that value so that different pools don't
+ * use the same seeds.
+ */
+static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
+                         const struct ceph_pg *raw_pgid)
+{
+        if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+                /* hash pool id and seed so that pool PGs do not overlap */
+                return crush_hash32_2(CRUSH_HASH_RJENKINS1,
+                                      ceph_stable_mod(raw_pgid->seed,
+                                                      pi->pgp_num,
+                                                      pi->pgp_num_mask),
+                                      raw_pgid->pool);
+        } else {
+                /*
+                 * legacy behavior: add ps and pool together.  this is
+                 * not a great approach because the PGs from each pool
+                 * will overlap on top of each other: 0.5 == 1.4 ==
+                 * 2.3 == ...
+                 */
+                return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
+                                       pi->pgp_num_mask) +
+                       (unsigned)raw_pgid->pool;
+        }
+}
 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
                    int *result, int result_max,
@@ -1497,84 +1841,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 }
 /*
- * Calculate raw (crush) set for given pgid.
+ * Calculate raw set (CRUSH output) for given PG.  The result may
+ * contain nonexistent OSDs.  ->primary is undefined for a raw set.
 *
- * Return raw set length, or error.
+ * Placement seed (CRUSH input) is returned through @ppps.
 */
-static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
+static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
-                          struct ceph_pg_pool_info *pool,
+                           struct ceph_pg_pool_info *pi,
-                          struct ceph_pg pgid, u32 pps, int *osds)
+                           const struct ceph_pg *raw_pgid,
+                           struct ceph_osds *raw,
+                           u32 *ppps)
 {
+        u32 pps = raw_pg_to_pps(pi, raw_pgid);
        int ruleno;
        int len;
-        /* crush */
+        ceph_osds_init(raw);
-        ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
+        if (ppps)
-                                 pool->type, pool->size);
+                *ppps = pps;
+        ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
+                                 pi->size);
        if (ruleno < 0) {
                pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
-                       pgid.pool, pool->crush_ruleset, pool->type,
+                       pi->id, pi->crush_ruleset, pi->type, pi->size);
-                       pool->size);
+                return;
-                return -ENOENT;
        }
-        len = do_crush(osdmap, ruleno, pps, osds,
+        len = do_crush(osdmap, ruleno, pps, raw->osds,
-                       min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+                       min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
                       osdmap->osd_weight, osdmap->max_osd);
        if (len < 0) {
                pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
-                       len, ruleno, pgid.pool, pool->crush_ruleset,
+                       len, ruleno, pi->id, pi->crush_ruleset, pi->type,
-                       pool->type, pool->size);
+                       pi->size);
-                return len;
+                return;
        }
-        return len;
+        raw->size = len;
 }
 /*
- * Given raw set, calculate up set and up primary.
+ * Given raw set, calculate up set and up primary.  By definition of an
+ * up set, the result won't contain nonexistent or down OSDs.
 *
- * Return up set length.  *primary is set to up primary osd id, or -1
+ * This is done in-place - on return @set is the up set.  If it's
- * if up set is empty.
+ * empty, ->primary will remain undefined.
 */
-static int raw_to_up_osds(struct ceph_osdmap *osdmap,
+static void raw_to_up_osds(struct ceph_osdmap *osdmap,
-                          struct ceph_pg_pool_info *pool,
+                           struct ceph_pg_pool_info *pi,
-                          int *osds, int len, int *primary)
+                           struct ceph_osds *set)
 {
-        int up_primary = -1;
        int i;
-        if (ceph_can_shift_osds(pool)) {
+        /* ->primary is undefined for a raw set */
+        BUG_ON(set->primary != -1);
+        if (ceph_can_shift_osds(pi)) {
                int removed = 0;
-                for (i = 0; i < len; i++) {
+                /* shift left */
-                        if (ceph_osd_is_down(osdmap, osds[i])) {
+                for (i = 0; i < set->size; i++) {
+                        if (ceph_osd_is_down(osdmap, set->osds[i])) {
                                removed++;
                                continue;
                        }
                        if (removed)
-                                osds[i - removed] = osds[i];
+                                set->osds[i - removed] = set->osds[i];
                }
+                set->size -= removed;
-                len -= removed;
+                if (set->size > 0)
-                if (len > 0)
+                        set->primary = set->osds[0];
-                        up_primary = osds[0];
        } else {
-                for (i = len - 1; i >= 0; i--) {
+                /* set down/dne devices to NONE */
-                        if (ceph_osd_is_down(osdmap, osds[i]))
+                for (i = set->size - 1; i >= 0; i--) {
-                                osds[i] = CRUSH_ITEM_NONE;
+                        if (ceph_osd_is_down(osdmap, set->osds[i]))
+                                set->osds[i] = CRUSH_ITEM_NONE;
                        else
-                                up_primary = osds[i];
+                                set->primary = set->osds[i];
                }
        }
-        *primary = up_primary;
-        return len;
 }
-static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
+static void apply_primary_affinity(struct ceph_osdmap *osdmap,
-                                   struct ceph_pg_pool_info *pool,
+                                   struct ceph_pg_pool_info *pi,
-                                   int *osds, int len, int *primary)
+                                   u32 pps,
+                                   struct ceph_osds *up)
 {
        int i;
        int pos = -1;
@@ -1586,8 +1938,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
        if (!osdmap->osd_primary_affinity)
                return;
-        for (i = 0; i < len; i++) {
+        for (i = 0; i < up->size; i++) {
-                int osd = osds[i];
+                int osd = up->osds[i];
                if (osd != CRUSH_ITEM_NONE &&
                    osdmap->osd_primary_affinity[osd] !=
@@ -1595,7 +1947,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
                        break;
                }
        }
-        if (i == len)
+        if (i == up->size)
                return;
        /*
@@ -1603,8 +1955,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
         * osd into the hash/rng so that a proportional fraction of an
         * osd's pgs get rejected as primary.
         */
-        for (i = 0; i < len; i++) {
+        for (i = 0; i < up->size; i++) {
-                int osd = osds[i];
+                int osd = up->osds[i];
                u32 aff;
                if (osd == CRUSH_ITEM_NONE)
@@ -1629,135 +1981,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
        if (pos < 0)
                return;
-        *primary = osds[pos];
+        up->primary = up->osds[pos];
-        if (ceph_can_shift_osds(pool) && pos > 0) {
+        if (ceph_can_shift_osds(pi) && pos > 0) {
                /* move the new primary to the front */
                for (i = pos; i > 0; i--)
-                        osds[i] = osds[i - 1];
+                        up->osds[i] = up->osds[i - 1];
-                osds[0] = *primary;
+                up->osds[0] = up->primary;
        }
 }
 /*
- * Given up set, apply pg_temp and primary_temp mappings.
+ * Get pg_temp and primary_temp mappings for given PG.
 *
- * Return acting set length.  *primary is set to acting primary osd id,
+ * Note that a PG may have none, only pg_temp, only primary_temp or
- * or -1 if acting set is empty.
+ * both pg_temp and primary_temp mappings.  This means @temp isn't
+ * always a valid OSD set on return: in the "only primary_temp" case,
+ * @temp will have its ->primary >= 0 but ->size == 0.
 */
-static int apply_temps(struct ceph_osdmap *osdmap,
+static void get_temp_osds(struct ceph_osdmap *osdmap,
-                       struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
+                          struct ceph_pg_pool_info *pi,
-                       int *osds, int len, int *primary)
+                          const struct ceph_pg *raw_pgid,
+                          struct ceph_osds *temp)
 {
+        struct ceph_pg pgid;
        struct ceph_pg_mapping *pg;
-        int temp_len;
-        int temp_primary;
        int i;
-        /* raw_pg -> pg */
+        raw_pg_to_pg(pi, raw_pgid, &pgid);
-        pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
+        ceph_osds_init(temp);
-                                    pool->pg_num_mask);
        /* pg_temp? */
        pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
        if (pg) {
-                temp_len = 0;
-                temp_primary = -1;
                for (i = 0; i < pg->pg_temp.len; i++) {
                        if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
-                                if (ceph_can_shift_osds(pool))
+                                if (ceph_can_shift_osds(pi))
                                        continue;
-                                else
-                                        osds[temp_len++] = CRUSH_ITEM_NONE;
+                                temp->osds[temp->size++] = CRUSH_ITEM_NONE;
                        } else {
-                                osds[temp_len++] = pg->pg_temp.osds[i];
+                                temp->osds[temp->size++] = pg->pg_temp.osds[i];
                        }
                }
                /* apply pg_temp's primary */
-                for (i = 0; i < temp_len; i++) {
+                for (i = 0; i < temp->size; i++) {
-                        if (osds[i] != CRUSH_ITEM_NONE) {
+                        if (temp->osds[i] != CRUSH_ITEM_NONE) {
-                                temp_primary = osds[i];
+                                temp->primary = temp->osds[i];
                                break;
                        }
                }
-        } else {
-                temp_len = len;
-                temp_primary = *primary;
        }
        /* primary_temp? */
        pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
        if (pg)
-                temp_primary = pg->primary_temp.osd;
+                temp->primary = pg->primary_temp.osd;
-        *primary = temp_primary;
-        return temp_len;
 }
 /*
- * Calculate acting set for given pgid.
+ * Map a PG to its acting set as well as its up set.
 *
- * Return acting set length, or error.  *primary is set to acting
+ * Acting set is used for data mapping purposes, while up set can be
- * primary osd id, or -1 if acting set is empty or on error.
+ * recorded for detecting interval changes and deciding whether to
+ * resend a request.
 */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
-                        int *osds, int *primary)
+                               const struct ceph_pg *raw_pgid,
+                               struct ceph_osds *up,
+                               struct ceph_osds *acting)
 {
-        struct ceph_pg_pool_info *pool;
+        struct ceph_pg_pool_info *pi;
        u32 pps;
-        int len;
-        pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
+        pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
-        if (!pool) {
+        if (!pi) {
-                *primary = -1;
+                ceph_osds_init(up);
-                return -ENOENT;
+                ceph_osds_init(acting);
+                goto out;
        }
-        if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+        pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
-                /* hash pool id and seed so that pool PGs do not overlap */
+        raw_to_up_osds(osdmap, pi, up);
-                pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
+        apply_primary_affinity(osdmap, pi, pps, up);
-                                     ceph_stable_mod(pgid.seed, pool->pgp_num,
+        get_temp_osds(osdmap, pi, raw_pgid, acting);
-                                                     pool->pgp_num_mask),
+        if (!acting->size) {
-                                     pgid.pool);
+                memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
-        } else {
+                acting->size = up->size;
-                /*
+                if (acting->primary == -1)
-                 * legacy behavior: add ps and pool together.  this is
+                        acting->primary = up->primary;
-                 * not a great approach because the PGs from each pool
-                 * will overlap on top of each other: 0.5 == 1.4 ==
-                 * 2.3 == ...
-                 */
-                pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
-                                      pool->pgp_num_mask) +
-                        (unsigned)pgid.pool;
-        }
-        len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
-        if (len < 0) {
-                *primary = -1;
-                return len;
        }
+out:
-        len = raw_to_up_osds(osdmap, pool, osds, len, primary);
+        WARN_ON(!osds_valid(up) || !osds_valid(acting));
-        apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
-        len = apply_temps(osdmap, pool, pgid, osds, len, primary);
-        return len;
 }
 /*
- * Return primary osd for given pgid, or -1 if none.
+ * Return acting primary for given PG, or -1 if none.
 */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+                              const struct ceph_pg *raw_pgid)
 {
-        int osds[CEPH_PG_MAX_SIZE];
+        struct ceph_osds up, acting;
-        int primary;
-        ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
-        return primary;
+        ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
+        return acting.primary;
 }
-EXPORT_SYMBOL(ceph_calc_pg_primary);
+EXPORT_SYMBOL(ceph_pg_to_acting_primary);